xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision cfc3fa87da824bdf593eb8b817a83b6376e77935)
1 /*
2  * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 //! B+Tree implementation
10 
11 
12 #include "Debug.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 
17 
18 #ifdef DEBUG
19 class NodeChecker {
20 	public:
21 		NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text)
22 			:
23 			fNode(node),
24 			fSize(nodeSize),
25 			fText(text)
26 		{
27 			Check("integrity check failed on construction.");
28 		}
29 
30 		~NodeChecker()
31 		{
32 			Check("integrity check failed on destruction.");
33 		}
34 
35 		void
36 		Check(const char *message)
37 		{
38 			if (fNode->CheckIntegrity(fSize) < B_OK)
39 				dprintf("%s: %s\n", fText, message);
40 		}
41 
42 	private:
43 		const bplustree_node	*fNode;
44 		int32					fSize;
45 		const char				*fText;
46 };
47 #endif
48 
49 
50 // Node Caching for the BPlusTree class
51 //
52 // With write support, there is the need for a function that allocates new
53 // nodes by either returning empty nodes, or by growing the file's data stream
54 //
55 // !! The CachedNode class assumes that you have properly locked the stream
56 // !! before asking for nodes.
57 //
58 // Note: This code will fail if the block size is smaller than the node size!
59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
60 // is hard-coded to 1024 bytes, that's not an issue now.
61 
62 void
63 CachedNode::UnsetUnchanged(Transaction &transaction)
64 {
65 	if (fTree == NULL || fTree->fStream == NULL)
66 		return;
67 
68 	if (fNode != NULL) {
69 		void *cache = fTree->fStream->GetVolume()->BlockCache();
70 
71 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
72 		block_cache_put(cache, fBlockNumber);
73 		fNode = NULL;
74 	}
75 }
76 
77 
78 void
79 CachedNode::Unset()
80 {
81 	if (fTree == NULL || fTree->fStream == NULL)
82 		return;
83 
84 	if (fNode != NULL) {
85 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber);
86 		fNode = NULL;
87 	}
88 }
89 
90 
91 const bplustree_node *
92 CachedNode::SetTo(off_t offset, bool check)
93 {
94 	if (fTree == NULL || fTree->fStream == NULL) {
95 		REPORT_ERROR(B_BAD_VALUE);
96 		return NULL;
97 	}
98 
99 	Unset();
100 
101 	// You can only ask for nodes at valid positions - you can't
102 	// even access the b+tree header with this method (use SetToHeader()
103 	// instead)
104 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
105 		|| offset <= 0
106 		|| (offset % fTree->fNodeSize) != 0)
107 		return NULL;
108 
109 	if (InternalSetTo(NULL, offset) != NULL && check) {
110 		// sanity checks (links, all_key_count)
111 		if (!fTree->fHeader->CheckNode(fNode)) {
112 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
113 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
114 			return NULL;
115 		}
116 	}
117 	return fNode;
118 }
119 
120 
121 bplustree_node *
122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check)
123 {
124 	if (fTree == NULL || fTree->fStream == NULL) {
125 		REPORT_ERROR(B_BAD_VALUE);
126 		return NULL;
127 	}
128 
129 	Unset();
130 
131 	// You can only ask for nodes at valid positions - you can't
132 	// even access the b+tree header with this method (use SetToHeader()
133 	// instead)
134 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
135 		|| offset <= 0
136 		|| (offset % fTree->fNodeSize) != 0)
137 		return NULL;
138 
139 	if (InternalSetTo(&transaction, offset) != NULL && check) {
140 		// sanity checks (links, all_key_count)
141 		if (!fTree->fHeader->CheckNode(fNode)) {
142 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
143 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
144 			return NULL;
145 		}
146 	}
147 	return fNode;
148 }
149 
150 
151 bplustree_node *
152 CachedNode::MakeWritable(Transaction &transaction)
153 {
154 	if (fNode == NULL)
155 		return NULL;
156 
157 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
158 			fBlockNumber, transaction.ID()) == B_OK)
159 		return fNode;
160 
161 	return NULL;
162 }
163 
164 
165 bplustree_header *
166 CachedNode::MakeWritableHeader(Transaction &transaction)
167 {
168 	return (bplustree_header *)MakeWritable(transaction);
169 }
170 
171 
172 const bplustree_header *
173 CachedNode::SetToHeader()
174 {
175 	if (fTree == NULL || fTree->fStream == NULL) {
176 		REPORT_ERROR(B_BAD_VALUE);
177 		return NULL;
178 	}
179 
180 	Unset();
181 
182 	InternalSetTo(NULL, 0LL);
183 	return (bplustree_header *)fNode;
184 }
185 
186 
187 bplustree_header *
188 CachedNode::SetToWritableHeader(Transaction &transaction)
189 {
190 	if (fTree == NULL || fTree->fStream == NULL) {
191 		REPORT_ERROR(B_BAD_VALUE);
192 		return NULL;
193 	}
194 
195 	Unset();
196 
197 	InternalSetTo(&transaction, 0LL);
198 	return (bplustree_header *)fNode;
199 }
200 
201 
202 bplustree_node *
203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset)
204 {
205 	fNode = NULL;
206 	fOffset = offset;
207 
208 	off_t fileOffset;
209 	block_run run;
210 	if (offset < fTree->fStream->Size()
211 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
212 		Volume *volume = fTree->fStream->GetVolume();
213 
214 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
215 		fBlockNumber = volume->ToBlock(run) + blockOffset;
216 		uint8 *block;
217 
218 		if (transaction != NULL) {
219 			block = (uint8 *)block_cache_get_writable(volume->BlockCache(),
220 				fBlockNumber, transaction->ID());
221 			fWritable = true;
222 		} else {
223 			block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber);
224 			fWritable = false;
225 		}
226 
227 		if (block) {
228 			// the node is somewhere in that block... (confusing offset calculation)
229 			fNode = (bplustree_node *)(block + offset -
230 						(fileOffset + (blockOffset << volume->BlockShift())));
231 		} else
232 			REPORT_ERROR(B_IO_ERROR);
233 	}
234 	return fNode;
235 }
236 
237 
238 status_t
239 CachedNode::Free(Transaction &transaction, off_t offset)
240 {
241 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
242 		RETURN_ERROR(B_BAD_VALUE);
243 
244 	// ToDo: scan the free nodes list and remove all nodes at the end
245 	// of the tree - perhaps that shouldn't be done everytime that
246 	// function is called, perhaps it should be done when the directory
247 	// inode is closed or based on some calculation or whatever...
248 
249 	bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction);
250 	if (header == NULL)
251 		return B_IO_ERROR;
252 
253 #if 0
254 	// TODO: temporarily disabled because CheckNode() doesn't like this...
255 	// 		Also, it's such an edge case that it's almost useless, anyway.
256 	// if the node is the last one in the tree, we shrink
257 	// the tree and file size by one node
258 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
259 	if (offset == lastOffset) {
260 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
261 		if (status < B_OK)
262 			return status;
263 
264 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
265 		return B_OK;
266 	}
267 #endif
268 
269 	// add the node to the free nodes list
270 	fNode->left_link = header->free_node_pointer;
271 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
272 
273 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
274 	return B_OK;
275 }
276 
277 
278 status_t
279 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node,
280 	off_t *_offset)
281 {
282 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
283 		RETURN_ERROR(B_BAD_VALUE);
284 
285 	Unset();
286 
287 	bplustree_header *header;
288 	status_t status;
289 
290 	// if there are any free nodes, recycle them
291 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
292 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
293 		if (header == NULL)
294 			return B_IO_ERROR;
295 
296 		*_offset = header->FreeNode();
297 		*_node = fNode;
298 
299 		// set new free node pointer
300 		header->free_node_pointer = fNode->left_link;
301 		fNode->Initialize();
302 		return B_OK;
303 	}
304 
305 	// allocate space for a new node
306 	Inode *stream = fTree->fStream;
307 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
308 		return status;
309 
310 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
311 	if (header == NULL)
312 		return B_IO_ERROR;
313 
314 	// the maximum_size has to be changed before the call to SetTo() - or
315 	// else it will fail because the requested node is out of bounds
316 	off_t offset = header->MaximumSize();
317 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
318 		+ fTree->fNodeSize);
319 
320 	if (SetToWritable(transaction, offset, false) != NULL) {
321 		fNode->Initialize();
322 
323 		*_offset = offset;
324 		*_node = fNode;
325 		return B_OK;
326 	}
327 
328 	// revert header size to old value
329 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
330 		- fTree->fNodeSize);
331 	RETURN_ERROR(B_ERROR);
332 }
333 
334 
335 //	#pragma mark -
336 
337 
338 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize)
339 	:
340 	fStream(NULL),
341 	fHeader(NULL),
342 	fCachedHeader(this)
343 {
344 	SetTo(transaction, stream);
345 }
346 
347 
348 BPlusTree::BPlusTree(Inode *stream)
349 	:
350 	fStream(NULL),
351 	fHeader(NULL),
352 	fCachedHeader(this)
353 {
354 	SetTo(stream);
355 }
356 
357 
358 BPlusTree::BPlusTree()
359 	:
360 	fStream(NULL),
361 	fHeader(NULL),
362 	fCachedHeader(this),
363 	fNodeSize(BPLUSTREE_NODE_SIZE),
364 	fAllowDuplicates(true),
365 	fStatus(B_NO_INIT)
366 {
367 }
368 
369 
370 BPlusTree::~BPlusTree()
371 {
372 	// if there are any TreeIterators left, we need to stop them
373 	// (can happen when the tree's inode gets deleted while
374 	// traversing the tree - a TreeIterator doesn't lock the inode)
375 	if (fIteratorLock.Lock() < B_OK)
376 		return;
377 
378 	TreeIterator *iterator = NULL;
379 	while ((iterator = fIterators.Next(iterator)) != NULL)
380 		iterator->Stop();
381 
382 	fIteratorLock.Unlock();
383 }
384 
385 
386 /*! Create a new B+Tree on the specified stream */
387 status_t
388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize)
389 {
390 	// initializes in-memory B+Tree
391 
392 	fStream = stream;
393 
394 	bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction);
395 	if (header == NULL) {
396 		// allocate space for new header + node!
397 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
398 		if (fStatus < B_OK)
399 			RETURN_ERROR(fStatus);
400 
401 		header = fCachedHeader.SetToWritableHeader(transaction);
402 		if (header == NULL)
403 			RETURN_ERROR(fStatus = B_ERROR);
404 	}
405 
406 	fHeader = header;
407 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
408 			&& stream->BlockRun() != stream->Parent())
409 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
410 
411 	fNodeSize = nodeSize;
412 
413 	// initialize b+tree header
414  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
415  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
416  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
417  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
418  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
419  	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
420  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
421 
422 	// initialize b+tree root node
423 	CachedNode cached(this);
424 	cached.SetToWritable(transaction, header->RootNode(), false);
425 	if (cached.Node() == NULL)
426 		RETURN_ERROR(B_IO_ERROR);
427 
428 	cached.Node()->Initialize();
429 
430 	return fStatus = B_OK;
431 }
432 
433 
434 status_t
435 BPlusTree::SetTo(Inode *stream)
436 {
437 	if (stream == NULL)
438 		RETURN_ERROR(fStatus = B_BAD_VALUE);
439 
440 	// get on-disk B+Tree header
441 
442 	fCachedHeader.Unset();
443 	fStream = stream;
444 
445 	fHeader = fCachedHeader.SetToHeader();
446 	if (fHeader == NULL)
447 		RETURN_ERROR(fStatus = B_NO_INIT);
448 
449 	// is header valid?
450 
451 	if (fHeader->MaximumSize() != stream->Size()) {
452 		FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n",
453 			fHeader->MaximumSize(), stream->Size()));
454 		// we can't change the header since we don't have a transaction
455 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
456 	}
457 	if (fHeader->Magic() != BPLUSTREE_MAGIC
458 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
459 		|| !fHeader->IsValidLink(fHeader->RootNode())
460 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
461 #ifdef DEBUG
462 		dump_bplustree_header(fHeader);
463 		dump_block((const char *)fHeader, 128);
464 #endif
465 		RETURN_ERROR(fStatus = B_BAD_DATA);
466 	}
467 
468 	fNodeSize = fHeader->NodeSize();
469 
470 	{
471 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
472 			S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
473 			S_DOUBLE_INDEX};
474 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
475 			| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
476 			| S_FLOAT_INDEX | S_DOUBLE_INDEX);
477 
478 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
479 			|| ((stream->Mode() & S_INDEX_DIR) != 0
480 				&& toMode[fHeader->DataType()] != mode)
481 			|| !stream->IsContainer()) {
482 			D(	dump_bplustree_header(fHeader);
483 				dump_inode(&stream->Node());
484 			);
485 			RETURN_ERROR(fStatus = B_BAD_TYPE);
486 		}
487 
488 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
489 		// in the original BFS code - we will honour it nevertheless
490 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
491 			&& stream->BlockRun() != stream->Parent())
492 			|| (stream->Mode() & S_ALLOW_DUPS) != 0;
493 	}
494 
495 	CachedNode cached(this, fHeader->RootNode());
496 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
497 }
498 
499 
500 status_t
501 BPlusTree::InitCheck()
502 {
503 	return fStatus;
504 }
505 
506 
507 int32
508 BPlusTree::TypeCodeToKeyType(type_code code)
509 {
510 	switch (code) {
511 		case B_STRING_TYPE:
512 			return BPLUSTREE_STRING_TYPE;
513 		case B_SSIZE_T_TYPE:
514 		case B_INT32_TYPE:
515 			return BPLUSTREE_INT32_TYPE;
516 		case B_SIZE_T_TYPE:
517 		case B_UINT32_TYPE:
518 			return BPLUSTREE_UINT32_TYPE;
519 		case B_OFF_T_TYPE:
520 		case B_INT64_TYPE:
521 			return BPLUSTREE_INT64_TYPE;
522 		case B_UINT64_TYPE:
523 			return BPLUSTREE_UINT64_TYPE;
524 		case B_FLOAT_TYPE:
525 			return BPLUSTREE_FLOAT_TYPE;
526 		case B_DOUBLE_TYPE:
527 			return BPLUSTREE_DOUBLE_TYPE;
528 	}
529 	return -1;
530 }
531 
532 
533 int32
534 BPlusTree::ModeToKeyType(mode_t mode)
535 {
536 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
537 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
538 		case S_INT_INDEX:
539 			return BPLUSTREE_INT32_TYPE;
540 		case S_UINT_INDEX:
541 			return BPLUSTREE_UINT32_TYPE;
542 		case S_LONG_LONG_INDEX:
543 			return BPLUSTREE_INT64_TYPE;
544 		case S_ULONG_LONG_INDEX:
545 			return BPLUSTREE_UINT64_TYPE;
546 		case S_FLOAT_INDEX:
547 			return BPLUSTREE_FLOAT_TYPE;
548 		case S_DOUBLE_INDEX:
549 			return BPLUSTREE_DOUBLE_TYPE;
550 		case S_STR_INDEX:
551 		default:
552 			// default is for standard directories
553 			return BPLUSTREE_STRING_TYPE;
554 	}
555 }
556 
557 
558 //	#pragma mark -
559 
560 
561 void
562 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
563 	uint16 splitAt, int8 change)
564 {
565 	// Although every iterator which is affected by this update currently
566 	// waits on a semaphore, other iterators could be added/removed at
567 	// any time, so we need to protect this loop
568 	if (fIteratorLock.Lock() < B_OK)
569 		return;
570 
571 	TreeIterator *iterator = NULL;
572 	while ((iterator = fIterators.Next(iterator)) != NULL)
573 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
574 
575 	fIteratorLock.Unlock();
576 }
577 
578 
579 void
580 BPlusTree::_AddIterator(TreeIterator *iterator)
581 {
582 	if (fIteratorLock.Lock() < B_OK)
583 		return;
584 
585 	fIterators.Add(iterator);
586 
587 	fIteratorLock.Unlock();
588 }
589 
590 
591 void
592 BPlusTree::_RemoveIterator(TreeIterator *iterator)
593 {
594 	if (fIteratorLock.Lock() < B_OK)
595 		return;
596 
597 	fIterators.Remove(iterator);
598 
599 	fIteratorLock.Unlock();
600 }
601 
602 
603 int32
604 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2,
605 	int keyLength2)
606 {
607 	type_code type = 0;
608 	switch (fHeader->data_type) {
609 	    case BPLUSTREE_STRING_TYPE:
610 	    	type = B_STRING_TYPE;
611 	    	break;
612 		case BPLUSTREE_INT32_TYPE:
613 	    	type = B_INT32_TYPE;
614 	    	break;
615 		case BPLUSTREE_UINT32_TYPE:
616 	    	type = B_UINT32_TYPE;
617 	    	break;
618 		case BPLUSTREE_INT64_TYPE:
619 	    	type = B_INT64_TYPE;
620 	    	break;
621 		case BPLUSTREE_UINT64_TYPE:
622 	    	type = B_UINT64_TYPE;
623 	    	break;
624 		case BPLUSTREE_FLOAT_TYPE:
625 	    	type = B_FLOAT_TYPE;
626 	    	break;
627 		case BPLUSTREE_DOUBLE_TYPE:
628 	    	type = B_DOUBLE_TYPE;
629 	    	break;
630 	}
631    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
632 }
633 
634 
635 status_t
636 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key,
637 	uint16 keyLength, uint16 *_index, off_t *_next)
638 {
639 #ifdef DEBUG
640 		NodeChecker checker(node, fNodeSize, "find");
641 #endif
642 
643 	if (node->all_key_count == 0) {
644 		if (_index)
645 			*_index = 0;
646 		if (_next)
647 			*_next = node->OverflowLink();
648 		return B_ENTRY_NOT_FOUND;
649 	}
650 
651 	off_t *values = node->Values();
652 	int16 saveIndex = -1;
653 
654 	// binary search in the key array
655 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
656 		uint16 i = (first + last) >> 1;
657 
658 		uint16 searchLength;
659 		uint8 *searchKey = node->KeyAt(i, &searchLength);
660 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
661 				> (uint8 *)node + fNodeSize
662 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
663 			fStream->GetVolume()->Panic();
664 			RETURN_ERROR(B_BAD_DATA);
665 		}
666 
667 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
668 		if (cmp < 0) {
669 			last = i - 1;
670 			saveIndex = i;
671 		} else if (cmp > 0) {
672 			saveIndex = first = i + 1;
673 		} else {
674 			if (_index)
675 				*_index = i;
676 			if (_next)
677 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
678 			return B_OK;
679 		}
680 	}
681 
682 	if (_index)
683 		*_index = saveIndex;
684 	if (_next) {
685 		if (saveIndex == node->NumKeys())
686 			*_next = node->OverflowLink();
687 		else
688 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
689 	}
690 	return B_ENTRY_NOT_FOUND;
691 }
692 
693 
694 /*!
695 	Prepares the stack to contain all nodes that were passed while
696 	following the key, from the root node to the leaf node that could
697 	or should contain that key.
698 */
699 status_t
700 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key,
701 	uint16 keyLength)
702 {
703 	// set the root node to begin with
704 	node_and_key nodeAndKey;
705 	nodeAndKey.nodeOffset = fHeader->RootNode();
706 
707 	CachedNode cached(this);
708 	const bplustree_node *node;
709 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
710 		// if we are already on leaf level, we're done
711 		if (node->OverflowLink() == BPLUSTREE_NULL) {
712 			// node that the keyIndex is not properly set here (but it's not
713 			// needed in the calling functions anyway)!
714 			nodeAndKey.keyIndex = 0;
715 			stack.Push(nodeAndKey);
716 			return B_OK;
717 		}
718 
719 		off_t nextOffset;
720 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
721 			&nextOffset);
722 
723 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
724 			RETURN_ERROR(B_ERROR);
725 
726 		if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) {
727 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
728 			break;
729 		}
730 
731 		// put the node offset & the correct keyIndex on the stack
732 		stack.Push(nodeAndKey);
733 
734 		nodeAndKey.nodeOffset = nextOffset;
735 	}
736 
737 	FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n",
738 		nodeAndKey.nodeOffset));
739 	return B_ERROR;
740 }
741 
742 
743 /*!
744 	This will find a free duplicate fragment in the given bplustree_node.
745 	The CachedNode will be set to the writable fragment on success.
746 */
747 status_t
748 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction,
749 	const bplustree_node *node, CachedNode &cached,
750 	off_t *_offset, bplustree_node **_fragment, uint32 *_index)
751 {
752 	off_t *values = node->Values();
753 	for (int32 i = 0; i < node->NumKeys(); i++) {
754 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
755 
756 		// does the value link to a duplicate fragment?
757 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
758 			continue;
759 
760 		const bplustree_node *fragment = cached.SetTo(
761 			bplustree_node::FragmentOffset(value), false);
762 		if (fragment == NULL) {
763 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
764 			continue;
765 		}
766 
767 		// see if there is some space left for us
768 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
769 		for (uint32 j = 0; j < num; j++) {
770 			duplicate_array *array = fragment->FragmentAt(j);
771 
772 			if (array->count == 0) {
773 				// found an unused fragment
774 				*_fragment = cached.MakeWritable(transaction);
775 				if (*_fragment == NULL)
776 					return B_IO_ERROR;
777 
778 				*_offset = bplustree_node::FragmentOffset(value);
779 				*_index = j;
780 				return B_OK;
781 			}
782 		}
783 	}
784 	return B_ENTRY_NOT_FOUND;
785 }
786 
787 
788 status_t
789 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached,
790 	const bplustree_node *node, uint16 index, off_t value)
791 {
792 	CachedNode cachedDuplicate(this);
793 	off_t *values = node->Values();
794 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
795 	status_t status;
796 	off_t offset;
797 
798 	if (bplustree_node::IsDuplicate(oldValue)) {
799 		// If it's a duplicate fragment, try to insert it into that, or if it
800 		// doesn't fit anymore, create a new duplicate node
801 
802 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
803 			bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
804 				bplustree_node::FragmentOffset(oldValue), false);
805 			if (duplicate == NULL)
806 				return B_IO_ERROR;
807 
808 			duplicate_array *array = duplicate->FragmentAt(
809 				bplustree_node::FragmentIndex(oldValue));
810 			if (array->count > NUM_FRAGMENT_VALUES
811 				|| array->count < 1) {
812 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
813 					(int)bplustree_node::FragmentIndex(oldValue),
814 					bplustree_node::FragmentOffset(oldValue),
815 					array->count));
816 				return B_BAD_DATA;
817 			}
818 
819 			if (array->count < NUM_FRAGMENT_VALUES) {
820 				array->Insert(value);
821 			} else {
822 				// test if the fragment will be empty if we remove this key's values
823 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
824 					// the node will be empty without our values, so let us
825 					// reuse it as a duplicate node
826 					offset = bplustree_node::FragmentOffset(oldValue);
827 
828 					memmove(duplicate->DuplicateArray(), array,
829 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
830 					duplicate->left_link = duplicate->right_link
831 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
832 
833 					array = duplicate->DuplicateArray();
834 					array->Insert(value);
835 				} else {
836 					// create a new duplicate node
837 
838 					cachedDuplicate.UnsetUnchanged(transaction);
839 						// the old duplicate has not been touched, so we can reuse it
840 
841 					bplustree_node *newDuplicate;
842 					status = cachedDuplicate.Allocate(transaction,
843 						&newDuplicate, &offset);
844 					if (status < B_OK)
845 						RETURN_ERROR(status);
846 
847 					// copy the array from the fragment node to the duplicate node
848 					// and free the old entry (by zero'ing all values)
849 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
850 						array->count);
851 					memcpy(&newDuplicate->all_key_count, &array->values[0],
852 						array->count * sizeof(off_t));
853 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
854 
855 					array = newDuplicate->DuplicateArray();
856 					array->Insert(value);
857 				}
858 
859 				// update the main pointer to link to a duplicate node
860 				if (cached.MakeWritable(transaction) == NULL)
861 					return B_IO_ERROR;
862 
863 				values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
864 					BPLUSTREE_DUPLICATE_NODE, offset));
865 			}
866 
867 			return B_OK;
868 		}
869 
870 		// Put the value into a dedicated duplicate node
871 
872 		// search for free space in the duplicate nodes of that key
873 		duplicate_array *array;
874 		const bplustree_node *duplicate;
875 		off_t duplicateOffset;
876 		do {
877 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
878 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
879 			if (duplicate == NULL)
880 				return B_IO_ERROR;
881 
882 			array = duplicate->DuplicateArray();
883 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
884 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
885 					duplicateOffset, array->count));
886 				return B_BAD_DATA;
887 			}
888 		} while (array->count >= NUM_DUPLICATE_VALUES
889 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
890 
891 		bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction);
892 		if (writableDuplicate == NULL)
893 			return B_IO_ERROR;
894 
895 		if (array->count < NUM_DUPLICATE_VALUES) {
896 			array = writableDuplicate->DuplicateArray();
897 			array->Insert(value);
898 		} else {
899 			// no space left - add a new duplicate node
900 
901 			bplustree_node *newDuplicate;
902 			status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
903 			if (status < B_OK)
904 				RETURN_ERROR(status);
905 
906 			// link the two nodes together
907 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
908 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
909 
910 			array = newDuplicate->DuplicateArray();
911 			array->count = 0;
912 			array->Insert(value);
913 		}
914 		return B_OK;
915 	}
916 
917 	// Search for a free duplicate fragment or create a new one
918 	// to insert the duplicate value into
919 
920 	uint32 fragmentIndex = 0;
921 	bplustree_node *fragment;
922 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
923 			&offset, &fragment, &fragmentIndex) != B_OK) {
924 		// allocate a new duplicate fragment node
925 		if ((status = cachedDuplicate.Allocate(transaction, &fragment,
926 				&offset)) < B_OK)
927 			RETURN_ERROR(status);
928 
929 		memset(fragment, 0, fNodeSize);
930 	}
931 
932 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
933 	array->Insert(oldValue);
934 	array->Insert(value);
935 
936 	if (cached.MakeWritable(transaction) == NULL)
937 		return B_IO_ERROR;
938 
939 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
940 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
941 
942 	return B_OK;
943 }
944 
945 
946 void
947 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key,
948 	uint16 keyLength, off_t value)
949 {
950 	// should never happen, but who knows?
951 	if (index > node->NumKeys())
952 		return;
953 
954 	off_t *values = node->Values();
955 	uint16 *keyLengths = node->KeyLengths();
956 	uint8 *keys = node->Keys();
957 
958 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
959 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
960 		+ keyLength);
961 
962 	off_t *newValues = node->Values();
963 	uint16 *newKeyLengths = node->KeyLengths();
964 
965 	// move values and copy new value into them
966 	memmove(newValues + index + 1, values + index,
967 		sizeof(off_t) * (node->NumKeys() - 1 - index));
968 	memmove(newValues, values, sizeof(off_t) * index);
969 
970 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
971 
972 	// move and update key length index
973 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
974 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
975 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
976 	}
977 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
978 
979 	int32 keyStart;
980 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
981 		+ (keyStart = index > 0
982 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
983 
984 	// move keys and copy new key into them
985 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
986 	int32 size = node->AllKeyLength() - length;
987 	if (size > 0)
988 		memmove(keys + length, keys + length - keyLength, size);
989 
990 	memcpy(keys + keyStart, key, keyLength);
991 }
992 
993 
994 /*!
995 	Splits the \a node into two halves - the other half will be put into \a other.
996 	It also takes care to create a new overflow link if the node to split is an
997 	index node.
998 */
999 status_t
1000 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset,
1001 	bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key,
1002 	uint16 *_keyLength, off_t *_value)
1003 {
1004 	if (*_keyIndex > node->NumKeys() + 1)
1005 		return B_BAD_VALUE;
1006 
1007 	uint16 *inKeyLengths = node->KeyLengths();
1008 	off_t *inKeyValues = node->Values();
1009 	uint8 *inKeys = node->Keys();
1010 	uint8 *outKeys = other->Keys();
1011 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1012 
1013 	if (keyIndex > node->NumKeys()) {
1014 		FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex,
1015 			node->NumKeys()));
1016 		return B_BAD_VALUE;
1017 	}
1018 
1019 	// how many keys will fit in one (half) page?
1020 	// that loop will find the answer to this question and
1021 	// change the key lengths indices for their new home
1022 
1023 	// "bytes" is the number of bytes written for the new key,
1024 	// "bytesBefore" are the bytes before that key
1025 	// "bytesAfter" are the bytes after the new key, if any
1026 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1027 
1028 	size_t size = fNodeSize >> 1;
1029 	int32 out, in;
1030 	for (in = out = 0; in < node->NumKeys() + 1;) {
1031 		if (!bytes) {
1032 			bytesBefore = in > 0
1033 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1034 		}
1035 
1036 		if (in == keyIndex && !bytes) {
1037 			bytes = *_keyLength;
1038 		} else {
1039 			if (keyIndex < out) {
1040 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1041 					- bytesBefore;
1042 			}
1043 
1044 			in++;
1045 		}
1046 		out++;
1047 
1048 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1049 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1050 			// we have found the number of keys in the new node!
1051 			break;
1052 		}
1053 	}
1054 
1055 	// if the new key was not inserted, set the length of the keys
1056 	// that can be copied directly
1057 	if (keyIndex >= out && in > 0)
1058 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1059 
1060 	if (bytesBefore < 0 || bytesAfter < 0)
1061 		return B_BAD_DATA;
1062 
1063 	other->left_link = node->left_link;
1064 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1065 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1066 		+ bytesAfter);
1067 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1068 
1069 	uint16 *outKeyLengths = other->KeyLengths();
1070 	off_t *outKeyValues = other->Values();
1071 	int32 keys = out > keyIndex ? keyIndex : out;
1072 
1073 	if (bytesBefore) {
1074 		// copy the keys
1075 		memcpy(outKeys, inKeys, bytesBefore);
1076 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1077 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1078 	}
1079 	if (bytes) {
1080 		// copy the newly inserted key
1081 		memcpy(outKeys + bytesBefore, key, bytes);
1082 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1083 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1084 
1085 		if (bytesAfter) {
1086 			// copy the keys after the new key
1087 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1088 				bytesAfter);
1089 			keys = out - keyIndex - 1;
1090 			for (int32 i = 0;i < keys;i++) {
1091 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1092 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes);
1093 			}
1094 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1095 				keys * sizeof(off_t));
1096 		}
1097 	}
1098 
1099 	// if the new key was already inserted, we shouldn't use it again
1100 	if (in != out)
1101 		keyIndex--;
1102 
1103 	int32 total = bytesBefore + bytesAfter;
1104 
1105 	// these variables are for the key that will be returned
1106 	// to the parent node
1107 	uint8 *newKey = NULL;
1108 	uint16 newLength;
1109 	bool newAllocated = false;
1110 
1111 	// If we have split an index node, we have to drop the first key
1112 	// of the next node (which can also be the new key to insert).
1113 	// The dropped key is also the one which has to be inserted in
1114 	// the parent node, so we will set the "newKey" already here.
1115 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1116 		if (in == keyIndex) {
1117 			newKey = key;
1118 			newLength = *_keyLength;
1119 
1120 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1121 			keyIndex--;
1122 		} else {
1123 			// If a key is dropped (is not the new key), we have to copy
1124 			// it, because it would be lost if not.
1125 			uint8 *droppedKey = node->KeyAt(in, &newLength);
1126 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1127 					> (uint8 *)node + fNodeSize
1128 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1129 				fStream->GetVolume()->Panic();
1130 				RETURN_ERROR(B_BAD_DATA);
1131 			}
1132 			newKey = (uint8 *)malloc(newLength);
1133 			if (newKey == NULL)
1134 				return B_NO_MEMORY;
1135 
1136 			newAllocated = true;
1137 			memcpy(newKey, droppedKey, newLength);
1138 
1139 			other->overflow_link = inKeyValues[in];
1140 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1141 		}
1142 	}
1143 
1144 	// and now the same game for the other page and the rest of the keys
1145 	// (but with memmove() instead of memcpy(), because they may overlap)
1146 
1147 	bytesBefore = bytesAfter = bytes = 0;
1148 	out = 0;
1149 	int32 skip = in;
1150 	while (in < node->NumKeys() + 1) {
1151 		if (in == keyIndex && !bytes) {
1152 			// it's enough to set bytesBefore once here, because we do
1153 			// not need to know the exact length of all keys in this
1154 			// loop
1155 			bytesBefore = in > skip
1156 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1157 			bytes = *_keyLength;
1158 			out++;
1159 		} else {
1160 			if (in < node->NumKeys()) {
1161 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1162 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1163 
1164 				if (bytes) {
1165 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1166 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1167 
1168 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1169 						- bytesBefore - bytes;
1170 				}
1171 				out++;
1172 			}
1173 			in++;
1174 		}
1175 	}
1176 
1177 	// adjust the byte counts (since we were a bit lazy in the loop)
1178 	if (keyIndex < skip)
1179 		bytesBefore = node->AllKeyLength() - total;
1180 
1181 	if (bytesBefore < 0 || bytesAfter < 0) {
1182 		if (newAllocated)
1183 			free(newKey);
1184 		return B_BAD_DATA;
1185 	}
1186 
1187 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1188 		// right link, and overflow link can stay the same
1189 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1190 		+ bytesAfter);
1191 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1192 
1193 	// array positions have changed
1194 	outKeyLengths = node->KeyLengths();
1195 	outKeyValues = node->Values();
1196 
1197 	// move the keys in the old node: the order is important here,
1198 	// because we don't want to overwrite any contents
1199 
1200 	keys = keyIndex <= skip ? out : keyIndex - skip;
1201 	keyIndex -= skip;
1202 	in = out - keyIndex - 1;
1203 		// Note: keyIndex and in will contain invalid values when the new key
1204 		// went to the other node. But in this case bytes and bytesAfter are
1205 		// 0 and subsequently we never use keyIndex and in.
1206 
1207 	if (bytesBefore)
1208 		memmove(inKeys, inKeys + total, bytesBefore);
1209 	if (bytesAfter) {
1210 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1211 			bytesAfter);
1212 	}
1213 
1214 	if (bytesBefore)
1215 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1216 	if (bytesAfter) {
1217 		// if byteAfter is > 0, keyIndex is larger than skip
1218 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1219 			in * sizeof(uint16));
1220 	}
1221 
1222 	if (bytesBefore)
1223 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1224 	if (bytesAfter) {
1225 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1226 			in * sizeof(off_t));
1227 	}
1228 
1229 	if (bytes) {
1230 		// finally, copy the newly inserted key (don't overwrite anything)
1231 		memcpy(inKeys + bytesBefore, key, bytes);
1232 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1233 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1234 	}
1235 
1236 	// Prepare the key that will be inserted in the parent node which
1237 	// is either the dropped key or the last of the other node.
1238 	// If it's the dropped key, "newKey" was already set earlier.
1239 
1240 	if (newKey == NULL)
1241 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1242 
1243 	memcpy(key, newKey, newLength);
1244 	*_keyLength = newLength;
1245 	*_value = otherOffset;
1246 
1247 	if (newAllocated)
1248 		free(newKey);
1249 
1250 	return B_OK;
1251 }
1252 
1253 
1254 /*!
1255 	This inserts a key into the tree. The changes made to the tree will
1256 	all be part of the \a transaction.
1257 */
1258 status_t
1259 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength,
1260 	off_t value)
1261 {
1262 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1263 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1264 		RETURN_ERROR(B_BAD_VALUE);
1265 #ifdef DEBUG
1266 	if (value < 0)
1267 		panic("tried to insert invalid value %Ld!\n", value);
1268 #endif
1269 
1270 	// lock access to stream
1271 	WriteLocked locked(fStream->Lock());
1272 
1273 	Stack<node_and_key> stack;
1274 	if (_SeekDown(stack, key, keyLength) != B_OK)
1275 		RETURN_ERROR(B_ERROR);
1276 
1277 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1278 
1279 	memcpy(keyBuffer, key, keyLength);
1280 	keyBuffer[keyLength] = 0;
1281 
1282 	node_and_key nodeAndKey;
1283 	const bplustree_node *node;
1284 
1285 	CachedNode cached(this);
1286 	while (stack.Pop(&nodeAndKey)
1287 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1288 #ifdef DEBUG
1289 		NodeChecker checker(node, fNodeSize, "insert");
1290 #endif
1291 		if (node->IsLeaf()) {
1292 			// first round, check for duplicate entries
1293 			status_t status = _FindKey(node, key, keyLength,
1294 				&nodeAndKey.keyIndex);
1295 
1296 			// is this a duplicate entry?
1297 			if (status == B_OK) {
1298 				if (fAllowDuplicates) {
1299 					status = _InsertDuplicate(transaction, cached, node,
1300 						nodeAndKey.keyIndex, value);
1301 					if (status != B_OK)
1302 						RETURN_ERROR(status);
1303 					return B_OK;
1304 				}
1305 
1306 				RETURN_ERROR(B_NAME_IN_USE);
1307 			}
1308 		}
1309 
1310 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1311 		if (writableNode == NULL)
1312 			return B_IO_ERROR;
1313 
1314 		// is the node big enough to hold the pair?
1315 		if (int32(round_up(sizeof(bplustree_node)
1316 				+ writableNode->AllKeyLength() + keyLength)
1317 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1318 				+ sizeof(off_t))) < fNodeSize) {
1319 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1320 				keyBuffer, keyLength, value);
1321 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1322 				nodeAndKey.keyIndex, 0, 1);
1323 
1324 			return B_OK;
1325 		} else {
1326 			CachedNode cachedNewRoot(this);
1327 			CachedNode cachedOther(this);
1328 
1329 			// do we need to allocate a new root node? if so, then do
1330 			// it now
1331 			off_t newRoot = BPLUSTREE_NULL;
1332 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1333 				bplustree_node *root;
1334 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1335 					&newRoot);
1336 				if (status < B_OK) {
1337 					// The tree is most likely corrupted!
1338 					// But it's still sane at leaf level - we could set
1339 					// a flag in the header that forces the tree to be
1340 					// rebuild next time...
1341 					// But since we will have journaling, that's not a big
1342 					// problem anyway.
1343 					RETURN_ERROR(status);
1344 				}
1345 			}
1346 
1347 			// reserve space for the other node
1348 			bplustree_node *other;
1349 			off_t otherOffset;
1350 			status_t status = cachedOther.Allocate(transaction, &other,
1351 				&otherOffset);
1352 			if (status < B_OK) {
1353 				cachedNewRoot.Free(transaction, newRoot);
1354 				RETURN_ERROR(status);
1355 			}
1356 
1357 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1358 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1359 					&value) < B_OK) {
1360 				// free root node & other node here
1361 				cachedOther.Free(transaction, otherOffset);
1362 				cachedNewRoot.Free(transaction, newRoot);
1363 
1364 				RETURN_ERROR(B_ERROR);
1365 			}
1366 #ifdef DEBUG
1367 			checker.Check("insert split");
1368 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1369 #endif
1370 
1371 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1372 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1373 
1374 			// update the right link of the node in the left of the new node
1375 			if ((other = cachedOther.SetToWritable(transaction,
1376 					other->LeftLink())) != NULL) {
1377 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1378 			}
1379 
1380 			// create a new root if necessary
1381 			if (newRoot != BPLUSTREE_NULL) {
1382 				bplustree_node *root = cachedNewRoot.Node();
1383 
1384 				_InsertKey(root, 0, keyBuffer, keyLength,
1385 					writableNode->LeftLink());
1386 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1387 					nodeAndKey.nodeOffset);
1388 
1389 				bplustree_header *header
1390 					= fCachedHeader.MakeWritableHeader(transaction);
1391 				if (header == NULL)
1392 					return B_IO_ERROR;
1393 
1394 				// finally, update header to point to the new root
1395 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1396 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1397 					header->MaxNumberOfLevels() + 1);
1398 
1399 				return B_OK;
1400 			}
1401 		}
1402 	}
1403 	RETURN_ERROR(B_ERROR);
1404 }
1405 
1406 
1407 /*!
1408 	Removes the duplicate index/value pair from the tree.
1409 	It's part of the private tree interface.
1410 */
1411 status_t
1412 BPlusTree::_RemoveDuplicate(Transaction &transaction,
1413 	const bplustree_node *node, CachedNode &cached, uint16 index,
1414 	off_t value)
1415 {
1416 	off_t *values = node->Values();
1417 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1418 
1419 	CachedNode cachedDuplicate(this);
1420 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1421 	bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
1422 		duplicateOffset, false);
1423 	if (duplicate == NULL)
1424 		return B_IO_ERROR;
1425 
1426 	// if it's a duplicate fragment, remove the entry from there
1427 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1428 		duplicate_array *array = duplicate->FragmentAt(
1429 			bplustree_node::FragmentIndex(oldValue));
1430 
1431 		if (array->count > NUM_FRAGMENT_VALUES
1432 			|| array->count < 1) {
1433 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
1434 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1435 			return B_BAD_DATA;
1436 		}
1437 		if (!array->Remove(value)) {
1438 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1439 				value, duplicateOffset));
1440 			return B_ENTRY_NOT_FOUND;
1441 		}
1442 
1443 		// remove the array from the fragment node if it is empty
1444 		if (array->count == 1) {
1445 			// set the link to the remaining value
1446 			if (cached.MakeWritable(transaction) == NULL)
1447 				return B_IO_ERROR;
1448 
1449 			values[index] = array->values[0];
1450 
1451 			// Remove the whole fragment node, if this was the only array,
1452 			// otherwise free just the array
1453 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1454 				status_t status = cachedDuplicate.Free(transaction, duplicateOffset);
1455 				if (status < B_OK)
1456 					return status;
1457 			} else
1458 				array->count = 0;
1459 		}
1460 		return B_OK;
1461 	}
1462 
1463 	// Remove value from a duplicate node!
1464 
1465 	duplicate_array *array = NULL;
1466 
1467 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1468 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1469 			duplicate->LeftLink()));
1470 		return B_BAD_DATA;
1471 	}
1472 
1473 	// Search the duplicate nodes until the entry could be found (and removed)
1474 	while (duplicate != NULL) {
1475 		array = duplicate->DuplicateArray();
1476 		if (array->count > NUM_DUPLICATE_VALUES
1477 			|| array->count < 0) {
1478 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1479 				duplicateOffset, array->count));
1480 			return B_BAD_DATA;
1481 		}
1482 
1483 		if (array->Remove(value))
1484 			break;
1485 
1486 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1487 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1488 
1489 		cachedDuplicate.UnsetUnchanged(transaction);
1490 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false);
1491 	}
1492 	if (duplicate == NULL)
1493 		RETURN_ERROR(B_IO_ERROR);
1494 
1495 	// The entry got removed from the duplicate node, but we might want to free
1496 	// it now in case it's empty
1497 
1498 	while (true) {
1499 		off_t left = duplicate->LeftLink();
1500 		off_t right = duplicate->RightLink();
1501 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1502 
1503 		if ((isLast && array->count == 1) || array->count == 0) {
1504 			// Free empty duplicate page, link their siblings together, and
1505 			// update the duplicate link if needed (ie. when we either remove
1506 			// the last duplicate node or have a new first one)
1507 
1508 			if (left == BPLUSTREE_NULL) {
1509 				// the duplicate link points to us
1510 				if (cached.MakeWritable(transaction) == NULL)
1511 					return B_IO_ERROR;
1512 
1513 				if (array->count == 1) {
1514 					// this is the last node, and there is only one value left;
1515 					// replace the duplicate link with that value, it's no duplicate
1516 					// anymore
1517 					values[index] = array->values[0];
1518 				} else {
1519 					// move the duplicate link to the next node
1520 					values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1521 						BPLUSTREE_DUPLICATE_NODE, right));
1522 				}
1523 			}
1524 
1525 			status_t status;
1526 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1527 				return status;
1528 
1529 			if (left != BPLUSTREE_NULL
1530 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) {
1531 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1532 
1533 				// If the next node is the last node, we need to free that node
1534 				// and convert the duplicate entry back into a normal entry
1535 				array = duplicate->DuplicateArray();
1536 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1537 					&& array->count <= NUM_FRAGMENT_VALUES) {
1538 					duplicateOffset = left;
1539 					continue;
1540 				}
1541 			}
1542 			if (right != BPLUSTREE_NULL
1543 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) {
1544 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1545 
1546 				// Again, we may need to turn the duplicate entry back into a normal entry
1547 				array = duplicate->DuplicateArray();
1548 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1549 					&& array->count <= NUM_FRAGMENT_VALUES) {
1550 					duplicateOffset = right;
1551 					continue;
1552 				}
1553 			}
1554 			return B_OK;
1555 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1556 			// If the number of entries fits in a duplicate fragment, then
1557 			// either find a free fragment node, or convert this node to a
1558 			// fragment node.
1559 			CachedNode cachedOther(this);
1560 
1561 			bplustree_node *fragment = NULL;
1562 			uint32 fragmentIndex = 0;
1563 			off_t offset;
1564 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1565 					&offset, &fragment, &fragmentIndex) == B_OK) {
1566 				// move to other node
1567 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1568 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1569 
1570 				cachedDuplicate.Free(transaction, duplicateOffset);
1571 				duplicateOffset = offset;
1572 			} else {
1573 				// convert node
1574 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1575 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1576 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1577 			}
1578 
1579 			if (cached.MakeWritable(transaction) == NULL)
1580 				return B_IO_ERROR;
1581 
1582 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1583 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1584 		}
1585 		return B_OK;
1586 	}
1587 }
1588 
1589 
1590 /*!
1591 	Removes the key with the given index from the specified node.
1592 	Since it has to get the key from the node anyway (to obtain it's
1593 	pointer), it's not needed to pass the key & its length, although
1594 	the calling method (BPlusTree::Remove()) have this data.
1595 */
1596 void
1597 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index)
1598 {
1599 	// should never happen, but who knows?
1600 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1601 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1602 		return;
1603 	}
1604 
1605 	off_t *values = node->Values();
1606 
1607 	// if we would have to drop the overflow link, drop
1608 	// the last key instead and update the overflow link
1609 	// to the value of that one
1610 	if (!node->IsLeaf() && index == node->NumKeys())
1611 		node->overflow_link = values[--index];
1612 
1613 	uint16 length;
1614 	uint8 *key = node->KeyAt(index, &length);
1615 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1616 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1617 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1618 			(int)fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1619 		fStream->GetVolume()->Panic();
1620 		return;
1621 	}
1622 
1623 	uint16 *keyLengths = node->KeyLengths();
1624 	uint8 *keys = node->Keys();
1625 
1626 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1627 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1628 
1629 	off_t *newValues = node->Values();
1630 	uint16 *newKeyLengths = node->KeyLengths();
1631 
1632 	// move key data
1633 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1634 
1635 	// move and update key lengths
1636 	if (index > 0 && newKeyLengths != keyLengths)
1637 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1638 	for (uint16 i = index; i < node->NumKeys(); i++) {
1639 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1640 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1641 	}
1642 
1643 	// move values
1644 	if (index > 0)
1645 		memmove(newValues, values, index * sizeof(off_t));
1646 	if (node->NumKeys() > index)
1647 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1648 }
1649 
1650 
1651 /*!
1652 	Removes the specified key from the tree. The "value" parameter is only used
1653 	for trees which allow duplicates, so you may safely ignore it.
1654 	It's not an optional parameter, so at least you have to think about it.
1655 */
1656 status_t
1657 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength,
1658 	off_t value)
1659 {
1660 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1661 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1662 		RETURN_ERROR(B_BAD_VALUE);
1663 
1664 	// lock access to stream
1665 	WriteLocked locked(fStream->Lock());
1666 
1667 	Stack<node_and_key> stack;
1668 	if (_SeekDown(stack, key, keyLength) != B_OK)
1669 		RETURN_ERROR(B_ERROR);
1670 
1671 	node_and_key nodeAndKey;
1672 	const bplustree_node *node;
1673 
1674 	CachedNode cached(this);
1675 	while (stack.Pop(&nodeAndKey)
1676 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1677 #ifdef DEBUG
1678 		NodeChecker checker(node, fNodeSize, "remove");
1679 #endif
1680 		if (node->IsLeaf()) {
1681 			// first round, check for duplicate entries
1682 			status_t status = _FindKey(node, key, keyLength,
1683 				&nodeAndKey.keyIndex);
1684 			if (status < B_OK)
1685 				RETURN_ERROR(status);
1686 
1687 			// If we will remove the last key, the iterator will be set
1688 			// to the next node after the current - if there aren't any
1689 			// more nodes, we need a way to prevent the TreeIterators to
1690 			// touch the old node again, we use BPLUSTREE_FREE for this
1691 			off_t next = node->RightLink() == BPLUSTREE_NULL
1692 				? BPLUSTREE_FREE : node->RightLink();
1693 			_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1694 				? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1695 
1696 			// is this a duplicate entry?
1697 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1698 					node->Values()[nodeAndKey.keyIndex]))) {
1699 				if (fAllowDuplicates) {
1700 					return _RemoveDuplicate(transaction, node, cached,
1701 						nodeAndKey.keyIndex, value);
1702 				} else {
1703 					FATAL(("dupliate node found where no duplicates are allowed!\n"));
1704 					RETURN_ERROR(B_ERROR);
1705 				}
1706 			}
1707 		}
1708 
1709 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1710 		if (writableNode == NULL)
1711 			return B_IO_ERROR;
1712 
1713 		// if it's an empty root node, we have to convert it
1714 		// to a leaf node by dropping the overflow link, or,
1715 		// if it's already a leaf node, just empty it
1716 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1717 			&& (node->NumKeys() == 0 || (node->NumKeys() == 1 && node->IsLeaf()))) {
1718 			writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1719 			writableNode->all_key_count = 0;
1720 			writableNode->all_key_length = 0;
1721 
1722 			// if we've made a leaf node out of the root node, we need
1723 			// to reset the maximum number of levels in the header
1724 			if (fHeader->MaxNumberOfLevels() != 1) {
1725 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1726 				if (header == NULL)
1727 					return B_IO_ERROR;
1728 
1729 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1730 			}
1731 			return B_OK;
1732 		}
1733 
1734 		// if there is only one key left, we don't have to remove
1735 		// it, we can just dump the node (index nodes still have
1736 		// the overflow link, so we have to drop the last key)
1737 		if (writableNode->NumKeys() > 1
1738 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
1739 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1740 			return B_OK;
1741 		}
1742 
1743 		// when we are here, we can just free the node, but
1744 		// we have to update the right/left link of the
1745 		// siblings first
1746 		CachedNode otherCached(this);
1747 		bplustree_node *other = otherCached.SetToWritable(transaction,
1748 			writableNode->LeftLink());
1749 		if (other != NULL)
1750 			other->right_link = writableNode->right_link;
1751 
1752 		if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL)
1753 			other->left_link = writableNode->left_link;
1754 
1755 		cached.Free(transaction, nodeAndKey.nodeOffset);
1756 	}
1757 	RETURN_ERROR(B_ERROR);
1758 }
1759 
1760 
1761 /*!
1762 	Replaces the value for the key in the tree.
1763 	Returns B_OK if the key could be found and its value replaced,
1764 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1765 	to indicate that something went terribly wrong.
1766 	Note that this doesn't work with duplicates - it will just
1767 	return B_BAD_TYPE if you call this function on a tree where
1768 	duplicates are allowed.
1769 */
1770 status_t
1771 BPlusTree::Replace(Transaction &transaction, const uint8 *key,
1772 	uint16 keyLength, off_t value)
1773 {
1774 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1775 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1776 		|| key == NULL)
1777 		RETURN_ERROR(B_BAD_VALUE);
1778 
1779 	if (fAllowDuplicates)
1780 		RETURN_ERROR(B_BAD_TYPE);
1781 
1782 	// lock access to stream (a read lock is okay for this purpose)
1783 	ReadLocked locked(fStream->Lock());
1784 
1785 	off_t nodeOffset = fHeader->RootNode();
1786 	CachedNode cached(this);
1787 	const bplustree_node *node;
1788 
1789 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1790 		uint16 keyIndex = 0;
1791 		off_t nextOffset;
1792 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1793 
1794 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1795 			if (status == B_OK) {
1796 				bplustree_node *writableNode = cached.MakeWritable(transaction);
1797 				if (writableNode != NULL)
1798 					writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value);
1799 				else
1800 					status = B_IO_ERROR;
1801 			}
1802 
1803 			return status;
1804 		} else if (nextOffset == nodeOffset)
1805 			RETURN_ERROR(B_ERROR);
1806 
1807 		nodeOffset = nextOffset;
1808 	}
1809 	RETURN_ERROR(B_ERROR);
1810 }
1811 
1812 
1813 /*!
1814 	Searches the key in the tree, and stores the offset found in
1815 	_value, if successful.
1816 	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1817 	a stack while it descends the tree.
1818 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1819 	if not. It can also return other errors to indicate that
1820 	something went wrong.
1821 	Note that this doesn't work with duplicates - it will just
1822 	return B_BAD_TYPE if you call this function on a tree where
1823 	duplicates are allowed.
1824 */
1825 status_t
1826 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1827 {
1828 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1829 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1830 		|| key == NULL)
1831 		RETURN_ERROR(B_BAD_VALUE);
1832 
1833 	if (fAllowDuplicates)
1834 		RETURN_ERROR(B_BAD_TYPE);
1835 
1836 	// lock access to stream
1837 	ReadLocked locked(fStream->Lock());
1838 
1839 	off_t nodeOffset = fHeader->RootNode();
1840 	CachedNode cached(this);
1841 	const bplustree_node *node;
1842 
1843 #ifdef DEBUG
1844 	int32 levels = 0;
1845 #endif
1846 
1847 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1848 		uint16 keyIndex = 0;
1849 		off_t nextOffset;
1850 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1851 
1852 #ifdef DEBUG
1853 		levels++;
1854 #endif
1855 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1856 			if (status == B_OK && _value != NULL)
1857 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1858 
1859 #ifdef DEBUG
1860 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1861 				DEBUGGER(("levels don't match"));
1862 #endif
1863 			return status;
1864 		} else if (nextOffset == nodeOffset)
1865 			RETURN_ERROR(B_ERROR);
1866 
1867 		nodeOffset = nextOffset;
1868 	}
1869 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1870 	RETURN_ERROR(B_ERROR);
1871 }
1872 
1873 
1874 //	#pragma mark -
1875 
1876 
1877 TreeIterator::TreeIterator(BPlusTree *tree)
1878 	:
1879 	fTree(tree),
1880 	fCurrentNodeOffset(BPLUSTREE_NULL),
1881 	fNext(NULL)
1882 {
1883 	tree->_AddIterator(this);
1884 }
1885 
1886 
1887 TreeIterator::~TreeIterator()
1888 {
1889 	if (fTree)
1890 		fTree->_RemoveIterator(this);
1891 }
1892 
1893 
1894 status_t
1895 TreeIterator::Goto(int8 to)
1896 {
1897 	if (fTree == NULL || fTree->fHeader == NULL)
1898 		RETURN_ERROR(B_BAD_VALUE);
1899 
1900 	// lock access to stream
1901 	ReadLocked locked(fTree->fStream->Lock());
1902 
1903 	off_t nodeOffset = fTree->fHeader->RootNode();
1904 	CachedNode cached(fTree);
1905 	const bplustree_node *node;
1906 
1907 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1908 		// is the node a leaf node?
1909 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1910 			fCurrentNodeOffset = nodeOffset;
1911 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1912 			fDuplicateNode = BPLUSTREE_NULL;
1913 
1914 			return B_OK;
1915 		}
1916 
1917 		// get the next node offset depending on the direction (and if there
1918 		// are any keys in that node at all)
1919 		off_t nextOffset;
1920 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1921 			nextOffset = node->OverflowLink();
1922 		else {
1923 			if (node->AllKeyLength() > fTree->fNodeSize
1924 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1925 					- 8 * node->NumKeys())
1926 				RETURN_ERROR(B_ERROR);
1927 
1928 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1929 		}
1930 		if (nextOffset == nodeOffset)
1931 			break;
1932 
1933 		nodeOffset = nextOffset;
1934 	}
1935 	FATAL(("%s fails\n", __FUNCTION__));
1936 
1937 	RETURN_ERROR(B_ERROR);
1938 }
1939 
1940 
1941 /*!
1942 	Iterates through the tree in the specified direction.
1943 	When it iterates through duplicates, the "key" is only updated for the
1944 	first entry - if you need to know when this happens, use the "duplicate"
1945 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1946 	the other duplicates.
1947 	That's not too nice, but saves the 256 bytes that would be needed to
1948 	store the last key - if this will ever become an issue, it will be
1949 	easy to change.
1950 	The other advantage of this is, that the queries can skip all duplicates
1951 	at once when they are not relevant to them.
1952 */
1953 status_t
1954 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength,
1955 	uint16 maxLength, off_t *value, uint16 *duplicate)
1956 {
1957 	if (fTree == NULL)
1958 		return B_INTERRUPTED;
1959 
1960 	bool forward = direction == BPLUSTREE_FORWARD;
1961 
1962 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1963 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1964 		RETURN_ERROR(B_ERROR);
1965 
1966 	// if the tree was emptied since the last call
1967 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1968 		return B_ENTRY_NOT_FOUND;
1969 
1970 	// lock access to stream
1971 	ReadLocked locked(fTree->fStream->Lock());
1972 
1973 	CachedNode cached(fTree);
1974 	const bplustree_node *node;
1975 
1976 	if (fDuplicateNode != BPLUSTREE_NULL) {
1977 		// regardless of traverse direction the duplicates are always presented in
1978 		// the same order; since they are all considered as equal, this shouldn't
1979 		// cause any problems
1980 
1981 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1982 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1983 		else
1984 			node = NULL;
1985 
1986 		if (node != NULL) {
1987 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1988 				// if the node is out of duplicates, we go directly to the next one
1989 				fDuplicateNode = node->RightLink();
1990 				if (fDuplicateNode != BPLUSTREE_NULL
1991 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1992 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1993 					fDuplicate = 0;
1994 				}
1995 			}
1996 			if (fDuplicate < fNumDuplicates) {
1997 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1998 				if (duplicate)
1999 					*duplicate = 2;
2000 				return B_OK;
2001 			}
2002 		}
2003 		fDuplicateNode = BPLUSTREE_NULL;
2004 	}
2005 
2006 	off_t savedNodeOffset = fCurrentNodeOffset;
2007 	int32 savedKey = fCurrentKey;
2008 
2009 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2010 		RETURN_ERROR(B_ERROR);
2011 
2012 	if (duplicate)
2013 		*duplicate = 0;
2014 
2015 	fCurrentKey += direction;
2016 
2017 	// is the current key in the current node?
2018 	while ((forward && fCurrentKey >= node->NumKeys())
2019 			|| (!forward && fCurrentKey < 0)) {
2020 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2021 
2022 		// are there any more nodes?
2023 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2024 			node = cached.SetTo(fCurrentNodeOffset);
2025 			if (!node)
2026 				RETURN_ERROR(B_ERROR);
2027 
2028 			// reset current key
2029 			fCurrentKey = forward ? 0 : node->NumKeys();
2030 		} else {
2031 			// there are no nodes left, so turn back to the last key
2032 			fCurrentNodeOffset = savedNodeOffset;
2033 			fCurrentKey = savedKey;
2034 
2035 			return B_ENTRY_NOT_FOUND;
2036 		}
2037 	}
2038 
2039 	if (node->all_key_count == 0)
2040 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2041 
2042 	uint16 length;
2043 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
2044 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
2045 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2046 		fTree->fStream->GetVolume()->Panic();
2047 		RETURN_ERROR(B_BAD_DATA);
2048 	}
2049 
2050 	length = min_c(length, maxLength);
2051 	memcpy(key, keyStart, length);
2052 
2053 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
2054 		// terminate string type
2055 		if (length == maxLength)
2056 			length--;
2057 		((char *)key)[length] = '\0';
2058 	}
2059 	*keyLength = length;
2060 
2061 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2062 
2063 	// duplicate fragments?
2064 	uint8 type = bplustree_node::LinkType(offset);
2065 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
2066 		fDuplicateNode = offset;
2067 
2068 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
2069 		if (node == NULL)
2070 			RETURN_ERROR(B_ERROR);
2071 
2072 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2073 
2074 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2075 		if (fNumDuplicates) {
2076 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2077 			fDuplicate = 1;
2078 			if (duplicate)
2079 				*duplicate = 1;
2080 		} else {
2081 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
2082 			fDuplicateNode = BPLUSTREE_NULL;
2083 			offset = 0;
2084 		}
2085 	}
2086 	*value = offset;
2087 
2088 	return B_OK;
2089 }
2090 
2091 
2092 /*!
2093 	This is more or less a copy of BPlusTree::Find() - but it just
2094 	sets the current position in the iterator, regardless of if the
2095 	key could be found or not.
2096 */
2097 status_t
2098 TreeIterator::Find(const uint8 *key, uint16 keyLength)
2099 {
2100 	if (fTree == NULL)
2101 		return B_INTERRUPTED;
2102 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2103 		|| key == NULL)
2104 		RETURN_ERROR(B_BAD_VALUE);
2105 
2106 	// lock access to stream
2107 	ReadLocked locked(fTree->fStream->Lock());
2108 
2109 	off_t nodeOffset = fTree->fHeader->RootNode();
2110 
2111 	CachedNode cached(fTree);
2112 	const bplustree_node *node;
2113 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2114 		uint16 keyIndex = 0;
2115 		off_t nextOffset;
2116 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2117 			&nextOffset);
2118 
2119 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2120 			fCurrentNodeOffset = nodeOffset;
2121 			fCurrentKey = keyIndex - 1;
2122 			fDuplicateNode = BPLUSTREE_NULL;
2123 
2124 			return status;
2125 		} else if (nextOffset == nodeOffset)
2126 			RETURN_ERROR(B_ERROR);
2127 
2128 		nodeOffset = nextOffset;
2129 	}
2130 	RETURN_ERROR(B_ERROR);
2131 }
2132 
2133 
2134 void
2135 TreeIterator::SkipDuplicates()
2136 {
2137 	fDuplicateNode = BPLUSTREE_NULL;
2138 }
2139 
2140 
2141 void
2142 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
2143 {
2144 	if (offset != fCurrentNodeOffset)
2145 		return;
2146 
2147 	if (nextOffset != BPLUSTREE_NULL) {
2148 		fCurrentNodeOffset = nextOffset;
2149 		if (splitAt <= fCurrentKey) {
2150 			fCurrentKey -= splitAt;
2151 			keyIndex -= splitAt;
2152 		}
2153 	}
2154 
2155 	// Adjust fCurrentKey to point to the same key as before.
2156 	// Note, that if a key is inserted at the current position
2157 	// it won't be included in this tree transition.
2158 	if (keyIndex <= fCurrentKey)
2159 		fCurrentKey += change;
2160 
2161 	// ToDo: duplicate handling!
2162 }
2163 
2164 
2165 void
2166 TreeIterator::Stop()
2167 {
2168 	fTree = NULL;
2169 }
2170 
2171 
2172 #ifdef DEBUG
2173 void
2174 TreeIterator::Dump()
2175 {
2176 	__out("TreeIterator at %p:\n", this);
2177 	__out("\tfTree = %p\n", fTree);
2178 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2179 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2180 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
2181 	__out("\tfDuplicate = %u\n", fDuplicate);
2182 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2183 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2184 }
2185 #endif
2186 
2187 
2188 //	#pragma mark -
2189 
2190 
2191 void
2192 bplustree_node::Initialize()
2193 {
2194 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2195 	all_key_count = 0;
2196 	all_key_length = 0;
2197 }
2198 
2199 
2200 uint8 *
2201 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
2202 {
2203 	if (index < 0 || index > NumKeys())
2204 		return NULL;
2205 
2206 	uint8 *keyStart = Keys();
2207 	uint16 *keyLengths = KeyLengths();
2208 
2209 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2210 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2211 	if (index > 0)
2212 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2213 
2214 	return keyStart;
2215 }
2216 
2217 
2218 uint8
2219 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2220 {
2221 	// the duplicate fragment handling is currently hard-coded to a node size
2222 	// of 1024 bytes - with future versions of BFS, this may be a problem
2223 
2224 	if (isFragment) {
2225 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2226 
2227 		return ((off_t *)this)[fragment];
2228 	}
2229 	return OverflowLink();
2230 }
2231 
2232 
2233 off_t
2234 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2235 {
2236 	uint32 start;
2237 	if (isFragment)
2238 		start = 8 * ((uint64)offset & 0x3ff);
2239 	else
2240 		start = 2;
2241 
2242 	return ((off_t *)this)[start + 1 + index];
2243 }
2244 
2245 
2246 /*!	Although the name suggests it, this function doesn't return the real
2247 	used fragment count; at least, it can only count to two: it returns
2248 	0, if there is no fragment used, 1 if there is only one fragment
2249 	used, and 2 if there are at least 2 fragments used.
2250 */
2251 uint32
2252 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2253 {
2254 	uint32 used = 0;
2255 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2256 		duplicate_array *array = FragmentAt(i);
2257 		if (array->count > 0 && ++used > 1)
2258 			return used;
2259 	}
2260 	return used;
2261 }
2262 
2263 
2264 #ifdef DEBUG
2265 status_t
2266 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2267 {
2268 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2269 		DEBUGGER(("invalid node: key/length count"));
2270 
2271 	for (int32 i = 0; i < NumKeys(); i++) {
2272 		uint16 length;
2273 		uint8 *key = KeyAt(i, &length);
2274 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2275 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2276 			dprintf("node %p, key %ld\n", this, i);
2277 			DEBUGGER(("invalid node: keys corrupted"));
2278 			return B_BAD_DATA;
2279 		}
2280 		if (Values()[i] == -1) {
2281 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2282 			DEBUGGER(("invalid node: values corrupted"));
2283 			return B_BAD_DATA;
2284 		}
2285 	}
2286 	return B_OK;
2287 }
2288 #endif
2289 
2290 
2291 //	#pragma mark -
2292 
2293 
2294 int32
2295 compareKeys(type_code type, const void *key1, int keyLength1,
2296 	const void *key2, int keyLength2)
2297 {
2298 	// if one of the keys is NULL, bail out gracefully
2299 	if (key1 == NULL || key2 == NULL) {
2300 		// even if it's most probably a bug in the calling code, we try to
2301 		// give a meaningful result
2302 		if (key1 == NULL && key2 != NULL)
2303 			return 1;
2304 		if (key2 == NULL && key1 != NULL)
2305 			return -1;
2306 
2307 		return 0;
2308 	}
2309 
2310 	switch (type) {
2311 	    case B_STRING_TYPE:
2312     	{
2313 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
2314 			if (result == 0)
2315 				result = keyLength1 - keyLength2;
2316 
2317 			return result;
2318 		}
2319 
2320 		case B_SSIZE_T_TYPE:
2321 		case B_INT32_TYPE:
2322 			return *(int32 *)key1 - *(int32 *)key2;
2323 
2324 		case B_SIZE_T_TYPE:
2325 		case B_UINT32_TYPE:
2326 			if (*(uint32 *)key1 == *(uint32 *)key2)
2327 				return 0;
2328 			if (*(uint32 *)key1 > *(uint32 *)key2)
2329 				return 1;
2330 
2331 			return -1;
2332 
2333 		case B_OFF_T_TYPE:
2334 		case B_INT64_TYPE:
2335 			if (*(int64 *)key1 == *(int64 *)key2)
2336 				return 0;
2337 			if (*(int64 *)key1 > *(int64 *)key2)
2338 				return 1;
2339 
2340 			return -1;
2341 
2342 		case B_UINT64_TYPE:
2343 			if (*(uint64 *)key1 == *(uint64 *)key2)
2344 				return 0;
2345 			if (*(uint64 *)key1 > *(uint64 *)key2)
2346 				return 1;
2347 
2348 			return -1;
2349 
2350 		case B_FLOAT_TYPE:
2351 		{
2352 			float result = *(float *)key1 - *(float *)key2;
2353 			if (result == 0.0f)
2354 				return 0;
2355 
2356 			return (result < 0.0f) ? -1 : 1;
2357 		}
2358 
2359 		case B_DOUBLE_TYPE:
2360 		{
2361 			double result = *(double *)key1 - *(double *)key2;
2362 			if (result == 0.0)
2363 				return 0;
2364 
2365 			return (result < 0.0) ? -1 : 1;
2366 		}
2367 	}
2368 
2369 	// if the type is unknown, the entries don't match...
2370 	return -1;
2371 }
2372 
2373