xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 45bd7bb3db9d9e4dcb02b89a3e7c2bf382c0a88c)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "Debug.h"
14 #include "BPlusTree.h"
15 #include "Inode.h"
16 #include "Utility.h"
17 
18 
19 #ifdef DEBUG
20 class NodeChecker {
21 public:
22 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
23 		:
24 		fNode(node),
25 		fSize(nodeSize),
26 		fText(text)
27 	{
28 		Check("integrity check failed on construction.");
29 	}
30 
31 	~NodeChecker()
32 	{
33 		Check("integrity check failed on destruction.");
34 	}
35 
36 	void
37 	Check(const char* message)
38 	{
39 		if (fNode->CheckIntegrity(fSize) < B_OK)
40 			dprintf("%s: %s\n", fText, message);
41 	}
42 
43 private:
44 	const bplustree_node*	fNode;
45 	int32					fSize;
46 	const char*				fText;
47 };
48 #endif
49 
50 
51 // Node Caching for the BPlusTree class
52 //
53 // With write support, there is the need for a function that allocates new
54 // nodes by either returning empty nodes, or by growing the file's data stream
55 //
56 // !! The CachedNode class assumes that you have properly locked the stream
57 // !! before asking for nodes.
58 //
59 // Note: This code will fail if the block size is smaller than the node size!
60 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
61 // is hard-coded to 1024 bytes, that's not an issue now.
62 
63 void
64 CachedNode::UnsetUnchanged(Transaction& transaction)
65 {
66 	if (fTree == NULL || fTree->fStream == NULL)
67 		return;
68 
69 	if (fNode != NULL) {
70 		void* cache = fTree->fStream->GetVolume()->BlockCache();
71 
72 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
73 		block_cache_put(cache, fBlockNumber);
74 		fNode = NULL;
75 	}
76 }
77 
78 
79 void
80 CachedNode::Unset()
81 {
82 	if (fTree == NULL || fTree->fStream == NULL)
83 		return;
84 
85 	if (fNode != NULL) {
86 		if (fWritable && fOffset == 0) {
87 			// The B+tree header has been updated - we need to update the
88 			// BPlusTrees copy of it, as well.
89 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
90 		}
91 
92 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
93 			fBlockNumber);
94 		fNode = NULL;
95 	}
96 }
97 
98 
99 const bplustree_node*
100 CachedNode::SetTo(off_t offset, bool check)
101 {
102 	if (fTree == NULL || fTree->fStream == NULL) {
103 		REPORT_ERROR(B_BAD_VALUE);
104 		return NULL;
105 	}
106 
107 	Unset();
108 
109 	// You can only ask for nodes at valid positions - you can't
110 	// even access the b+tree header with this method (use SetToHeader()
111 	// instead)
112 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
113 		|| offset <= 0
114 		|| (offset % fTree->fNodeSize) != 0)
115 		return NULL;
116 
117 	if (InternalSetTo(NULL, offset) != NULL && check) {
118 		// sanity checks (links, all_key_count)
119 		if (!fTree->fHeader.CheckNode(fNode)) {
120 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
121 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
122 				fBlockNumber, fTree->fStream->ID()));
123 			return NULL;
124 		}
125 	}
126 	return fNode;
127 }
128 
129 
130 bplustree_node*
131 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
132 {
133 	if (fTree == NULL || fTree->fStream == NULL) {
134 		REPORT_ERROR(B_BAD_VALUE);
135 		return NULL;
136 	}
137 
138 	Unset();
139 
140 	// You can only ask for nodes at valid positions - you can't
141 	// even access the b+tree header with this method (use SetToHeader()
142 	// instead)
143 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
144 		|| offset <= 0
145 		|| (offset % fTree->fNodeSize) != 0)
146 		return NULL;
147 
148 	if (InternalSetTo(&transaction, offset) != NULL && check) {
149 		// sanity checks (links, all_key_count)
150 		if (!fTree->fHeader.CheckNode(fNode)) {
151 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
152 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
153 				fBlockNumber, fTree->fStream->ID()));
154 			return NULL;
155 		}
156 	}
157 	return fNode;
158 }
159 
160 
161 bplustree_node*
162 CachedNode::MakeWritable(Transaction& transaction)
163 {
164 	if (fNode == NULL)
165 		return NULL;
166 
167 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
168 			fBlockNumber, transaction.ID()) == B_OK)
169 		return fNode;
170 
171 	return NULL;
172 }
173 
174 
175 const bplustree_header*
176 CachedNode::SetToHeader()
177 {
178 	if (fTree == NULL || fTree->fStream == NULL) {
179 		REPORT_ERROR(B_BAD_VALUE);
180 		return NULL;
181 	}
182 
183 	Unset();
184 
185 	InternalSetTo(NULL, 0LL);
186 	return (bplustree_header*)fNode;
187 }
188 
189 
190 bplustree_header*
191 CachedNode::SetToWritableHeader(Transaction& transaction)
192 {
193 	if (fTree == NULL || fTree->fStream == NULL) {
194 		REPORT_ERROR(B_BAD_VALUE);
195 		return NULL;
196 	}
197 
198 	Unset();
199 
200 	InternalSetTo(&transaction, 0LL);
201 
202 	if (fNode != NULL && !fTree->fInTransaction) {
203 		transaction.AddListener(fTree);
204 		fTree->fInTransaction = true;
205 
206 		if (!transaction.GetVolume()->IsInitializing()) {
207 			acquire_vnode(transaction.GetVolume()->FSVolume(),
208 				fTree->fStream->ID());
209 		}
210 	}
211 
212 	return (bplustree_header*)fNode;
213 }
214 
215 
216 bplustree_node*
217 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
218 {
219 	fNode = NULL;
220 	fOffset = offset;
221 
222 	off_t fileOffset;
223 	block_run run;
224 	if (offset < fTree->fStream->Size()
225 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
226 		Volume* volume = fTree->fStream->GetVolume();
227 
228 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
229 		fBlockNumber = volume->ToBlock(run) + blockOffset;
230 		uint8* block;
231 
232 		if (transaction != NULL) {
233 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
234 				fBlockNumber, transaction->ID());
235 			fWritable = true;
236 		} else {
237 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
238 			fWritable = false;
239 		}
240 
241 		if (block != NULL) {
242 			// The node is somewhere in that block...
243 			// (confusing offset calculation)
244 			fNode = (bplustree_node*)(block + offset
245 				- (fileOffset + (blockOffset << volume->BlockShift())));
246 		} else
247 			REPORT_ERROR(B_IO_ERROR);
248 	}
249 	return fNode;
250 }
251 
252 
253 status_t
254 CachedNode::Free(Transaction& transaction, off_t offset)
255 {
256 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
257 		RETURN_ERROR(B_BAD_VALUE);
258 
259 	// TODO: scan the free nodes list and remove all nodes at the end
260 	// of the tree - perhaps that shouldn't be done everytime that
261 	// function is called, perhaps it should be done when the directory
262 	// inode is closed or based on some calculation or whatever...
263 
264 	CachedNode cached(fTree);
265 	bplustree_header* header = cached.SetToWritableHeader(transaction);
266 	if (header == NULL)
267 		return B_IO_ERROR;
268 
269 #if 0
270 	// TODO: temporarily disabled because CheckNode() doesn't like this...
271 	// 		Also, it's such an edge case that it's almost useless, anyway.
272 	// if the node is the last one in the tree, we shrink
273 	// the tree and file size by one node
274 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
275 	if (offset == lastOffset) {
276 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
277 		if (status < B_OK)
278 			return status;
279 
280 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
281 		return B_OK;
282 	}
283 #endif
284 
285 	// add the node to the free nodes list
286 	fNode->left_link = header->free_node_pointer;
287 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
288 
289 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
290 	return B_OK;
291 }
292 
293 
294 status_t
295 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
296 	off_t* _offset)
297 {
298 	if (fTree == NULL || fTree->fStream == NULL)
299 		RETURN_ERROR(B_BAD_VALUE);
300 
301 	Unset();
302 
303 	bplustree_header* header;
304 	status_t status;
305 
306 	// if there are any free nodes, recycle them
307 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
308 		CachedNode cached(fTree);
309 		header = cached.SetToWritableHeader(transaction);
310 		if (header == NULL)
311 			return B_IO_ERROR;
312 
313 		*_offset = header->FreeNode();
314 		*_node = fNode;
315 
316 		// set new free node pointer
317 		header->free_node_pointer = fNode->left_link;
318 		fNode->Initialize();
319 		return B_OK;
320 	}
321 
322 	// allocate space for a new node
323 	Inode* stream = fTree->fStream;
324 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
325 		return status;
326 
327 	CachedNode cached(fTree);
328 	header = cached.SetToWritableHeader(transaction);
329 	if (header == NULL)
330 		return B_IO_ERROR;
331 
332 	// the maximum_size has to be changed before the call to SetTo() - or
333 	// else it will fail because the requested node is out of bounds
334 	off_t offset = header->MaximumSize();
335 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
336 		+ fTree->fNodeSize);
337 
338 	cached.Unset();
339 		// SetToWritable() below needs the new values in the tree's header
340 
341 	if (SetToWritable(transaction, offset, false) == NULL)
342 		RETURN_ERROR(B_ERROR);
343 
344 	fNode->Initialize();
345 
346 	*_offset = offset;
347 	*_node = fNode;
348 	return B_OK;
349 }
350 
351 
352 //	#pragma mark -
353 
354 
355 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
356 	:
357 	fStream(NULL),
358 	fInTransaction(false)
359 {
360 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
361 	SetTo(transaction, stream);
362 }
363 
364 
365 BPlusTree::BPlusTree(Inode* stream)
366 	:
367 	fStream(NULL),
368 	fInTransaction(false)
369 {
370 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
371 	SetTo(stream);
372 }
373 
374 
375 BPlusTree::BPlusTree()
376 	:
377 	fStream(NULL),
378 	fNodeSize(BPLUSTREE_NODE_SIZE),
379 	fAllowDuplicates(true),
380 	fInTransaction(false),
381 	fStatus(B_NO_INIT)
382 {
383 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
384 }
385 
386 
387 BPlusTree::~BPlusTree()
388 {
389 	// if there are any TreeIterators left, we need to stop them
390 	// (can happen when the tree's inode gets deleted while
391 	// traversing the tree - a TreeIterator doesn't lock the inode)
392 	mutex_lock(&fIteratorLock);
393 
394 	SinglyLinkedList<TreeIterator>::Iterator iterator
395 		= fIterators.GetIterator();
396 	while (iterator.HasNext())
397 		iterator.Next()->Stop();
398 
399 	mutex_destroy(&fIteratorLock);
400 }
401 
402 
403 /*! Create a new B+Tree on the specified stream */
404 status_t
405 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
406 {
407 	// initializes in-memory B+Tree
408 
409 	fStream = stream;
410 
411 	CachedNode cached(this);
412 	bplustree_header* header = cached.SetToWritableHeader(transaction);
413 	if (header == NULL) {
414 		// allocate space for new header + node!
415 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
416 		if (fStatus < B_OK)
417 			RETURN_ERROR(fStatus);
418 
419 		header = cached.SetToWritableHeader(transaction);
420 		if (header == NULL)
421 			RETURN_ERROR(fStatus = B_ERROR);
422 	}
423 
424 	fAllowDuplicates = stream->IsIndex()
425 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
426 
427 	fNodeSize = nodeSize;
428 
429 	// initialize b+tree header
430  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
431  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
432  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
433  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
434  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
435  	header->free_node_pointer
436  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
437  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
438 
439 	cached.Unset();
440 
441 	// initialize b+tree root node
442 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
443 	if (cached.Node() == NULL)
444 		RETURN_ERROR(B_IO_ERROR);
445 
446 	cached.Node()->Initialize();
447 
448 	return fStatus = B_OK;
449 }
450 
451 
452 status_t
453 BPlusTree::SetTo(Inode* stream)
454 {
455 	if (stream == NULL)
456 		RETURN_ERROR(fStatus = B_BAD_VALUE);
457 
458 	fStream = stream;
459 
460 	// get on-disk B+Tree header
461 
462 	CachedNode cached(this);
463 	const bplustree_header* header = cached.SetToHeader();
464 	if (header != NULL)
465 		memcpy(&fHeader, header, sizeof(bplustree_header));
466 	else
467 		RETURN_ERROR(fStatus = B_IO_ERROR);
468 
469 	// is header valid?
470 
471 	if (fHeader.MaximumSize() != stream->Size()) {
472 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
473 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
474 		// we can't change the header since we don't have a transaction
475 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
476 	}
477 	if (fHeader.Magic() != BPLUSTREE_MAGIC
478 		|| (fHeader.RootNode() % fHeader.NodeSize()) != 0
479 		|| !fHeader.IsValidLink(fHeader.RootNode())
480 		|| !fHeader.IsValidLink(fHeader.FreeNode())) {
481 #ifdef DEBUG
482 		dump_bplustree_header(&fHeader);
483 		dump_block((const char*)&fHeader, 128);
484 #endif
485 		RETURN_ERROR(fStatus = B_BAD_DATA);
486 	}
487 
488 	fNodeSize = fHeader.NodeSize();
489 
490 	// validity check
491 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
492 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
493 		S_DOUBLE_INDEX};
494 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
495 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
496 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
497 
498 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
499 		|| ((stream->Mode() & S_INDEX_DIR) != 0
500 			&& kToMode[fHeader.DataType()] != mode)
501 		|| !stream->IsContainer()) {
502 		D(	dump_bplustree_header(&fHeader);
503 			dump_inode(&stream->Node());
504 		);
505 		RETURN_ERROR(fStatus = B_BAD_TYPE);
506 	}
507 
508 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
509 	// in the original BFS code - we will honour it nevertheless
510 	fAllowDuplicates = stream->IsIndex()
511 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
512 
513 	cached.SetTo(fHeader.RootNode());
514 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
515 }
516 
517 
518 status_t
519 BPlusTree::InitCheck()
520 {
521 	return fStatus;
522 }
523 
524 
525 int32
526 BPlusTree::TypeCodeToKeyType(type_code code)
527 {
528 	switch (code) {
529 		case B_STRING_TYPE:
530 			return BPLUSTREE_STRING_TYPE;
531 		case B_SSIZE_T_TYPE:
532 		case B_INT32_TYPE:
533 			return BPLUSTREE_INT32_TYPE;
534 		case B_SIZE_T_TYPE:
535 		case B_UINT32_TYPE:
536 			return BPLUSTREE_UINT32_TYPE;
537 		case B_OFF_T_TYPE:
538 		case B_INT64_TYPE:
539 			return BPLUSTREE_INT64_TYPE;
540 		case B_UINT64_TYPE:
541 			return BPLUSTREE_UINT64_TYPE;
542 		case B_FLOAT_TYPE:
543 			return BPLUSTREE_FLOAT_TYPE;
544 		case B_DOUBLE_TYPE:
545 			return BPLUSTREE_DOUBLE_TYPE;
546 	}
547 	return -1;
548 }
549 
550 
551 int32
552 BPlusTree::ModeToKeyType(mode_t mode)
553 {
554 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
555 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
556 		case S_INT_INDEX:
557 			return BPLUSTREE_INT32_TYPE;
558 		case S_UINT_INDEX:
559 			return BPLUSTREE_UINT32_TYPE;
560 		case S_LONG_LONG_INDEX:
561 			return BPLUSTREE_INT64_TYPE;
562 		case S_ULONG_LONG_INDEX:
563 			return BPLUSTREE_UINT64_TYPE;
564 		case S_FLOAT_INDEX:
565 			return BPLUSTREE_FLOAT_TYPE;
566 		case S_DOUBLE_INDEX:
567 			return BPLUSTREE_DOUBLE_TYPE;
568 		case S_STR_INDEX:
569 		default:
570 			// default is for standard directories
571 			return BPLUSTREE_STRING_TYPE;
572 	}
573 }
574 
575 
576 //	#pragma mark - TransactionListener implementation
577 
578 
579 void
580 BPlusTree::TransactionDone(bool success)
581 {
582 	if (!success) {
583 		// update header from disk
584 		CachedNode cached(this);
585 		const bplustree_header* header = cached.SetToHeader();
586 		if (header != NULL)
587 			memcpy(&fHeader, header, sizeof(bplustree_header));
588 	}
589 }
590 
591 
592 void
593 BPlusTree::RemovedFromTransaction()
594 {
595 	fInTransaction = false;
596 
597 	if (!fStream->GetVolume()->IsInitializing())
598 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
599 }
600 
601 
602 //	#pragma mark -
603 
604 
605 void
606 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
607 	uint16 splitAt, int8 change)
608 {
609 	// Although every iterator which is affected by this update currently
610 	// waits on a semaphore, other iterators could be added/removed at
611 	// any time, so we need to protect this loop
612 	MutexLocker _(fIteratorLock);
613 
614 	SinglyLinkedList<TreeIterator>::Iterator iterator
615 		= fIterators.GetIterator();
616 	while (iterator.HasNext())
617 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
618 }
619 
620 
621 void
622 BPlusTree::_AddIterator(TreeIterator* iterator)
623 {
624 	MutexLocker _(fIteratorLock);
625 	fIterators.Add(iterator);
626 }
627 
628 
629 void
630 BPlusTree::_RemoveIterator(TreeIterator* iterator)
631 {
632 	MutexLocker _(fIteratorLock);
633 	fIterators.Remove(iterator);
634 }
635 
636 
637 int32
638 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
639 	int keyLength2)
640 {
641 	type_code type = 0;
642 	switch (fHeader.DataType()) {
643 	    case BPLUSTREE_STRING_TYPE:
644 	    	type = B_STRING_TYPE;
645 	    	break;
646 		case BPLUSTREE_INT32_TYPE:
647 	    	type = B_INT32_TYPE;
648 	    	break;
649 		case BPLUSTREE_UINT32_TYPE:
650 	    	type = B_UINT32_TYPE;
651 	    	break;
652 		case BPLUSTREE_INT64_TYPE:
653 	    	type = B_INT64_TYPE;
654 	    	break;
655 		case BPLUSTREE_UINT64_TYPE:
656 	    	type = B_UINT64_TYPE;
657 	    	break;
658 		case BPLUSTREE_FLOAT_TYPE:
659 	    	type = B_FLOAT_TYPE;
660 	    	break;
661 		case BPLUSTREE_DOUBLE_TYPE:
662 	    	type = B_DOUBLE_TYPE;
663 	    	break;
664 	}
665    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
666 }
667 
668 
669 status_t
670 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
671 	uint16 keyLength, uint16* _index, off_t* _next)
672 {
673 #ifdef DEBUG
674 		NodeChecker checker(node, fNodeSize, "find");
675 #endif
676 
677 	if (node->all_key_count == 0) {
678 		if (_index)
679 			*_index = 0;
680 		if (_next)
681 			*_next = node->OverflowLink();
682 		return B_ENTRY_NOT_FOUND;
683 	}
684 
685 	off_t* values = node->Values();
686 	int16 saveIndex = -1;
687 
688 	// binary search in the key array
689 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
690 		uint16 i = (first + last) >> 1;
691 
692 		uint16 searchLength;
693 		uint8* searchKey = node->KeyAt(i, &searchLength);
694 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
695 				> (uint8*)node + fNodeSize
696 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
697 			fStream->GetVolume()->Panic();
698 			RETURN_ERROR(B_BAD_DATA);
699 		}
700 
701 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
702 		if (cmp < 0) {
703 			last = i - 1;
704 			saveIndex = i;
705 		} else if (cmp > 0) {
706 			saveIndex = first = i + 1;
707 		} else {
708 			if (_index)
709 				*_index = i;
710 			if (_next)
711 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
712 			return B_OK;
713 		}
714 	}
715 
716 	if (_index)
717 		*_index = saveIndex;
718 	if (_next) {
719 		if (saveIndex == node->NumKeys())
720 			*_next = node->OverflowLink();
721 		else
722 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
723 	}
724 	return B_ENTRY_NOT_FOUND;
725 }
726 
727 
728 /*!	Prepares the stack to contain all nodes that were passed while
729 	following the key, from the root node to the leaf node that could
730 	or should contain that key.
731 */
732 status_t
733 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
734 	uint16 keyLength)
735 {
736 	// set the root node to begin with
737 	node_and_key nodeAndKey;
738 	nodeAndKey.nodeOffset = fHeader.RootNode();
739 
740 	CachedNode cached(this);
741 	const bplustree_node* node;
742 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
743 		// if we are already on leaf level, we're done
744 		if (node->OverflowLink() == BPLUSTREE_NULL) {
745 			// node that the keyIndex is not properly set here (but it's not
746 			// needed in the calling functions anyway)!
747 			nodeAndKey.keyIndex = 0;
748 			stack.Push(nodeAndKey);
749 			return B_OK;
750 		}
751 
752 		off_t nextOffset;
753 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
754 			&nextOffset);
755 
756 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
757 			RETURN_ERROR(B_ERROR);
758 
759 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
760 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
761 			break;
762 		}
763 
764 		// put the node offset & the correct keyIndex on the stack
765 		stack.Push(nodeAndKey);
766 
767 		nodeAndKey.nodeOffset = nextOffset;
768 	}
769 
770 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF "\n",
771 		nodeAndKey.nodeOffset));
772 	return B_ERROR;
773 }
774 
775 
776 /*!	This will find a free duplicate fragment in the given bplustree_node.
777 	The CachedNode will be set to the writable fragment on success.
778 */
779 status_t
780 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
781 	const bplustree_node* node, CachedNode& cached,
782 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
783 {
784 	off_t* values = node->Values();
785 	for (int32 i = 0; i < node->NumKeys(); i++) {
786 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
787 
788 		// does the value link to a duplicate fragment?
789 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
790 			continue;
791 
792 		const bplustree_node* fragment = cached.SetTo(
793 			bplustree_node::FragmentOffset(value), false);
794 		if (fragment == NULL) {
795 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF "\n",
796 				value));
797 			continue;
798 		}
799 
800 		// see if there is some space left for us
801 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
802 		for (uint32 j = 0; j < num; j++) {
803 			duplicate_array* array = fragment->FragmentAt(j);
804 
805 			if (array->count == 0) {
806 				// found an unused fragment
807 				*_fragment = cached.MakeWritable(transaction);
808 				if (*_fragment == NULL)
809 					return B_IO_ERROR;
810 
811 				*_offset = bplustree_node::FragmentOffset(value);
812 				*_index = j;
813 				return B_OK;
814 			}
815 		}
816 	}
817 	return B_ENTRY_NOT_FOUND;
818 }
819 
820 
821 status_t
822 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
823 	const bplustree_node* node, uint16 index, off_t value)
824 {
825 	CachedNode cachedDuplicate(this);
826 	off_t* values = node->Values();
827 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
828 	status_t status;
829 	off_t offset;
830 
831 	if (bplustree_node::IsDuplicate(oldValue)) {
832 		// If it's a duplicate fragment, try to insert it into that, or if it
833 		// doesn't fit anymore, create a new duplicate node
834 
835 		if (bplustree_node::LinkType(oldValue)
836 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
837 			bplustree_node* duplicate
838 				= cachedDuplicate.SetToWritable(transaction,
839 				bplustree_node::FragmentOffset(oldValue), false);
840 			if (duplicate == NULL)
841 				return B_IO_ERROR;
842 
843 			duplicate_array* array = duplicate->FragmentAt(
844 				bplustree_node::FragmentIndex(oldValue));
845 			if (array->count > NUM_FRAGMENT_VALUES
846 				|| array->count < 1) {
847 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment "
848 					"%" B_PRIdOFF " == %" B_PRIdOFF "!\n",
849 					(int)bplustree_node::FragmentIndex(oldValue),
850 					bplustree_node::FragmentOffset(oldValue), array->count));
851 				return B_BAD_DATA;
852 			}
853 
854 			if (array->count < NUM_FRAGMENT_VALUES) {
855 				array->Insert(value);
856 			} else {
857 				// Test if the fragment will be empty if we remove this key's
858 				// values
859 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
860 					// The node will be empty without our values, so let us
861 					// reuse it as a duplicate node
862 					offset = bplustree_node::FragmentOffset(oldValue);
863 
864 					memmove(duplicate->DuplicateArray(), array,
865 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
866 					duplicate->left_link = duplicate->right_link
867 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
868 
869 					array = duplicate->DuplicateArray();
870 					array->Insert(value);
871 				} else {
872 					// Create a new duplicate node
873 
874 					cachedDuplicate.UnsetUnchanged(transaction);
875 						// The old duplicate has not been touched, so we can
876 						// reuse it
877 
878 					bplustree_node* newDuplicate;
879 					status = cachedDuplicate.Allocate(transaction,
880 						&newDuplicate, &offset);
881 					if (status < B_OK)
882 						RETURN_ERROR(status);
883 
884 					// Copy the array from the fragment node to the duplicate
885 					// node and free the old entry (by zero'ing all values)
886 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
887 						array->count);
888 					memcpy(&newDuplicate->all_key_count, &array->values[0],
889 						array->count * sizeof(off_t));
890 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
891 
892 					array = newDuplicate->DuplicateArray();
893 					array->Insert(value);
894 				}
895 
896 				// Update the main pointer to link to a duplicate node
897 				if (cached.MakeWritable(transaction) == NULL)
898 					return B_IO_ERROR;
899 
900 				values[index]
901 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
902 						BPLUSTREE_DUPLICATE_NODE, offset));
903 			}
904 
905 			return B_OK;
906 		}
907 
908 		// Put the value into a dedicated duplicate node
909 
910 		// search for free space in the duplicate nodes of that key
911 		duplicate_array* array;
912 		const bplustree_node* duplicate;
913 		off_t duplicateOffset;
914 		do {
915 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
916 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
917 			if (duplicate == NULL)
918 				return B_IO_ERROR;
919 
920 			array = duplicate->DuplicateArray();
921 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
922 				FATAL(("removeDuplicate: Invalid array size in duplicate %"
923 					B_PRIdOFF " == %" B_PRIdOFF "!\n", duplicateOffset,
924 					array->count));
925 				return B_BAD_DATA;
926 			}
927 		} while (array->count >= NUM_DUPLICATE_VALUES
928 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
929 
930 		bplustree_node* writableDuplicate
931 			= cachedDuplicate.MakeWritable(transaction);
932 		if (writableDuplicate == NULL)
933 			return B_IO_ERROR;
934 
935 		if (array->count < NUM_DUPLICATE_VALUES) {
936 			array = writableDuplicate->DuplicateArray();
937 			array->Insert(value);
938 		} else {
939 			// no space left - add a new duplicate node
940 
941 			bplustree_node* newDuplicate;
942 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
943 				&offset);
944 			if (status != B_OK)
945 				RETURN_ERROR(status);
946 
947 			// link the two nodes together
948 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
949 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
950 
951 			array = newDuplicate->DuplicateArray();
952 			array->count = 0;
953 			array->Insert(value);
954 		}
955 		return B_OK;
956 	}
957 
958 	// Search for a free duplicate fragment or create a new one
959 	// to insert the duplicate value into
960 
961 	uint32 fragmentIndex = 0;
962 	bplustree_node* fragment;
963 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
964 			&offset, &fragment, &fragmentIndex) != B_OK) {
965 		// allocate a new duplicate fragment node
966 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
967 		if (status != B_OK)
968 			RETURN_ERROR(status);
969 
970 		memset(fragment, 0, fNodeSize);
971 	}
972 
973 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
974 	array->Insert(oldValue);
975 	array->Insert(value);
976 
977 	if (cached.MakeWritable(transaction) == NULL)
978 		return B_IO_ERROR;
979 
980 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
981 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
982 
983 	return B_OK;
984 }
985 
986 
987 void
988 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
989 	uint16 keyLength, off_t value)
990 {
991 	// should never happen, but who knows?
992 	if (index > node->NumKeys())
993 		return;
994 
995 	off_t* values = node->Values();
996 	uint16* keyLengths = node->KeyLengths();
997 	uint8* keys = node->Keys();
998 
999 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1000 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1001 		+ keyLength);
1002 
1003 	off_t* newValues = node->Values();
1004 	uint16* newKeyLengths = node->KeyLengths();
1005 
1006 	// move values and copy new value into them
1007 	memmove(newValues + index + 1, values + index,
1008 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1009 	memmove(newValues, values, sizeof(off_t) * index);
1010 
1011 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1012 
1013 	// move and update key length index
1014 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1015 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1016 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1017 	}
1018 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1019 
1020 	int32 keyStart;
1021 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1022 		+ (keyStart = index > 0
1023 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1024 
1025 	// move keys and copy new key into them
1026 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1027 	int32 size = node->AllKeyLength() - length;
1028 	if (size > 0)
1029 		memmove(keys + length, keys + length - keyLength, size);
1030 
1031 	memcpy(keys + keyStart, key, keyLength);
1032 }
1033 
1034 
1035 /*!	Splits the \a node into two halves - the other half will be put into
1036 	\a other. It also takes care to create a new overflow link if the node
1037 	to split is an index node.
1038 */
1039 status_t
1040 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1041 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1042 	uint16* _keyLength, off_t* _value)
1043 {
1044 	if (*_keyIndex > node->NumKeys() + 1)
1045 		return B_BAD_VALUE;
1046 
1047 	uint16* inKeyLengths = node->KeyLengths();
1048 	off_t* inKeyValues = node->Values();
1049 	uint8* inKeys = node->Keys();
1050 	uint8* outKeys = other->Keys();
1051 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1052 
1053 	if (keyIndex > node->NumKeys()) {
1054 		FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex,
1055 			node->NumKeys()));
1056 		return B_BAD_VALUE;
1057 	}
1058 
1059 	// how many keys will fit in one (half) page?
1060 	// that loop will find the answer to this question and
1061 	// change the key lengths indices for their new home
1062 
1063 	// "bytes" is the number of bytes written for the new key,
1064 	// "bytesBefore" are the bytes before that key
1065 	// "bytesAfter" are the bytes after the new key, if any
1066 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1067 
1068 	size_t size = fNodeSize >> 1;
1069 	int32 out, in;
1070 	for (in = out = 0; in < node->NumKeys() + 1;) {
1071 		if (!bytes) {
1072 			bytesBefore = in > 0
1073 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1074 		}
1075 
1076 		if (in == keyIndex && !bytes) {
1077 			bytes = *_keyLength;
1078 		} else {
1079 			if (keyIndex < out) {
1080 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1081 					- bytesBefore;
1082 			}
1083 
1084 			in++;
1085 		}
1086 		out++;
1087 
1088 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1089 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1090 			// we have found the number of keys in the new node!
1091 			break;
1092 		}
1093 	}
1094 
1095 	// if the new key was not inserted, set the length of the keys
1096 	// that can be copied directly
1097 	if (keyIndex >= out && in > 0)
1098 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1099 
1100 	if (bytesBefore < 0 || bytesAfter < 0)
1101 		return B_BAD_DATA;
1102 
1103 	other->left_link = node->left_link;
1104 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1105 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1106 		+ bytesAfter);
1107 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1108 
1109 	uint16* outKeyLengths = other->KeyLengths();
1110 	off_t* outKeyValues = other->Values();
1111 	int32 keys = out > keyIndex ? keyIndex : out;
1112 
1113 	if (bytesBefore) {
1114 		// copy the keys
1115 		memcpy(outKeys, inKeys, bytesBefore);
1116 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1117 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1118 	}
1119 	if (bytes) {
1120 		// copy the newly inserted key
1121 		memcpy(outKeys + bytesBefore, key, bytes);
1122 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1123 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1124 
1125 		if (bytesAfter) {
1126 			// copy the keys after the new key
1127 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1128 				bytesAfter);
1129 			keys = out - keyIndex - 1;
1130 			for (int32 i = 0;i < keys;i++) {
1131 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1132 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1133 						+ bytes);
1134 			}
1135 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1136 				keys * sizeof(off_t));
1137 		}
1138 	}
1139 
1140 	// if the new key was already inserted, we shouldn't use it again
1141 	if (in != out)
1142 		keyIndex--;
1143 
1144 	int32 total = bytesBefore + bytesAfter;
1145 
1146 	// these variables are for the key that will be returned
1147 	// to the parent node
1148 	uint8* newKey = NULL;
1149 	uint16 newLength;
1150 	bool newAllocated = false;
1151 
1152 	// If we have split an index node, we have to drop the first key
1153 	// of the next node (which can also be the new key to insert).
1154 	// The dropped key is also the one which has to be inserted in
1155 	// the parent node, so we will set the "newKey" already here.
1156 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1157 		if (in == keyIndex) {
1158 			newKey = key;
1159 			newLength = *_keyLength;
1160 
1161 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1162 			keyIndex--;
1163 		} else {
1164 			// If a key is dropped (is not the new key), we have to copy
1165 			// it, because it would be lost if not.
1166 			uint8* droppedKey = node->KeyAt(in, &newLength);
1167 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1168 					> (uint8*)node + fNodeSize
1169 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1170 				fStream->GetVolume()->Panic();
1171 				RETURN_ERROR(B_BAD_DATA);
1172 			}
1173 			newKey = (uint8*)malloc(newLength);
1174 			if (newKey == NULL)
1175 				return B_NO_MEMORY;
1176 
1177 			newAllocated = true;
1178 			memcpy(newKey, droppedKey, newLength);
1179 
1180 			other->overflow_link = inKeyValues[in];
1181 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1182 		}
1183 	}
1184 
1185 	// and now the same game for the other page and the rest of the keys
1186 	// (but with memmove() instead of memcpy(), because they may overlap)
1187 
1188 	bytesBefore = bytesAfter = bytes = 0;
1189 	out = 0;
1190 	int32 skip = in;
1191 	while (in < node->NumKeys() + 1) {
1192 		if (in == keyIndex && !bytes) {
1193 			// it's enough to set bytesBefore once here, because we do
1194 			// not need to know the exact length of all keys in this
1195 			// loop
1196 			bytesBefore = in > skip
1197 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1198 			bytes = *_keyLength;
1199 			out++;
1200 		} else {
1201 			if (in < node->NumKeys()) {
1202 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1203 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1204 
1205 				if (bytes) {
1206 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1207 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1208 
1209 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1210 						- bytesBefore - bytes;
1211 				}
1212 				out++;
1213 			}
1214 			in++;
1215 		}
1216 	}
1217 
1218 	// adjust the byte counts (since we were a bit lazy in the loop)
1219 	if (keyIndex < skip)
1220 		bytesBefore = node->AllKeyLength() - total;
1221 
1222 	if (bytesBefore < 0 || bytesAfter < 0) {
1223 		if (newAllocated)
1224 			free(newKey);
1225 		return B_BAD_DATA;
1226 	}
1227 
1228 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1229 		// right link, and overflow link can stay the same
1230 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1231 		+ bytesAfter);
1232 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1233 
1234 	// array positions have changed
1235 	outKeyLengths = node->KeyLengths();
1236 	outKeyValues = node->Values();
1237 
1238 	// move the keys in the old node: the order is important here,
1239 	// because we don't want to overwrite any contents
1240 
1241 	keys = keyIndex <= skip ? out : keyIndex - skip;
1242 	keyIndex -= skip;
1243 	in = out - keyIndex - 1;
1244 		// Note: keyIndex and in will contain invalid values when the new key
1245 		// went to the other node. But in this case bytes and bytesAfter are
1246 		// 0 and subsequently we never use keyIndex and in.
1247 
1248 	if (bytesBefore)
1249 		memmove(inKeys, inKeys + total, bytesBefore);
1250 	if (bytesAfter) {
1251 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1252 			bytesAfter);
1253 	}
1254 
1255 	if (bytesBefore)
1256 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1257 	if (bytesAfter) {
1258 		// if byteAfter is > 0, keyIndex is larger than skip
1259 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1260 			in * sizeof(uint16));
1261 	}
1262 
1263 	if (bytesBefore)
1264 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1265 	if (bytesAfter) {
1266 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1267 			in * sizeof(off_t));
1268 	}
1269 
1270 	if (bytes) {
1271 		// finally, copy the newly inserted key (don't overwrite anything)
1272 		memcpy(inKeys + bytesBefore, key, bytes);
1273 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1274 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1275 	}
1276 
1277 	// Prepare the key that will be inserted in the parent node which
1278 	// is either the dropped key or the last of the other node.
1279 	// If it's the dropped key, "newKey" was already set earlier.
1280 
1281 	if (newKey == NULL)
1282 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1283 
1284 	memcpy(key, newKey, newLength);
1285 	*_keyLength = newLength;
1286 	*_value = otherOffset;
1287 
1288 	if (newAllocated)
1289 		free(newKey);
1290 
1291 	return B_OK;
1292 }
1293 
1294 
1295 /*!	This inserts a key into the tree. The changes made to the tree will
1296 	all be part of the \a transaction.
1297 	You need to have the inode write locked.
1298 */
1299 status_t
1300 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1301 	off_t value)
1302 {
1303 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1304 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1305 		RETURN_ERROR(B_BAD_VALUE);
1306 #ifdef DEBUG
1307 	if (value < 0)
1308 		panic("tried to insert invalid value %Ld!\n", value);
1309 #endif
1310 
1311 	ASSERT_WRITE_LOCKED_INODE(fStream);
1312 
1313 	Stack<node_and_key> stack;
1314 	if (_SeekDown(stack, key, keyLength) != B_OK)
1315 		RETURN_ERROR(B_ERROR);
1316 
1317 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1318 
1319 	memcpy(keyBuffer, key, keyLength);
1320 	keyBuffer[keyLength] = 0;
1321 
1322 	node_and_key nodeAndKey;
1323 	const bplustree_node* node;
1324 
1325 	CachedNode cached(this);
1326 	while (stack.Pop(&nodeAndKey)
1327 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1328 #ifdef DEBUG
1329 		NodeChecker checker(node, fNodeSize, "insert");
1330 #endif
1331 		if (node->IsLeaf()) {
1332 			// first round, check for duplicate entries
1333 			status_t status = _FindKey(node, key, keyLength,
1334 				&nodeAndKey.keyIndex);
1335 
1336 			// is this a duplicate entry?
1337 			if (status == B_OK) {
1338 				if (fAllowDuplicates) {
1339 					status = _InsertDuplicate(transaction, cached, node,
1340 						nodeAndKey.keyIndex, value);
1341 					if (status != B_OK)
1342 						RETURN_ERROR(status);
1343 					return B_OK;
1344 				}
1345 
1346 				return B_NAME_IN_USE;
1347 			}
1348 		}
1349 
1350 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1351 		if (writableNode == NULL)
1352 			return B_IO_ERROR;
1353 
1354 		// is the node big enough to hold the pair?
1355 		if (int32(key_align(sizeof(bplustree_node)
1356 				+ writableNode->AllKeyLength() + keyLength)
1357 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1358 				+ sizeof(off_t))) < fNodeSize) {
1359 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1360 				keyBuffer, keyLength, value);
1361 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1362 				nodeAndKey.keyIndex, 0, 1);
1363 
1364 			return B_OK;
1365 		} else {
1366 			CachedNode cachedNewRoot(this);
1367 			CachedNode cachedOther(this);
1368 
1369 			// do we need to allocate a new root node? if so, then do
1370 			// it now
1371 			off_t newRoot = BPLUSTREE_NULL;
1372 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1373 				bplustree_node* root;
1374 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1375 					&newRoot);
1376 				if (status < B_OK) {
1377 					// The tree is most likely corrupted!
1378 					// But it's still sane at leaf level - we could set
1379 					// a flag in the header that forces the tree to be
1380 					// rebuild next time...
1381 					// But since we will have journaling, that's not a big
1382 					// problem anyway.
1383 					RETURN_ERROR(status);
1384 				}
1385 			}
1386 
1387 			// reserve space for the other node
1388 			bplustree_node* other;
1389 			off_t otherOffset;
1390 			status_t status = cachedOther.Allocate(transaction, &other,
1391 				&otherOffset);
1392 			if (status < B_OK) {
1393 				cachedNewRoot.Free(transaction, newRoot);
1394 				RETURN_ERROR(status);
1395 			}
1396 
1397 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1398 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1399 					&value) < B_OK) {
1400 				// free root node & other node here
1401 				cachedOther.Free(transaction, otherOffset);
1402 				cachedNewRoot.Free(transaction, newRoot);
1403 
1404 				RETURN_ERROR(B_ERROR);
1405 			}
1406 #ifdef DEBUG
1407 			checker.Check("insert split");
1408 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1409 #endif
1410 
1411 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1412 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1413 
1414 			// update the right link of the node in the left of the new node
1415 			if ((other = cachedOther.SetToWritable(transaction,
1416 					other->LeftLink())) != NULL) {
1417 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1418 			}
1419 
1420 			// create a new root if necessary
1421 			if (newRoot != BPLUSTREE_NULL) {
1422 				bplustree_node* root = cachedNewRoot.Node();
1423 
1424 				_InsertKey(root, 0, keyBuffer, keyLength,
1425 					writableNode->LeftLink());
1426 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1427 					nodeAndKey.nodeOffset);
1428 
1429 				CachedNode cached(this);
1430 				bplustree_header* header
1431 					= cached.SetToWritableHeader(transaction);
1432 				if (header == NULL)
1433 					return B_IO_ERROR;
1434 
1435 				// finally, update header to point to the new root
1436 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1437 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1438 					header->MaxNumberOfLevels() + 1);
1439 
1440 				return B_OK;
1441 			}
1442 		}
1443 	}
1444 	RETURN_ERROR(B_ERROR);
1445 }
1446 
1447 
1448 /*!	Removes the duplicate index/value pair from the tree.
1449 	It's part of the private tree interface.
1450 */
1451 status_t
1452 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1453 	const bplustree_node* node, CachedNode& cached, uint16 index,
1454 	off_t value)
1455 {
1456 	off_t* values = node->Values();
1457 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1458 
1459 	CachedNode cachedDuplicate(this);
1460 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1461 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1462 		duplicateOffset, false);
1463 	if (duplicate == NULL)
1464 		return B_IO_ERROR;
1465 
1466 	// if it's a duplicate fragment, remove the entry from there
1467 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1468 		duplicate_array* array = duplicate->FragmentAt(
1469 			bplustree_node::FragmentIndex(oldValue));
1470 
1471 		if (array->count > NUM_FRAGMENT_VALUES
1472 			|| array->count < 1) {
1473 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %"
1474 				B_PRIdOFF " == %" B_PRIdOFF "!\n",
1475 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1476 				array->count));
1477 			return B_BAD_DATA;
1478 		}
1479 		if (!array->Remove(value)) {
1480 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1481 				"%" B_PRIdOFF "...\n", value, duplicateOffset));
1482 			return B_ENTRY_NOT_FOUND;
1483 		}
1484 
1485 		// remove the array from the fragment node if it is empty
1486 		if (array->count == 1) {
1487 			// set the link to the remaining value
1488 			if (cached.MakeWritable(transaction) == NULL)
1489 				return B_IO_ERROR;
1490 
1491 			values[index] = array->values[0];
1492 
1493 			// Remove the whole fragment node, if this was the only array,
1494 			// otherwise free just the array
1495 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1496 				status_t status = cachedDuplicate.Free(transaction,
1497 					duplicateOffset);
1498 				if (status < B_OK)
1499 					return status;
1500 			} else
1501 				array->count = 0;
1502 		}
1503 		return B_OK;
1504 	}
1505 
1506 	// Remove value from a duplicate node!
1507 
1508 	duplicate_array* array = NULL;
1509 
1510 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1511 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1512 			"!\n", duplicate->LeftLink()));
1513 		return B_BAD_DATA;
1514 	}
1515 
1516 	// Search the duplicate nodes until the entry could be found (and removed)
1517 	while (duplicate != NULL) {
1518 		array = duplicate->DuplicateArray();
1519 		if (array->count > NUM_DUPLICATE_VALUES
1520 			|| array->count < 0) {
1521 			FATAL(("removeDuplicate: Invalid array size in duplicate %"
1522 				B_PRIdOFF " == %" B_PRIdOFF "!\n", duplicateOffset,
1523 				array->count));
1524 			return B_BAD_DATA;
1525 		}
1526 
1527 		if (array->Remove(value))
1528 			break;
1529 
1530 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1531 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1532 
1533 		cachedDuplicate.UnsetUnchanged(transaction);
1534 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1535 			false);
1536 	}
1537 	if (duplicate == NULL)
1538 		RETURN_ERROR(B_IO_ERROR);
1539 
1540 	// The entry got removed from the duplicate node, but we might want to free
1541 	// it now in case it's empty
1542 
1543 	while (true) {
1544 		off_t left = duplicate->LeftLink();
1545 		off_t right = duplicate->RightLink();
1546 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1547 
1548 		if ((isLast && array->count == 1) || array->count == 0) {
1549 			// Free empty duplicate page, link their siblings together, and
1550 			// update the duplicate link if needed (ie. when we either remove
1551 			// the last duplicate node or have a new first one)
1552 
1553 			if (left == BPLUSTREE_NULL) {
1554 				// the duplicate link points to us
1555 				if (cached.MakeWritable(transaction) == NULL)
1556 					return B_IO_ERROR;
1557 
1558 				if (array->count == 1) {
1559 					// This is the last node, and there is only one value left;
1560 					// replace the duplicate link with that value, it's no
1561 					// duplicate anymore
1562 					values[index] = array->values[0];
1563 				} else {
1564 					// Move the duplicate link to the next node
1565 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1566 						bplustree_node::MakeLink(
1567 							BPLUSTREE_DUPLICATE_NODE, right));
1568 				}
1569 			}
1570 
1571 			status_t status = cachedDuplicate.Free(transaction,
1572 				duplicateOffset);
1573 			if (status < B_OK)
1574 				return status;
1575 
1576 			if (left != BPLUSTREE_NULL
1577 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1578 						false)) != NULL) {
1579 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1580 
1581 				// If the next node is the last node, we need to free that node
1582 				// and convert the duplicate entry back into a normal entry
1583 				array = duplicate->DuplicateArray();
1584 				if (right == BPLUSTREE_NULL
1585 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1586 					&& array->count <= NUM_FRAGMENT_VALUES) {
1587 					duplicateOffset = left;
1588 					continue;
1589 				}
1590 			}
1591 			if (right != BPLUSTREE_NULL
1592 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1593 						right, false)) != NULL) {
1594 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1595 
1596 				// Again, we may need to turn the duplicate entry back into a
1597 				// normal entry
1598 				array = duplicate->DuplicateArray();
1599 				if (left == BPLUSTREE_NULL
1600 					&& duplicate->RightLink() == BPLUSTREE_NULL
1601 					&& array->count <= NUM_FRAGMENT_VALUES) {
1602 					duplicateOffset = right;
1603 					continue;
1604 				}
1605 			}
1606 			return B_OK;
1607 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1608 			// If the number of entries fits in a duplicate fragment, then
1609 			// either find a free fragment node, or convert this node to a
1610 			// fragment node.
1611 			CachedNode cachedOther(this);
1612 
1613 			bplustree_node* fragment = NULL;
1614 			uint32 fragmentIndex = 0;
1615 			off_t offset;
1616 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1617 					&offset, &fragment, &fragmentIndex) == B_OK) {
1618 				// move to other node
1619 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
1620 				memcpy(target, array,
1621 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1622 
1623 				cachedDuplicate.Free(transaction, duplicateOffset);
1624 				duplicateOffset = offset;
1625 			} else {
1626 				// convert node
1627 				memmove(duplicate, array,
1628 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1629 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1630 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1631 			}
1632 
1633 			if (cached.MakeWritable(transaction) == NULL)
1634 				return B_IO_ERROR;
1635 
1636 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1637 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1638 		}
1639 		return B_OK;
1640 	}
1641 }
1642 
1643 
1644 /*!	Removes the key with the given index from the specified node.
1645 	Since it has to get the key from the node anyway (to obtain it's
1646 	pointer), it's not needed to pass the key & its length, although
1647 	the calling method (BPlusTree::Remove()) have this data.
1648 */
1649 void
1650 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
1651 {
1652 	// should never happen, but who knows?
1653 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1654 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1655 		return;
1656 	}
1657 
1658 	off_t* values = node->Values();
1659 
1660 	// if we would have to drop the overflow link, drop
1661 	// the last key instead and update the overflow link
1662 	// to the value of that one
1663 	if (!node->IsLeaf() && index == node->NumKeys())
1664 		node->overflow_link = values[--index];
1665 
1666 	uint16 length;
1667 	uint8* key = node->KeyAt(index, &length);
1668 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
1669 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1670 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1671 			(int)fStream->BlockRun().allocation_group,
1672 			fStream->BlockRun().start));
1673 		fStream->GetVolume()->Panic();
1674 		return;
1675 	}
1676 
1677 	uint16* keyLengths = node->KeyLengths();
1678 	uint8* keys = node->Keys();
1679 
1680 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1681 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
1682 		node->AllKeyLength() - length);
1683 
1684 	off_t* newValues = node->Values();
1685 	uint16* newKeyLengths = node->KeyLengths();
1686 
1687 	// move key data
1688 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1689 
1690 	// move and update key lengths
1691 	if (index > 0 && newKeyLengths != keyLengths)
1692 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1693 	for (uint16 i = index; i < node->NumKeys(); i++) {
1694 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1695 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1696 	}
1697 
1698 	// move values
1699 	if (index > 0)
1700 		memmove(newValues, values, index * sizeof(off_t));
1701 	if (node->NumKeys() > index) {
1702 		memmove(newValues + index, values + index + 1,
1703 			(node->NumKeys() - index) * sizeof(off_t));
1704 	}
1705 }
1706 
1707 
1708 /*!	Removes the specified key from the tree. The "value" parameter is only used
1709 	for trees which allow duplicates, so you may safely ignore it.
1710 	It's not an optional parameter, so at least you have to think about it.
1711 	You need to have the inode write locked.
1712 */
1713 status_t
1714 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
1715 	off_t value)
1716 {
1717 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1718 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1719 		RETURN_ERROR(B_BAD_VALUE);
1720 
1721 	ASSERT_WRITE_LOCKED_INODE(fStream);
1722 
1723 	Stack<node_and_key> stack;
1724 	if (_SeekDown(stack, key, keyLength) != B_OK)
1725 		RETURN_ERROR(B_ERROR);
1726 
1727 	node_and_key nodeAndKey;
1728 	const bplustree_node* node;
1729 
1730 	CachedNode cached(this);
1731 	while (stack.Pop(&nodeAndKey)
1732 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1733 #ifdef DEBUG
1734 		NodeChecker checker(node, fNodeSize, "remove");
1735 #endif
1736 		if (node->IsLeaf()) {
1737 			// first round, check for duplicate entries
1738 			status_t status = _FindKey(node, key, keyLength,
1739 				&nodeAndKey.keyIndex);
1740 			if (status != B_OK)
1741 				RETURN_ERROR(status);
1742 
1743 			// Is this a duplicate entry?
1744 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1745 					node->Values()[nodeAndKey.keyIndex]))) {
1746 				if (fAllowDuplicates) {
1747 					return _RemoveDuplicate(transaction, node, cached,
1748 						nodeAndKey.keyIndex, value);
1749 				}
1750 
1751 				FATAL(("dupliate node found where no duplicates are "
1752 					"allowed!\n"));
1753 				RETURN_ERROR(B_ERROR);
1754 			} else {
1755 				if (node->Values()[nodeAndKey.keyIndex] != value)
1756 					return B_ENTRY_NOT_FOUND;
1757 
1758 				// If we will remove the last key, the iterator will be set
1759 				// to the next node after the current - if there aren't any
1760 				// more nodes, we need a way to prevent the TreeIterators to
1761 				// touch the old node again, we use BPLUSTREE_FREE for this
1762 				off_t next = node->RightLink() == BPLUSTREE_NULL
1763 					? BPLUSTREE_FREE : node->RightLink();
1764 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1765 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1766 			}
1767 		}
1768 
1769 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1770 		if (writableNode == NULL)
1771 			return B_IO_ERROR;
1772 
1773 		// if it's an empty root node, we have to convert it
1774 		// to a leaf node by dropping the overflow link, or,
1775 		// if it's already a leaf node, just empty it
1776 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
1777 			&& (node->NumKeys() == 0
1778 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
1779 			writableNode->overflow_link
1780 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1781 			writableNode->all_key_count = 0;
1782 			writableNode->all_key_length = 0;
1783 
1784 			// if we've made a leaf node out of the root node, we need
1785 			// to reset the maximum number of levels in the header
1786 			if (fHeader.MaxNumberOfLevels() != 1) {
1787 				CachedNode cached(this);
1788 				bplustree_header* header
1789 					= cached.SetToWritableHeader(transaction);
1790 				if (header == NULL)
1791 					return B_IO_ERROR;
1792 
1793 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1794 			}
1795 			return B_OK;
1796 		}
1797 
1798 		// if there is only one key left, we don't have to remove
1799 		// it, we can just dump the node (index nodes still have
1800 		// the overflow link, so we have to drop the last key)
1801 		if (writableNode->NumKeys() > 1
1802 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
1803 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1804 			return B_OK;
1805 		}
1806 
1807 		// when we are here, we can just free the node, but
1808 		// we have to update the right/left link of the
1809 		// siblings first
1810 		CachedNode otherCached(this);
1811 		bplustree_node* other = otherCached.SetToWritable(transaction,
1812 			writableNode->LeftLink());
1813 		if (other != NULL)
1814 			other->right_link = writableNode->right_link;
1815 
1816 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
1817 				!= NULL) {
1818 			other->left_link = writableNode->left_link;
1819 		}
1820 
1821 		cached.Free(transaction, nodeAndKey.nodeOffset);
1822 	}
1823 	RETURN_ERROR(B_ERROR);
1824 }
1825 
1826 
1827 /*!	Replaces the value for the key in the tree.
1828 	Returns B_OK if the key could be found and its value replaced,
1829 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1830 	to indicate that something went terribly wrong.
1831 	Note that this doesn't work with duplicates - it will just
1832 	return B_BAD_TYPE if you call this function on a tree where
1833 	duplicates are allowed.
1834 	You need to have the inode write locked.
1835 */
1836 status_t
1837 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
1838 	off_t value)
1839 {
1840 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1841 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1842 		|| key == NULL)
1843 		RETURN_ERROR(B_BAD_VALUE);
1844 
1845 	if (fAllowDuplicates)
1846 		RETURN_ERROR(B_BAD_TYPE);
1847 
1848 	ASSERT_WRITE_LOCKED_INODE(fStream);
1849 
1850 	off_t nodeOffset = fHeader.RootNode();
1851 	CachedNode cached(this);
1852 	const bplustree_node* node;
1853 
1854 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1855 		uint16 keyIndex = 0;
1856 		off_t nextOffset;
1857 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1858 			&nextOffset);
1859 
1860 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1861 			if (status == B_OK) {
1862 				bplustree_node* writableNode = cached.MakeWritable(transaction);
1863 				if (writableNode != NULL) {
1864 					writableNode->Values()[keyIndex]
1865 						= HOST_ENDIAN_TO_BFS_INT64(value);
1866 				} else
1867 					status = B_IO_ERROR;
1868 			}
1869 
1870 			return status;
1871 		} else if (nextOffset == nodeOffset)
1872 			RETURN_ERROR(B_ERROR);
1873 
1874 		nodeOffset = nextOffset;
1875 	}
1876 	RETURN_ERROR(B_ERROR);
1877 }
1878 
1879 
1880 /*!	Searches the key in the tree, and stores the offset found in _value,
1881 	if successful.
1882 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
1883 	while it descends the tree.
1884 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
1885 	It can also return other errors to indicate that something went wrong.
1886 	Note that this doesn't work with duplicates - it will just return
1887 	B_BAD_TYPE if you call this function on a tree where duplicates are
1888 	allowed.
1889 	You need to have the inode read or write locked.
1890 */
1891 status_t
1892 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
1893 {
1894 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1895 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1896 		|| key == NULL)
1897 		RETURN_ERROR(B_BAD_VALUE);
1898 
1899 	if (fAllowDuplicates)
1900 		RETURN_ERROR(B_BAD_TYPE);
1901 
1902 	ASSERT_READ_LOCKED_INODE(fStream);
1903 
1904 	off_t nodeOffset = fHeader.RootNode();
1905 	CachedNode cached(this);
1906 	const bplustree_node* node;
1907 
1908 #ifdef DEBUG
1909 	int32 levels = 0;
1910 #endif
1911 
1912 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1913 		uint16 keyIndex = 0;
1914 		off_t nextOffset;
1915 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1916 			&nextOffset);
1917 
1918 #ifdef DEBUG
1919 		levels++;
1920 #endif
1921 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1922 			if (status == B_OK && _value != NULL)
1923 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1924 
1925 #ifdef DEBUG
1926 			if (levels != (int32)fHeader.MaxNumberOfLevels())
1927 				DEBUGGER(("levels don't match"));
1928 #endif
1929 			return status;
1930 		} else if (nextOffset == nodeOffset)
1931 			RETURN_ERROR(B_ERROR);
1932 
1933 		nodeOffset = nextOffset;
1934 	}
1935 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded\n", nodeOffset));
1936 	RETURN_ERROR(B_ERROR);
1937 }
1938 
1939 
1940 //	#pragma mark -
1941 
1942 
1943 TreeIterator::TreeIterator(BPlusTree* tree)
1944 	:
1945 	fTree(tree),
1946 	fCurrentNodeOffset(BPLUSTREE_NULL)
1947 {
1948 	tree->_AddIterator(this);
1949 }
1950 
1951 
1952 TreeIterator::~TreeIterator()
1953 {
1954 	if (fTree)
1955 		fTree->_RemoveIterator(this);
1956 }
1957 
1958 
1959 status_t
1960 TreeIterator::Goto(int8 to)
1961 {
1962 	if (fTree == NULL || fTree->fStream == NULL)
1963 		RETURN_ERROR(B_BAD_VALUE);
1964 
1965 	// lock access to stream
1966 	InodeReadLocker locker(fTree->fStream);
1967 
1968 	off_t nodeOffset = fTree->fHeader.RootNode();
1969 	CachedNode cached(fTree);
1970 	const bplustree_node* node;
1971 
1972 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1973 		// is the node a leaf node?
1974 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1975 			fCurrentNodeOffset = nodeOffset;
1976 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1977 			fDuplicateNode = BPLUSTREE_NULL;
1978 
1979 			return B_OK;
1980 		}
1981 
1982 		// get the next node offset depending on the direction (and if there
1983 		// are any keys in that node at all)
1984 		off_t nextOffset;
1985 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1986 			nextOffset = node->OverflowLink();
1987 		else {
1988 			if (node->AllKeyLength() > fTree->fNodeSize
1989 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1990 					- 8 * node->NumKeys())
1991 				RETURN_ERROR(B_ERROR);
1992 
1993 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1994 		}
1995 		if (nextOffset == nodeOffset)
1996 			break;
1997 
1998 		nodeOffset = nextOffset;
1999 	}
2000 	FATAL(("%s fails\n", __FUNCTION__));
2001 
2002 	RETURN_ERROR(B_ERROR);
2003 }
2004 
2005 
2006 /*!	Iterates through the tree in the specified direction.
2007 	When it iterates through duplicates, the "key" is only updated for the
2008 	first entry - if you need to know when this happens, use the "duplicate"
2009 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2010 	the other duplicates.
2011 	That's not too nice, but saves the 256 bytes that would be needed to
2012 	store the last key - if this will ever become an issue, it will be
2013 	easy to change.
2014 	The other advantage of this is, that the queries can skip all duplicates
2015 	at once when they are not relevant to them.
2016 */
2017 status_t
2018 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2019 	uint16 maxLength, off_t* value, uint16* duplicate)
2020 {
2021 	if (fTree == NULL)
2022 		return B_INTERRUPTED;
2023 
2024 	bool forward = direction == BPLUSTREE_FORWARD;
2025 
2026 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2027 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
2028 		RETURN_ERROR(B_ERROR);
2029 
2030 	// if the tree was emptied since the last call
2031 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2032 		return B_ENTRY_NOT_FOUND;
2033 
2034 	// lock access to stream
2035 	InodeReadLocker locker(fTree->fStream);
2036 
2037 	CachedNode cached(fTree);
2038 	const bplustree_node* node;
2039 
2040 	if (fDuplicateNode != BPLUSTREE_NULL) {
2041 		// regardless of traverse direction the duplicates are always presented
2042 		// in the same order; since they are all considered as equal, this
2043 		// shouldn't cause any problems
2044 
2045 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2046 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2047 				false);
2048 		} else
2049 			node = NULL;
2050 
2051 		if (node != NULL) {
2052 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2053 				// If the node is out of duplicates, we go directly to the next
2054 				// one
2055 				fDuplicateNode = node->RightLink();
2056 				if (fDuplicateNode != BPLUSTREE_NULL
2057 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2058 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2059 						false);
2060 					fDuplicate = 0;
2061 				}
2062 			}
2063 			if (fDuplicate < fNumDuplicates) {
2064 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2065 					fDuplicate++);
2066 				if (duplicate)
2067 					*duplicate = 2;
2068 				return B_OK;
2069 			}
2070 		}
2071 		fDuplicateNode = BPLUSTREE_NULL;
2072 	}
2073 
2074 	off_t savedNodeOffset = fCurrentNodeOffset;
2075 	int32 savedKey = fCurrentKey;
2076 
2077 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2078 		RETURN_ERROR(B_ERROR);
2079 
2080 	if (duplicate)
2081 		*duplicate = 0;
2082 
2083 	fCurrentKey += direction;
2084 
2085 	// is the current key in the current node?
2086 	while ((forward && fCurrentKey >= node->NumKeys())
2087 			|| (!forward && fCurrentKey < 0)) {
2088 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2089 
2090 		// are there any more nodes?
2091 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2092 			node = cached.SetTo(fCurrentNodeOffset);
2093 			if (!node)
2094 				RETURN_ERROR(B_ERROR);
2095 
2096 			// reset current key
2097 			fCurrentKey = forward ? 0 : node->NumKeys();
2098 		} else {
2099 			// there are no nodes left, so turn back to the last key
2100 			fCurrentNodeOffset = savedNodeOffset;
2101 			fCurrentKey = savedKey;
2102 
2103 			return B_ENTRY_NOT_FOUND;
2104 		}
2105 	}
2106 
2107 	if (node->all_key_count == 0)
2108 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2109 
2110 	uint16 length;
2111 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2112 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2113 			> (uint8*)node + fTree->fNodeSize
2114 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2115 		fTree->fStream->GetVolume()->Panic();
2116 		RETURN_ERROR(B_BAD_DATA);
2117 	}
2118 
2119 	length = min_c(length, maxLength);
2120 	memcpy(key, keyStart, length);
2121 
2122 	if (fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE)	{
2123 		// terminate string type
2124 		if (length == maxLength)
2125 			length--;
2126 		((char*)key)[length] = '\0';
2127 	}
2128 	*keyLength = length;
2129 
2130 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2131 
2132 	// duplicate fragments?
2133 	uint8 type = bplustree_node::LinkType(offset);
2134 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2135 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2136 		fDuplicateNode = offset;
2137 
2138 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2139 			false);
2140 		if (node == NULL)
2141 			RETURN_ERROR(B_ERROR);
2142 
2143 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2144 
2145 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2146 		if (fNumDuplicates) {
2147 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2148 			fDuplicate = 1;
2149 			if (duplicate)
2150 				*duplicate = 1;
2151 		} else {
2152 			// Shouldn't happen, but we're dealing here with potentially
2153 			// corrupt disks...
2154 			fDuplicateNode = BPLUSTREE_NULL;
2155 			offset = 0;
2156 		}
2157 	}
2158 	*value = offset;
2159 
2160 	return B_OK;
2161 }
2162 
2163 
2164 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2165 	sets the current position in the iterator, regardless of if the
2166 	key could be found or not.
2167 */
2168 status_t
2169 TreeIterator::Find(const uint8* key, uint16 keyLength)
2170 {
2171 	if (fTree == NULL)
2172 		return B_INTERRUPTED;
2173 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2174 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2175 		|| key == NULL)
2176 		RETURN_ERROR(B_BAD_VALUE);
2177 
2178 	// lock access to stream
2179 	InodeReadLocker locker(fTree->fStream);
2180 
2181 	off_t nodeOffset = fTree->fHeader.RootNode();
2182 
2183 	CachedNode cached(fTree);
2184 	const bplustree_node* node;
2185 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2186 		uint16 keyIndex = 0;
2187 		off_t nextOffset;
2188 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2189 			&nextOffset);
2190 
2191 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2192 			fCurrentNodeOffset = nodeOffset;
2193 			fCurrentKey = keyIndex - 1;
2194 			fDuplicateNode = BPLUSTREE_NULL;
2195 
2196 			return status;
2197 		} else if (nextOffset == nodeOffset)
2198 			RETURN_ERROR(B_ERROR);
2199 
2200 		nodeOffset = nextOffset;
2201 	}
2202 	RETURN_ERROR(B_ERROR);
2203 }
2204 
2205 
2206 void
2207 TreeIterator::SkipDuplicates()
2208 {
2209 	fDuplicateNode = BPLUSTREE_NULL;
2210 }
2211 
2212 
2213 void
2214 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2215 	uint16 splitAt, int8 change)
2216 {
2217 	if (offset != fCurrentNodeOffset)
2218 		return;
2219 
2220 	if (nextOffset != BPLUSTREE_NULL) {
2221 		fCurrentNodeOffset = nextOffset;
2222 		if (splitAt <= fCurrentKey) {
2223 			fCurrentKey -= splitAt;
2224 			keyIndex -= splitAt;
2225 		}
2226 	}
2227 
2228 	// Adjust fCurrentKey to point to the same key as before.
2229 	// Note, that if a key is inserted at the current position
2230 	// it won't be included in this tree transition.
2231 	if (keyIndex <= fCurrentKey)
2232 		fCurrentKey += change;
2233 
2234 	// TODO: duplicate handling!
2235 }
2236 
2237 
2238 void
2239 TreeIterator::Stop()
2240 {
2241 	fTree = NULL;
2242 }
2243 
2244 
2245 #ifdef DEBUG
2246 void
2247 TreeIterator::Dump()
2248 {
2249 	__out("TreeIterator at %p:\n", this);
2250 	__out("\tfTree = %p\n", fTree);
2251 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2252 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2253 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2254 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2255 		fDuplicateNode);
2256 	__out("\tfDuplicate = %u\n", fDuplicate);
2257 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2258 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2259 }
2260 #endif
2261 
2262 
2263 //	#pragma mark -
2264 
2265 
2266 void
2267 bplustree_node::Initialize()
2268 {
2269 	left_link = right_link = overflow_link
2270 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2271 	all_key_count = 0;
2272 	all_key_length = 0;
2273 }
2274 
2275 
2276 uint8*
2277 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2278 {
2279 	if (index < 0 || index > NumKeys())
2280 		return NULL;
2281 
2282 	uint8* keyStart = Keys();
2283 	uint16* keyLengths = KeyLengths();
2284 
2285 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2286 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2287 	if (index > 0)
2288 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2289 
2290 	return keyStart;
2291 }
2292 
2293 
2294 uint8
2295 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2296 {
2297 	// the duplicate fragment handling is currently hard-coded to a node size
2298 	// of 1024 bytes - with future versions of BFS, this may be a problem
2299 
2300 	if (isFragment) {
2301 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2302 
2303 		return ((off_t*)this)[fragment];
2304 	}
2305 	return OverflowLink();
2306 }
2307 
2308 
2309 off_t
2310 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2311 {
2312 	uint32 start;
2313 	if (isFragment)
2314 		start = 8 * ((uint64)offset & 0x3ff);
2315 	else
2316 		start = 2;
2317 
2318 	return ((off_t*)this)[start + 1 + index];
2319 }
2320 
2321 
2322 /*!	Although the name suggests it, this function doesn't return the real
2323 	used fragment count; at least, it can only count to two: it returns
2324 	0, if there is no fragment used, 1 if there is only one fragment
2325 	used, and 2 if there are at least 2 fragments used.
2326 */
2327 uint32
2328 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2329 {
2330 	uint32 used = 0;
2331 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2332 		duplicate_array* array = FragmentAt(i);
2333 		if (array->count > 0 && ++used > 1)
2334 			return used;
2335 	}
2336 	return used;
2337 }
2338 
2339 
2340 #ifdef DEBUG
2341 status_t
2342 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2343 {
2344 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2345 		DEBUGGER(("invalid node: key/length count"));
2346 
2347 	for (int32 i = 0; i < NumKeys(); i++) {
2348 		uint16 length;
2349 		uint8* key = KeyAt(i, &length);
2350 		if (key + length + sizeof(off_t) + sizeof(uint16)
2351 				> (uint8*)this + nodeSize
2352 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2353 			dprintf("node %p, key %d\n", this, (int)i);
2354 			DEBUGGER(("invalid node: keys corrupted"));
2355 			return B_BAD_DATA;
2356 		}
2357 		if (Values()[i] == -1) {
2358 			dprintf("node %p, value %d: %lld\n", this, (int)i, Values()[i]);
2359 			DEBUGGER(("invalid node: values corrupted"));
2360 			return B_BAD_DATA;
2361 		}
2362 	}
2363 	return B_OK;
2364 }
2365 #endif
2366 
2367 
2368 //	#pragma mark -
2369 
2370 
2371 int32
2372 compareKeys(type_code type, const void* key1, int keyLength1,
2373 	const void* key2, int keyLength2)
2374 {
2375 	// if one of the keys is NULL, bail out gracefully
2376 	if (key1 == NULL || key2 == NULL) {
2377 		// even if it's most probably a bug in the calling code, we try to
2378 		// give a meaningful result
2379 		if (key1 == NULL && key2 != NULL)
2380 			return 1;
2381 		if (key2 == NULL && key1 != NULL)
2382 			return -1;
2383 
2384 		return 0;
2385 	}
2386 
2387 	switch (type) {
2388 	    case B_STRING_TYPE:
2389     	{
2390 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
2391 			if (result == 0) {
2392 				// ignore trailing null bytes
2393 				if ((keyLength1 == keyLength2 + 1
2394 						&& ((uint8*)key1)[keyLength2] == '\0')
2395 					|| (keyLength2 == keyLength1 + 1
2396 						&& ((uint8*)key2)[keyLength1] == '\0'))
2397 					return 0;
2398 
2399 				result = keyLength1 - keyLength2;
2400 			}
2401 
2402 			return result;
2403 		}
2404 
2405 		case B_SSIZE_T_TYPE:
2406 		case B_INT32_TYPE:
2407 			return *(int32*)key1 - *(int32*)key2;
2408 
2409 		case B_SIZE_T_TYPE:
2410 		case B_UINT32_TYPE:
2411 			if (*(uint32*)key1 == *(uint32*)key2)
2412 				return 0;
2413 			if (*(uint32*)key1 > *(uint32*)key2)
2414 				return 1;
2415 
2416 			return -1;
2417 
2418 		case B_OFF_T_TYPE:
2419 		case B_INT64_TYPE:
2420 			if (*(int64*)key1 == *(int64*)key2)
2421 				return 0;
2422 			if (*(int64*)key1 > *(int64*)key2)
2423 				return 1;
2424 
2425 			return -1;
2426 
2427 		case B_UINT64_TYPE:
2428 			if (*(uint64*)key1 == *(uint64*)key2)
2429 				return 0;
2430 			if (*(uint64*)key1 > *(uint64*)key2)
2431 				return 1;
2432 
2433 			return -1;
2434 
2435 		case B_FLOAT_TYPE:
2436 		{
2437 			float result = *(float*)key1 - *(float*)key2;
2438 			if (result == 0.0f)
2439 				return 0;
2440 
2441 			return (result < 0.0f) ? -1 : 1;
2442 		}
2443 
2444 		case B_DOUBLE_TYPE:
2445 		{
2446 			double result = *(double*)key1 - *(double*)key2;
2447 			if (result == 0.0)
2448 				return 0;
2449 
2450 			return (result < 0.0) ? -1 : 1;
2451 		}
2452 	}
2453 
2454 	// if the type is unknown, the entries don't match...
2455 	return -1;
2456 }
2457 
2458