xref: /haiku/src/bin/bfs_tools/lib/BPlusTree.cpp (revision 1e60bdeab63fa7a57bc9a55b032052e95a18bd2c)
1 /* BPlusTree - BFS B+Tree implementation
2 **
3 ** Copyright 2001-2002 pinc Software. All Rights Reserved.
4 ** Released under the terms of the MIT license.
5 */
6 
7 
8 #include "BPlusTree.h"
9 #include "Inode.h"
10 #include "Stack.h"
11 #include "dump.h"
12 
13 #include <Debug.h>
14 
15 #include <string.h>
16 #include <stdlib.h>
17 #include <stdio.h>
18 
19 
20 #define MAX_NODES_IN_CACHE 20
21 
22 class CacheableNode : public NodeCache::Cacheable
23 {
24 	public:
25 		CacheableNode(off_t offset,bplustree_node *node)
26 			:
27 			fOffset(offset),
28 			fNode(node)
29 		{
30 		}
31 
32 		virtual ~CacheableNode()
33 		{
34 			if (fNode)
35 				free(fNode);
36 		}
37 
38 		virtual bool Equals(off_t offset)
39 		{
40 			return offset == fOffset;
41 		}
42 
43 		off_t			fOffset;
44 		bplustree_node	*fNode;
45 };
46 
47 
48 NodeCache::NodeCache(BPlusTree *tree)
49 	:	Cache<off_t>(),
50 	fTree(tree)
51 {
52 }
53 
54 
55 Cache<off_t>::Cacheable *
56 NodeCache::NewCacheable(off_t offset)
57 {
58 	return new CacheableNode(offset,fTree->Node(offset,false));
59 }
60 
61 
62 bplustree_node *
63 NodeCache::Get(off_t offset)
64 {
65 	CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset);
66 	return node->fNode;
67 }
68 
69 
70 //	#pragma mark -
71 
72 
73 BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates)
74 	:
75 	fStream(NULL),
76 	fHeader(NULL),
77 	fCache(this),
78 	fCurrentNodeOffset(BPLUSTREE_NULL)
79 {
80 	SetTo(keyType,nodeSize,allowDuplicates);
81 }
82 
83 
84 BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates)
85 	:
86 	fStream(NULL),
87 	fHeader(NULL),
88 	fCache(this),
89 	fCurrentNodeOffset(BPLUSTREE_NULL)
90 {
91 	SetTo(stream,allowDuplicates);
92 }
93 
94 
95 BPlusTree::BPlusTree()
96 	:
97 	fStream(NULL),
98 	fHeader(NULL),
99 	fNodeSize(BPLUSTREE_NODE_SIZE),
100 	fAllowDuplicates(true),
101 	fStatus(B_NO_INIT),
102 	fCache(this),
103 	fCurrentNodeOffset(BPLUSTREE_NULL)
104 {
105 }
106 
107 
108 BPlusTree::~BPlusTree()
109 {
110 	fCache.Clear();
111 }
112 
113 
114 void BPlusTree::Initialize(int32 nodeSize)
115 {
116 	// free old data
117 	fCache.Clear(0,true);
118 
119 	if (fHeader)
120 		free(fHeader);
121 
122 	fStream = NULL;
123 
124 	fNodeSize = nodeSize;
125 	fHeader = (bplustree_header *)malloc(fNodeSize);
126  	memset(fHeader,0,fNodeSize);
127 
128  	fCurrentNodeOffset = BPLUSTREE_NULL;
129 }
130 
131 
132 status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates)
133 {
134 	// initializes in-memory B+Tree
135 
136 	Initialize(nodeSize);
137 
138 	fAllowDuplicates = allowDuplicates;
139 	fCache.SetHoldChanges(true);
140 
141 	// initialize b+tree header
142  	fHeader->magic = BPLUSTREE_MAGIC;
143  	fHeader->node_size = fNodeSize;
144  	fHeader->max_number_of_levels = 1;
145  	fHeader->data_type = keyType;
146  	fHeader->root_node_pointer = fNodeSize;
147  	fHeader->free_node_pointer = BPLUSTREE_NULL;
148  	fHeader->maximum_size = fNodeSize * 2;
149 
150 	return fStatus = B_OK;
151 }
152 
153 
154 status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates)
155 {
156 	// initializes on-disk B+Tree
157 
158 	bplustree_header header;
159 	ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header));
160 	if (read < 0)
161 		return fStatus = read;
162 
163 	// is header valid?
164 
165 	stream->Seek(0,SEEK_END);
166 	off_t size = stream->Position();
167 	stream->Seek(0,SEEK_SET);
168 
169 	//dump_bplustree_header(&header);
170 
171 	if (header.magic != BPLUSTREE_MAGIC
172 		|| header.maximum_size != size
173 		|| (header.root_node_pointer % header.node_size) != 0
174 		|| !header.IsValidLink(header.root_node_pointer)
175 		|| !header.IsValidLink(header.free_node_pointer))
176 		return fStatus = B_BAD_DATA;
177 
178 	fAllowDuplicates = allowDuplicates;
179 
180 	if (DataStream *dataStream = dynamic_cast<DataStream *>(stream))
181 	{
182 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
183 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
184 		uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
185 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
186 
187 		if (header.data_type > BPLUSTREE_DOUBLE_TYPE
188 			|| (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode
189 			|| !dataStream->IsDirectory())
190 			return fStatus = B_BAD_TYPE;
191 
192 		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
193 		fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR;
194 
195 		//printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no");
196 	}
197 
198 	Initialize(header.node_size);
199 
200 	memcpy(fHeader,&header,sizeof(bplustree_header));
201 
202 	fStream = stream;
203 
204 	bplustree_node *node = fCache.Get(header.root_node_pointer);
205 	//if (node)
206 	//	dump_bplustree_node(node);
207 
208 	return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA;
209 }
210 
211 
212 status_t BPlusTree::InitCheck()
213 {
214 	return fStatus;
215 }
216 
217 
218 struct validate_info {
219 	off_t	offset;
220 	off_t	from;
221 	bool	free;
222 };
223 
224 
225 status_t
226 BPlusTree::Validate(bool verbose)
227 {
228 	// validates the on-disk b+tree
229 	// (for now only the node integrity is checked)
230 
231 	if (!fStream)
232 		return B_OK;
233 
234 	struct validate_info info;
235 
236 	Stack<validate_info> stack;
237 	info.offset = fHeader->root_node_pointer;
238 	info.from = BPLUSTREE_NULL;
239 	info.free = false;
240 	stack.Push(info);
241 
242 	if (fHeader->free_node_pointer != BPLUSTREE_NULL) {
243 		info.offset = fHeader->free_node_pointer;
244 		info.free = true;
245 		stack.Push(info);
246 	}
247 
248 	char escape[3] = {0x1b, '[', 0};
249 	int32 count = 0,freeCount = 0;
250 
251 	while (true) {
252 		if (!stack.Pop(&info)) {
253 			if (verbose) {
254 				printf("  %" B_PRId32 " B+tree node(s) processed (%" B_PRId32
255 					" free node(s)).\n", count, freeCount);
256 			}
257 			return B_OK;
258 		}
259 
260 		if (!info.free && verbose && ++count % 10 == 0)
261 			printf("  %" B_PRId32 "%s1A\n", count, escape);
262 		else if (info.free)
263 			freeCount++;
264 
265 		bplustree_node *node;
266 		if ((node = fCache.Get(info.offset)) == NULL
267 			|| !info.free && !CheckNode(node)) {
268 			if (verbose) {
269 				fprintf(stderr,"  B+Tree: Could not get data at position %"
270 					B_PRIdOFF " (referenced by node at %" B_PRIdOFF ")\n",
271 					info.offset, info.from);
272 				if ((node = fCache.Get(info.from)) != NULL)
273 					dump_bplustree_node(node,fHeader);
274 			}
275 			return B_BAD_DATA;
276 		}
277 
278 		info.from = info.offset;
279 
280 		if (node->overflow_link != BPLUSTREE_NULL && !info.free) {
281 			info.offset = node->overflow_link;
282 			stack.Push(info);
283 
284 			//dump_bplustree_node(node,fHeader);
285 			off_t *values = node->Values();
286 			for (int32 i = 0;i < node->all_key_count;i++) {
287 				info.offset = values[i];
288 				stack.Push(info);
289 			}
290 		} else if (node->left_link != BPLUSTREE_NULL && info.free) {
291 			info.offset = node->left_link;
292 			stack.Push(info);
293 		}
294 	}
295 }
296 
297 
298 status_t
299 BPlusTree::WriteTo(BPositionIO *stream)
300 {
301 	ssize_t written = stream->WriteAt(0,fHeader,fNodeSize);
302 	if (written < fNodeSize)
303 		return written < B_OK ? written : B_IO_ERROR;
304 
305 	for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize)
306 	{
307 		bplustree_node *node = fCache.Get(offset);
308 		if (node == NULL)
309 			return B_ERROR;
310 
311 		written = stream->WriteAt(offset,node,fNodeSize);
312 		if (written < fNodeSize)
313 			return written < B_OK ? written : B_IO_ERROR;
314 	}
315 	return stream->SetSize(fHeader->maximum_size);
316 }
317 
318 
319 //	#pragma mark -
320 
321 
322 void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to)
323 {
324 	fCurrentNodeOffset = offset;
325 	fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count;
326 	fDuplicateNode = BPLUSTREE_NULL;
327 }
328 
329 
330 status_t BPlusTree::Goto(int8 to)
331 {
332 	if (fHeader == NULL)
333 		return B_BAD_VALUE;
334 
335 	Stack<off_t> stack;
336 	if (stack.Push(fHeader->root_node_pointer) < B_OK)
337 		return B_NO_MEMORY;
338 
339 	bplustree_node *node;
340 	off_t pos;
341 	while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node))
342 	{
343 		// is the node a leaf node?
344 		if (node->overflow_link == BPLUSTREE_NULL)
345 		{
346 			SetCurrentNode(node,pos,to);
347 
348 			return B_OK;
349 		}
350 
351 		if (to == BPLUSTREE_END || node->all_key_count == 0)
352 			pos = node->overflow_link;
353 		else
354 		{
355 			if (node->all_key_length > fNodeSize
356 				|| (addr_t)node->Values() > (addr_t)node + fNodeSize
357 					- 8 * node->all_key_count)
358 				return B_ERROR;
359 
360 			pos = *node->Values();
361 		}
362 
363 		if (stack.Push(pos) < B_OK)
364 			break;
365 	}
366 	return B_ERROR;
367 }
368 
369 
370 status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value)
371 {
372 	if (fCurrentNodeOffset == BPLUSTREE_NULL
373 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
374 		return B_ERROR;
375 
376 	bplustree_node *node;
377 
378 	if (fDuplicateNode != BPLUSTREE_NULL)
379 	{
380 		// regardless of traverse direction the duplicates are always presented in
381 		// the same order; since they are all considered as equal, this shouldn't
382 		// cause any problems
383 
384 		if (!fIsFragment || fDuplicate < fNumDuplicates)
385 			node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
386 		else
387 			node = NULL;
388 
389 		if (node != NULL)
390 		{
391 			if (!fIsFragment && fDuplicate >= fNumDuplicates)
392 			{
393 				// if the node is out of duplicates, we go directly to the next one
394 				fDuplicateNode = node->right_link;
395 				if (fDuplicateNode != BPLUSTREE_NULL
396 					&& (node = fCache.Get(fDuplicateNode)) != NULL)
397 				{
398 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,false);
399 					fDuplicate = 0;
400 				}
401 			}
402 			if (fDuplicate < fNumDuplicates)
403 			{
404 				*value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++);
405 				return B_OK;
406 			}
407 		}
408 		fDuplicateNode = BPLUSTREE_NULL;
409 	}
410 
411 	off_t savedNodeOffset = fCurrentNodeOffset;
412 	if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node))
413 		return B_ERROR;
414 
415 	fCurrentKey += direction;
416 
417 	// is the current key in the current node?
418 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count)
419 		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
420 	{
421 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link;
422 
423 		// are there any more nodes?
424 		if (fCurrentNodeOffset != BPLUSTREE_NULL)
425 		{
426 			node = fCache.Get(fCurrentNodeOffset);
427 			if (!node || !CheckNode(node))
428 				return B_ERROR;
429 
430 			// reset current key
431 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count;
432 		}
433 		else
434 		{
435 			// there are no nodes left, so turn back to the last key
436 			fCurrentNodeOffset = savedNodeOffset;
437 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1;
438 
439 			return B_ENTRY_NOT_FOUND;
440 		}
441 	}
442 
443 	if (node->all_key_count == 0)
444 		return B_ERROR; //B_ENTRY_NOT_FOUND;
445 
446 	uint16 length;
447 	uint8 *keyStart = node->KeyAt(fCurrentKey,&length);
448 
449 	length = min_c(length,maxLength);
450 	memcpy(key,keyStart,length);
451 
452 	if (fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
453 	{
454 		if (length == maxLength)
455 			length--;
456 		((char *)key)[length] = '\0';
457 	}
458 	*keyLength = length;
459 
460 	off_t offset = node->Values()[fCurrentKey];
461 
462 	// duplicate fragments?
463 	uint8 type = bplustree_node::LinkType(offset);
464 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
465 	{
466 		fDuplicateNode = offset;
467 
468 		node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
469 		if (node == NULL)
470 			return B_ERROR;
471 
472 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
473 
474 		fNumDuplicates = node->CountDuplicates(offset,fIsFragment);
475 		if (fNumDuplicates)
476 		{
477 			// give back first duplicate
478 			fDuplicate = 1;
479 			offset = node->DuplicateAt(offset,fIsFragment,0);
480 		}
481 		else
482 		{
483 			// shouldn't happen, but we're dealing here with corrupt disks...
484 			fDuplicateNode = BPLUSTREE_NULL;
485 			offset = 0;
486 		}
487 	}
488 	*value = offset;
489 
490 	return B_OK;
491 }
492 
493 
494 int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
495 {
496 	switch (fHeader->data_type)
497 	{
498 	    case BPLUSTREE_STRING_TYPE:
499     	{
500 			int len = min_c(keyLength1,keyLength2);
501 			int result = strncmp((const char *)key1,(const char *)key2,len);
502 
503 			if (result == 0)
504 				result = keyLength1 - keyLength2;
505 
506 			return result;
507 		}
508 
509 		case BPLUSTREE_INT32_TYPE:
510 			return *(int32 *)key1 - *(int32 *)key2;
511 
512 		case BPLUSTREE_UINT32_TYPE:
513 		{
514 			if (*(uint32 *)key1 == *(uint32 *)key2)
515 				return 0;
516 			else if (*(uint32 *)key1 > *(uint32 *)key2)
517 				return 1;
518 
519 			return -1;
520 		}
521 
522 		case BPLUSTREE_INT64_TYPE:
523 		{
524 			if (*(int64 *)key1 == *(int64 *)key2)
525 				return 0;
526 			else if (*(int64 *)key1 > *(int64 *)key2)
527 				return 1;
528 
529 			return -1;
530 		}
531 
532 		case BPLUSTREE_UINT64_TYPE:
533 		{
534 			if (*(uint64 *)key1 == *(uint64 *)key2)
535 				return 0;
536 			else if (*(uint64 *)key1 > *(uint64 *)key2)
537 				return 1;
538 
539 			return -1;
540 		}
541 
542 		case BPLUSTREE_FLOAT_TYPE:
543 		{
544 			float result = *(float *)key1 - *(float *)key2;
545 			if (result == 0.0f)
546 				return 0;
547 
548 			return (result < 0.0f) ? -1 : 1;
549 		}
550 
551 		case BPLUSTREE_DOUBLE_TYPE:
552 		{
553 			double result = *(double *)key1 - *(double *)key2;
554 			if (result == 0.0)
555 				return 0;
556 
557 			return (result < 0.0) ? -1 : 1;
558 		}
559 	}
560 	return 0;
561 }
562 
563 
564 status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next)
565 {
566 	if (node->all_key_count == 0)
567 	{
568 		if (index)
569 			*index = 0;
570 		if (next)
571 			*next = node->overflow_link;
572 		return B_ENTRY_NOT_FOUND;
573 	}
574 
575 	off_t *values = node->Values();
576 	int16 saveIndex = 0;
577 
578 	// binary search in the key array
579 	for (int16 first = 0,last = node->all_key_count - 1;first <= last;)
580 	{
581 		uint16 i = (first + last) >> 1;
582 
583 		uint16 searchLength;
584 		uint8 *searchKey = node->KeyAt(i,&searchLength);
585 
586 		int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength);
587 		if (cmp < 0)
588 		{
589 			last = i - 1;
590 			saveIndex = i;
591 		}
592 		else if (cmp > 0)
593 		{
594 			saveIndex = first = i + 1;
595 		}
596 		else
597 		{
598 			if (index)
599 				*index = i;
600 			if (next)
601 				*next = values[i];
602 			return B_OK;
603 		}
604 	}
605 
606 	if (index)
607 		*index = saveIndex;
608 	if (next)
609 	{
610 		if (saveIndex == node->all_key_count)
611 			*next = node->overflow_link;
612 		else
613 			*next = values[saveIndex];
614 	}
615 	return B_ENTRY_NOT_FOUND;
616 }
617 
618 
619 status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength)
620 {
621 	node_and_key nodeAndKey;
622 	nodeAndKey.nodeOffset = fHeader->root_node_pointer;
623 	nodeAndKey.keyIndex = 0;
624 
625 	bplustree_node *node;
626 	while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) {
627 		// if we are already on leaf level, we're done
628 		if (node->overflow_link == BPLUSTREE_NULL) {
629 			// put the node on the stack
630 			// node that the keyIndex is not properly set here!
631 			nodeAndKey.keyIndex = 0;
632 			stack.Push(nodeAndKey);
633 			return B_OK;
634 		}
635 
636 		off_t nextOffset;
637 		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset);
638 
639 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
640 			return B_ERROR;
641 
642 		// put the node & the correct keyIndex on the stack
643 		stack.Push(nodeAndKey);
644 
645 		nodeAndKey.nodeOffset = nextOffset;
646 	}
647 	return B_ERROR;
648 }
649 
650 
651 void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index)
652 {
653 	// should never happen, but who knows?
654 	if (index > node->all_key_count)
655 		return;
656 
657 	off_t *values = node->Values();
658 	uint16 *keyLengths = node->KeyLengths();
659 	uint8 *keys = node->Keys();
660 
661 	node->all_key_count++;
662 	node->all_key_length += keyLength;
663 
664 	off_t *newValues = node->Values();
665 	uint16 *newKeyLengths = node->KeyLengths();
666 
667 	// move values and copy new value into them
668 	memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index));
669 	memmove(newValues,values,sizeof(off_t) * index);
670 
671 	newValues[index] = value;
672 
673 	// move and update key length index
674 	for (uint16 i = node->all_key_count;i-- > index + 1;)
675 		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
676 	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
677 
678 	int32 keyStart;
679 	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
680 
681 	// move keys and copy new key into them
682 	int32 size = node->all_key_length - newKeyLengths[index];
683 	if (size > 0)
684 		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
685 
686 	memcpy(keys + keyStart,key,keyLength);
687 }
688 
689 
690 status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/)
691 {
692 	printf("DUPLICATE ENTRY!!\n");
693 
694 //		/* handle dup keys */
695 //		if (dupflg == 0)
696 //		{
697 //			bt_errno(b) = BT_DUPKEY;
698 //			goto bombout;
699 //		}
700 //		else
701 //		{
702 //			/* paste new data ptr in page */
703 //			/* and write it out again. */
704 //			off_t	*p;
705 //			p = (off_t *)KEYCLD(op->p);
706 //			*(p + keyat) = rrn;
707 //			op->flags = BT_CHE_DIRTY;
708 //			if(bt_wpage(b,op) == BT_ERR ||
709 //				bt_wpage(b,kp) == BT_ERR)
710 //				return(BT_ERR);
711 //
712 //			/* mark all as well with tree */
713 //			bt_clearerr(b);
714 //			return(BT_OK);
715 //		}
716 
717 	return B_OK;
718 }
719 
720 
721 status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value)
722 {
723 	if (*_keyIndex > node->all_key_count + 1)
724 		return B_BAD_VALUE;
725 
726 	//printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex);
727 	//dump_bplustree_node(node,fHeader);
728 	//hexdump(node,fNodeSize);
729 
730 	off_t otherOffset;
731 	bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/);
732 	if (other == NULL)
733 		return B_NO_MEMORY;
734 
735 	uint16 *inKeyLengths = node->KeyLengths();
736 	off_t *inKeyValues = node->Values();
737 	uint8 *inKeys = node->Keys();
738 	uint8 *outKeys = other->Keys();
739 	uint16 keyIndex = *_keyIndex;
740 
741 	// how many keys will fit in one (half) page?
742 
743 	// "bytes" is the number of bytes written for the new key,
744 	// "bytesBefore" are the bytes before that key
745 	// "bytesAfter" are the bytes after the new key, if any
746 	int32 bytes = 0,bytesBefore = 0,bytesAfter = 0;
747 
748 	size_t size = fNodeSize >> 1;
749 	int32 out,in;
750 	for (in = out = 0;in < node->all_key_count + 1;) {
751 		if (!bytes)
752 			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
753 		if (in == keyIndex && !bytes) {
754 			bytes = *_keyLength;
755 		} else {
756 			if (keyIndex < out) {
757 				bytesAfter = inKeyLengths[in] - bytesBefore;
758 				// fix the key lengths for the new node
759 				inKeyLengths[in] = bytesAfter + bytesBefore + bytes;
760 			} //else
761 			in++;
762 		}
763 		out++;
764 
765 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
766 						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
767 			// we have found the number of keys in the new node!
768 			break;
769 		}
770 	}
771 
772 	// if the new key was not inserted, set the length of the keys
773 	// that can be copied directly
774 	if (keyIndex >= out && in > 0)
775 		bytesBefore = inKeyLengths[in - 1];
776 
777 	if (bytesBefore < 0 || bytesAfter < 0)
778 		return B_BAD_DATA;
779 
780 	//printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
781 
782 	other->left_link = node->left_link;
783 	other->right_link = nodeOffset;
784 	other->all_key_length = bytes + bytesBefore + bytesAfter;
785 	other->all_key_count = out;
786 	//printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node));
787 
788 	uint16 *outKeyLengths = other->KeyLengths();
789 	off_t *outKeyValues = other->Values();
790 	int32 keys = out > keyIndex ? keyIndex : out;
791 
792 	if (bytesBefore) {
793 		// copy the keys
794 		memcpy(outKeys,inKeys,bytesBefore);
795 		memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16));
796 		memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t));
797 	}
798 	if (bytes) {
799 		// copy the newly inserted key
800 		memcpy(outKeys + bytesBefore,key,bytes);
801 		outKeyLengths[keyIndex] = bytes + bytesBefore;
802 		outKeyValues[keyIndex] = *_value;
803 
804 		if (bytesAfter) {
805 			// copy the keys after the new key
806 			memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter);
807 			keys = out - keyIndex - 1;
808 			memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16));
809 			memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t));
810 		}
811 	}
812 
813 	// if the new key was already inserted, we shouldn't use it again
814 	if (in != out)
815 		keyIndex--;
816 
817 	int32 total = bytesBefore + bytesAfter;
818 
819 	// if we have split an index node, we have to drop the first key
820 	// of the next node (which can also be the new key to insert)
821 	if (node->overflow_link != BPLUSTREE_NULL) {
822 		if (in == keyIndex) {
823 			other->overflow_link = *_value;
824 			keyIndex--;
825 		} else {
826 			other->overflow_link = inKeyValues[in];
827 			total = inKeyLengths[in++];
828 		}
829 	}
830 
831 	// and now the same game for the other page and the rest of the keys
832 	// (but with memmove() instead of memcpy(), because they may overlap)
833 
834 	bytesBefore = bytesAfter = bytes = 0;
835 	out = 0;
836 	int32 skip = in;
837 	while (in < node->all_key_count + 1) {
838 		//printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes);
839 		if (in == keyIndex && !bytes) {
840 			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
841 			//printf("bytesBefore = %ld\n",bytesBefore);
842 			bytes = *_keyLength;
843 		} else if (in < node->all_key_count) {
844 			//printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]);
845 			inKeyLengths[in] -= total;
846 			if (bytes) {
847 				inKeyLengths[in] += bytes;
848 				bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
849 				//printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore);
850 			}
851 
852 			in++;
853 		} else
854 			in++;
855 
856 		out++;
857 
858 		//printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
859 		//				out * (sizeof(uint16) + sizeof(off_t)));
860 		if (in > node->all_key_count && keyIndex < in)
861 			break;
862 	}
863 
864 	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
865 
866 	if (keyIndex >= in && keyIndex - skip < out) {
867 		bytesAfter = inKeyLengths[in] - bytesBefore - total;
868 	} else if (keyIndex < skip)
869 		bytesBefore = node->all_key_length - total;
870 
871 	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
872 
873 	if (bytesBefore < 0 || bytesAfter < 0)
874 		return B_BAD_DATA;
875 
876 	node->left_link = otherOffset;
877 		// right link, and overflow link can stay the same
878 	node->all_key_length = bytes + bytesBefore + bytesAfter;
879 	node->all_key_count = out - 1;
880 
881 	// array positions have changed
882 	outKeyLengths = node->KeyLengths();
883 	outKeyValues = node->Values();
884 
885 	//printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
886 
887 	// move the keys in the old node: the order is important here,
888 	// because we don't want to overwrite any contents
889 
890 	keys = keyIndex <= skip ? out : keyIndex - skip;
891 	keyIndex -= skip;
892 
893 	if (bytesBefore)
894 		memmove(inKeys,inKeys + total,bytesBefore);
895 	if (bytesAfter)
896 		memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter);
897 
898 	if (bytesBefore)
899 		memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16));
900 	in = out - keyIndex - 1;
901 	if (bytesAfter)
902 		memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16));
903 
904 	if (bytesBefore)
905 		memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t));
906 	if (bytesAfter)
907 		memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t));
908 
909 	if (bytes) {
910 		// finally, copy the newly inserted key (don't overwrite anything)
911 		memcpy(inKeys + bytesBefore,key,bytes);
912 		outKeyLengths[keyIndex] = bytes + bytesBefore;
913 		outKeyValues[keyIndex] = *_value;
914 	}
915 
916 	//puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n");
917 	//dump_bplustree_node(other,fHeader);
918 	//hexdump(other,fNodeSize);
919 	//puts("\n");
920 	//dump_bplustree_node(node,fHeader);
921 	//hexdump(node,fNodeSize);
922 
923 	// write the updated nodes back
924 
925 	fCache.SetDirty(otherOffset,true);
926 	fCache.SetDirty(nodeOffset,true);
927 
928 	// update the right link of the node in the left of the new node
929 	if (other->left_link != BPLUSTREE_NULL) {
930 		bplustree_node *left = fCache.Get(other->left_link);
931 		if (left != NULL) {
932 			left->right_link = otherOffset;
933 			fCache.SetDirty(other->left_link,true);
934 		}
935 	}
936 
937 	// prepare key to insert in the parent node
938 
939 	//printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1);
940 	uint16 length;
941 	uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length);
942 	memcpy(key,lastKey,length);
943 	*_keyLength = length;
944 	*_value = otherOffset;
945 
946 	return B_OK;
947 }
948 
949 
950 status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value)
951 {
952 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
953 		return B_BAD_VALUE;
954 
955 	Stack<node_and_key> stack;
956 	if (SeekDown(stack,key,keyLength) != B_OK)
957 		return B_ERROR;
958 
959 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
960 
961 	memcpy(keyBuffer,key,keyLength);
962 	keyBuffer[keyLength] = 0;
963 
964 	off_t valueToInsert = value;
965 
966 	fCurrentNodeOffset = BPLUSTREE_NULL;
967 
968 	node_and_key nodeAndKey;
969 	bplustree_node *node;
970 	uint32 count = 0;
971 
972 	while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
973 	{
974 		if (count++ == 0)	// first round, check for duplicate entries
975 		{
976 			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
977 			if (status == B_ERROR)
978 				return B_ERROR;
979 
980 			// is this a duplicate entry?
981 			if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
982 			{
983 				if (fAllowDuplicates)
984 					return InsertDuplicate(node,nodeAndKey.keyIndex);
985 				else
986 					return B_NAME_IN_USE;
987 			}
988 		}
989 
990 		// is the node big enough to hold the pair?
991 		if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength)
992 			+ (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
993 		{
994 			InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex);
995 			fCache.SetDirty(nodeAndKey.nodeOffset,true);
996 
997 			return B_OK;
998 		}
999 		else
1000 		{
1001 			// do we need to allocate a new root node? if so, then do
1002 			// it now
1003 			bplustree_node *rootNode = NULL;
1004 			off_t newRoot = BPLUSTREE_NULL;
1005 			if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) {
1006 				rootNode = fCache.Get(newRoot = fHeader->maximum_size);
1007 				if (rootNode == NULL) {
1008 					// the tree is most likely dead
1009 					return B_NO_MEMORY;
1010 				}
1011 			}
1012 
1013 			if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) {
1014 				if (newRoot != BPLUSTREE_NULL) {
1015 					// free root node
1016 				}
1017 				return B_ERROR;
1018 			}
1019 
1020 			if (newRoot != BPLUSTREE_NULL) {
1021 				rootNode->overflow_link = nodeAndKey.nodeOffset;
1022 
1023 				InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0);
1024 
1025 				fHeader->root_node_pointer = newRoot;
1026 				fHeader->max_number_of_levels++;
1027 				// write header
1028 
1029 				fCache.SetDirty(newRoot,true);
1030 
1031 				return B_OK;
1032 			}
1033 		}
1034 	}
1035 
1036 	return B_ERROR;
1037 }
1038 
1039 
1040 status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value)
1041 {
1042 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1043 		return B_BAD_VALUE;
1044 
1045 	Stack<node_and_key> stack;
1046 	if (SeekDown(stack,key,keyLength) != B_OK)
1047 		return B_ERROR;
1048 
1049 	fCurrentNodeOffset = BPLUSTREE_NULL;
1050 
1051 	node_and_key nodeAndKey;
1052 	bplustree_node *node;
1053 
1054 	if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
1055 	{
1056 		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
1057 		if (status == B_ERROR)
1058 			return B_ERROR;
1059 
1060 		SetCurrentNode(node,nodeAndKey.nodeOffset);
1061 
1062 		if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
1063 		{
1064 			*value = node->Values()[nodeAndKey.keyIndex];
1065 			return B_OK;
1066 		}
1067 	}
1068 	return B_ENTRY_NOT_FOUND;
1069 }
1070 
1071 
1072 //	#pragma mark -
1073 
1074 
1075 bool
1076 BPlusTree::CheckNode(bplustree_node *node)
1077 {
1078 	if (!fHeader->IsValidLink(node->left_link)
1079 		|| !fHeader->IsValidLink(node->right_link)
1080 		|| !fHeader->IsValidLink(node->overflow_link)
1081 		|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1082 		return false;
1083 
1084 	return true;
1085 }
1086 
1087 
1088 bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check)
1089 {
1090 /*printf("1: %d,%d,%d\n",
1091 	nodeOffset > fHeader->maximum_size - fNodeSize,
1092 	nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL,
1093 	(nodeOffset % fNodeSize) != 0);
1094 */
1095 	// the super node is always in memory, and shouldn't
1096 	// never be taken out of the cache
1097 	if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/
1098 		|| nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL
1099 		|| (nodeOffset % fNodeSize) != 0)
1100 		return NULL;
1101 
1102 	bplustree_node *node = (bplustree_node *)malloc(fNodeSize);
1103 	if (node == NULL)
1104 		return NULL;
1105 
1106 	if (nodeOffset == BPLUSTREE_NULL || !fStream)
1107 	{
1108 	 	node->left_link = BPLUSTREE_NULL;
1109 	 	node->right_link = BPLUSTREE_NULL;
1110 	 	node->overflow_link = BPLUSTREE_NULL;
1111 	 	node->all_key_count = 0;
1112 	 	node->all_key_length = 0;
1113 	}
1114 	else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize)
1115 	{
1116 		free(node);
1117 		return NULL;
1118 	}
1119 
1120 	if (check && node != NULL)
1121 	{
1122 		// sanity checks (links, all_key_count)
1123 		if (!fHeader->IsValidLink(node->left_link)
1124 			|| !fHeader->IsValidLink(node->right_link)
1125 			|| !fHeader->IsValidLink(node->overflow_link)
1126 			|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1127 			return NULL;
1128 	}
1129 
1130 	if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) {
1131 		// do some hacks here
1132 		fHeader->maximum_size += fNodeSize;
1133 	}
1134 
1135 	return node;
1136 }
1137 
1138 
1139 void BPlusTree::SetHoldChanges(bool hold)
1140 {
1141 	fCache.SetHoldChanges(hold);
1142 }
1143 
1144 
1145 //	#pragma mark -
1146 
1147 
1148 uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const
1149 {
1150 	if (index < 0 || index > all_key_count)
1151 		return NULL;
1152 
1153 	uint8 *keyStart = Keys();
1154 	uint16 *keyLengths = KeyLengths();
1155 
1156 	*keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0);
1157 	if (index > 0)
1158 		keyStart += keyLengths[index - 1];
1159 
1160 	return keyStart;
1161 }
1162 
1163 
1164 uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const
1165 {
1166 	// the duplicate fragment handling is currently hard-coded to a node size
1167 	// of 1024 bytes - with future versions of BFS, this may be a problem
1168 
1169 	if (isFragment)
1170 	{
1171 		uint32 fragment = 8 * ((uint64)offset & 0x3ff);
1172 
1173 		return ((off_t *)this)[fragment];
1174 	}
1175 	return overflow_link;
1176 }
1177 
1178 
1179 off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const
1180 {
1181 	uint32 start;
1182 	if (isFragment)
1183 		start = 8 * ((uint64)offset & 0x3ff);
1184 	else
1185 		start = 2;
1186 
1187 	return ((off_t *)this)[start + 1 + index];
1188 }
1189 
1190