xref: /haiku/src/bin/bfs_tools/lib/BPlusTree.cpp (revision f8da8f3477d3c18142e59d17d05a545982faa5a8)
1 /* BPlusTree - BFS B+Tree implementation
2 **
3 ** Copyright 2001-2002 pinc Software. All Rights Reserved.
4 */
5 
6 
7 #include "BPlusTree.h"
8 #include "Inode.h"
9 #include "Stack.h"
10 #include "dump.h"
11 
12 #include <Debug.h>
13 
14 #include <string.h>
15 #include <stdlib.h>
16 #include <stdio.h>
17 
18 
19 #define MAX_NODES_IN_CACHE 20
20 
21 class CacheableNode : public NodeCache::Cacheable
22 {
23 	public:
24 		CacheableNode(off_t offset,bplustree_node *node)
25 			:
26 			fOffset(offset),
27 			fNode(node)
28 		{
29 		}
30 
31 		virtual ~CacheableNode()
32 		{
33 			if (fNode)
34 				free(fNode);
35 		}
36 
37 		virtual bool Equals(off_t offset)
38 		{
39 			return offset == fOffset;
40 		}
41 
42 		off_t			fOffset;
43 		bplustree_node	*fNode;
44 };
45 
46 
47 NodeCache::NodeCache(BPlusTree *tree)
48 	:	Cache<off_t>(),
49 	fTree(tree)
50 {
51 }
52 
53 
54 Cache<off_t>::Cacheable *
55 NodeCache::NewCacheable(off_t offset)
56 {
57 	return new CacheableNode(offset,fTree->Node(offset,false));
58 }
59 
60 
61 bplustree_node *
62 NodeCache::Get(off_t offset)
63 {
64 	CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset);
65 	return node->fNode;
66 }
67 
68 
69 //	#pragma mark -
70 
71 
72 BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates)
73 	:
74 	fStream(NULL),
75 	fHeader(NULL),
76 	fCache(this),
77 	fCurrentNodeOffset(BPLUSTREE_NULL)
78 {
79 	SetTo(keyType,nodeSize,allowDuplicates);
80 }
81 
82 
83 BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates)
84 	:
85 	fStream(NULL),
86 	fHeader(NULL),
87 	fCache(this),
88 	fCurrentNodeOffset(BPLUSTREE_NULL)
89 {
90 	SetTo(stream,allowDuplicates);
91 }
92 
93 
94 BPlusTree::BPlusTree()
95 	:
96 	fStream(NULL),
97 	fHeader(NULL),
98 	fNodeSize(BPLUSTREE_NODE_SIZE),
99 	fAllowDuplicates(true),
100 	fStatus(B_NO_INIT),
101 	fCache(this),
102 	fCurrentNodeOffset(BPLUSTREE_NULL)
103 {
104 }
105 
106 
107 BPlusTree::~BPlusTree()
108 {
109 	fCache.Clear();
110 }
111 
112 
113 void BPlusTree::Initialize(int32 nodeSize)
114 {
115 	// free old data
116 	fCache.Clear(0,true);
117 
118 	if (fHeader)
119 		free(fHeader);
120 
121 	fStream = NULL;
122 
123 	fNodeSize = nodeSize;
124 	fHeader = (bplustree_header *)malloc(fNodeSize);
125  	memset(fHeader,0,fNodeSize);
126 
127  	fCurrentNodeOffset = BPLUSTREE_NULL;
128 }
129 
130 
131 status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates)
132 {
133 	// initializes in-memory B+Tree
134 
135 	Initialize(nodeSize);
136 
137 	fAllowDuplicates = allowDuplicates;
138 	fCache.SetHoldChanges(true);
139 
140 	// initialize b+tree header
141  	fHeader->magic = BPLUSTREE_MAGIC;
142  	fHeader->node_size = fNodeSize;
143  	fHeader->max_number_of_levels = 1;
144  	fHeader->data_type = keyType;
145  	fHeader->root_node_pointer = fNodeSize;
146  	fHeader->free_node_pointer = BPLUSTREE_NULL;
147  	fHeader->maximum_size = fNodeSize * 2;
148 
149 	return fStatus = B_OK;
150 }
151 
152 
153 status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates)
154 {
155 	// initializes on-disk B+Tree
156 
157 	bplustree_header header;
158 	ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header));
159 	if (read < 0)
160 		return fStatus = read;
161 
162 	// is header valid?
163 
164 	stream->Seek(0,SEEK_END);
165 	off_t size = stream->Position();
166 	stream->Seek(0,SEEK_SET);
167 
168 	//dump_bplustree_header(&header);
169 
170 	if (header.magic != BPLUSTREE_MAGIC
171 		|| header.maximum_size != size
172 		|| (header.root_node_pointer % header.node_size) != 0
173 		|| !header.IsValidLink(header.root_node_pointer)
174 		|| !header.IsValidLink(header.free_node_pointer))
175 		return fStatus = B_BAD_DATA;
176 
177 	fAllowDuplicates = allowDuplicates;
178 
179 	if (DataStream *dataStream = dynamic_cast<DataStream *>(stream))
180 	{
181 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
182 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
183 		uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
184 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
185 
186 		if (header.data_type > BPLUSTREE_DOUBLE_TYPE
187 			|| (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode
188 			|| !dataStream->IsDirectory())
189 			return fStatus = B_BAD_TYPE;
190 
191 		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
192 		fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR;
193 
194 		//printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no");
195 	}
196 
197 	Initialize(header.node_size);
198 
199 	memcpy(fHeader,&header,sizeof(bplustree_header));
200 
201 	fStream = stream;
202 
203 	bplustree_node *node = fCache.Get(header.root_node_pointer);
204 	//if (node)
205 	//	dump_bplustree_node(node);
206 
207 	return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA;
208 }
209 
210 
211 status_t BPlusTree::InitCheck()
212 {
213 	return fStatus;
214 }
215 
216 
217 struct validate_info {
218 	off_t	offset;
219 	off_t	from;
220 	bool	free;
221 };
222 
223 
224 status_t
225 BPlusTree::Validate(bool verbose)
226 {
227 	// validates the on-disk b+tree
228 	// (for now only the node integrity is checked)
229 
230 	if (!fStream)
231 		return B_OK;
232 
233 	struct validate_info info;
234 
235 	Stack<validate_info> stack;
236 	info.offset = fHeader->root_node_pointer;
237 	info.from = BPLUSTREE_NULL;
238 	info.free = false;
239 	stack.Push(info);
240 
241 	if (fHeader->free_node_pointer != BPLUSTREE_NULL) {
242 		info.offset = fHeader->free_node_pointer;
243 		info.free = true;
244 		stack.Push(info);
245 	}
246 
247 	char escape[3] = {0x1b, '[', 0};
248 	int32 count = 0,freeCount = 0;
249 
250 	while (true) {
251 		if (!stack.Pop(&info)) {
252 			if (verbose) {
253 				printf("  %" B_PRId32 " B+tree node(s) processed (%" B_PRId32
254 					" free node(s)).\n", count, freeCount);
255 			}
256 			return B_OK;
257 		}
258 
259 		if (!info.free && verbose && ++count % 10 == 0)
260 			printf("  %" B_PRId32 "%s1A\n", count, escape);
261 		else if (info.free)
262 			freeCount++;
263 
264 		bplustree_node *node;
265 		if ((node = fCache.Get(info.offset)) == NULL
266 			|| !info.free && !CheckNode(node)) {
267 			if (verbose) {
268 				fprintf(stderr,"  B+Tree: Could not get data at position %"
269 					B_PRIdOFF " (referenced by node at %" B_PRIdOFF ")\n",
270 					info.offset, info.from);
271 				if ((node = fCache.Get(info.from)) != NULL)
272 					dump_bplustree_node(node,fHeader);
273 			}
274 			return B_BAD_DATA;
275 		}
276 
277 		info.from = info.offset;
278 
279 		if (node->overflow_link != BPLUSTREE_NULL && !info.free) {
280 			info.offset = node->overflow_link;
281 			stack.Push(info);
282 
283 			//dump_bplustree_node(node,fHeader);
284 			off_t *values = node->Values();
285 			for (int32 i = 0;i < node->all_key_count;i++) {
286 				info.offset = values[i];
287 				stack.Push(info);
288 			}
289 		} else if (node->left_link != BPLUSTREE_NULL && info.free) {
290 			info.offset = node->left_link;
291 			stack.Push(info);
292 		}
293 	}
294 }
295 
296 
297 status_t
298 BPlusTree::WriteTo(BPositionIO *stream)
299 {
300 	ssize_t written = stream->WriteAt(0,fHeader,fNodeSize);
301 	if (written < fNodeSize)
302 		return written < B_OK ? written : B_IO_ERROR;
303 
304 	for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize)
305 	{
306 		bplustree_node *node = fCache.Get(offset);
307 		if (node == NULL)
308 			return B_ERROR;
309 
310 		written = stream->WriteAt(offset,node,fNodeSize);
311 		if (written < fNodeSize)
312 			return written < B_OK ? written : B_IO_ERROR;
313 	}
314 	return stream->SetSize(fHeader->maximum_size);
315 }
316 
317 
318 //	#pragma mark -
319 
320 
321 void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to)
322 {
323 	fCurrentNodeOffset = offset;
324 	fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count;
325 	fDuplicateNode = BPLUSTREE_NULL;
326 }
327 
328 
329 status_t BPlusTree::Goto(int8 to)
330 {
331 	if (fHeader == NULL)
332 		return B_BAD_VALUE;
333 
334 	Stack<off_t> stack;
335 	if (stack.Push(fHeader->root_node_pointer) < B_OK)
336 		return B_NO_MEMORY;
337 
338 	bplustree_node *node;
339 	off_t pos;
340 	while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node))
341 	{
342 		// is the node a leaf node?
343 		if (node->overflow_link == BPLUSTREE_NULL)
344 		{
345 			SetCurrentNode(node,pos,to);
346 
347 			return B_OK;
348 		}
349 
350 		if (to == BPLUSTREE_END || node->all_key_count == 0)
351 			pos = node->overflow_link;
352 		else
353 		{
354 			if (node->all_key_length > fNodeSize
355 				|| (addr_t)node->Values() > (addr_t)node + fNodeSize
356 					- 8 * node->all_key_count)
357 				return B_ERROR;
358 
359 			pos = *node->Values();
360 		}
361 
362 		if (stack.Push(pos) < B_OK)
363 			break;
364 	}
365 	return B_ERROR;
366 }
367 
368 
369 status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value)
370 {
371 	if (fCurrentNodeOffset == BPLUSTREE_NULL
372 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
373 		return B_ERROR;
374 
375 	bplustree_node *node;
376 
377 	if (fDuplicateNode != BPLUSTREE_NULL)
378 	{
379 		// regardless of traverse direction the duplicates are always presented in
380 		// the same order; since they are all considered as equal, this shouldn't
381 		// cause any problems
382 
383 		if (!fIsFragment || fDuplicate < fNumDuplicates)
384 			node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
385 		else
386 			node = NULL;
387 
388 		if (node != NULL)
389 		{
390 			if (!fIsFragment && fDuplicate >= fNumDuplicates)
391 			{
392 				// if the node is out of duplicates, we go directly to the next one
393 				fDuplicateNode = node->right_link;
394 				if (fDuplicateNode != BPLUSTREE_NULL
395 					&& (node = fCache.Get(fDuplicateNode)) != NULL)
396 				{
397 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,false);
398 					fDuplicate = 0;
399 				}
400 			}
401 			if (fDuplicate < fNumDuplicates)
402 			{
403 				*value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++);
404 				return B_OK;
405 			}
406 		}
407 		fDuplicateNode = BPLUSTREE_NULL;
408 	}
409 
410 	off_t savedNodeOffset = fCurrentNodeOffset;
411 	if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node))
412 		return B_ERROR;
413 
414 	fCurrentKey += direction;
415 
416 	// is the current key in the current node?
417 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count)
418 		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
419 	{
420 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link;
421 
422 		// are there any more nodes?
423 		if (fCurrentNodeOffset != BPLUSTREE_NULL)
424 		{
425 			node = fCache.Get(fCurrentNodeOffset);
426 			if (!node || !CheckNode(node))
427 				return B_ERROR;
428 
429 			// reset current key
430 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count;
431 		}
432 		else
433 		{
434 			// there are no nodes left, so turn back to the last key
435 			fCurrentNodeOffset = savedNodeOffset;
436 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1;
437 
438 			return B_ENTRY_NOT_FOUND;
439 		}
440 	}
441 
442 	if (node->all_key_count == 0)
443 		return B_ERROR; //B_ENTRY_NOT_FOUND;
444 
445 	uint16 length;
446 	uint8 *keyStart = node->KeyAt(fCurrentKey,&length);
447 
448 	length = min_c(length,maxLength);
449 	memcpy(key,keyStart,length);
450 
451 	if (fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
452 	{
453 		if (length == maxLength)
454 			length--;
455 		((char *)key)[length] = '\0';
456 	}
457 	*keyLength = length;
458 
459 	off_t offset = node->Values()[fCurrentKey];
460 
461 	// duplicate fragments?
462 	uint8 type = bplustree_node::LinkType(offset);
463 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
464 	{
465 		fDuplicateNode = offset;
466 
467 		node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode));
468 		if (node == NULL)
469 			return B_ERROR;
470 
471 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
472 
473 		fNumDuplicates = node->CountDuplicates(offset,fIsFragment);
474 		if (fNumDuplicates)
475 		{
476 			// give back first duplicate
477 			fDuplicate = 1;
478 			offset = node->DuplicateAt(offset,fIsFragment,0);
479 		}
480 		else
481 		{
482 			// shouldn't happen, but we're dealing here with corrupt disks...
483 			fDuplicateNode = BPLUSTREE_NULL;
484 			offset = 0;
485 		}
486 	}
487 	*value = offset;
488 
489 	return B_OK;
490 }
491 
492 
493 int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
494 {
495 	switch (fHeader->data_type)
496 	{
497 	    case BPLUSTREE_STRING_TYPE:
498     	{
499 			int len = min_c(keyLength1,keyLength2);
500 			int result = strncmp((const char *)key1,(const char *)key2,len);
501 
502 			if (result == 0)
503 				result = keyLength1 - keyLength2;
504 
505 			return result;
506 		}
507 
508 		case BPLUSTREE_INT32_TYPE:
509 			return *(int32 *)key1 - *(int32 *)key2;
510 
511 		case BPLUSTREE_UINT32_TYPE:
512 		{
513 			if (*(uint32 *)key1 == *(uint32 *)key2)
514 				return 0;
515 			else if (*(uint32 *)key1 > *(uint32 *)key2)
516 				return 1;
517 
518 			return -1;
519 		}
520 
521 		case BPLUSTREE_INT64_TYPE:
522 		{
523 			if (*(int64 *)key1 == *(int64 *)key2)
524 				return 0;
525 			else if (*(int64 *)key1 > *(int64 *)key2)
526 				return 1;
527 
528 			return -1;
529 		}
530 
531 		case BPLUSTREE_UINT64_TYPE:
532 		{
533 			if (*(uint64 *)key1 == *(uint64 *)key2)
534 				return 0;
535 			else if (*(uint64 *)key1 > *(uint64 *)key2)
536 				return 1;
537 
538 			return -1;
539 		}
540 
541 		case BPLUSTREE_FLOAT_TYPE:
542 		{
543 			float result = *(float *)key1 - *(float *)key2;
544 			if (result == 0.0f)
545 				return 0;
546 
547 			return (result < 0.0f) ? -1 : 1;
548 		}
549 
550 		case BPLUSTREE_DOUBLE_TYPE:
551 		{
552 			double result = *(double *)key1 - *(double *)key2;
553 			if (result == 0.0)
554 				return 0;
555 
556 			return (result < 0.0) ? -1 : 1;
557 		}
558 	}
559 	return 0;
560 }
561 
562 
563 status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next)
564 {
565 	if (node->all_key_count == 0)
566 	{
567 		if (index)
568 			*index = 0;
569 		if (next)
570 			*next = node->overflow_link;
571 		return B_ENTRY_NOT_FOUND;
572 	}
573 
574 	off_t *values = node->Values();
575 	int16 saveIndex = 0;
576 
577 	// binary search in the key array
578 	for (int16 first = 0,last = node->all_key_count - 1;first <= last;)
579 	{
580 		uint16 i = (first + last) >> 1;
581 
582 		uint16 searchLength;
583 		uint8 *searchKey = node->KeyAt(i,&searchLength);
584 
585 		int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength);
586 		if (cmp < 0)
587 		{
588 			last = i - 1;
589 			saveIndex = i;
590 		}
591 		else if (cmp > 0)
592 		{
593 			saveIndex = first = i + 1;
594 		}
595 		else
596 		{
597 			if (index)
598 				*index = i;
599 			if (next)
600 				*next = values[i];
601 			return B_OK;
602 		}
603 	}
604 
605 	if (index)
606 		*index = saveIndex;
607 	if (next)
608 	{
609 		if (saveIndex == node->all_key_count)
610 			*next = node->overflow_link;
611 		else
612 			*next = values[saveIndex];
613 	}
614 	return B_ENTRY_NOT_FOUND;
615 }
616 
617 
618 status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength)
619 {
620 	node_and_key nodeAndKey;
621 	nodeAndKey.nodeOffset = fHeader->root_node_pointer;
622 	nodeAndKey.keyIndex = 0;
623 
624 	bplustree_node *node;
625 	while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) {
626 		// if we are already on leaf level, we're done
627 		if (node->overflow_link == BPLUSTREE_NULL) {
628 			// put the node on the stack
629 			// node that the keyIndex is not properly set here!
630 			nodeAndKey.keyIndex = 0;
631 			stack.Push(nodeAndKey);
632 			return B_OK;
633 		}
634 
635 		off_t nextOffset;
636 		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset);
637 
638 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
639 			return B_ERROR;
640 
641 		// put the node & the correct keyIndex on the stack
642 		stack.Push(nodeAndKey);
643 
644 		nodeAndKey.nodeOffset = nextOffset;
645 	}
646 	return B_ERROR;
647 }
648 
649 
650 void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index)
651 {
652 	// should never happen, but who knows?
653 	if (index > node->all_key_count)
654 		return;
655 
656 	off_t *values = node->Values();
657 	uint16 *keyLengths = node->KeyLengths();
658 	uint8 *keys = node->Keys();
659 
660 	node->all_key_count++;
661 	node->all_key_length += keyLength;
662 
663 	off_t *newValues = node->Values();
664 	uint16 *newKeyLengths = node->KeyLengths();
665 
666 	// move values and copy new value into them
667 	memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index));
668 	memmove(newValues,values,sizeof(off_t) * index);
669 
670 	newValues[index] = value;
671 
672 	// move and update key length index
673 	for (uint16 i = node->all_key_count;i-- > index + 1;)
674 		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
675 	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
676 
677 	int32 keyStart;
678 	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
679 
680 	// move keys and copy new key into them
681 	int32 size = node->all_key_length - newKeyLengths[index];
682 	if (size > 0)
683 		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
684 
685 	memcpy(keys + keyStart,key,keyLength);
686 }
687 
688 
689 status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/)
690 {
691 	printf("DUPLICATE ENTRY!!\n");
692 
693 //		/* handle dup keys */
694 //		if (dupflg == 0)
695 //		{
696 //			bt_errno(b) = BT_DUPKEY;
697 //			goto bombout;
698 //		}
699 //		else
700 //		{
701 //			/* paste new data ptr in page */
702 //			/* and write it out again. */
703 //			off_t	*p;
704 //			p = (off_t *)KEYCLD(op->p);
705 //			*(p + keyat) = rrn;
706 //			op->flags = BT_CHE_DIRTY;
707 //			if(bt_wpage(b,op) == BT_ERR ||
708 //				bt_wpage(b,kp) == BT_ERR)
709 //				return(BT_ERR);
710 //
711 //			/* mark all as well with tree */
712 //			bt_clearerr(b);
713 //			return(BT_OK);
714 //		}
715 
716 	return B_OK;
717 }
718 
719 
720 status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value)
721 {
722 	if (*_keyIndex > node->all_key_count + 1)
723 		return B_BAD_VALUE;
724 
725 	//printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex);
726 	//dump_bplustree_node(node,fHeader);
727 	//hexdump(node,fNodeSize);
728 
729 	off_t otherOffset;
730 	bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/);
731 	if (other == NULL)
732 		return B_NO_MEMORY;
733 
734 	uint16 *inKeyLengths = node->KeyLengths();
735 	off_t *inKeyValues = node->Values();
736 	uint8 *inKeys = node->Keys();
737 	uint8 *outKeys = other->Keys();
738 	uint16 keyIndex = *_keyIndex;
739 
740 	// how many keys will fit in one (half) page?
741 
742 	// "bytes" is the number of bytes written for the new key,
743 	// "bytesBefore" are the bytes before that key
744 	// "bytesAfter" are the bytes after the new key, if any
745 	int32 bytes = 0,bytesBefore = 0,bytesAfter = 0;
746 
747 	size_t size = fNodeSize >> 1;
748 	int32 out,in;
749 	for (in = out = 0;in < node->all_key_count + 1;) {
750 		if (!bytes)
751 			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
752 		if (in == keyIndex && !bytes) {
753 			bytes = *_keyLength;
754 		} else {
755 			if (keyIndex < out) {
756 				bytesAfter = inKeyLengths[in] - bytesBefore;
757 				// fix the key lengths for the new node
758 				inKeyLengths[in] = bytesAfter + bytesBefore + bytes;
759 			} //else
760 			in++;
761 		}
762 		out++;
763 
764 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
765 						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
766 			// we have found the number of keys in the new node!
767 			break;
768 		}
769 	}
770 
771 	// if the new key was not inserted, set the length of the keys
772 	// that can be copied directly
773 	if (keyIndex >= out && in > 0)
774 		bytesBefore = inKeyLengths[in - 1];
775 
776 	if (bytesBefore < 0 || bytesAfter < 0)
777 		return B_BAD_DATA;
778 
779 	//printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
780 
781 	other->left_link = node->left_link;
782 	other->right_link = nodeOffset;
783 	other->all_key_length = bytes + bytesBefore + bytesAfter;
784 	other->all_key_count = out;
785 	//printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node));
786 
787 	uint16 *outKeyLengths = other->KeyLengths();
788 	off_t *outKeyValues = other->Values();
789 	int32 keys = out > keyIndex ? keyIndex : out;
790 
791 	if (bytesBefore) {
792 		// copy the keys
793 		memcpy(outKeys,inKeys,bytesBefore);
794 		memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16));
795 		memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t));
796 	}
797 	if (bytes) {
798 		// copy the newly inserted key
799 		memcpy(outKeys + bytesBefore,key,bytes);
800 		outKeyLengths[keyIndex] = bytes + bytesBefore;
801 		outKeyValues[keyIndex] = *_value;
802 
803 		if (bytesAfter) {
804 			// copy the keys after the new key
805 			memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter);
806 			keys = out - keyIndex - 1;
807 			memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16));
808 			memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t));
809 		}
810 	}
811 
812 	// if the new key was already inserted, we shouldn't use it again
813 	if (in != out)
814 		keyIndex--;
815 
816 	int32 total = bytesBefore + bytesAfter;
817 
818 	// if we have split an index node, we have to drop the first key
819 	// of the next node (which can also be the new key to insert)
820 	if (node->overflow_link != BPLUSTREE_NULL) {
821 		if (in == keyIndex) {
822 			other->overflow_link = *_value;
823 			keyIndex--;
824 		} else {
825 			other->overflow_link = inKeyValues[in];
826 			total = inKeyLengths[in++];
827 		}
828 	}
829 
830 	// and now the same game for the other page and the rest of the keys
831 	// (but with memmove() instead of memcpy(), because they may overlap)
832 
833 	bytesBefore = bytesAfter = bytes = 0;
834 	out = 0;
835 	int32 skip = in;
836 	while (in < node->all_key_count + 1) {
837 		//printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes);
838 		if (in == keyIndex && !bytes) {
839 			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
840 			//printf("bytesBefore = %ld\n",bytesBefore);
841 			bytes = *_keyLength;
842 		} else if (in < node->all_key_count) {
843 			//printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]);
844 			inKeyLengths[in] -= total;
845 			if (bytes) {
846 				inKeyLengths[in] += bytes;
847 				bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
848 				//printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore);
849 			}
850 
851 			in++;
852 		} else
853 			in++;
854 
855 		out++;
856 
857 		//printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
858 		//				out * (sizeof(uint16) + sizeof(off_t)));
859 		if (in > node->all_key_count && keyIndex < in)
860 			break;
861 	}
862 
863 	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
864 
865 	if (keyIndex >= in && keyIndex - skip < out) {
866 		bytesAfter = inKeyLengths[in] - bytesBefore - total;
867 	} else if (keyIndex < skip)
868 		bytesBefore = node->all_key_length - total;
869 
870 	//printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total);
871 
872 	if (bytesBefore < 0 || bytesAfter < 0)
873 		return B_BAD_DATA;
874 
875 	node->left_link = otherOffset;
876 		// right link, and overflow link can stay the same
877 	node->all_key_length = bytes + bytesBefore + bytesAfter;
878 	node->all_key_count = out - 1;
879 
880 	// array positions have changed
881 	outKeyLengths = node->KeyLengths();
882 	outKeyValues = node->Values();
883 
884 	//printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in);
885 
886 	// move the keys in the old node: the order is important here,
887 	// because we don't want to overwrite any contents
888 
889 	keys = keyIndex <= skip ? out : keyIndex - skip;
890 	keyIndex -= skip;
891 
892 	if (bytesBefore)
893 		memmove(inKeys,inKeys + total,bytesBefore);
894 	if (bytesAfter)
895 		memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter);
896 
897 	if (bytesBefore)
898 		memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16));
899 	in = out - keyIndex - 1;
900 	if (bytesAfter)
901 		memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16));
902 
903 	if (bytesBefore)
904 		memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t));
905 	if (bytesAfter)
906 		memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t));
907 
908 	if (bytes) {
909 		// finally, copy the newly inserted key (don't overwrite anything)
910 		memcpy(inKeys + bytesBefore,key,bytes);
911 		outKeyLengths[keyIndex] = bytes + bytesBefore;
912 		outKeyValues[keyIndex] = *_value;
913 	}
914 
915 	//puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n");
916 	//dump_bplustree_node(other,fHeader);
917 	//hexdump(other,fNodeSize);
918 	//puts("\n");
919 	//dump_bplustree_node(node,fHeader);
920 	//hexdump(node,fNodeSize);
921 
922 	// write the updated nodes back
923 
924 	fCache.SetDirty(otherOffset,true);
925 	fCache.SetDirty(nodeOffset,true);
926 
927 	// update the right link of the node in the left of the new node
928 	if (other->left_link != BPLUSTREE_NULL) {
929 		bplustree_node *left = fCache.Get(other->left_link);
930 		if (left != NULL) {
931 			left->right_link = otherOffset;
932 			fCache.SetDirty(other->left_link,true);
933 		}
934 	}
935 
936 	// prepare key to insert in the parent node
937 
938 	//printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1);
939 	uint16 length;
940 	uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length);
941 	memcpy(key,lastKey,length);
942 	*_keyLength = length;
943 	*_value = otherOffset;
944 
945 	return B_OK;
946 }
947 
948 
949 status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value)
950 {
951 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
952 		return B_BAD_VALUE;
953 
954 	Stack<node_and_key> stack;
955 	if (SeekDown(stack,key,keyLength) != B_OK)
956 		return B_ERROR;
957 
958 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
959 
960 	memcpy(keyBuffer,key,keyLength);
961 	keyBuffer[keyLength] = 0;
962 
963 	off_t valueToInsert = value;
964 
965 	fCurrentNodeOffset = BPLUSTREE_NULL;
966 
967 	node_and_key nodeAndKey;
968 	bplustree_node *node;
969 	uint32 count = 0;
970 
971 	while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
972 	{
973 		if (count++ == 0)	// first round, check for duplicate entries
974 		{
975 			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
976 			if (status == B_ERROR)
977 				return B_ERROR;
978 
979 			// is this a duplicate entry?
980 			if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
981 			{
982 				if (fAllowDuplicates)
983 					return InsertDuplicate(node,nodeAndKey.keyIndex);
984 				else
985 					return B_NAME_IN_USE;
986 			}
987 		}
988 
989 		// is the node big enough to hold the pair?
990 		if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength)
991 			+ (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
992 		{
993 			InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex);
994 			fCache.SetDirty(nodeAndKey.nodeOffset,true);
995 
996 			return B_OK;
997 		}
998 		else
999 		{
1000 			// do we need to allocate a new root node? if so, then do
1001 			// it now
1002 			bplustree_node *rootNode = NULL;
1003 			off_t newRoot = BPLUSTREE_NULL;
1004 			if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) {
1005 				rootNode = fCache.Get(newRoot = fHeader->maximum_size);
1006 				if (rootNode == NULL) {
1007 					// the tree is most likely dead
1008 					return B_NO_MEMORY;
1009 				}
1010 			}
1011 
1012 			if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) {
1013 				if (newRoot != BPLUSTREE_NULL) {
1014 					// free root node
1015 				}
1016 				return B_ERROR;
1017 			}
1018 
1019 			if (newRoot != BPLUSTREE_NULL) {
1020 				rootNode->overflow_link = nodeAndKey.nodeOffset;
1021 
1022 				InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0);
1023 
1024 				fHeader->root_node_pointer = newRoot;
1025 				fHeader->max_number_of_levels++;
1026 				// write header
1027 
1028 				fCache.SetDirty(newRoot,true);
1029 
1030 				return B_OK;
1031 			}
1032 		}
1033 	}
1034 
1035 	return B_ERROR;
1036 }
1037 
1038 
1039 status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value)
1040 {
1041 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1042 		return B_BAD_VALUE;
1043 
1044 	Stack<node_and_key> stack;
1045 	if (SeekDown(stack,key,keyLength) != B_OK)
1046 		return B_ERROR;
1047 
1048 	fCurrentNodeOffset = BPLUSTREE_NULL;
1049 
1050 	node_and_key nodeAndKey;
1051 	bplustree_node *node;
1052 
1053 	if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node))
1054 	{
1055 		status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
1056 		if (status == B_ERROR)
1057 			return B_ERROR;
1058 
1059 		SetCurrentNode(node,nodeAndKey.nodeOffset);
1060 
1061 		if (status == B_OK && node->overflow_link == BPLUSTREE_NULL)
1062 		{
1063 			*value = node->Values()[nodeAndKey.keyIndex];
1064 			return B_OK;
1065 		}
1066 	}
1067 	return B_ENTRY_NOT_FOUND;
1068 }
1069 
1070 
1071 //	#pragma mark -
1072 
1073 
1074 bool
1075 BPlusTree::CheckNode(bplustree_node *node)
1076 {
1077 	if (!fHeader->IsValidLink(node->left_link)
1078 		|| !fHeader->IsValidLink(node->right_link)
1079 		|| !fHeader->IsValidLink(node->overflow_link)
1080 		|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1081 		return false;
1082 
1083 	return true;
1084 }
1085 
1086 
1087 bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check)
1088 {
1089 /*printf("1: %d,%d,%d\n",
1090 	nodeOffset > fHeader->maximum_size - fNodeSize,
1091 	nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL,
1092 	(nodeOffset % fNodeSize) != 0);
1093 */
1094 	// the super node is always in memory, and shouldn't
1095 	// never be taken out of the cache
1096 	if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/
1097 		|| nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL
1098 		|| (nodeOffset % fNodeSize) != 0)
1099 		return NULL;
1100 
1101 	bplustree_node *node = (bplustree_node *)malloc(fNodeSize);
1102 	if (node == NULL)
1103 		return NULL;
1104 
1105 	if (nodeOffset == BPLUSTREE_NULL || !fStream)
1106 	{
1107 	 	node->left_link = BPLUSTREE_NULL;
1108 	 	node->right_link = BPLUSTREE_NULL;
1109 	 	node->overflow_link = BPLUSTREE_NULL;
1110 	 	node->all_key_count = 0;
1111 	 	node->all_key_length = 0;
1112 	}
1113 	else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize)
1114 	{
1115 		free(node);
1116 		return NULL;
1117 	}
1118 
1119 	if (check && node != NULL)
1120 	{
1121 		// sanity checks (links, all_key_count)
1122 		if (!fHeader->IsValidLink(node->left_link)
1123 			|| !fHeader->IsValidLink(node->right_link)
1124 			|| !fHeader->IsValidLink(node->overflow_link)
1125 			|| (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize)
1126 			return NULL;
1127 	}
1128 
1129 	if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) {
1130 		// do some hacks here
1131 		fHeader->maximum_size += fNodeSize;
1132 	}
1133 
1134 	return node;
1135 }
1136 
1137 
1138 void BPlusTree::SetHoldChanges(bool hold)
1139 {
1140 	fCache.SetHoldChanges(hold);
1141 }
1142 
1143 
1144 //	#pragma mark -
1145 
1146 
1147 uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const
1148 {
1149 	if (index < 0 || index > all_key_count)
1150 		return NULL;
1151 
1152 	uint8 *keyStart = Keys();
1153 	uint16 *keyLengths = KeyLengths();
1154 
1155 	*keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0);
1156 	if (index > 0)
1157 		keyStart += keyLengths[index - 1];
1158 
1159 	return keyStart;
1160 }
1161 
1162 
1163 uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const
1164 {
1165 	// the duplicate fragment handling is currently hard-coded to a node size
1166 	// of 1024 bytes - with future versions of BFS, this may be a problem
1167 
1168 	if (isFragment)
1169 	{
1170 		uint32 fragment = 8 * ((uint64)offset & 0x3ff);
1171 
1172 		return ((off_t *)this)[fragment];
1173 	}
1174 	return overflow_link;
1175 }
1176 
1177 
1178 off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const
1179 {
1180 	uint32 start;
1181 	if (isFragment)
1182 		start = 8 * ((uint64)offset & 0x3ff);
1183 	else
1184 		start = 2;
1185 
1186 	return ((off_t *)this)[start + 1 + index];
1187 }
1188 
1189