/* BPlusTree - BFS B+Tree implementation
**
** Initial version by Axel Dörfler, axeld@pinc-software.de
** Roughly based on 'btlib' written by Marcus J. Ranum
**
** Copyright (c) 2001-2004 pinc Software. All Rights Reserved.
** This file may be used under the terms of the OpenBeOS License.
*/


#include "Debug.h"
#include "BPlusTree.h"
#include "Inode.h"
#include "Utility.h"
#include "Stack.h"

#include <kernel_cpp.h>
#include <TypeConstants.h>

#include <string.h>
#include <stdlib.h>
#include <stdio.h>


#ifdef DEBUG
class NodeChecker {
	public:
		NodeChecker(bplustree_node *node, int32 nodeSize)
			:
			fNode(node),
			fSize(nodeSize)
		{
		}

		~NodeChecker()
		{
			fNode->CheckIntegrity(fSize);
		}

	private:
		bplustree_node *fNode;
		int32	fSize;
};
#endif

// Node Caching for the BPlusTree class
//
// With write support, there is the need for a function that allocates new
// nodes by either returning empty nodes, or by growing the file's data stream
//
// !! The CachedNode class assumes that you have properly locked the stream
// !! before asking for nodes.
//
// Note: This code will fail if the block size is smaller than the node size!
// Since BFS supports block sizes of 1024 bytes or greater, and the node size
// is hard-coded to 1024 bytes, that's not an issue now.

void 
CachedNode::Unset()
{
	if (fTree == NULL || fTree->fStream == NULL)
		return;

	if (fBlock != NULL) {
		release_block(fTree->fStream->GetVolume()->Device(), fBlockNumber);
	
		fBlock = NULL;
		fNode = NULL;
	}
}


bplustree_node *
CachedNode::SetTo(off_t offset, bool check)
{
	if (fTree == NULL || fTree->fStream == NULL) {
		REPORT_ERROR(B_BAD_VALUE);
		return NULL;
	}

	Unset();

	// You can only ask for nodes at valid positions - you can't
	// even access the b+tree header with this method (use SetToHeader()
	// instead)
	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
		|| offset <= 0
		|| (offset % fTree->fNodeSize) != 0)
		return NULL;

	if (InternalSetTo(offset) != NULL && check) {
		// sanity checks (links, all_key_count)
		bplustree_header *header = fTree->fHeader;
		if (!header->IsValidLink(fNode->LeftLink())
			|| !header->IsValidLink(fNode->RightLink())
			|| !header->IsValidLink(fNode->OverflowLink())
			|| (int8 *)fNode->Values() + fNode->NumKeys() * sizeof(off_t) >
					(int8 *)fNode + fTree->fNodeSize) {
			FATAL(("invalid node read from offset %Ld, inode at %Ld\n",
					offset, fTree->fStream->ID()));
			return NULL;
		}
	}
	return fNode;
}


bplustree_header *
CachedNode::SetToHeader()
{
	if (fTree == NULL || fTree->fStream == NULL) {
		REPORT_ERROR(B_BAD_VALUE);
		return NULL;
	}

	Unset();
	
	InternalSetTo(0LL);
	return (bplustree_header *)fNode;
}


bplustree_node *
CachedNode::InternalSetTo(off_t offset)
{
	fNode = NULL;

	off_t fileOffset;
	block_run run;
	if (offset < fTree->fStream->Size()
		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
		Volume *volume = fTree->fStream->GetVolume();

		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
		fBlockNumber = volume->ToBlock(run) + blockOffset;

		fBlock = (uint8 *)get_block(volume->Device(), fBlockNumber, volume->BlockSize());
		if (fBlock) {
			// the node is somewhere in that block... (confusing offset calculation)
			fNode = (bplustree_node *)(fBlock + offset -
						(fileOffset + blockOffset * volume->BlockSize()));
		} else
			REPORT_ERROR(B_IO_ERROR);
	}
	return fNode;
}


status_t
CachedNode::Free(Transaction *transaction, off_t offset)
{
	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL
		|| offset == BPLUSTREE_NULL)
		RETURN_ERROR(B_BAD_VALUE);

	// ToDo: scan the free nodes list and remove all nodes at the end
	// of the tree - perhaps that shouldn't be done everytime that
	// function is called, perhaps it should be done when the directory
	// inode is closed or based on some calculation or whatever...

	// if the node is the last one in the tree, we shrink
	// the tree and file size by one node
	off_t lastOffset = fTree->fHeader->MaximumSize() - fTree->fNodeSize;
	if (offset == lastOffset) {
		fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);

		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
		if (status < B_OK)
			return status;

		return fTree->fCachedHeader.WriteBack(transaction);
	}

	// add the node to the free nodes list
	fNode->left_link = fTree->fHeader->free_node_pointer;
	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);

	if (WriteBack(transaction) == B_OK) {
		fTree->fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
		return fTree->fCachedHeader.WriteBack(transaction);
	}
	return B_ERROR;
}


status_t
CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset)
{
	if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL
		|| fTree->fStream == NULL) {
		RETURN_ERROR(B_BAD_VALUE);
	}

	status_t status;

	// if there are any free nodes, recycle them
	if (SetTo(fTree->fHeader->FreeNode(), false) != NULL) {
		*_offset = fTree->fHeader->FreeNode();
		
		// set new free node pointer
		fTree->fHeader->free_node_pointer = fNode->left_link;
		if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) {
			fNode->Initialize();
			*_node = fNode;
			return B_OK;
		}
		return status;
	}
	// allocate space for a new node
	Inode *stream = fTree->fStream;
	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
		return status;

	// the maximum_size has to be changed before the call to SetTo() - or
	// else it will fail because the requested node is out of bounds
	off_t offset = fTree->fHeader->MaximumSize();
	fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(fTree->fHeader->MaximumSize() + fTree->fNodeSize);

	if (SetTo(offset, false) != NULL) {
		*_offset = offset;

		if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) {
			fNode->Initialize();
			*_node = fNode;
			return B_OK;
		}
	}
	RETURN_ERROR(B_ERROR);
}


status_t 
CachedNode::WriteBack(Transaction *transaction)
{
	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL)
		RETURN_ERROR(B_BAD_VALUE);

	return transaction->WriteBlocks(fBlockNumber, fBlock);
}


//	#pragma mark -


BPlusTree::BPlusTree(Transaction *transaction, Inode *stream, int32 nodeSize)
	:
	fStream(NULL),
	fHeader(NULL),
	fCachedHeader(this)
{
	SetTo(transaction, stream);
}


BPlusTree::BPlusTree(Inode *stream)
	:
	fStream(NULL),
	fHeader(NULL),
	fCachedHeader(this)
{
	SetTo(stream);
}


BPlusTree::BPlusTree()
	:
	fStream(NULL),
	fHeader(NULL),
	fCachedHeader(this),
	fNodeSize(BPLUSTREE_NODE_SIZE),
	fAllowDuplicates(true),
	fStatus(B_NO_INIT)
{
}


BPlusTree::~BPlusTree()
{
	// if there are any TreeIterators left, we need to stop them
	// (can happen when the tree's inode gets deleted while
	// traversing the tree - a TreeIterator doesn't lock the inode)
	if (fIteratorLock.Lock() < B_OK)
		return;

	TreeIterator *iterator = NULL;
	while ((iterator = fIterators.Next(iterator)) != NULL)
		iterator->Stop();

	fIteratorLock.Unlock();
}


/** Create a new B+Tree on the specified stream */

status_t
BPlusTree::SetTo(Transaction *transaction, Inode *stream, int32 nodeSize)
{
	// initializes in-memory B+Tree

	fCachedHeader.Unset();
	fStream = stream;

	fHeader = fCachedHeader.SetToHeader();
	if (fHeader == NULL) {
		// allocate space for new header + node!
		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
		if (fStatus < B_OK)
			RETURN_ERROR(fStatus);
		
		fHeader = fCachedHeader.SetToHeader();
		if (fHeader == NULL)
			RETURN_ERROR(fStatus = B_ERROR);
	}

	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
						&& stream->BlockRun() != stream->Parent())
						|| (stream->Mode() & S_ALLOW_DUPS) != 0;

	fNodeSize = nodeSize;

	// initialize b+tree header
 	fHeader->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
 	fHeader->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
 	fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
 	fHeader->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
 	fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
 	fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
 	fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);

	if (fCachedHeader.WriteBack(transaction) < B_OK)
		RETURN_ERROR(fStatus = B_ERROR);

	// initialize b+tree root node
	CachedNode cached(this, fHeader->RootNode(), false);
	if (cached.Node() == NULL)
		RETURN_ERROR(B_ERROR);

	cached.Node()->Initialize();
	return fStatus = cached.WriteBack(transaction);
}


status_t
BPlusTree::SetTo(Inode *stream)
{
	if (stream == NULL || stream->Node() == NULL)
		RETURN_ERROR(fStatus = B_BAD_VALUE);

	// get on-disk B+Tree header

	fCachedHeader.Unset();
	fStream = stream;

	fHeader = fCachedHeader.SetToHeader();
	if (fHeader == NULL)
		RETURN_ERROR(fStatus = B_NO_INIT);
	
	// is header valid?

	if (fHeader->Magic() != BPLUSTREE_MAGIC
		|| fHeader->MaximumSize() != stream->Size()
		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
		|| !fHeader->IsValidLink(fHeader->RootNode())
		|| !fHeader->IsValidLink(fHeader->FreeNode()))
		RETURN_ERROR(fStatus = B_BAD_DATA);

	fNodeSize = fHeader->NodeSize();

	{
		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
	
		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode
			|| !stream->IsContainer()) {
			D(	dump_bplustree_header(fHeader);
				dump_inode(stream->Node());
			);
			RETURN_ERROR(fStatus = B_BAD_TYPE);
		}

		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
		 // in the original BFS code - we will honour it nevertheless
		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
							&& stream->BlockRun() != stream->Parent())
							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
	}

	CachedNode cached(this, fHeader->RootNode());
	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
}


status_t
BPlusTree::InitCheck()
{
	return fStatus;
}


int32 
BPlusTree::TypeCodeToKeyType(type_code code)
{
	switch (code) {
		case B_STRING_TYPE:
			return BPLUSTREE_STRING_TYPE;
		case B_SSIZE_T_TYPE:
		case B_INT32_TYPE:
			return BPLUSTREE_INT32_TYPE;
		case B_SIZE_T_TYPE:
		case B_UINT32_TYPE:
			return BPLUSTREE_UINT32_TYPE;
		case B_OFF_T_TYPE:
		case B_INT64_TYPE:
			return BPLUSTREE_INT64_TYPE;
		case B_UINT64_TYPE:
			return BPLUSTREE_UINT64_TYPE;
		case B_FLOAT_TYPE:
			return BPLUSTREE_FLOAT_TYPE;
		case B_DOUBLE_TYPE:
			return BPLUSTREE_DOUBLE_TYPE;
	}
	return -1;
}


int32 
BPlusTree::ModeToKeyType(mode_t mode)
{
	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
		case S_INT_INDEX:
			return BPLUSTREE_INT32_TYPE;
		case S_UINT_INDEX:
			return BPLUSTREE_UINT32_TYPE;
		case S_LONG_LONG_INDEX:
			return BPLUSTREE_INT64_TYPE;
		case S_ULONG_LONG_INDEX:
			return BPLUSTREE_UINT64_TYPE;
		case S_FLOAT_INDEX:
			return BPLUSTREE_FLOAT_TYPE;
		case S_DOUBLE_INDEX:
			return BPLUSTREE_DOUBLE_TYPE;
		case S_STR_INDEX:
		default:
			// default is for standard directories
			return BPLUSTREE_STRING_TYPE;
	}
}


//	#pragma mark -


void
BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt,
	int8 change)
{
	// Although every iterator which is affected by this update currently
	// waits on a semaphore, other iterators could be added/removed at
	// any time, so we need to protect this loop
	if (fIteratorLock.Lock() < B_OK)
		return;

	TreeIterator *iterator = NULL;
	while ((iterator = fIterators.Next(iterator)) != NULL)
		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);

	fIteratorLock.Unlock();
}


void
BPlusTree::AddIterator(TreeIterator *iterator)
{
	if (fIteratorLock.Lock() < B_OK)
		return;

	fIterators.Add(iterator);

	fIteratorLock.Unlock();
}


void 
BPlusTree::RemoveIterator(TreeIterator *iterator)
{
	if (fIteratorLock.Lock() < B_OK)
		return;

	fIterators.Remove(iterator);

	fIteratorLock.Unlock();
}


int32
BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
{
	type_code type = 0;
	switch (fHeader->data_type) {
	    case BPLUSTREE_STRING_TYPE:
	    	type = B_STRING_TYPE;
	    	break;
		case BPLUSTREE_INT32_TYPE:
	    	type = B_INT32_TYPE;
	    	break;
		case BPLUSTREE_UINT32_TYPE:
	    	type = B_UINT32_TYPE;
	    	break;
		case BPLUSTREE_INT64_TYPE:
	    	type = B_INT64_TYPE;
	    	break;
		case BPLUSTREE_UINT64_TYPE:
	    	type = B_UINT64_TYPE;
	    	break;
		case BPLUSTREE_FLOAT_TYPE:
	    	type = B_FLOAT_TYPE;
	    	break;
		case BPLUSTREE_DOUBLE_TYPE:
	    	type = B_DOUBLE_TYPE;
	    	break;
	}
   	return compareKeys(type, key1, keyLength1, key2, keyLength2);
}


status_t
BPlusTree::FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index,
	off_t *next)
{
	if (node->all_key_count == 0) {
		if (index)
			*index = 0;
		if (next)
			*next = node->OverflowLink();
		return B_ENTRY_NOT_FOUND;
	}

	off_t *values = node->Values();
	int16 saveIndex = -1;

	// binary search in the key array
	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
		uint16 i = (first + last) >> 1;

		uint16 searchLength;
		uint8 *searchKey = node->KeyAt(i, &searchLength);
		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
			fStream->GetVolume()->Panic();
			RETURN_ERROR(B_BAD_DATA);
		}

		int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength);
		if (cmp < 0) {
			last = i - 1;
			saveIndex = i;
		} else if (cmp > 0) {
			saveIndex = first = i + 1;
		} else {
			if (index)
				*index = i;
			if (next)
				*next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
			return B_OK;
		}
	}

	if (index)
		*index = saveIndex;
	if (next) {
		if (saveIndex == node->NumKeys())
			*next = node->OverflowLink();
		else
			*next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
	}
	return B_ENTRY_NOT_FOUND;
}


/**	Prepares the stack to contain all nodes that were passed while
 *	following the key, from the root node to the leaf node that could
 *	or should contain that key.
 */

status_t
BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength)
{
	// set the root node to begin with
	node_and_key nodeAndKey;
	nodeAndKey.nodeOffset = fHeader->RootNode();

	CachedNode cached(this);
	bplustree_node *node;
	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
		// if we are already on leaf level, we're done
		if (node->OverflowLink() == BPLUSTREE_NULL) {
			// node that the keyIndex is not properly set here (but it's not
			// needed in the calling functions anyway)!
			nodeAndKey.keyIndex = 0;
			stack.Push(nodeAndKey);
			return B_OK;
		}

		off_t nextOffset;
		status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset);
		
		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
			RETURN_ERROR(B_ERROR);

		// put the node offset & the correct keyIndex on the stack
		stack.Push(nodeAndKey);

		nodeAndKey.nodeOffset = nextOffset;
	}
	RETURN_ERROR(B_ERROR);
}


status_t
BPlusTree::FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset,
	bplustree_node **_fragment, uint32 *_index)
{
	off_t *values = node->Values();
	for (int32 i = 0; i < node->NumKeys(); i++) {
		// does the value link to a duplicate fragment?
		if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT)
			continue;

		bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]), false);
		if (fragment == NULL) {
			FATAL(("Could not get duplicate fragment at %Ld\n", values[i]));
			continue;
		}
		
		// see if there is some space left for us
		int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1);
		for (int32 j = 0;j < num;j++) {
			duplicate_array *array = fragment->FragmentAt(j);

			if (array->count == 0) {
				*_offset = bplustree_node::FragmentOffset(values[i]);
				*_fragment = fragment;
				*_index = j;
				return B_OK;
			}
		}
	}
	return B_ENTRY_NOT_FOUND;
}


status_t
BPlusTree::InsertDuplicate(Transaction *transaction, CachedNode *cached, bplustree_node *node,
	uint16 index, off_t value)
{
	CachedNode cachedDuplicate(this);
	off_t *values = node->Values();
	off_t oldValue = values[index];
	status_t status;
	off_t offset;

	if (bplustree_node::IsDuplicate(oldValue)) {
		//
		// If it's a duplicate fragment, try to insert it into that, or if it
		// doesn't fit anymore, create a new duplicate node
		//
		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
			bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue), false);
			if (duplicate == NULL)
				return B_IO_ERROR;

			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
			if (array->count > NUM_FRAGMENT_VALUES
				|| array->count < 1) {
				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
					bplustree_node::FragmentIndex(oldValue),
					bplustree_node::FragmentOffset(oldValue),
					array->count));
				return B_BAD_DATA;
			}

			if (array->count < NUM_FRAGMENT_VALUES) {
				array->Insert(value);
			} else {
				// test if the fragment will be empty if we remove this key's values			
				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
					// the node will be empty without our values, so let us
					// reuse it as a duplicate node
					offset = bplustree_node::FragmentOffset(oldValue);

					memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
					duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);

					array = duplicate->DuplicateArray();
					array->Insert(value);
				} else {
					// create a new duplicate node
					CachedNode cachedNewDuplicate(this);
					bplustree_node *newDuplicate;
					status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
					if (status < B_OK)
						return status;

					// copy the array from the fragment node to the duplicate node
					// and free the old entry (by zero'ing all values)
					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count);
					memcpy(&newDuplicate->all_key_count, &array->values[0],
						array->count * sizeof(off_t));
					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
	
					array = newDuplicate->DuplicateArray();
					array->Insert(value);
					
					// if this fails, the old fragments node will contain wrong
					// data... (but since it couldn't be written, it shouldn't
					// be fatal)
					if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK)
						return status;
				}

				// update the main pointer to link to a duplicate node
				values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, offset);
				if ((status = cached->WriteBack(transaction)) < B_OK)
					return status;
			}

			return cachedDuplicate.WriteBack(transaction);
		}

		//
		// Put the value into a dedicated duplicate node
		//

		// search for free space in the duplicate nodes of that key
		duplicate_array *array;
		bplustree_node *duplicate;
		off_t duplicateOffset;
		do {
			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
			if (duplicate == NULL)
				return B_IO_ERROR;

			array = duplicate->DuplicateArray();
			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
					duplicateOffset, array->count));
				return B_BAD_DATA;
			}
		} while (array->count >= NUM_DUPLICATE_VALUES
				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);

		if (array->count < NUM_DUPLICATE_VALUES) {
			array->Insert(value);
		} else {
			// no space left - add a new duplicate node

			CachedNode cachedNewDuplicate(this);
			bplustree_node *newDuplicate;
			status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
			if (status < B_OK)
				return status;

			// link the two nodes together
			duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
			
			array = newDuplicate->DuplicateArray();
			array->count = 0;
			array->Insert(value);
			
			status = cachedNewDuplicate.WriteBack(transaction);
			if (status < B_OK)
				return status;
		}
		return cachedDuplicate.WriteBack(transaction);
	}

	//
	// Search for a free duplicate fragment or create a new one
	// to insert the duplicate value into
	//

	uint32 fragmentIndex = 0;
	bplustree_node *fragment;
	if (FindFreeDuplicateFragment(node, &cachedDuplicate, &offset, &fragment, &fragmentIndex) < B_OK) {
		// allocate a new duplicate fragment node
		if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK)
			return status;

		memset(fragment, 0, fNodeSize);
	}
	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
	array->Insert(oldValue);
	array->Insert(value);

	if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK)
		return status;

	values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex);

	return cached->WriteBack(transaction);
}


void
BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength,
	off_t value)
{
	// should never happen, but who knows?
	if (index > node->NumKeys())
		return;

	off_t *values = node->Values();
	uint16 *keyLengths = node->KeyLengths();
	uint8 *keys = node->Keys();

	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength);

	off_t *newValues = node->Values();
	uint16 *newKeyLengths = node->KeyLengths();

	// move values and copy new value into them
	memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index));
	memmove(newValues, values, sizeof(off_t) * index);

	newValues[index] = value;

	// move and update key length index
	for (uint16 i = node->NumKeys(); i-- > index + 1;)
		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);

	int32 keyStart;
	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);

	// move keys and copy new key into them
	int32 size = node->AllKeyLength() - newKeyLengths[index];
	if (size > 0)
		memmove(keys + newKeyLengths[index], keys + newKeyLengths[index] - keyLength, size);

	memcpy(keys + keyStart, key, keyLength);
}


status_t
BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other,
	off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value)
{
	if (*_keyIndex > node->NumKeys() + 1)
		return B_BAD_VALUE;

	uint16 *inKeyLengths = node->KeyLengths();
	off_t *inKeyValues = node->Values();
	uint8 *inKeys = node->Keys();
	uint8 *outKeys = other->Keys();
	int32 keyIndex = *_keyIndex;	// can become less than zero!

	// how many keys will fit in one (half) page?
	// that loop will find the answer to this question and
	// change the key lengths indices for their new home

	// "bytes" is the number of bytes written for the new key,
	// "bytesBefore" are the bytes before that key
	// "bytesAfter" are the bytes after the new key, if any
	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;

	size_t size = fNodeSize >> 1;
	int32 out, in;
	for (in = out = 0; in < node->NumKeys() + 1;) {
		if (!bytes)
			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;

		if (in == keyIndex && !bytes) {
			bytes = *_keyLength;
		} else {
			if (keyIndex < out)
				bytesAfter = inKeyLengths[in] - bytesBefore;

			in++;
		}
		out++;

		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
			// we have found the number of keys in the new node!
			break;
		}
	}

	// if the new key was not inserted, set the length of the keys
	// that can be copied directly
	if (keyIndex >= out && in > 0)
		bytesBefore = inKeyLengths[in - 1];

	if (bytesBefore < 0 || bytesAfter < 0)
		return B_BAD_DATA;

	other->left_link = node->left_link;
	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);

	uint16 *outKeyLengths = other->KeyLengths();
	off_t *outKeyValues = other->Values();
	int32 keys = out > keyIndex ? keyIndex : out;

	if (bytesBefore) {
		// copy the keys
		memcpy(outKeys, inKeys, bytesBefore);
		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
	}
	if (bytes) {
		// copy the newly inserted key
		memcpy(outKeys + bytesBefore, key, bytes);
		outKeyLengths[keyIndex] = bytes + bytesBefore;
		outKeyValues[keyIndex] = *_value;

		if (bytesAfter) {
			// copy the keys after the new key
			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter);
			keys = out - keyIndex - 1;
			for (int32 i = 0;i < keys;i++)
				outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes;
			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t));
		}
	}

	// if the new key was already inserted, we shouldn't use it again
	if (in != out)
		keyIndex--;

	int32 total = bytesBefore + bytesAfter;

	// these variables are for the key that will be returned
	// to the parent node
	uint8 *newKey = NULL;
	uint16 newLength;
	bool newAllocated = false;

	// If we have split an index node, we have to drop the first key
	// of the next node (which can also be the new key to insert).
	// The dropped key is also the one which has to be inserted in
	// the parent node, so we will set the "newKey" already here.
	if (node->OverflowLink() != BPLUSTREE_NULL) {
		if (in == keyIndex) {
			newKey = key;
			newLength = *_keyLength;

			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
			keyIndex--;
		} else {
			// If a key is dropped (is not the new key), we have to copy
			// it, because it would be lost if not.
			uint8 *droppedKey = node->KeyAt(in, &newLength);
			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
				fStream->GetVolume()->Panic();
				RETURN_ERROR(B_BAD_DATA);
			}
			newKey = (uint8 *)malloc(newLength);
			if (newKey == NULL)
				return B_NO_MEMORY;
			memcpy(newKey, droppedKey, newLength);

			other->overflow_link = inKeyValues[in];
			total = inKeyLengths[in++];
		}
	}

	// and now the same game for the other page and the rest of the keys
	// (but with memmove() instead of memcpy(), because they may overlap)

	bytesBefore = bytesAfter = bytes = 0;
	out = 0;
	int32 skip = in;
	while (in < node->NumKeys() + 1) {
		if (in == keyIndex && !bytes) {
			// it's enough to set bytesBefore once here, because we do
			// not need to know the exact length of all keys in this
			// loop
			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
			bytes = *_keyLength;
		} else {
			if (in < node->NumKeys()) {
				inKeyLengths[in] -= total;
				if (bytes) {
					inKeyLengths[in] += bytes;
					bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
				}
			}
			in++;
		}

		out++;

		// break out when all keys are done
		if (in > node->NumKeys() && keyIndex < in)
			break;
	}

	// adjust the byte counts (since we were a bit lazy in the loop)
	if (keyIndex >= in && keyIndex - skip < out)
		bytesAfter = inKeyLengths[in] - bytesBefore - total;
	else if (keyIndex < skip)
		bytesBefore = node->AllKeyLength() - total;

	if (bytesBefore < 0 || bytesAfter < 0)
		return B_BAD_DATA;

	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
		// right link, and overflow link can stay the same
	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);

	// array positions have changed
	outKeyLengths = node->KeyLengths();
	outKeyValues = node->Values();

	// move the keys in the old node: the order is important here,
	// because we don't want to overwrite any contents

	keys = keyIndex <= skip ? out : keyIndex - skip;
	keyIndex -= skip;

	if (bytesBefore)
		memmove(inKeys, inKeys + total, bytesBefore);
	if (bytesAfter)
		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter);

	if (bytesBefore)
		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
	in = out - keyIndex - 1;
	if (bytesAfter)
		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16));

	if (bytesBefore)
		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
	if (bytesAfter)
		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t));

	if (bytes) {
		// finally, copy the newly inserted key (don't overwrite anything)
		memcpy(inKeys + bytesBefore, key, bytes);
		outKeyLengths[keyIndex] = bytes + bytesBefore;
		outKeyValues[keyIndex] = *_value;
	}

	// Prepare the key that will be inserted in the parent node which
	// is either the dropped key or the last of the other node.
	// If it's the dropped key, "newKey" was already set earlier.

	if (newKey == NULL)
		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);

	memcpy(key, newKey, newLength);
	*_keyLength = newLength;
	*_value = otherOffset;

	if (newAllocated)
		free(newKey);

	return B_OK;
}


status_t
BPlusTree::Insert(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
{
	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
		RETURN_ERROR(B_BAD_VALUE);

	// lock access to stream
	WriteLocked locked(fStream->Lock());

	Stack<node_and_key> stack;
	if (SeekDown(stack, key, keyLength) != B_OK)
		RETURN_ERROR(B_ERROR);

	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];

	memcpy(keyBuffer, key, keyLength);
	keyBuffer[keyLength] = 0;

	node_and_key nodeAndKey;
	bplustree_node *node;

	CachedNode cached(this);
	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
		NodeChecker checker(node, fNodeSize);
#endif
		if (node->IsLeaf()) {
			// first round, check for duplicate entries
			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);

			// is this a duplicate entry?
			if (status == B_OK) {
				if (fAllowDuplicates)
					return InsertDuplicate(transaction, &cached, node, nodeAndKey.keyIndex, value);

				RETURN_ERROR(B_NAME_IN_USE);
			}
		}

		// is the node big enough to hold the pair?
		if (int32(round_up(sizeof(bplustree_node) + node->AllKeyLength() + keyLength)
			+ (node->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
		{
			InsertKey(node, nodeAndKey.keyIndex, keyBuffer, keyLength, value);
			UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1);

			return cached.WriteBack(transaction);
		} else {
			CachedNode cachedNewRoot(this);
			CachedNode cachedOther(this);

			// do we need to allocate a new root node? if so, then do
			// it now
			off_t newRoot = BPLUSTREE_NULL;
			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
				bplustree_node *root;
				status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot);
				if (status < B_OK) {
					// The tree is most likely corrupted!
					// But it's still sane at leaf level - we could set
					// a flag in the header that forces the tree to be
					// rebuild next time...
					// But since we will have journaling, that's not a big
					// problem anyway.
					RETURN_ERROR(status);
				}
			}

			// reserve space for the other node
			bplustree_node *other;
			off_t otherOffset;
			status_t status = cachedOther.Allocate(transaction, &other, &otherOffset);
			if (status < B_OK) {
				cachedNewRoot.Free(transaction, newRoot);
				RETURN_ERROR(status);
			}

			if (SplitNode(node, nodeAndKey.nodeOffset, other, otherOffset,
					&nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) {
				// free root node & other node here
				cachedNewRoot.Free(transaction, newRoot);
				cachedOther.Free(transaction, otherOffset);					

				RETURN_ERROR(B_ERROR);
			}

			// write the updated nodes back
		
			if (cached.WriteBack(transaction) < B_OK
				|| cachedOther.WriteBack(transaction) < B_OK)
				RETURN_ERROR(B_ERROR);

			UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex,
				node->NumKeys(), 1);

			// update the right link of the node in the left of the new node
			if ((other = cachedOther.SetTo(other->LeftLink())) != NULL) {
				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
				if (cachedOther.WriteBack(transaction) < B_OK)
					RETURN_ERROR(B_ERROR);
			}

			// create a new root if necessary
			if (newRoot != BPLUSTREE_NULL) {
				bplustree_node *root = cachedNewRoot.Node();

				InsertKey(root, 0, keyBuffer, keyLength, node->LeftLink());
				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset);

				if (cachedNewRoot.WriteBack(transaction) < B_OK)
					RETURN_ERROR(B_ERROR);

				// finally, update header to point to the new root
				fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(fHeader->MaxNumberOfLevels() + 1);

				return fCachedHeader.WriteBack(transaction);
			}
		}
	}
	RETURN_ERROR(B_ERROR);
}


status_t
BPlusTree::RemoveDuplicate(Transaction *transaction, bplustree_node *node, CachedNode *cached,
	uint16 index, off_t value)
{
	CachedNode cachedDuplicate(this);
	off_t *values = node->Values();
	off_t oldValue = values[index];
	status_t status;

	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
	bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
	if (duplicate == NULL)
		return B_IO_ERROR;

	// if it's a duplicate fragment, remove the entry from there
	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));

		if (array->count > NUM_FRAGMENT_VALUES
			|| array->count < 1) {
			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
				bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
			return B_BAD_DATA;
		}
		if (!array->Remove(value))
			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
				value, duplicateOffset));

		// remove the array from the fragment node if it is empty
		if (array->count == 1) {
			// set the link to the remaining value
			values[index] = array->values[0];

			// Remove the whole fragment node, if this was the only array,
			// otherwise free the array and write the changes back
			if (duplicate->FragmentsUsed(fNodeSize) == 1)
				status = cachedDuplicate.Free(transaction, duplicateOffset);
			else {
				array->count = 0;
				status = cachedDuplicate.WriteBack(transaction);
			}
			if (status < B_OK)
				return status;

			return cached->WriteBack(transaction);
		}
		return cachedDuplicate.WriteBack(transaction);
	}

	//
	// Remove value from a duplicate node!
	//

	duplicate_array *array;

	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
		FATAL(("invalid duplicate node: first left link points to %Ld!\n", duplicate->LeftLink()));
		return B_BAD_DATA;
	}

	// Search the duplicate nodes until the entry could be found (and removed)
	while (duplicate != NULL) {
		array = duplicate->DuplicateArray();
		if (array->count > NUM_DUPLICATE_VALUES
			|| array->count < 0) {
			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
				duplicateOffset, array->count));
			return B_BAD_DATA;
		}

		if (array->Remove(value))
			break;

		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
			RETURN_ERROR(B_ENTRY_NOT_FOUND);
		
		duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
	}
	if (duplicate == NULL)
		RETURN_ERROR(B_IO_ERROR);

	while (true) {
		off_t left = duplicate->LeftLink();
		off_t right = duplicate->RightLink();
		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
	
		if (isLast && array->count == 1 || array->count == 0) {
			// Free empty duplicate page, link their siblings together, and
			// update the duplicate link if needed (which should not be, if
			// we are the only one working on that tree...)
	
			if (duplicateOffset == bplustree_node::FragmentOffset(oldValue)
				|| array->count == 1) {
				if (array->count == 1 && isLast)
					values[index] = array->values[0];
				else if (isLast) {
					FATAL(("removed last value from duplicate!\n"));
				} else
					values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, right);
	
				if ((status = cached->WriteBack(transaction)) < B_OK)
					return status;
			}

			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
				return status;

			if (left != BPLUSTREE_NULL
				&& (duplicate = cachedDuplicate.SetTo(left, false)) != NULL) {
				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);

				// If the next node is the last node, we need to free that node
				// and convert the duplicate entry back into a normal entry
				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
					duplicateOffset = left;
					continue;
				}

				status = cachedDuplicate.WriteBack(transaction);
				if (status < B_OK)
					return status;
			}
			if (right != BPLUSTREE_NULL
				&& (duplicate = cachedDuplicate.SetTo(right, false)) != NULL) {
				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);

				// Again, we may need to turn the duplicate entry back into a normal entry
				array = duplicate->DuplicateArray();
				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
					duplicateOffset = right;
					continue;
				}

				return cachedDuplicate.WriteBack(transaction);
			}
			return status;
		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
			// If the number of entries fits in a duplicate fragment, then
			// either find a free fragment node, or convert this node to a
			// fragment node.
			CachedNode cachedOther(this);
	
			bplustree_node *fragment = NULL;
			uint32 fragmentIndex = 0;
			off_t offset;
			if (FindFreeDuplicateFragment(node, &cachedOther, &offset,
					&fragment, &fragmentIndex) < B_OK) {
				// convert node
				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
			} else {
				// move to other node
				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
	
				cachedDuplicate.Free(transaction, duplicateOffset);
				duplicateOffset = offset;
			}
			values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,
								duplicateOffset, fragmentIndex);

			if ((status = cached->WriteBack(transaction)) < B_OK)
				return status;

			if (fragment != NULL)
				return cachedOther.WriteBack(transaction);
		}
		return cachedDuplicate.WriteBack(transaction);
	}
}


/** Removes the key with the given index from the specified node.
 *	Since it has to get the key from the node anyway (to obtain it's
 *	pointer), it's not needed to pass the key & its length, although
 *	the calling method (BPlusTree::Remove()) have this data.
 */

void
BPlusTree::RemoveKey(bplustree_node *node, uint16 index)
{
	// should never happen, but who knows?
	if (index > node->NumKeys() && node->NumKeys() > 0) {
		FATAL(("Asked me to remove key outer limits: %u\n", index));
		return;
	}

	off_t *values = node->Values();

	// if we would have to drop the overflow link, drop
	// the last key instead and update the overflow link
	// to the value of that one
	if (!node->IsLeaf() && index == node->NumKeys())
		node->overflow_link = values[--index];

	uint16 length;
	uint8 *key = node->KeyAt(index, &length);
	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
		FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length,
			fStream->BlockRun().allocation_group, fStream->BlockRun().start));
		fStream->GetVolume()->Panic();
		return;
	}

	uint16 *keyLengths = node->KeyLengths();
	uint8 *keys = node->Keys();

	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);

	off_t *newValues = node->Values();
	uint16 *newKeyLengths = node->KeyLengths();

	// move key data
	memmove(key, key + length, node->AllKeyLength() - (key - keys));

	// move and update key lengths
	if (index > 0 && newKeyLengths != keyLengths)
		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
	for (uint16 i = index; i < node->NumKeys(); i++)
		newKeyLengths[i] = keyLengths[i + 1] - length;

	// move values
	if (index > 0)
		memmove(newValues, values, index * sizeof(off_t));
	if (node->NumKeys() > index)
		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
}


/**	Removes the specified key from the tree. The "value" parameter is only used
 *	for trees which allow duplicates, so you may safely ignore it.
 *	It's not an optional parameter, so at least you have to think about it.
 */

status_t
BPlusTree::Remove(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
{
	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
		RETURN_ERROR(B_BAD_VALUE);

	// lock access to stream
	WriteLocked locked(fStream->Lock());

	Stack<node_and_key> stack;
	if (SeekDown(stack, key, keyLength) != B_OK)
		RETURN_ERROR(B_ERROR);

	node_and_key nodeAndKey;
	bplustree_node *node;

	CachedNode cached(this);
	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
#ifdef DEBUG
		NodeChecker checker(node, fNodeSize);
#endif
		if (node->IsLeaf()) {
			// first round, check for duplicate entries
			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
			if (status < B_OK)
				RETURN_ERROR(status);

			// If we will remove the last key, the iterator will be set
			// to the next node after the current - if there aren't any
			// more nodes, we need a way to prevent the TreeIterators to
			// touch the old node again, we use BPLUSTREE_FREE for this
			off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink();
			UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ?
								next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);

			// is this a duplicate entry?
			if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) {
				if (fAllowDuplicates)
					return RemoveDuplicate(transaction, node, &cached, nodeAndKey.keyIndex, value);
				else
					RETURN_ERROR(B_NAME_IN_USE);
			}
		}

		// if it's an empty root node, we have to convert it
		// to a leaf node by dropping the overflow link, or,
		// if it's a leaf node, just empty it
		if (nodeAndKey.nodeOffset == fHeader->RootNode()
			&& node->NumKeys() == 0
			|| node->NumKeys() == 1 && node->IsLeaf()) {
			node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
			node->all_key_count = 0;
			node->all_key_length = 0;

			if (cached.WriteBack(transaction) < B_OK)
				return B_IO_ERROR;

			// if we've cleared the root node, reset the maximum
			// number of levels in the header
			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
				return fCachedHeader.WriteBack(transaction);
			}
			return B_OK;
		}

		// if there is only one key left, we don't have to remove
		// it, we can just dump the node (index nodes still have
		// the overflow link, so we have to drop the last key)
		if (node->NumKeys() > 1
			|| !node->IsLeaf() && node->NumKeys() == 1) {
			RemoveKey(node, nodeAndKey.keyIndex);
			return cached.WriteBack(transaction);
		}

		// when we are here, we can just free the node, but
		// we have to update the right/left link of the
		// siblings first
		CachedNode otherCached(this);
		bplustree_node *other = otherCached.SetTo(node->LeftLink());
		if (other != NULL) {
			other->right_link = node->right_link;
			if (otherCached.WriteBack(transaction) < B_OK)
				return B_IO_ERROR;
		}

		if ((other = otherCached.SetTo(node->RightLink())) != NULL) {
			other->left_link = node->left_link;
			if (otherCached.WriteBack(transaction) < B_OK)
				return B_IO_ERROR;
		}

		cached.Free(transaction, nodeAndKey.nodeOffset);
	}
	RETURN_ERROR(B_ERROR);
}


/**	Replaces the value for the key in the tree.
 *	Returns B_OK if the key could be found and its value replaced,
 *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
 *	to indicate that something went terribly wrong.
 *	Note that this doesn't work with duplicates - it will just
 *	return B_BAD_TYPE if you call this function on a tree where
 *	duplicates are allowed.
 */

status_t
BPlusTree::Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
{
	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
		|| key == NULL)
		RETURN_ERROR(B_BAD_VALUE);

	if (fAllowDuplicates)
		RETURN_ERROR(B_BAD_TYPE);

	// lock access to stream (a read lock is okay for this purpose)
	ReadLocked locked(fStream->Lock());

	off_t nodeOffset = fHeader->RootNode();
	CachedNode cached(this);
	bplustree_node *node;

	while ((node = cached.SetTo(nodeOffset)) != NULL) {
		uint16 keyIndex = 0;
		off_t nextOffset;
		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);

		if (node->OverflowLink() == BPLUSTREE_NULL) {
			if (status == B_OK) {
				node->Values()[keyIndex] = value;
				return cached.WriteBack(transaction);
			}

			return status;
		} else if (nextOffset == nodeOffset)
			RETURN_ERROR(B_ERROR);

		nodeOffset = nextOffset;
	}
	RETURN_ERROR(B_ERROR);
}


/**	Searches the key in the tree, and stores the offset found in
 *	_value, if successful.
 *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
 *	a stack while it descends the tree.
 *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
 *	if not. It can also return other errors to indicate that
 *	something went wrong.
 *	Note that this doesn't work with duplicates - it will just
 *	return B_BAD_TYPE if you call this function on a tree where
 *	duplicates are allowed.
 */

status_t
BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
{
	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
		|| key == NULL)
		RETURN_ERROR(B_BAD_VALUE);

	if (fAllowDuplicates)
		RETURN_ERROR(B_BAD_TYPE);

	// lock access to stream
	ReadLocked locked(fStream->Lock());

	off_t nodeOffset = fHeader->RootNode();
	CachedNode cached(this);
	bplustree_node *node;

#ifdef DEBUG
	int32 levels = 0;
#endif

	while ((node = cached.SetTo(nodeOffset)) != NULL) {
		uint16 keyIndex = 0;
		off_t nextOffset;
		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);

#ifdef DEBUG
		levels++;
#endif
		if (node->OverflowLink() == BPLUSTREE_NULL) {
			if (status == B_OK && _value != NULL)
				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);

#ifdef DEBUG
			if (levels != (int32)fHeader->MaxNumberOfLevels())
				DEBUGGER(("levels don't match"));
#endif
			return status;
		} else if (nextOffset == nodeOffset)
			RETURN_ERROR(B_ERROR);

		nodeOffset = nextOffset;
	}
	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
	RETURN_ERROR(B_ERROR);
}


//	#pragma mark -


TreeIterator::TreeIterator(BPlusTree *tree)
	:
	fTree(tree),
	fCurrentNodeOffset(BPLUSTREE_NULL),
	fNext(NULL)
{
	tree->AddIterator(this);
}


TreeIterator::~TreeIterator()
{
	if (fTree)
		fTree->RemoveIterator(this);
}


status_t
TreeIterator::Goto(int8 to)
{
	if (fTree == NULL || fTree->fHeader == NULL)
		RETURN_ERROR(B_BAD_VALUE);

	// lock access to stream
	ReadLocked locked(fTree->fStream->Lock());

	off_t nodeOffset = fTree->fHeader->RootNode();
	CachedNode cached(fTree);
	bplustree_node *node;

	while ((node = cached.SetTo(nodeOffset)) != NULL) {
		// is the node a leaf node?
		if (node->OverflowLink() == BPLUSTREE_NULL) {
			fCurrentNodeOffset = nodeOffset;
			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
			fDuplicateNode = BPLUSTREE_NULL;

			return B_OK;
		}

		// get the next node offset depending on the direction (and if there
		// are any keys in that node at all)
		off_t nextOffset;
		if (to == BPLUSTREE_END || node->all_key_count == 0)
			nextOffset = node->OverflowLink();
		else {
			if (node->AllKeyLength() > fTree->fNodeSize
				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys())
				RETURN_ERROR(B_ERROR);

			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
		}
		if (nextOffset == nodeOffset)
			break;

		nodeOffset = nextOffset;
	}
	FATAL(("%s fails\n", __FUNCTION__));

	RETURN_ERROR(B_ERROR);
}


/**	Iterates through the tree in the specified direction.
 *	When it iterates through duplicates, the "key" is only updated for the
 *	first entry - if you need to know when this happens, use the "duplicate"
 *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
 *	the other duplicates.
 *	That's not too nice, but saves the 256 bytes that would be needed to
 *	store the last key - if this will ever become an issue, it will be
 *	easy to change.
 *	The other advantage of this is, that the queries can skip all duplicates
 *	at once when they are not relevant to them.
 */

status_t
TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength,
	off_t *value, uint16 *duplicate)
{
	if (fTree == NULL)
		return B_INTERRUPTED;
	if (fCurrentNodeOffset == BPLUSTREE_NULL
		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 
		RETURN_ERROR(B_ERROR);

	// if the tree was emptied since the last call
	if (fCurrentNodeOffset == BPLUSTREE_FREE)
		return B_ENTRY_NOT_FOUND;

	// lock access to stream
	ReadLocked locked(fTree->fStream->Lock());

	CachedNode cached(fTree);
	bplustree_node *node;

	if (fDuplicateNode != BPLUSTREE_NULL) {
		// regardless of traverse direction the duplicates are always presented in
		// the same order; since they are all considered as equal, this shouldn't
		// cause any problems

		if (!fIsFragment || fDuplicate < fNumDuplicates)
			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
		else
			node = NULL;

		if (node != NULL) {
			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
				// if the node is out of duplicates, we go directly to the next one
				fDuplicateNode = node->RightLink();
				if (fDuplicateNode != BPLUSTREE_NULL
					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
					fDuplicate = 0;
				}
			}
			if (fDuplicate < fNumDuplicates) {
				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
				if (duplicate)
					*duplicate = 2;
				return B_OK;
			}
		}
		fDuplicateNode = BPLUSTREE_NULL;
	}

	off_t savedNodeOffset = fCurrentNodeOffset;
	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
		RETURN_ERROR(B_ERROR);

	if (duplicate)
		*duplicate = 0;

	fCurrentKey += direction;
	
	// is the current key in the current node?
	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys())
			|| (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) {
		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink();

		// are there any more nodes?
		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
			node = cached.SetTo(fCurrentNodeOffset);
			if (!node)
				RETURN_ERROR(B_ERROR);

			// reset current key
			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys();
		} else {
			// there are no nodes left, so turn back to the last key
			fCurrentNodeOffset = savedNodeOffset;
			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1;

			return B_ENTRY_NOT_FOUND;
		}
	}

	if (node->all_key_count == 0)
		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?

	uint16 length;
	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
		fTree->fStream->GetVolume()->Panic();
		RETURN_ERROR(B_BAD_DATA);
	}

	length = min_c(length, maxLength);
	memcpy(key, keyStart, length);
	
	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
		// terminate string type
		if (length == maxLength)
			length--;
		((char *)key)[length] = '\0';
	}
	*keyLength = length;

	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);

	// duplicate fragments?
	uint8 type = bplustree_node::LinkType(offset);
	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
		fDuplicateNode = offset;

		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
		if (node == NULL)
			RETURN_ERROR(B_ERROR);

		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;

		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
		if (fNumDuplicates) {
			offset = node->DuplicateAt(offset, fIsFragment, 0);
			fDuplicate = 1;
			if (duplicate)
				*duplicate = 1;
		} else {
			// shouldn't happen, but we're dealing here with potentially corrupt disks...
			fDuplicateNode = BPLUSTREE_NULL;
			offset = 0;
		}
	}
	*value = offset;

	return B_OK;
}


/**	This is more or less a copy of BPlusTree::Find() - but it just
 *	sets the current position in the iterator, regardless of if the
 *	key could be found or not.
 */

status_t 
TreeIterator::Find(const uint8 *key, uint16 keyLength)
{
	if (fTree == NULL)
		return B_INTERRUPTED;
	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
		|| key == NULL)
		RETURN_ERROR(B_BAD_VALUE);

	// lock access to stream
	ReadLocked locked(fTree->fStream->Lock());

	off_t nodeOffset = fTree->fHeader->RootNode();

	CachedNode cached(fTree);
	bplustree_node *node;
	while ((node = cached.SetTo(nodeOffset)) != NULL) {
		uint16 keyIndex = 0;
		off_t nextOffset;
		status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset);

		if (node->OverflowLink() == BPLUSTREE_NULL) {
			fCurrentNodeOffset = nodeOffset;
			fCurrentKey = keyIndex - 1;
			fDuplicateNode = BPLUSTREE_NULL;

			return status;
		} else if (nextOffset == nodeOffset)
			RETURN_ERROR(B_ERROR);

		nodeOffset = nextOffset;
	}
	RETURN_ERROR(B_ERROR);
}


void 
TreeIterator::SkipDuplicates()
{
	fDuplicateNode = BPLUSTREE_NULL;
}


void 
TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
{
	if (offset != fCurrentNodeOffset)
		return;

	if (nextOffset != BPLUSTREE_NULL) {
		fCurrentNodeOffset = nextOffset;
		if (splitAt <= fCurrentKey) {
			fCurrentKey -= splitAt;
			keyIndex -= splitAt;
		}
	}

	// Adjust fCurrentKey to point to the same key as before.
	// Note, that if a key is inserted at the current position
	// it won't be included in this tree transition.
	if (keyIndex <= fCurrentKey)
		fCurrentKey += change;

	// ToDo: duplicate handling!
}


void 
TreeIterator::Stop()
{
	fTree = NULL;
}


#ifdef DEBUG
void 
TreeIterator::Dump()
{
	__out("TreeIterator at %p:\n", this);
	__out("\tfTree = %p\n", fTree);
	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
	__out("\tfCurrentKey = %ld\n", fCurrentKey);
	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
	__out("\tfDuplicate = %u\n", fDuplicate);
	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
}
#endif


//	#pragma mark -


void 
bplustree_node::Initialize()
{
	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
	all_key_count = 0;
	all_key_length = 0;
}


uint8 *
bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
{
	if (index < 0 || index > NumKeys())
		return NULL;

	uint8 *keyStart = Keys();
	uint16 *keyLengths = KeyLengths();

	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
	if (index > 0)
		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);

	return keyStart;
}


uint8
bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
{
	// the duplicate fragment handling is currently hard-coded to a node size
	// of 1024 bytes - with future versions of BFS, this may be a problem

	if (isFragment) {
		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);

		return ((off_t *)this)[fragment];
	}
	return OverflowLink();
}


off_t
bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
{
	uint32 start;
	if (isFragment)
		start = 8 * ((uint64)offset & 0x3ff);
	else
		start = 2;

	return ((off_t *)this)[start + 1 + index];
}


/**	Although the name suggests it, this function doesn't return the real
 *	used fragment count; at least, it can only count to two: it returns
 *	0, if there is no fragment used, 1 if there is only one fragment
 *	used, and 2 if there are at least 2 fragments used.
 */

int32
bplustree_node::FragmentsUsed(uint32 nodeSize)
{
	uint32 used = 0;
	for (uint32 i = 0; i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); i++) {
		duplicate_array *array = FragmentAt(i);
		if (array->count > 0 && ++used > 1)
			return used;
	}
	return used;
}


#ifdef DEBUG
void 
bplustree_node::CheckIntegrity(uint32 nodeSize)
{
	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
		DEBUGGER(("invalid node: key/length count"));

	for (int32 i = 0; i < NumKeys(); i++) {
		uint16 length;
		uint8 *key = KeyAt(i, &length);
		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
			|| length > BPLUSTREE_MAX_KEY_LENGTH)
			DEBUGGER(("invalid node: keys corrupted"));
	}
}
#endif


//	#pragma mark -


int32
compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2)
{
	// if one of the keys is NULL, bail out gracefully
	if (key1 == NULL || key2 == NULL) {
//#if USER
//		// that's here to see if it's reasonable to handle this case nicely at all
//		DEBUGGER(("compareKeys() got NULL key!"));
//#endif
		// even if it's most probably a bug in the calling code, we try to
		// give a meaningful result
		if (key1 == NULL && key2 != NULL)
			return 1;
		else if (key2 == NULL && key1 != NULL)
			return -1;

		return 0;
	}

	switch (type) {
	    case B_STRING_TYPE:
    	{
			int len = min_c(keyLength1, keyLength2);
			int result = strncmp((const char *)key1, (const char *)key2, len);

			if (result == 0
				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
				result = keyLength1 - keyLength2;

			return result;
		}

		case B_SSIZE_T_TYPE:
		case B_INT32_TYPE:
			return *(int32 *)key1 - *(int32 *)key2;

		case B_SIZE_T_TYPE:
		case B_UINT32_TYPE:
			if (*(uint32 *)key1 == *(uint32 *)key2)
				return 0;
			else if (*(uint32 *)key1 > *(uint32 *)key2)
				return 1;

			return -1;

		case B_OFF_T_TYPE:
		case B_INT64_TYPE:
			if (*(int64 *)key1 == *(int64 *)key2)
				return 0;
			else if (*(int64 *)key1 > *(int64 *)key2)
				return 1;

			return -1;

		case B_UINT64_TYPE:
			if (*(uint64 *)key1 == *(uint64 *)key2)
				return 0;
			else if (*(uint64 *)key1 > *(uint64 *)key2)
				return 1;

			return -1;

		case B_FLOAT_TYPE:
		{
			float result = *(float *)key1 - *(float *)key2;
			if (result == 0.0f)
				return 0;

			return (result < 0.0f) ? -1 : 1;
		}

		case B_DOUBLE_TYPE:
		{
			double result = *(double *)key1 - *(double *)key2;
			if (result == 0.0)
				return 0;

			return (result < 0.0) ? -1 : 1;
		}
	}

	// if the type is unknown, the entries don't match...
	return -1;
}