/* * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de. * This file may be used under the terms of the MIT License. * * Roughly based on 'btlib' written by Marcus J. Ranum - it shares * no code but achieves binary compatibility with the on disk format. */ //! B+Tree implementation #include "BPlusTree.h" #include #include "Debug.h" #include "Utility.h" #if !_BOOT_MODE # include "Inode.h" #else # include "Stream.h" // BFS::Stream from the bootloader has the same API as Inode. # define Inode BFS::Stream # define strerror(x) "error message unavailable" namespace BFS { #endif /*! Simple array used for the duplicate handling in the B+Tree. This is an on disk structure. */ struct duplicate_array { off_t count; off_t values[0]; inline bool IsEmpty() const { return count == 0; } inline int32 Count() const { return (int32)BFS_ENDIAN_TO_HOST_INT64(count); } inline off_t ValueAt(uint32 index) const { return BFS_ENDIAN_TO_HOST_INT64(values[index]); } inline void SetValueAt(uint32 index, off_t value) { values[index] = HOST_ENDIAN_TO_BFS_INT64(value); } inline int32 Find(off_t value) const { int32 i; return _FindInternal(value, i) ? i : -1; } void Insert(off_t value); bool Remove(off_t value); private: bool _FindInternal(off_t value, int32& index) const; } _PACKED; #ifdef DEBUG class NodeChecker { public: NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) : fNode(node), fSize(nodeSize), fText(text) { Check("integrity check failed on construction."); } ~NodeChecker() { Check("integrity check failed on destruction."); } void Check(const char* message) { if (fNode->CheckIntegrity(fSize) != B_OK) { dprintf("%s: %s\n", fText, message); DEBUGGER(("NodeChecker integrity check failed!")); } } private: const bplustree_node* fNode; int32 fSize; const char* fText; }; #endif // DEBUG #if !_BOOT_MODE class BitmapArray { public: BitmapArray(size_t numBits); ~BitmapArray(); status_t InitCheck() const; bool IsSet(size_t index) const; void Set(size_t index, bool set); size_t CountSet() const { return fCountSet; } private: uint8* fBitmap; size_t fSize; size_t fCountSet; }; struct TreeCheck { TreeCheck(BPlusTree* tree) : fLevelCount(0), fFreeCount(0), fNodeSize(tree->NodeSize()), fMaxLevels(tree->fHeader.MaxNumberOfLevels()), fFoundErrors(0), fVisited(tree->Stream()->Size() / tree->NodeSize()), fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) { fPreviousOffsets = (off_t*)malloc( sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); if (fPreviousOffsets != NULL) { for (size_t i = 0; i < fMaxLevels; i++) fPreviousOffsets[i] = BPLUSTREE_NULL; } } ~TreeCheck() { free(fPreviousOffsets); } status_t InitCheck() const { if (fPreviousOffsets == NULL) return B_NO_MEMORY; status_t status = fVisited.InitCheck(); if (status != B_OK) return status; return fVisitedFragment.InitCheck(); } bool Visited(off_t offset) const { return fVisited.IsSet(offset / fNodeSize); } void SetVisited(off_t offset) { fVisited.Set(offset / fNodeSize, true); } size_t VisitedCount() const { return fVisited.CountSet(); } bool VisitedFragment(off_t offset) const { return fVisitedFragment.IsSet(offset / fNodeSize); } void SetVisitedFragment(off_t offset) { fVisitedFragment.Set(offset / fNodeSize, true); } uint32 MaxLevels() const { return fLevelCount; } void SetLevel(uint32 level) { if (fLevelCount < level) fLevelCount = level; } off_t PreviousOffset(uint32 level) { return fPreviousOffsets[level]; } void SetPreviousOffset(uint32 level, off_t offset) { fPreviousOffsets[level] = offset; } void FoundError() { fFoundErrors++; } bool ErrorsFound() { return fFoundErrors != 0; } private: uint32 fLevelCount; uint32 fFreeCount; uint32 fNodeSize; uint32 fMaxLevels; uint32 fFoundErrors; BitmapArray fVisited; BitmapArray fVisitedFragment; off_t* fPreviousOffsets; }; // #pragma mark - // Node Caching for the BPlusTree class // // With write support, there is the need for a function that allocates new // nodes by either returning empty nodes, or by growing the file's data stream // // !! The CachedNode class assumes that you have properly locked the stream // !! before asking for nodes. // // Note: This code will fail if the block size is smaller than the node size! // Since BFS supports block sizes of 1024 bytes or greater, and the node size // is hard-coded to 1024 bytes, that's not an issue now. void CachedNode::UnsetUnchanged(Transaction& transaction) { if (fTree == NULL || fTree->fStream == NULL) return; if (fNode != NULL) { void* cache = fTree->fStream->GetVolume()->BlockCache(); block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); block_cache_put(cache, fBlockNumber); fNode = NULL; } } #endif // !_BOOT_MODE void CachedNode::Unset() { if (fTree == NULL || fTree->fStream == NULL) return; if (fNode != NULL) { #if !_BOOT_MODE if (fWritable && fOffset == 0) { // The B+tree header has been updated - we need to update the // BPlusTrees copy of it, as well. memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); } block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); #endif // !_BOOT_MODE fNode = NULL; } } const bplustree_node* CachedNode::SetTo(off_t offset, bool check) { const bplustree_node* node; if (SetTo(offset, &node, check) == B_OK) return node; return NULL; } status_t CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) { if (fTree == NULL || fTree->fStream == NULL) RETURN_ERROR(B_BAD_VALUE); Unset(); // You can only ask for nodes at valid positions - you can't // even access the b+tree header with this method (use SetToHeader() // instead) if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize || offset <= 0 || (offset % fTree->fNodeSize) != 0) { RETURN_ERROR(B_BAD_VALUE); } if (InternalSetTo(NULL, offset) != NULL && check) { // sanity checks (links, all_key_count) if (!fTree->fHeader.CheckNode(fNode)) { FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); return B_BAD_DATA; } } *_node = fNode; return B_OK; } #if !_BOOT_MODE bplustree_node* CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) { if (fTree == NULL || fTree->fStream == NULL) { REPORT_ERROR(B_BAD_VALUE); return NULL; } Unset(); // You can only ask for nodes at valid positions - you can't // even access the b+tree header with this method (use SetToHeader() // instead) if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize || offset <= 0 || (offset % fTree->fNodeSize) != 0) return NULL; if (InternalSetTo(&transaction, offset) != NULL && check) { // sanity checks (links, all_key_count) if (!fTree->fHeader.CheckNode(fNode)) { FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); return NULL; } } return fNode; } bplustree_node* CachedNode::MakeWritable(Transaction& transaction) { if (fNode == NULL) return NULL; if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), fBlockNumber, transaction.ID()) == B_OK) { return fNode; } return NULL; } #endif // !_BOOT_MODE const bplustree_header* CachedNode::SetToHeader() { if (fTree == NULL || fTree->fStream == NULL) { REPORT_ERROR(B_BAD_VALUE); return NULL; } Unset(); InternalSetTo(NULL, 0LL); return (bplustree_header*)fNode; } #if !_BOOT_MODE bplustree_header* CachedNode::SetToWritableHeader(Transaction& transaction) { if (fTree == NULL || fTree->fStream == NULL) { REPORT_ERROR(B_BAD_VALUE); return NULL; } Unset(); InternalSetTo(&transaction, 0LL); if (fNode != NULL && !fTree->fInTransaction) { transaction.AddListener(fTree); fTree->fInTransaction = true; if (!transaction.GetVolume()->IsInitializing()) { acquire_vnode(transaction.GetVolume()->FSVolume(), fTree->fStream->ID()); } } return (bplustree_header*)fNode; } #endif // !_BOOT_MODE bplustree_node* CachedNode::InternalSetTo(Transaction* transaction, off_t offset) { fNode = NULL; fOffset = offset; off_t fileOffset; block_run run; if (offset < fTree->fStream->Size() && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { #if !_BOOT_MODE Volume* volume = fTree->fStream->GetVolume(); #else Volume* volume = &fTree->fStream->GetVolume(); #endif int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); fBlockNumber = volume->ToBlock(run) + blockOffset; uint8* block = NULL; #if !_BOOT_MODE if (transaction != NULL) { block = (uint8*)block_cache_get_writable(volume->BlockCache(), fBlockNumber, transaction->ID()); fWritable = true; } else { block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); fWritable = false; } #else // !_BOOT_MODE if (fBlock == NULL) { fBlock = (uint8*)malloc(volume->BlockSize()); if (fBlock == NULL) return NULL; } if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { block = fBlock; } fWritable = false; #endif // _BOOT_MODE if (block != NULL) { // The node is somewhere in that block... // (confusing offset calculation) fNode = (bplustree_node*)(block + offset - (fileOffset + (blockOffset << volume->BlockShift()))); } else REPORT_ERROR(B_IO_ERROR); } return fNode; } #if !_BOOT_MODE status_t CachedNode::Free(Transaction& transaction, off_t offset) { if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) RETURN_ERROR(B_BAD_VALUE); // TODO: scan the free nodes list and remove all nodes at the end // of the tree - perhaps that shouldn't be done everytime that // function is called, perhaps it should be done when the directory // inode is closed or based on some calculation or whatever... CachedNode cached(fTree); bplustree_header* header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; #if 0 // TODO: temporarily disabled because CheckNode() doesn't like this... // Also, it's such an edge case that it's almost useless, anyway. // if the node is the last one in the tree, we shrink // the tree and file size by one node off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; if (offset == lastOffset) { status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); if (status != B_OK) return status; header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); return B_OK; } #endif // add the node to the free nodes list fNode->left_link = header->free_node_pointer; fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); return B_OK; } status_t CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, off_t* _offset) { if (fTree == NULL || fTree->fStream == NULL) RETURN_ERROR(B_BAD_VALUE); Unset(); bplustree_header* header; status_t status; // if there are any free nodes, recycle them if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { CachedNode cached(fTree); header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; *_offset = header->FreeNode(); *_node = fNode; // set new free node pointer header->free_node_pointer = fNode->left_link; fNode->Initialize(); return B_OK; } // allocate space for a new node Inode* stream = fTree->fStream; if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) return status; CachedNode cached(fTree); header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; // the maximum_size has to be changed before the call to SetTo() - or // else it will fail because the requested node is out of bounds off_t offset = header->MaximumSize(); header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() + fTree->fNodeSize); cached.Unset(); // SetToWritable() below needs the new values in the tree's header if (SetToWritable(transaction, offset, false) == NULL) RETURN_ERROR(B_ERROR); fNode->Initialize(); *_offset = offset; *_node = fNode; return B_OK; } // #pragma mark - BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) : fStream(NULL), fInTransaction(false) { mutex_init(&fIteratorLock, "bfs b+tree iterator"); SetTo(transaction, stream); } #endif // !_BOOT_MODE BPlusTree::BPlusTree(Inode* stream) : fStream(NULL), fInTransaction(false) { #if !_BOOT_MODE mutex_init(&fIteratorLock, "bfs b+tree iterator"); #endif SetTo(stream); } BPlusTree::BPlusTree() : fStream(NULL), fNodeSize(BPLUSTREE_NODE_SIZE), fAllowDuplicates(true), fInTransaction(false), fStatus(B_NO_INIT) { #if !_BOOT_MODE mutex_init(&fIteratorLock, "bfs b+tree iterator"); #endif } BPlusTree::~BPlusTree() { #if !_BOOT_MODE // if there are any TreeIterators left, we need to stop them // (can happen when the tree's inode gets deleted while // traversing the tree - a TreeIterator doesn't lock the inode) mutex_lock(&fIteratorLock); SinglyLinkedList::Iterator iterator = fIterators.GetIterator(); while (iterator.HasNext()) iterator.Next()->Stop(); mutex_destroy(&fIteratorLock); ASSERT(!fInTransaction); #endif // !_BOOT_MODE } #if !_BOOT_MODE /*! Create a new B+Tree on the specified stream */ status_t BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) { // initializes in-memory B+Tree fStream = stream; CachedNode cached(this); bplustree_header* header = cached.SetToWritableHeader(transaction); if (header == NULL) { // allocate space for new header + node! fStatus = stream->SetFileSize(transaction, nodeSize * 2); if (fStatus != B_OK) RETURN_ERROR(fStatus); header = cached.SetToWritableHeader(transaction); if (header == NULL) RETURN_ERROR(fStatus = B_ERROR); } fAllowDuplicates = stream->IsIndex() || (stream->Mode() & S_ALLOW_DUPS) != 0; fNodeSize = nodeSize; // initialize b+tree header header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); cached.Unset(); // initialize b+tree root node cached.SetToWritable(transaction, fHeader.RootNode(), false); if (cached.Node() == NULL) RETURN_ERROR(B_IO_ERROR); cached.Node()->Initialize(); return fStatus = B_OK; } #endif // !_BOOT_MODE status_t BPlusTree::SetTo(Inode* stream) { if (stream == NULL) RETURN_ERROR(fStatus = B_BAD_VALUE); fStream = stream; // get on-disk B+Tree header CachedNode cached(this); const bplustree_header* header = cached.SetToHeader(); if (header != NULL) memcpy(&fHeader, header, sizeof(bplustree_header)); else RETURN_ERROR(fStatus = B_IO_ERROR); // is header valid? if (fHeader.MaximumSize() != stream->Size()) { dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); // we can't change the header since we don't have a transaction //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); } if (!fHeader.IsValid()) { #ifdef DEBUG dump_bplustree_header(&fHeader); dump_block((const char*)&fHeader, 128); #endif RETURN_ERROR(fStatus = B_BAD_DATA); } fNodeSize = fHeader.NodeSize(); // validity check static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE || ((stream->Mode() & S_INDEX_DIR) != 0 && kToMode[fHeader.DataType()] != mode) || !stream->IsContainer()) { D( dump_bplustree_header(&fHeader); dump_inode(&stream->Node()); ); RETURN_ERROR(fStatus = B_BAD_TYPE); } // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused // in the original BFS code - we will honour it nevertheless fAllowDuplicates = stream->IsIndex() || (stream->Mode() & S_ALLOW_DUPS) != 0; cached.SetTo(fHeader.RootNode()); RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); } status_t BPlusTree::InitCheck() { return fStatus; } #if !_BOOT_MODE status_t BPlusTree::Validate(bool repair, bool& _errorsFound) { TreeCheck check(this); if (check.InitCheck() != B_OK) return B_NO_MEMORY; check.SetVisited(0); // Walk the free nodes CachedNode cached(this); off_t freeOffset = fHeader.FreeNode(); while (freeOffset > 0) { const bplustree_node* node; status_t status = cached.SetTo(freeOffset, &node, false); if (status != B_OK) { if (status == B_IO_ERROR) return B_IO_ERROR; dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " "not be read: %s\n", fStream->ID(), freeOffset, strerror(status)); check.FoundError(); break; } if (check.Visited(freeOffset)) { dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " circular!\n", fStream->ID(), freeOffset); // TODO: if 'repair' is true, we could collect all unvisited nodes // at the end, and put the into the free list check.FoundError(); break; } check.SetVisited(freeOffset); if (node->OverflowLink() != BPLUSTREE_FREE) { dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " misses free mark!\n", fStream->ID(), freeOffset); } freeOffset = node->LeftLink(); } // Iterate over the complete tree recursively const bplustree_node* root; status_t status = cached.SetTo(fHeader.RootNode(), &root, true); if (status != B_OK) return status; status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); if (check.ErrorsFound()) _errorsFound = true; if (status != B_OK) return status; if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), fHeader.MaxNumberOfLevels()); } if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF " nodes.\n", fStream->ID(), check.VisitedCount(), fHeader.MaximumSize() / fNodeSize); } return B_OK; } status_t BPlusTree::MakeEmpty() { // Put all nodes into the free list in order Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); // Reset the header, and root node CachedNode cached(this); bplustree_header* header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); if (fStream->Size() > (off_t)NodeSize() * 2) header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); else { header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); } bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); if (node == NULL) return B_IO_ERROR; node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); node->all_key_count = 0; node->all_key_length = 0; for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); offset += NodeSize()) { bplustree_node* node = cached.SetToWritable(transaction, offset, false); if (node == NULL) { dprintf("--> could not open %" B_PRIdOFF "\n", offset); return B_IO_ERROR; } if (offset < fStream->Size() - (off_t)NodeSize()) node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); else node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); // It's not important to write it out in a single transaction; just // make sure it doesn't get too large if (offset % (1024 * 1024) == 0) { transaction.Done(); transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); } } return transaction.Done(); } int32 BPlusTree::TypeCodeToKeyType(type_code code) { switch (code) { case B_STRING_TYPE: return BPLUSTREE_STRING_TYPE; case B_SSIZE_T_TYPE: case B_INT32_TYPE: return BPLUSTREE_INT32_TYPE; case B_SIZE_T_TYPE: case B_UINT32_TYPE: return BPLUSTREE_UINT32_TYPE; case B_OFF_T_TYPE: case B_INT64_TYPE: return BPLUSTREE_INT64_TYPE; case B_UINT64_TYPE: return BPLUSTREE_UINT64_TYPE; case B_FLOAT_TYPE: return BPLUSTREE_FLOAT_TYPE; case B_DOUBLE_TYPE: return BPLUSTREE_DOUBLE_TYPE; } return -1; } int32 BPlusTree::ModeToKeyType(mode_t mode) { switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { case S_INT_INDEX: return BPLUSTREE_INT32_TYPE; case S_UINT_INDEX: return BPLUSTREE_UINT32_TYPE; case S_LONG_LONG_INDEX: return BPLUSTREE_INT64_TYPE; case S_ULONG_LONG_INDEX: return BPLUSTREE_UINT64_TYPE; case S_FLOAT_INDEX: return BPLUSTREE_FLOAT_TYPE; case S_DOUBLE_INDEX: return BPLUSTREE_DOUBLE_TYPE; case S_STR_INDEX: default: // default is for standard directories return BPLUSTREE_STRING_TYPE; } } #endif // !_BOOT_MODE // #pragma mark - TransactionListener implementation #if !_BOOT_MODE void BPlusTree::TransactionDone(bool success) { if (!success) { // update header from disk CachedNode cached(this); const bplustree_header* header = cached.SetToHeader(); if (header != NULL) memcpy(&fHeader, header, sizeof(bplustree_header)); } } void BPlusTree::RemovedFromTransaction() { fInTransaction = false; if (!fStream->GetVolume()->IsInitializing()) put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); } // #pragma mark - void BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) { // Although every iterator which is affected by this update currently // waits on a semaphore, other iterators could be added/removed at // any time, so we need to protect this loop MutexLocker _(fIteratorLock); SinglyLinkedList::Iterator iterator = fIterators.GetIterator(); while (iterator.HasNext()) iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); } void BPlusTree::_AddIterator(TreeIterator* iterator) { MutexLocker _(fIteratorLock); fIterators.Add(iterator); } void BPlusTree::_RemoveIterator(TreeIterator* iterator) { MutexLocker _(fIteratorLock); fIterators.Remove(iterator); } #endif // !_BOOT_MODE int32 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, int keyLength2) { type_code type = 0; switch (fHeader.DataType()) { case BPLUSTREE_STRING_TYPE: type = B_STRING_TYPE; break; case BPLUSTREE_INT32_TYPE: type = B_INT32_TYPE; break; case BPLUSTREE_UINT32_TYPE: type = B_UINT32_TYPE; break; case BPLUSTREE_INT64_TYPE: type = B_INT64_TYPE; break; case BPLUSTREE_UINT64_TYPE: type = B_UINT64_TYPE; break; case BPLUSTREE_FLOAT_TYPE: type = B_FLOAT_TYPE; break; case BPLUSTREE_DOUBLE_TYPE: type = B_DOUBLE_TYPE; break; } return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2); } status_t BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, uint16 keyLength, uint16* _index, off_t* _next) { #ifdef DEBUG NodeChecker checker(node, fNodeSize, "find"); #endif if (node->all_key_count == 0) { if (_index) *_index = 0; if (_next) *_next = node->OverflowLink(); return B_ENTRY_NOT_FOUND; } Unaligned* values = node->Values(); int16 saveIndex = -1; // binary search in the key array for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { uint16 i = (first + last) >> 1; uint16 searchLength = 0; uint8* searchKey = node->KeyAt(i, &searchLength); if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { #if !_BOOT_MODE fStream->GetVolume()->Panic(); #endif RETURN_ERROR(B_BAD_DATA); } int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); if (cmp < 0) { last = i - 1; saveIndex = i; } else if (cmp > 0) { saveIndex = first = i + 1; } else { if (_index) *_index = i; if (_next) *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); return B_OK; } } if (_index) *_index = saveIndex; if (_next) { if (saveIndex == node->NumKeys()) *_next = node->OverflowLink(); else *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); } return B_ENTRY_NOT_FOUND; } #if !_BOOT_MODE /*! Prepares the stack to contain all nodes that were passed while following the key, from the root node to the leaf node that could or should contain that key. */ status_t BPlusTree::_SeekDown(Stack& stack, const uint8* key, uint16 keyLength) { // set the root node to begin with node_and_key nodeAndKey; nodeAndKey.nodeOffset = fHeader.RootNode(); CachedNode cached(this); const bplustree_node* node; while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { // if we are already on leaf level, we're done if (node->OverflowLink() == BPLUSTREE_NULL) { // node that the keyIndex is not properly set here (but it's not // needed in the calling functions anyway)! nodeAndKey.keyIndex = 0; stack.Push(nodeAndKey); return B_OK; } off_t nextOffset; status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset); if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) RETURN_ERROR(B_ERROR); if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); break; } // put the node offset & the correct keyIndex on the stack stack.Push(nodeAndKey); nodeAndKey.nodeOffset = nextOffset; } FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); return B_ERROR; } /*! This will find a free duplicate fragment in the given bplustree_node. The CachedNode will be set to the writable fragment on success. */ status_t BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, const bplustree_node* node, CachedNode& cached, off_t* _offset, bplustree_node** _fragment, uint32* _index) { Unaligned* values = node->Values(); for (int32 i = 0; i < node->NumKeys(); i++) { off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); // does the value link to a duplicate fragment? if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) continue; const bplustree_node* fragment = cached.SetTo( bplustree_node::FragmentOffset(value), false); if (fragment == NULL) { FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" B_PRIdOFF "\n", value, fStream->ID())); continue; } // see if there is some space left for us uint32 num = bplustree_node::MaxFragments(fNodeSize); for (uint32 j = 0; j < num; j++) { duplicate_array* array = fragment->FragmentAt(j); if (array->IsEmpty()) { // found an unused fragment *_fragment = cached.MakeWritable(transaction); if (*_fragment == NULL) return B_IO_ERROR; *_offset = bplustree_node::FragmentOffset(value); *_index = j; return B_OK; } } } return B_ENTRY_NOT_FOUND; } status_t BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, const bplustree_node* node, uint16 index, off_t value) { CachedNode cachedDuplicate(this); Unaligned* values = node->Values(); off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); status_t status; off_t offset; if (bplustree_node::IsDuplicate(oldValue)) { // If it's a duplicate fragment, try to insert it into that, or if it // doesn't fit anymore, create a new duplicate node if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { bplustree_node* duplicate = cachedDuplicate.SetToWritable( transaction, bplustree_node::FragmentOffset(oldValue), false); if (duplicate == NULL) return B_IO_ERROR; duplicate_array* array = duplicate->FragmentAt( bplustree_node::FragmentIndex(oldValue)); int32 arrayCount = array->Count(); if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", (int)bplustree_node::FragmentIndex(oldValue), bplustree_node::FragmentOffset(oldValue), arrayCount, fStream->ID())); return B_BAD_DATA; } if (arrayCount < NUM_FRAGMENT_VALUES) { array->Insert(value); } else { // Test if the fragment will be empty if we remove this key's // values if (duplicate->FragmentsUsed(fNodeSize) < 2) { // The node will be empty without our values, so let us // reuse it as a duplicate node offset = bplustree_node::FragmentOffset(oldValue); memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); array = duplicate->DuplicateArray(); array->Insert(value); } else { // Create a new duplicate node cachedDuplicate.UnsetUnchanged(transaction); // The old duplicate has not been touched, so we can // reuse it bplustree_node* newDuplicate; status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); if (status != B_OK) RETURN_ERROR(status); // Copy the array from the fragment node to the duplicate // node and free the old entry (by zero'ing all values) newDuplicate->overflow_link = array->count; memcpy(&newDuplicate->all_key_count, &array->values[0], array->Count() * sizeof(off_t)); memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); array = newDuplicate->DuplicateArray(); array->Insert(value); } // Update the main pointer to link to a duplicate node if (cached.MakeWritable(transaction) == NULL) return B_IO_ERROR; values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( BPLUSTREE_DUPLICATE_NODE, offset)); } return B_OK; } // Put the value into a dedicated duplicate node // search for free space in the duplicate nodes of that key duplicate_array* array; int32 arrayCount; const bplustree_node* duplicate; off_t duplicateOffset; do { duplicateOffset = bplustree_node::FragmentOffset(oldValue); duplicate = cachedDuplicate.SetTo(duplicateOffset, false); if (duplicate == NULL) return B_IO_ERROR; array = duplicate->DuplicateArray(); arrayCount =array->Count(); if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { FATAL(("_InsertDuplicate: Invalid array size in duplicate %" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", duplicateOffset, arrayCount, fStream->ID())); return B_BAD_DATA; } } while (arrayCount >= NUM_DUPLICATE_VALUES && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); bplustree_node* writableDuplicate = cachedDuplicate.MakeWritable(transaction); if (writableDuplicate == NULL) return B_IO_ERROR; if (arrayCount < NUM_DUPLICATE_VALUES) { array = writableDuplicate->DuplicateArray(); array->Insert(value); } else { // no space left - add a new duplicate node bplustree_node* newDuplicate; status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); if (status != B_OK) RETURN_ERROR(status); // link the two nodes together writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); array = newDuplicate->DuplicateArray(); array->count = 0; array->Insert(value); } return B_OK; } // Search for a free duplicate fragment or create a new one // to insert the duplicate value into uint32 fragmentIndex = 0; bplustree_node* fragment; if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, &offset, &fragment, &fragmentIndex) != B_OK) { // allocate a new duplicate fragment node status = cachedDuplicate.Allocate(transaction, &fragment, &offset); if (status != B_OK) RETURN_ERROR(status); memset(fragment, 0, fNodeSize); } duplicate_array* array = fragment->FragmentAt(fragmentIndex); array->Insert(oldValue); array->Insert(value); if (cached.MakeWritable(transaction) == NULL) return B_IO_ERROR; values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); return B_OK; } void BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, uint16 keyLength, off_t value) { // should never happen, but who knows? if (index > node->NumKeys()) return; Unaligned* values = node->Values(); Unaligned* keyLengths = node->KeyLengths(); uint8* keys = node->Keys(); node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength); Unaligned* newValues = node->Values(); Unaligned* newKeyLengths = node->KeyLengths(); // move values and copy new value into them memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index)); memmove(newValues, values, sizeof(off_t) * index); newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); // move and update key length index for (uint16 i = node->NumKeys(); i-- > index + 1;) { newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); } memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); int32 keyStart; newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength + (keyStart = index > 0 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); // move keys and copy new key into them uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); int32 size = node->AllKeyLength() - length; if (size > 0) memmove(keys + length, keys + length - keyLength, size); memcpy(keys + keyStart, key, keyLength); } /*! Splits the \a node into two halves - the other half will be put into \a other. It also takes care to create a new overflow link if the node to split is an index node. */ status_t BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, uint16* _keyLength, off_t* _value) { if (*_keyIndex > node->NumKeys() + 1) return B_BAD_VALUE; Unaligned* inKeyLengths = node->KeyLengths(); Unaligned* inKeyValues = node->Values(); uint8* inKeys = node->Keys(); uint8* outKeys = other->Keys(); int32 keyIndex = *_keyIndex; // can become less than zero! if (keyIndex > node->NumKeys()) { FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); return B_BAD_VALUE; } // How many keys will fit in one (half) page? // The following loop will find the answer to this question and // change the key lengths indices for their new home // "bytes" is the number of bytes written for the new key, // "bytesBefore" are the bytes before that key // "bytesAfter" are the bytes after the new key, if any int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; size_t size = fNodeSize >> 1; int32 out, in; size_t keyLengths = 0; for (in = out = 0; in < node->NumKeys() + 1;) { keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]); if (in == keyIndex && !bytes) { bytes = *_keyLength; bytesBefore = in > 0 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; } else { if (keyIndex < out) bytesAfter = keyLengths - bytesBefore; in++; } out++; if (key_align(sizeof(bplustree_node) + bytes + keyLengths) + out * (sizeof(uint16) + sizeof(off_t)) >= size) { // we have found the number of keys in the new node! break; } } // if the new key was not inserted, set the length of the keys // that can be copied directly if (keyIndex >= out && in > 0) bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); else if (keyIndex + 1 < out) bytesAfter = keyLengths - bytesBefore; if (bytesBefore < 0 || bytesAfter < 0) return B_BAD_DATA; other->left_link = node->left_link; other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); Unaligned* outKeyLengths = other->KeyLengths(); Unaligned* outKeyValues = other->Values(); int32 keys = out > keyIndex ? keyIndex : out; if (bytesBefore) { // copy the keys memcpy(outKeys, inKeys, bytesBefore); memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); } if (bytes) { // copy the newly inserted key memcpy(outKeys + bytesBefore, key, bytes); outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); if (bytesAfter) { // copy the keys after the new key memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter); keys = out - keyIndex - 1; for (int32 i = 0;i < keys;i++) { outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes); } memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t)); } } // if the new key was already inserted, we shouldn't use it again if (in != out) keyIndex--; int32 total = bytesBefore + bytesAfter; // these variables are for the key that will be returned // to the parent node uint8* newKey = NULL; uint16 newLength; bool newAllocated = false; // If we have split an index node, we have to drop the first key // of the next node (which can also be the new key to insert). // The dropped key is also the one which has to be inserted in // the parent node, so we will set the "newKey" already here. if (node->OverflowLink() != BPLUSTREE_NULL) { if (in == keyIndex) { newKey = key; newLength = *_keyLength; other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); keyIndex--; } else { // If a key is dropped (is not the new key), we have to copy // it, because it would be lost if not. uint8* droppedKey = node->KeyAt(in, &newLength); if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize || newLength > BPLUSTREE_MAX_KEY_LENGTH) { fStream->GetVolume()->Panic(); RETURN_ERROR(B_BAD_DATA); } newKey = (uint8*)malloc(newLength); if (newKey == NULL) return B_NO_MEMORY; newAllocated = true; memcpy(newKey, droppedKey, newLength); other->overflow_link = inKeyValues[in]; total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); } } // and now the same game for the other page and the rest of the keys // (but with memmove() instead of memcpy(), because they may overlap) bytesBefore = bytesAfter = bytes = 0; out = 0; int32 skip = in; while (in < node->NumKeys() + 1) { if (in == keyIndex && !bytes) { // it's enough to set bytesBefore once here, because we do // not need to know the exact length of all keys in this // loop bytesBefore = in > skip ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; bytes = *_keyLength; out++; } else { if (in < node->NumKeys()) { inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); if (bytes) { inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - bytesBefore - bytes; } out++; } in++; } } // adjust the byte counts (since we were a bit lazy in the loop) if (keyIndex < skip) bytesBefore = node->AllKeyLength() - total; if (bytesBefore < 0 || bytesAfter < 0) { if (newAllocated) free(newKey); return B_BAD_DATA; } node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); // right link, and overflow link can stay the same node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); // array positions have changed outKeyLengths = node->KeyLengths(); outKeyValues = node->Values(); // move the keys in the old node: the order is important here, // because we don't want to overwrite any contents keys = keyIndex <= skip ? out : keyIndex - skip; keyIndex -= skip; in = out - keyIndex - 1; // Note: keyIndex and in will contain invalid values when the new key // went to the other node. But in this case bytes and bytesAfter are // 0 and subsequently we never use keyIndex and in. if (bytesBefore) memmove(inKeys, inKeys + total, bytesBefore); if (bytesAfter) { memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter); } if (bytesBefore) memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); if (bytesAfter) { // if byteAfter is > 0, keyIndex is larger than skip memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16)); } if (bytesBefore) memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); if (bytesAfter) { memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t)); } if (bytes) { // finally, copy the newly inserted key (don't overwrite anything) memcpy(inKeys + bytesBefore, key, bytes); outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); } // Prepare the key that will be inserted in the parent node which // is either the dropped key or the last of the other node. // If it's the dropped key, "newKey" was already set earlier. if (newKey == NULL) newKey = other->KeyAt(other->NumKeys() - 1, &newLength); memcpy(key, newKey, newLength); *_keyLength = newLength; *_value = otherOffset; if (newAllocated) free(newKey); return B_OK; } /*! This inserts a key into the tree. The changes made to the tree will all be part of the \a transaction. You need to have the inode write locked. */ status_t BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, off_t value) { if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) RETURN_ERROR(B_BAD_VALUE); #ifdef DEBUG if (value < 0) panic("tried to insert invalid value %" B_PRId64 "!\n", value); #endif ASSERT_WRITE_LOCKED_INODE(fStream); Stack stack; if (_SeekDown(stack, key, keyLength) != B_OK) RETURN_ERROR(B_ERROR); uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH]; memcpy(keyBuffer, key, keyLength); node_and_key nodeAndKey; const bplustree_node* node; CachedNode cached(this); while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { #ifdef DEBUG NodeChecker checker(node, fNodeSize, "insert"); #endif if (node->IsLeaf()) { // first round, check for duplicate entries status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex); // is this a duplicate entry? if (status == B_OK) { if (fAllowDuplicates) { status = _InsertDuplicate(transaction, cached, node, nodeAndKey.keyIndex, value); if (status != B_OK) RETURN_ERROR(status); return B_OK; } return B_NAME_IN_USE; } } bplustree_node* writableNode = cached.MakeWritable(transaction); if (writableNode == NULL) return B_IO_ERROR; // is the node big enough to hold the pair? if (int32(key_align(sizeof(bplustree_node) + writableNode->AllKeyLength() + keyLength) + (writableNode->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) { _InsertKey(writableNode, nodeAndKey.keyIndex, keyBuffer, keyLength, value); _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1); return B_OK; } else { CachedNode cachedNewRoot(this); CachedNode cachedOther(this); // do we need to allocate a new root node? if so, then do // it now off_t newRoot = BPLUSTREE_NULL; if (nodeAndKey.nodeOffset == fHeader.RootNode()) { bplustree_node* root; status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot); if (status != B_OK) { // The tree is most likely corrupted! // But it's still sane at leaf level - we could set // a flag in the header that forces the tree to be // rebuild next time... // But since we will have journaling, that's not a big // problem anyway. RETURN_ERROR(status); } } // reserve space for the other node bplustree_node* other; off_t otherOffset; status_t status = cachedOther.Allocate(transaction, &other, &otherOffset); if (status != B_OK) { cachedNewRoot.Free(transaction, newRoot); RETURN_ERROR(status); } if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) != B_OK) { // free root node & other node here cachedOther.Free(transaction, otherOffset); cachedNewRoot.Free(transaction, newRoot); RETURN_ERROR(B_ERROR); } #ifdef DEBUG checker.Check("insert split"); NodeChecker otherChecker(other, fNodeSize, "insert split other"); #endif _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex, writableNode->NumKeys(), 1); // update the right link of the node in the left of the new node if ((other = cachedOther.SetToWritable(transaction, other->LeftLink())) != NULL) { other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); } // create a new root if necessary if (newRoot != BPLUSTREE_NULL) { bplustree_node* root = cachedNewRoot.Node(); _InsertKey(root, 0, keyBuffer, keyLength, writableNode->LeftLink()); root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( nodeAndKey.nodeOffset); CachedNode cached(this); bplustree_header* header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; // finally, update header to point to the new root header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( header->MaxNumberOfLevels() + 1); return B_OK; } } } RETURN_ERROR(B_ERROR); } /*! Removes the duplicate index/value pair from the tree. It's part of the private tree interface. */ status_t BPlusTree::_RemoveDuplicate(Transaction& transaction, const bplustree_node* node, CachedNode& cached, uint16 index, off_t value) { Unaligned* values = node->Values(); off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); CachedNode cachedDuplicate(this); off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); if (duplicate == NULL) return B_IO_ERROR; // if it's a duplicate fragment, remove the entry from there if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { duplicate_array* array = duplicate->FragmentAt( bplustree_node::FragmentIndex(oldValue)); int32 arrayCount = array->Count(); if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, arrayCount, fStream->ID())); return B_BAD_DATA; } if (!array->Remove(value)) { FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, duplicateOffset, fStream->ID())); return B_ENTRY_NOT_FOUND; } // remove the array from the fragment node if it is empty if (--arrayCount == 1) { // set the link to the remaining value if (cached.MakeWritable(transaction) == NULL) return B_IO_ERROR; values[index] = array->values[0]; // Remove the whole fragment node, if this was the only array, // otherwise free just the array if (duplicate->FragmentsUsed(fNodeSize) == 1) { status_t status = cachedDuplicate.Free(transaction, duplicateOffset); if (status != B_OK) return status; } else array->count = 0; } return B_OK; } // Remove value from a duplicate node! duplicate_array* array = NULL; int32 arrayCount = 0; if (duplicate->LeftLink() != BPLUSTREE_NULL) { FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); return B_BAD_DATA; } // Search the duplicate nodes until the entry could be found (and removed) while (duplicate != NULL) { array = duplicate->DuplicateArray(); arrayCount = array->Count(); if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", duplicateOffset, arrayCount, fStream->ID())); return B_BAD_DATA; } if (array->Remove(value)) { arrayCount--; break; } if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) RETURN_ERROR(B_ENTRY_NOT_FOUND); cachedDuplicate.UnsetUnchanged(transaction); duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); } if (duplicate == NULL) RETURN_ERROR(B_IO_ERROR); // The entry got removed from the duplicate node, but we might want to free // it now in case it's empty while (true) { off_t left = duplicate->LeftLink(); off_t right = duplicate->RightLink(); bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; if ((isLast && arrayCount == 1) || arrayCount == 0) { // Free empty duplicate page, link their siblings together, and // update the duplicate link if needed (ie. when we either remove // the last duplicate node or have a new first one) if (left == BPLUSTREE_NULL) { // the duplicate link points to us if (cached.MakeWritable(transaction) == NULL) return B_IO_ERROR; if (arrayCount == 1) { // This is the last node, and there is only one value left; // replace the duplicate link with that value, it's no // duplicate anymore values[index] = array->values[0]; } else { // Move the duplicate link to the next node values[index] = HOST_ENDIAN_TO_BFS_INT64( bplustree_node::MakeLink( BPLUSTREE_DUPLICATE_NODE, right)); } } status_t status = cachedDuplicate.Free(transaction, duplicateOffset); if (status != B_OK) return status; if (left != BPLUSTREE_NULL && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); // If the next node is the last node, we need to free that node // and convert the duplicate entry back into a normal entry array = duplicate->DuplicateArray(); arrayCount = array->Count(); if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL && arrayCount <= NUM_FRAGMENT_VALUES) { duplicateOffset = left; continue; } } if (right != BPLUSTREE_NULL && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); // Again, we may need to turn the duplicate entry back into a // normal entry array = duplicate->DuplicateArray(); arrayCount = array->Count(); if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL && arrayCount <= NUM_FRAGMENT_VALUES) { duplicateOffset = right; continue; } } return B_OK; } if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { // If the number of entries fits in a duplicate fragment, then // either find a free fragment node, or convert this node to a // fragment node. CachedNode cachedOther(this); bplustree_node* fragment = NULL; uint32 fragmentIndex = 0; off_t offset; if (_FindFreeDuplicateFragment(transaction, node, cachedOther, &offset, &fragment, &fragmentIndex) == B_OK) { // move to other node duplicate_array* target = fragment->FragmentAt(fragmentIndex); memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); cachedDuplicate.Free(transaction, duplicateOffset); duplicateOffset = offset; } else { // convert node memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); } if (cached.MakeWritable(transaction) == NULL) return B_IO_ERROR; values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); } return B_OK; } } /*! Removes the key with the given index from the specified node. Since it has to get the key from the node anyway (to obtain it's pointer), it's not needed to pass the key & its length, although the calling method (BPlusTree::Remove()) have this data. */ void BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) { // should never happen, but who knows? if (index > node->NumKeys() && node->NumKeys() > 0) { FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF "\n", index, fStream->ID())); return; } Unaligned* values = node->Values(); // if we would have to drop the overflow link, drop // the last key instead and update the overflow link // to the value of that one if (!node->IsLeaf() && index == node->NumKeys()) node->overflow_link = values[--index]; uint16 length; uint8* key = node->KeyAt(index, &length); if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize || length > BPLUSTREE_MAX_KEY_LENGTH) { FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, fStream->ID())); fStream->GetVolume()->Panic(); return; } Unaligned* keyLengths = node->KeyLengths(); uint8* keys = node->Keys(); node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( node->AllKeyLength() - length); Unaligned* newValues = node->Values(); Unaligned* newKeyLengths = node->KeyLengths(); // move key data memmove(key, key + length, node->AllKeyLength() - (key - keys)); // move and update key lengths if (index > 0 && newKeyLengths != keyLengths) memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); for (uint16 i = index; i < node->NumKeys(); i++) { newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); } // move values if (index > 0) memmove(newValues, values, index * sizeof(off_t)); if (node->NumKeys() > index) { memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); } } /*! Removes the specified key from the tree. The "value" parameter is only used for trees which allow duplicates, so you may safely ignore it. It's not an optional parameter, so at least you have to think about it. You need to have the inode write locked. */ status_t BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, off_t value) { if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) RETURN_ERROR(B_BAD_VALUE); ASSERT_WRITE_LOCKED_INODE(fStream); Stack stack; if (_SeekDown(stack, key, keyLength) != B_OK) RETURN_ERROR(B_ERROR); node_and_key nodeAndKey; const bplustree_node* node; CachedNode cached(this); while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { #ifdef DEBUG NodeChecker checker(node, fNodeSize, "remove"); #endif if (node->IsLeaf()) { // first round, check for duplicate entries status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex); if (status != B_OK) RETURN_ERROR(status); // Is this a duplicate entry? if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( node->Values()[nodeAndKey.keyIndex]))) { if (fAllowDuplicates) { return _RemoveDuplicate(transaction, node, cached, nodeAndKey.keyIndex, value); } FATAL(("dupliate node found where no duplicates are " "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); RETURN_ERROR(B_ERROR); } else { if (node->Values()[nodeAndKey.keyIndex] != value) return B_ENTRY_NOT_FOUND; // If we will remove the last key, the iterator will be set // to the next node after the current - if there aren't any // more nodes, we need a way to prevent the TreeIterators to // touch the old node again, we use BPLUSTREE_FREE for this off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink(); _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); } } bplustree_node* writableNode = cached.MakeWritable(transaction); if (writableNode == NULL) return B_IO_ERROR; // if it's an empty root node, we have to convert it // to a leaf node by dropping the overflow link, or, // if it's already a leaf node, just empty it if (nodeAndKey.nodeOffset == fHeader.RootNode() && (node->NumKeys() == 0 || (node->NumKeys() == 1 && node->IsLeaf()))) { writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); writableNode->all_key_count = 0; writableNode->all_key_length = 0; // if we've made a leaf node out of the root node, we need // to reset the maximum number of levels in the header if (fHeader.MaxNumberOfLevels() != 1) { CachedNode cached(this); bplustree_header* header = cached.SetToWritableHeader(transaction); if (header == NULL) return B_IO_ERROR; header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); } return B_OK; } // if there is only one key left, we don't have to remove // it, we can just dump the node (index nodes still have // the overflow link, so we have to drop the last key) if (writableNode->NumKeys() > 1 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { _RemoveKey(writableNode, nodeAndKey.keyIndex); return B_OK; } // when we are here, we can just free the node, but // we have to update the right/left link of the // siblings first CachedNode otherCached(this); bplustree_node* other = otherCached.SetToWritable(transaction, writableNode->LeftLink()); if (other != NULL) other->right_link = writableNode->right_link; if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) { other->left_link = writableNode->left_link; } cached.Free(transaction, nodeAndKey.nodeOffset); } RETURN_ERROR(B_ERROR); } /*! Replaces the value for the key in the tree. Returns B_OK if the key could be found and its value replaced, B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors to indicate that something went terribly wrong. Note that this doesn't work with duplicates - it will just return B_BAD_TYPE if you call this function on a tree where duplicates are allowed. You need to have the inode write locked. */ status_t BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, off_t value) { if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH || key == NULL) RETURN_ERROR(B_BAD_VALUE); if (fAllowDuplicates) RETURN_ERROR(B_BAD_TYPE); ASSERT_WRITE_LOCKED_INODE(fStream); off_t nodeOffset = fHeader.RootNode(); CachedNode cached(this); const bplustree_node* node; while ((node = cached.SetTo(nodeOffset)) != NULL) { uint16 keyIndex = 0; off_t nextOffset; status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); if (node->OverflowLink() == BPLUSTREE_NULL) { if (status == B_OK) { bplustree_node* writableNode = cached.MakeWritable(transaction); if (writableNode != NULL) { writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); } else status = B_IO_ERROR; } return status; } else if (nextOffset == nodeOffset) RETURN_ERROR(B_ERROR); nodeOffset = nextOffset; } RETURN_ERROR(B_ERROR); } #endif // !_BOOT_MODE /*! Searches the key in the tree, and stores the offset found in _value, if successful. It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack while it descends the tree. Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. It can also return other errors to indicate that something went wrong. Note that this doesn't work with duplicates - it will just return B_BAD_TYPE if you call this function on a tree where duplicates are allowed. You need to have the inode read or write locked. */ status_t BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) { if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH) RETURN_ERROR(B_BAD_VALUE); if (keyLength > BPLUSTREE_MAX_KEY_LENGTH) RETURN_ERROR(B_NAME_TOO_LONG); if (fAllowDuplicates) RETURN_ERROR(B_BAD_TYPE); #if !_BOOT_MODE ASSERT_READ_LOCKED_INODE(fStream); #endif off_t nodeOffset = fHeader.RootNode(); CachedNode cached(this); const bplustree_node* node; #ifdef DEBUG int32 levels = 0; #endif while ((node = cached.SetTo(nodeOffset)) != NULL) { uint16 keyIndex = 0; off_t nextOffset; status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); #ifdef DEBUG levels++; #endif if (node->OverflowLink() == BPLUSTREE_NULL) { if (status == B_OK && _value != NULL) *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); #ifdef DEBUG if (levels != (int32)fHeader.MaxNumberOfLevels()) DEBUGGER(("levels don't match")); #endif return status; } else if (nextOffset == nodeOffset) RETURN_ERROR(B_ERROR); nodeOffset = nextOffset; } FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" B_PRIdOFF "\n", nodeOffset, fStream->ID())); RETURN_ERROR(B_ERROR); } #if !_BOOT_MODE status_t BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, const uint8* largestKey, uint16 largestKeyLength, const bplustree_node* parent) { if (parent->CheckIntegrity(fNodeSize) != B_OK) { dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " "failed!\n", fStream->ID(), offset); check.FoundError(); return B_OK; } if (level >= fHeader.MaxNumberOfLevels()) { dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF "!\n", fStream->ID(), offset); check.FoundError(); return B_OK; } check.SetLevel(level); if (check.Visited(offset)) { dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", fStream->ID(), offset); check.FoundError(); return B_OK; } check.SetVisited(offset); uint32 count = parent->NumKeys(); Unaligned* values = parent->Values(); off_t lastOffset = check.PreviousOffset(level); CachedNode cached(this); for (uint32 i = 0; i < count; i++) { uint16 keyLength; uint8* key = parent->KeyAt(i, &keyLength); if (largestKey != NULL) { int result = _CompareKeys(key, keyLength, largestKey, largestKeyLength); if (result > 0 || (result == 0 && i != count - 1)) { dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" B_PRIu32 " larger than it should!\n", fStream->ID(), offset, i); check.FoundError(); } } off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); if (bplustree_node::IsDuplicate(childOffset)) { // Walk the duplicate nodes off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); off_t lastDuplicateOffset = BPLUSTREE_NULL; while (duplicateOffset != BPLUSTREE_NULL) { const bplustree_node* node; status_t status = cached.SetTo(duplicateOffset, &node, false); if (status != B_OK) { if (status == B_IO_ERROR) return B_IO_ERROR; dprintf("inode %" B_PRIdOFF ": duplicate node at %" B_PRIdOFF " could not be read: %s\n", fStream->ID(), duplicateOffset, strerror(status)); check.FoundError(); break; } bool isFragmentNode = bplustree_node::LinkType(childOffset) == BPLUSTREE_DUPLICATE_FRAGMENT; bool isKnownFragment = isFragmentNode && check.VisitedFragment(duplicateOffset); if (!isKnownFragment && check.Visited(duplicateOffset)) { dprintf("inode %" B_PRIdOFF ": %s node at %" B_PRIdOFF " already visited, referenced from %" B_PRIdOFF "!\n", fStream->ID(), isFragmentNode ? "fragment" : "duplicate", duplicateOffset, offset); check.FoundError(); break; } // Fragment nodes may be visited more than once from different // places if (!check.Visited(duplicateOffset)) check.SetVisited(duplicateOffset); if (!isKnownFragment && isFragmentNode) check.SetVisitedFragment(duplicateOffset); duplicate_array* array; int32 minSize; int32 maxSize; if (isFragmentNode) { array = node->FragmentAt( bplustree_node::FragmentIndex(childOffset)); minSize = 2; maxSize = NUM_FRAGMENT_VALUES; } else { array = node->DuplicateArray(); minSize = 1; maxSize = NUM_DUPLICATE_VALUES; } int32 arrayCount = array->Count(); if (arrayCount < minSize || arrayCount > maxSize) { dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF " has invalid array size %" B_PRId32 "!\n", fStream->ID(), duplicateOffset, arrayCount); check.FoundError(); } else { // Simple check if the values in the array may be valid for (int32 j = 0; j < arrayCount; j++) { if (!fStream->GetVolume()->IsValidInodeBlock( array->ValueAt(j))) { dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF " contains invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", fStream->ID(), duplicateOffset, array->ValueAt(j), j); check.FoundError(); break; } } } // A fragment node is not linked (and does not have valid links) if (isFragmentNode) break; if (node->LeftLink() != lastDuplicateOffset) { dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF " has wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, node->LeftLink(), lastDuplicateOffset); check.FoundError(); } lastDuplicateOffset = duplicateOffset; duplicateOffset = node->RightLink(); } } else if (!parent->IsLeaf()) { // Test a regular child node recursively off_t nextOffset = parent->OverflowLink(); if (i < count - 1) nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); if (i == 0 && lastOffset != BPLUSTREE_NULL) { // Test right link of the previous node const bplustree_node* previous = cached.SetTo(lastOffset, true); if (previous == NULL) return B_IO_ERROR; if (previous->RightLink() != childOffset) { dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF "!\n", fStream->ID(), lastOffset, previous->RightLink(), childOffset); check.FoundError(); } } status_t status = _ValidateChild(check, cached, level, childOffset, lastOffset, nextOffset, key, keyLength); if (status != B_OK) return status; } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", fStream->ID(), offset, childOffset, i); check.FoundError(); } lastOffset = childOffset; } if (parent->OverflowLink() != BPLUSTREE_NULL) { off_t childOffset = parent->OverflowLink(); status_t status = _ValidateChild(check, cached, level, childOffset, lastOffset, 0, NULL, 0); if (status != B_OK) return status; lastOffset = childOffset; } check.SetPreviousOffset(level, lastOffset); return B_OK; } status_t BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, off_t offset, off_t lastOffset, off_t nextOffset, const uint8* key, uint16 keyLength) { const bplustree_node* node; status_t status = cached.SetTo(offset, &node, true); if (status != B_OK) { if (status == B_IO_ERROR) return B_IO_ERROR; dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " "read: %s\n", fStream->ID(), offset, strerror(status)); check.FoundError(); return B_OK; } if (node->LeftLink() != lastOffset) { dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); check.FoundError(); } if (nextOffset != 0 && node->RightLink() != nextOffset) { dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); check.FoundError(); } return _ValidateChildren(check, level + 1, offset, key, keyLength, node); } #endif // !_BOOT_MODE // #pragma mark - TreeIterator::TreeIterator(BPlusTree* tree) : fTree(tree), fCurrentNodeOffset(BPLUSTREE_NULL) { #if !_BOOT_MODE tree->_AddIterator(this); #endif } TreeIterator::~TreeIterator() { #if !_BOOT_MODE if (fTree) fTree->_RemoveIterator(this); #endif } status_t TreeIterator::Goto(int8 to) { if (fTree == NULL || fTree->fStream == NULL) RETURN_ERROR(B_BAD_VALUE); #if !_BOOT_MODE // lock access to stream InodeReadLocker locker(fTree->fStream); #endif off_t nodeOffset = fTree->fHeader.RootNode(); CachedNode cached(fTree); const bplustree_node* node; while ((node = cached.SetTo(nodeOffset)) != NULL) { // is the node a leaf node? if (node->OverflowLink() == BPLUSTREE_NULL) { fCurrentNodeOffset = nodeOffset; fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); fDuplicateNode = BPLUSTREE_NULL; return B_OK; } // get the next node offset depending on the direction (and if there // are any keys in that node at all) off_t nextOffset; if (to == BPLUSTREE_END || node->all_key_count == 0) nextOffset = node->OverflowLink(); else { if (node->AllKeyLength() > fTree->fNodeSize || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize - 8 * node->NumKeys()) RETURN_ERROR(B_ERROR); nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); } if (nextOffset == nodeOffset) break; nodeOffset = nextOffset; } FATAL(("%s fails\n", __FUNCTION__)); RETURN_ERROR(B_ERROR); } /*! Iterates through the tree in the specified direction. When it iterates through duplicates, the "key" is only updated for the first entry - if you need to know when this happens, use the "duplicate" parameter which is 0 for no duplicate, 1 for the first, and 2 for all the other duplicates. That's not too nice, but saves the 256 bytes that would be needed to store the last key - if this will ever become an issue, it will be easy to change. The other advantage of this is, that the queries can skip all duplicates at once when they are not relevant to them. */ status_t TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, uint16 maxLength, off_t* value, uint16* duplicate) { if (fTree == NULL) return B_INTERRUPTED; bool forward = direction == BPLUSTREE_FORWARD; if (fCurrentNodeOffset == BPLUSTREE_NULL && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) RETURN_ERROR(B_ERROR); // if the tree was emptied since the last call if (fCurrentNodeOffset == BPLUSTREE_FREE) return B_ENTRY_NOT_FOUND; #if !_BOOT_MODE // lock access to stream InodeReadLocker locker(fTree->fStream); #endif CachedNode cached(fTree); const bplustree_node* node; if (fDuplicateNode != BPLUSTREE_NULL) { // regardless of traverse direction the duplicates are always presented // in the same order; since they are all considered as equal, this // shouldn't cause any problems if (!fIsFragment || fDuplicate < fNumDuplicates) { node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); } else node = NULL; if (node != NULL) { if (!fIsFragment && fDuplicate >= fNumDuplicates) { // If the node is out of duplicates, we go directly to the next // one fDuplicateNode = node->RightLink(); if (fDuplicateNode != BPLUSTREE_NULL && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); fDuplicate = 0; } } if (fDuplicate < fNumDuplicates) { *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); if (duplicate) *duplicate = 2; return B_OK; } } fDuplicateNode = BPLUSTREE_NULL; } off_t savedNodeOffset = fCurrentNodeOffset; int32 savedKey = fCurrentKey; if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) RETURN_ERROR(B_ERROR); if (duplicate) *duplicate = 0; fCurrentKey += direction; // is the current key in the current node? while ((forward && fCurrentKey >= node->NumKeys()) || (!forward && fCurrentKey < 0)) { fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); // are there any more nodes? if (fCurrentNodeOffset != BPLUSTREE_NULL) { node = cached.SetTo(fCurrentNodeOffset); if (!node) RETURN_ERROR(B_ERROR); // reset current key fCurrentKey = forward ? 0 : node->NumKeys() - 1; } else { // there are no nodes left, so turn back to the last key fCurrentNodeOffset = savedNodeOffset; fCurrentKey = savedKey; return B_ENTRY_NOT_FOUND; } } if (node->all_key_count == 0) RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? uint16 length = 0; uint8* keyStart = node->KeyAt(fCurrentKey, &length); if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fTree->fNodeSize || length > BPLUSTREE_MAX_KEY_LENGTH) { #if !_BOOT_MODE fTree->fStream->GetVolume()->Panic(); #endif RETURN_ERROR(B_BAD_DATA); } // include the termination for string types bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; if (length + (needsTermination ? 1 : 0) > maxLength) { if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) { // The buffer is too small, restore the last key and return // an error fCurrentNodeOffset = savedNodeOffset; fCurrentKey = savedKey; return B_BUFFER_OVERFLOW; } // Always cut off strings at the maximum buffer size, and leave // room for a terminating null byte. // This allows to handle larger key sizes gracefully. length = maxLength - 1; } memcpy(key, keyStart, length); if (needsTermination) ((char*)key)[length] = '\0'; *keyLength = length; off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); // duplicate fragments? uint8 type = bplustree_node::LinkType(offset); if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { fDuplicateNode = offset; node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); if (node == NULL) RETURN_ERROR(B_ERROR); fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; fNumDuplicates = node->CountDuplicates(offset, fIsFragment); if (fNumDuplicates) { offset = node->DuplicateAt(offset, fIsFragment, 0); fDuplicate = 1; if (duplicate) *duplicate = 1; } else { // Shouldn't happen, but we're dealing here with potentially // corrupt disks... fDuplicateNode = BPLUSTREE_NULL; offset = 0; } } *value = offset; return B_OK; } /*! This is more or less a copy of BPlusTree::Find() - but it just sets the current position in the iterator, regardless of if the key could be found or not. */ status_t TreeIterator::Find(const uint8* key, uint16 keyLength) { if (fTree == NULL) return B_INTERRUPTED; if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH || key == NULL) RETURN_ERROR(B_BAD_VALUE); #if !_BOOT_MODE // lock access to stream InodeReadLocker locker(fTree->fStream); #endif off_t nodeOffset = fTree->fHeader.RootNode(); CachedNode cached(fTree); const bplustree_node* node; while ((node = cached.SetTo(nodeOffset)) != NULL) { uint16 keyIndex = 0; off_t nextOffset; status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, &nextOffset); if (node->OverflowLink() == BPLUSTREE_NULL) { fCurrentNodeOffset = nodeOffset; fCurrentKey = keyIndex - 1; fDuplicateNode = BPLUSTREE_NULL; return status; } else if (nextOffset == nodeOffset) RETURN_ERROR(B_ERROR); nodeOffset = nextOffset; } RETURN_ERROR(B_ERROR); } void TreeIterator::SkipDuplicates() { fDuplicateNode = BPLUSTREE_NULL; } void TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) { if (offset != fCurrentNodeOffset) return; if (nextOffset != BPLUSTREE_NULL) { fCurrentNodeOffset = nextOffset; if (splitAt <= fCurrentKey) { fCurrentKey -= splitAt; keyIndex -= splitAt; } } // Adjust fCurrentKey to point to the same key as before. // Note, that if a key is inserted at the current position // it won't be included in this tree transition. if (keyIndex <= fCurrentKey) fCurrentKey += change; // TODO: duplicate handling! } void TreeIterator::Stop() { fTree = NULL; } #ifdef DEBUG void TreeIterator::Dump() { __out("TreeIterator at %p:\n", this); __out("\tfTree = %p\n", fTree); __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset); __out("\tfCurrentKey = %d\n", (int)fCurrentKey); __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); __out("\tfDuplicate = %u\n", fDuplicate); __out("\tfNumDuplicates = %u\n", fNumDuplicates); __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); } #endif // #pragma mark - bool bplustree_header::IsValid() const { return Magic() == BPLUSTREE_MAGIC && (RootNode() % NodeSize()) == 0 && IsValidLink(RootNode()) && IsValidLink(FreeNode()); } // #pragma mark - void bplustree_node::Initialize() { left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); all_key_count = 0; all_key_length = 0; } uint8* bplustree_node::KeyAt(int32 index, uint16* keyLength) const { if (index < 0 || index > NumKeys()) return NULL; uint8* keyStart = Keys(); Unaligned* keyLengths = KeyLengths(); *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); if (index > 0) keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); return keyStart; } uint8 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const { // the duplicate fragment handling is currently hard-coded to a node size // of 1024 bytes - with future versions of BFS, this may be a problem if (isFragment) { uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); return ((off_t*)this)[fragment]; } return OverflowLink(); } off_t bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const { uint32 start; if (isFragment) start = 8 * ((uint64)offset & 0x3ff); else start = 2; return ((off_t*)this)[start + 1 + index]; } /*! Although the name suggests it, this function doesn't return the real used fragment count; at least, it can only count to two: it returns 0, if there is no fragment used, 1 if there is only one fragment used, and 2 if there are at least 2 fragments used. */ uint32 bplustree_node::FragmentsUsed(uint32 nodeSize) const { uint32 used = 0; for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { duplicate_array* array = FragmentAt(i); if (array->Count() > 0 && ++used > 1) return used; } return used; } status_t bplustree_node::CheckIntegrity(uint32 nodeSize) const { if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) DEBUGGER(("invalid node: key/length count")); for (int32 i = 0; i < NumKeys(); i++) { uint16 length = 0; uint8* key = KeyAt(i, &length); if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)this + nodeSize || length > BPLUSTREE_MAX_KEY_LENGTH) { dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); return B_BAD_DATA; } if (Values()[i] == -1) { dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " "corrupted\n", this, (int)i, Values()[i].value); return B_BAD_DATA; } } return B_OK; } // #pragma mark - #if !_BOOT_MODE BitmapArray::BitmapArray(size_t numBits) { fSize = (numBits + 7) / 8; fBitmap = (uint8*)calloc(fSize, 1); fCountSet = 0; } BitmapArray::~BitmapArray() { free(fBitmap); } status_t BitmapArray::InitCheck() const { return fBitmap != NULL ? B_OK : B_NO_MEMORY; } bool BitmapArray::IsSet(size_t index) const { uint32 byteIndex = index / 8; if (byteIndex >= fSize) return false; return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; } void BitmapArray::Set(size_t index, bool set) { uint32 byteIndex = index / 8; if (byteIndex >= fSize) return; if (set) { fBitmap[byteIndex] |= 1UL << (index & 0x7); fCountSet++; } else { fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); fCountSet--; } } #endif // !_BOOT_MODE // #pragma mark - bool duplicate_array::_FindInternal(off_t value, int32& index) const { int32 min = 0, max = Count() - 1; off_t cmp; while (min <= max) { index = (min + max) / 2; cmp = ValueAt(index) - value; if (cmp < 0) min = index + 1; else if (cmp > 0) max = index - 1; else return true; } return false; } void duplicate_array::Insert(off_t value) { // if there are more than 8 values in this array, use a // binary search, if not, just iterate linearly to find // the insertion point int32 size = Count(); int32 i = 0; if (size > 8 ) { if (!_FindInternal(value, i) && ValueAt(i) <= value) i++; } else { for (i = 0; i < size; i++) { if (ValueAt(i) > value) break; } } memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); values[i] = HOST_ENDIAN_TO_BFS_INT64(value); count = HOST_ENDIAN_TO_BFS_INT64(size + 1); } bool duplicate_array::Remove(off_t value) { int32 index = Find(value); if (index == -1) return false; int32 newSize = Count() - 1; memmove(&values[index], &values[index + 1], (newSize - index) * sizeof(off_t)); count = HOST_ENDIAN_TO_BFS_INT64(newSize); return true; } #if _BOOT_MODE } // namespace BFS #endif