1 /* 2 * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "Debug.h" 14 #include "BPlusTree.h" 15 #include "Inode.h" 16 #include "Utility.h" 17 18 19 /*! Simple array used for the duplicate handling in the B+Tree. This is an 20 on disk structure. 21 */ 22 struct duplicate_array { 23 off_t count; 24 off_t values[0]; 25 26 inline bool IsEmpty() const 27 { 28 return count == 0; 29 } 30 31 inline int32 Count() const 32 { 33 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 34 } 35 36 inline off_t ValueAt(uint32 index) const 37 { 38 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 39 } 40 41 inline void SetValueAt(uint32 index, off_t value) 42 { 43 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 44 } 45 46 inline int32 Find(off_t value) const 47 { 48 int32 i; 49 return _FindInternal(value, i) ? i : -1; 50 } 51 52 void Insert(off_t value); 53 bool Remove(off_t value); 54 55 private: 56 bool _FindInternal(off_t value, int32& index) const; 57 } _PACKED; 58 59 60 #ifdef DEBUG 61 class NodeChecker { 62 public: 63 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 64 : 65 fNode(node), 66 fSize(nodeSize), 67 fText(text) 68 { 69 Check("integrity check failed on construction."); 70 } 71 72 ~NodeChecker() 73 { 74 Check("integrity check failed on destruction."); 75 } 76 77 void 78 Check(const char* message) 79 { 80 if (fNode->CheckIntegrity(fSize) != B_OK) { 81 dprintf("%s: %s\n", fText, message); 82 DEBUGGER(("NodeChecker integrity check failed!")); 83 } 84 } 85 86 private: 87 const bplustree_node* fNode; 88 int32 fSize; 89 const char* fText; 90 }; 91 #endif 92 93 94 class BitmapArray { 95 public: 96 BitmapArray(size_t numBits); 97 ~BitmapArray(); 98 99 status_t InitCheck() const; 100 101 bool IsSet(size_t index) const; 102 void Set(size_t index, bool set); 103 104 size_t CountSet() const { return fCountSet; } 105 106 private: 107 uint8* fBitmap; 108 size_t fSize; 109 size_t fCountSet; 110 }; 111 112 113 struct TreeCheck { 114 TreeCheck(BPlusTree* tree) 115 : 116 fLevelCount(0), 117 fFreeCount(0), 118 fNodeSize(tree->NodeSize()), 119 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 120 fFoundErrors(0), 121 fVisited(tree->Stream()->Size() / tree->NodeSize()), 122 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 123 { 124 fPreviousOffsets = (off_t*)malloc( 125 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 126 if (fPreviousOffsets != NULL) { 127 for (size_t i = 0; i < fMaxLevels; i++) 128 fPreviousOffsets[i] = BPLUSTREE_NULL; 129 } 130 } 131 132 ~TreeCheck() 133 { 134 free(fPreviousOffsets); 135 } 136 137 status_t InitCheck() const 138 { 139 if (fPreviousOffsets == NULL) 140 return B_NO_MEMORY; 141 142 status_t status = fVisited.InitCheck(); 143 if (status != B_OK) 144 return status; 145 146 return fVisitedFragment.InitCheck(); 147 } 148 149 bool Visited(off_t offset) const 150 { 151 return fVisited.IsSet(offset / fNodeSize); 152 } 153 154 void SetVisited(off_t offset) 155 { 156 fVisited.Set(offset / fNodeSize, true); 157 } 158 159 size_t VisitedCount() const 160 { 161 return fVisited.CountSet(); 162 } 163 164 bool VisitedFragment(off_t offset) const 165 { 166 return fVisitedFragment.IsSet(offset / fNodeSize); 167 } 168 169 void SetVisitedFragment(off_t offset) 170 { 171 fVisitedFragment.Set(offset / fNodeSize, true); 172 } 173 174 uint32 MaxLevels() const 175 { 176 return fLevelCount; 177 } 178 179 void SetLevel(uint32 level) 180 { 181 if (fLevelCount < level) 182 fLevelCount = level; 183 } 184 185 off_t PreviousOffset(uint32 level) 186 { 187 return fPreviousOffsets[level]; 188 } 189 190 void SetPreviousOffset(uint32 level, off_t offset) 191 { 192 fPreviousOffsets[level] = offset; 193 } 194 195 void FoundError() 196 { 197 fFoundErrors++; 198 } 199 200 bool ErrorsFound() 201 { 202 return fFoundErrors != 0; 203 } 204 205 private: 206 uint32 fLevelCount; 207 uint32 fFreeCount; 208 uint32 fNodeSize; 209 uint32 fMaxLevels; 210 uint32 fFoundErrors; 211 BitmapArray fVisited; 212 BitmapArray fVisitedFragment; 213 off_t* fPreviousOffsets; 214 }; 215 216 217 // #pragma mark - 218 219 220 // Node Caching for the BPlusTree class 221 // 222 // With write support, there is the need for a function that allocates new 223 // nodes by either returning empty nodes, or by growing the file's data stream 224 // 225 // !! The CachedNode class assumes that you have properly locked the stream 226 // !! before asking for nodes. 227 // 228 // Note: This code will fail if the block size is smaller than the node size! 229 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 230 // is hard-coded to 1024 bytes, that's not an issue now. 231 232 void 233 CachedNode::UnsetUnchanged(Transaction& transaction) 234 { 235 if (fTree == NULL || fTree->fStream == NULL) 236 return; 237 238 if (fNode != NULL) { 239 void* cache = fTree->fStream->GetVolume()->BlockCache(); 240 241 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 242 block_cache_put(cache, fBlockNumber); 243 fNode = NULL; 244 } 245 } 246 247 248 void 249 CachedNode::Unset() 250 { 251 if (fTree == NULL || fTree->fStream == NULL) 252 return; 253 254 if (fNode != NULL) { 255 if (fWritable && fOffset == 0) { 256 // The B+tree header has been updated - we need to update the 257 // BPlusTrees copy of it, as well. 258 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 259 } 260 261 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 262 fBlockNumber); 263 fNode = NULL; 264 } 265 } 266 267 268 const bplustree_node* 269 CachedNode::SetTo(off_t offset, bool check) 270 { 271 const bplustree_node* node; 272 if (SetTo(offset, &node, check) == B_OK) 273 return node; 274 275 return NULL; 276 } 277 278 279 status_t 280 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 281 { 282 if (fTree == NULL || fTree->fStream == NULL) 283 RETURN_ERROR(B_BAD_VALUE); 284 285 Unset(); 286 287 // You can only ask for nodes at valid positions - you can't 288 // even access the b+tree header with this method (use SetToHeader() 289 // instead) 290 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 291 || offset <= 0 292 || (offset % fTree->fNodeSize) != 0) 293 RETURN_ERROR(B_BAD_VALUE); 294 295 if (InternalSetTo(NULL, offset) != NULL && check) { 296 // sanity checks (links, all_key_count) 297 if (!fTree->fHeader.CheckNode(fNode)) { 298 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 299 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 300 fBlockNumber, fTree->fStream->ID())); 301 return B_BAD_DATA; 302 } 303 } 304 305 *_node = fNode; 306 return B_OK; 307 } 308 309 310 bplustree_node* 311 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 312 { 313 if (fTree == NULL || fTree->fStream == NULL) { 314 REPORT_ERROR(B_BAD_VALUE); 315 return NULL; 316 } 317 318 Unset(); 319 320 // You can only ask for nodes at valid positions - you can't 321 // even access the b+tree header with this method (use SetToHeader() 322 // instead) 323 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 324 || offset <= 0 325 || (offset % fTree->fNodeSize) != 0) 326 return NULL; 327 328 if (InternalSetTo(&transaction, offset) != NULL && check) { 329 // sanity checks (links, all_key_count) 330 if (!fTree->fHeader.CheckNode(fNode)) { 331 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 332 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 333 fBlockNumber, fTree->fStream->ID())); 334 return NULL; 335 } 336 } 337 return fNode; 338 } 339 340 341 bplustree_node* 342 CachedNode::MakeWritable(Transaction& transaction) 343 { 344 if (fNode == NULL) 345 return NULL; 346 347 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 348 fBlockNumber, transaction.ID()) == B_OK) 349 return fNode; 350 351 return NULL; 352 } 353 354 355 const bplustree_header* 356 CachedNode::SetToHeader() 357 { 358 if (fTree == NULL || fTree->fStream == NULL) { 359 REPORT_ERROR(B_BAD_VALUE); 360 return NULL; 361 } 362 363 Unset(); 364 365 InternalSetTo(NULL, 0LL); 366 return (bplustree_header*)fNode; 367 } 368 369 370 bplustree_header* 371 CachedNode::SetToWritableHeader(Transaction& transaction) 372 { 373 if (fTree == NULL || fTree->fStream == NULL) { 374 REPORT_ERROR(B_BAD_VALUE); 375 return NULL; 376 } 377 378 Unset(); 379 380 InternalSetTo(&transaction, 0LL); 381 382 if (fNode != NULL && !fTree->fInTransaction) { 383 transaction.AddListener(fTree); 384 fTree->fInTransaction = true; 385 386 if (!transaction.GetVolume()->IsInitializing()) { 387 acquire_vnode(transaction.GetVolume()->FSVolume(), 388 fTree->fStream->ID()); 389 } 390 } 391 392 return (bplustree_header*)fNode; 393 } 394 395 396 bplustree_node* 397 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 398 { 399 fNode = NULL; 400 fOffset = offset; 401 402 off_t fileOffset; 403 block_run run; 404 if (offset < fTree->fStream->Size() 405 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 406 Volume* volume = fTree->fStream->GetVolume(); 407 408 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 409 fBlockNumber = volume->ToBlock(run) + blockOffset; 410 uint8* block; 411 412 if (transaction != NULL) { 413 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 414 fBlockNumber, transaction->ID()); 415 fWritable = true; 416 } else { 417 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 418 fWritable = false; 419 } 420 421 if (block != NULL) { 422 // The node is somewhere in that block... 423 // (confusing offset calculation) 424 fNode = (bplustree_node*)(block + offset 425 - (fileOffset + (blockOffset << volume->BlockShift()))); 426 } else 427 REPORT_ERROR(B_IO_ERROR); 428 } 429 return fNode; 430 } 431 432 433 status_t 434 CachedNode::Free(Transaction& transaction, off_t offset) 435 { 436 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 437 RETURN_ERROR(B_BAD_VALUE); 438 439 // TODO: scan the free nodes list and remove all nodes at the end 440 // of the tree - perhaps that shouldn't be done everytime that 441 // function is called, perhaps it should be done when the directory 442 // inode is closed or based on some calculation or whatever... 443 444 CachedNode cached(fTree); 445 bplustree_header* header = cached.SetToWritableHeader(transaction); 446 if (header == NULL) 447 return B_IO_ERROR; 448 449 #if 0 450 // TODO: temporarily disabled because CheckNode() doesn't like this... 451 // Also, it's such an edge case that it's almost useless, anyway. 452 // if the node is the last one in the tree, we shrink 453 // the tree and file size by one node 454 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 455 if (offset == lastOffset) { 456 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 457 if (status != B_OK) 458 return status; 459 460 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 461 return B_OK; 462 } 463 #endif 464 465 // add the node to the free nodes list 466 fNode->left_link = header->free_node_pointer; 467 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 468 469 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 470 return B_OK; 471 } 472 473 474 status_t 475 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 476 off_t* _offset) 477 { 478 if (fTree == NULL || fTree->fStream == NULL) 479 RETURN_ERROR(B_BAD_VALUE); 480 481 Unset(); 482 483 bplustree_header* header; 484 status_t status; 485 486 // if there are any free nodes, recycle them 487 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 488 CachedNode cached(fTree); 489 header = cached.SetToWritableHeader(transaction); 490 if (header == NULL) 491 return B_IO_ERROR; 492 493 *_offset = header->FreeNode(); 494 *_node = fNode; 495 496 // set new free node pointer 497 header->free_node_pointer = fNode->left_link; 498 fNode->Initialize(); 499 return B_OK; 500 } 501 502 // allocate space for a new node 503 Inode* stream = fTree->fStream; 504 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 505 return status; 506 507 CachedNode cached(fTree); 508 header = cached.SetToWritableHeader(transaction); 509 if (header == NULL) 510 return B_IO_ERROR; 511 512 // the maximum_size has to be changed before the call to SetTo() - or 513 // else it will fail because the requested node is out of bounds 514 off_t offset = header->MaximumSize(); 515 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 516 + fTree->fNodeSize); 517 518 cached.Unset(); 519 // SetToWritable() below needs the new values in the tree's header 520 521 if (SetToWritable(transaction, offset, false) == NULL) 522 RETURN_ERROR(B_ERROR); 523 524 fNode->Initialize(); 525 526 *_offset = offset; 527 *_node = fNode; 528 return B_OK; 529 } 530 531 532 // #pragma mark - 533 534 535 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 536 : 537 fStream(NULL), 538 fInTransaction(false) 539 { 540 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 541 SetTo(transaction, stream); 542 } 543 544 545 BPlusTree::BPlusTree(Inode* stream) 546 : 547 fStream(NULL), 548 fInTransaction(false) 549 { 550 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 551 SetTo(stream); 552 } 553 554 555 BPlusTree::BPlusTree() 556 : 557 fStream(NULL), 558 fNodeSize(BPLUSTREE_NODE_SIZE), 559 fAllowDuplicates(true), 560 fInTransaction(false), 561 fStatus(B_NO_INIT) 562 { 563 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 564 } 565 566 567 BPlusTree::~BPlusTree() 568 { 569 // if there are any TreeIterators left, we need to stop them 570 // (can happen when the tree's inode gets deleted while 571 // traversing the tree - a TreeIterator doesn't lock the inode) 572 mutex_lock(&fIteratorLock); 573 574 SinglyLinkedList<TreeIterator>::Iterator iterator 575 = fIterators.GetIterator(); 576 while (iterator.HasNext()) 577 iterator.Next()->Stop(); 578 579 mutex_destroy(&fIteratorLock); 580 581 ASSERT(!fInTransaction); 582 } 583 584 585 /*! Create a new B+Tree on the specified stream */ 586 status_t 587 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 588 { 589 // initializes in-memory B+Tree 590 591 fStream = stream; 592 593 CachedNode cached(this); 594 bplustree_header* header = cached.SetToWritableHeader(transaction); 595 if (header == NULL) { 596 // allocate space for new header + node! 597 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 598 if (fStatus != B_OK) 599 RETURN_ERROR(fStatus); 600 601 header = cached.SetToWritableHeader(transaction); 602 if (header == NULL) 603 RETURN_ERROR(fStatus = B_ERROR); 604 } 605 606 fAllowDuplicates = stream->IsIndex() 607 || (stream->Mode() & S_ALLOW_DUPS) != 0; 608 609 fNodeSize = nodeSize; 610 611 // initialize b+tree header 612 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 613 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 614 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 615 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 616 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 617 header->free_node_pointer 618 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 619 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 620 621 cached.Unset(); 622 623 // initialize b+tree root node 624 cached.SetToWritable(transaction, fHeader.RootNode(), false); 625 if (cached.Node() == NULL) 626 RETURN_ERROR(B_IO_ERROR); 627 628 cached.Node()->Initialize(); 629 630 return fStatus = B_OK; 631 } 632 633 634 status_t 635 BPlusTree::SetTo(Inode* stream) 636 { 637 if (stream == NULL) 638 RETURN_ERROR(fStatus = B_BAD_VALUE); 639 640 fStream = stream; 641 642 // get on-disk B+Tree header 643 644 CachedNode cached(this); 645 const bplustree_header* header = cached.SetToHeader(); 646 if (header != NULL) 647 memcpy(&fHeader, header, sizeof(bplustree_header)); 648 else 649 RETURN_ERROR(fStatus = B_IO_ERROR); 650 651 // is header valid? 652 653 if (fHeader.MaximumSize() != stream->Size()) { 654 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 655 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 656 // we can't change the header since we don't have a transaction 657 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 658 } 659 if (!fHeader.IsValid()) { 660 #ifdef DEBUG 661 dump_bplustree_header(&fHeader); 662 dump_block((const char*)&fHeader, 128); 663 #endif 664 RETURN_ERROR(fStatus = B_BAD_DATA); 665 } 666 667 fNodeSize = fHeader.NodeSize(); 668 669 // validity check 670 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 671 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 672 S_DOUBLE_INDEX}; 673 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 674 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 675 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 676 677 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 678 || ((stream->Mode() & S_INDEX_DIR) != 0 679 && kToMode[fHeader.DataType()] != mode) 680 || !stream->IsContainer()) { 681 D( dump_bplustree_header(&fHeader); 682 dump_inode(&stream->Node()); 683 ); 684 RETURN_ERROR(fStatus = B_BAD_TYPE); 685 } 686 687 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 688 // in the original BFS code - we will honour it nevertheless 689 fAllowDuplicates = stream->IsIndex() 690 || (stream->Mode() & S_ALLOW_DUPS) != 0; 691 692 cached.SetTo(fHeader.RootNode()); 693 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 694 } 695 696 697 status_t 698 BPlusTree::InitCheck() 699 { 700 return fStatus; 701 } 702 703 704 status_t 705 BPlusTree::Validate(bool repair, bool& _errorsFound) 706 { 707 TreeCheck check(this); 708 if (check.InitCheck() != B_OK) 709 return B_NO_MEMORY; 710 711 check.SetVisited(0); 712 713 // Walk the free nodes 714 715 CachedNode cached(this); 716 off_t freeOffset = fHeader.FreeNode(); 717 while (freeOffset > 0) { 718 const bplustree_node* node; 719 status_t status = cached.SetTo(freeOffset, &node, false); 720 if (status != B_OK) { 721 if (status == B_IO_ERROR) 722 return B_IO_ERROR; 723 724 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 725 "not be read: %s\n", fStream->ID(), freeOffset, 726 strerror(status)); 727 check.FoundError(); 728 break; 729 } 730 731 if (check.Visited(freeOffset)) { 732 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 733 " circular!\n", fStream->ID(), freeOffset); 734 // TODO: if 'repair' is true, we could collect all unvisited nodes 735 // at the end, and put the into the free list 736 check.FoundError(); 737 break; 738 } 739 740 check.SetVisited(freeOffset); 741 742 if (node->OverflowLink() != BPLUSTREE_FREE) { 743 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 744 " misses free mark!\n", fStream->ID(), freeOffset); 745 } 746 freeOffset = node->LeftLink(); 747 } 748 749 // Iterate over the complete tree recursively 750 751 const bplustree_node* root; 752 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 753 if (status != B_OK) 754 return status; 755 756 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 757 758 if (check.ErrorsFound()) 759 _errorsFound = true; 760 761 if (status != B_OK) 762 return status; 763 764 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 765 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 766 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 767 fHeader.MaxNumberOfLevels()); 768 } 769 770 if (check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 771 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 772 " nodes.\n", fStream->ID(), check.VisitedCount(), 773 fHeader.MaximumSize() / fNodeSize); 774 } 775 776 return B_OK; 777 } 778 779 780 status_t 781 BPlusTree::MakeEmpty() 782 { 783 // Put all nodes into the free list in order 784 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 785 786 // Reset the header, and root node 787 CachedNode cached(this); 788 bplustree_header* header = cached.SetToWritableHeader(transaction); 789 if (header == NULL) 790 return B_IO_ERROR; 791 792 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 793 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 794 if (fStream->Size() > NodeSize() * 2) 795 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 796 else { 797 header->free_node_pointer 798 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 799 } 800 801 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 802 if (node == NULL) 803 return B_IO_ERROR; 804 805 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 806 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 807 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 808 node->all_key_count = 0; 809 node->all_key_length = 0; 810 811 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 812 offset += NodeSize()) { 813 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 814 if (node == NULL) { 815 dprintf("--> could not open %lld\n", offset); 816 return B_IO_ERROR; 817 } 818 if (offset < fStream->Size() - NodeSize()) 819 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 820 else 821 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 822 823 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 824 825 // It's not important to write it out in a single transaction; just 826 // make sure it doesn't get too large 827 if (offset % (1024 * 1024) == 0) { 828 transaction.Done(); 829 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 830 } 831 } 832 833 return transaction.Done(); 834 } 835 836 837 int32 838 BPlusTree::TypeCodeToKeyType(type_code code) 839 { 840 switch (code) { 841 case B_STRING_TYPE: 842 return BPLUSTREE_STRING_TYPE; 843 case B_SSIZE_T_TYPE: 844 case B_INT32_TYPE: 845 return BPLUSTREE_INT32_TYPE; 846 case B_SIZE_T_TYPE: 847 case B_UINT32_TYPE: 848 return BPLUSTREE_UINT32_TYPE; 849 case B_OFF_T_TYPE: 850 case B_INT64_TYPE: 851 return BPLUSTREE_INT64_TYPE; 852 case B_UINT64_TYPE: 853 return BPLUSTREE_UINT64_TYPE; 854 case B_FLOAT_TYPE: 855 return BPLUSTREE_FLOAT_TYPE; 856 case B_DOUBLE_TYPE: 857 return BPLUSTREE_DOUBLE_TYPE; 858 } 859 return -1; 860 } 861 862 863 int32 864 BPlusTree::ModeToKeyType(mode_t mode) 865 { 866 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 867 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 868 case S_INT_INDEX: 869 return BPLUSTREE_INT32_TYPE; 870 case S_UINT_INDEX: 871 return BPLUSTREE_UINT32_TYPE; 872 case S_LONG_LONG_INDEX: 873 return BPLUSTREE_INT64_TYPE; 874 case S_ULONG_LONG_INDEX: 875 return BPLUSTREE_UINT64_TYPE; 876 case S_FLOAT_INDEX: 877 return BPLUSTREE_FLOAT_TYPE; 878 case S_DOUBLE_INDEX: 879 return BPLUSTREE_DOUBLE_TYPE; 880 case S_STR_INDEX: 881 default: 882 // default is for standard directories 883 return BPLUSTREE_STRING_TYPE; 884 } 885 } 886 887 888 // #pragma mark - TransactionListener implementation 889 890 891 void 892 BPlusTree::TransactionDone(bool success) 893 { 894 if (!success) { 895 // update header from disk 896 CachedNode cached(this); 897 const bplustree_header* header = cached.SetToHeader(); 898 if (header != NULL) 899 memcpy(&fHeader, header, sizeof(bplustree_header)); 900 } 901 } 902 903 904 void 905 BPlusTree::RemovedFromTransaction() 906 { 907 fInTransaction = false; 908 909 if (!fStream->GetVolume()->IsInitializing()) 910 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 911 } 912 913 914 // #pragma mark - 915 916 917 void 918 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 919 uint16 splitAt, int8 change) 920 { 921 // Although every iterator which is affected by this update currently 922 // waits on a semaphore, other iterators could be added/removed at 923 // any time, so we need to protect this loop 924 MutexLocker _(fIteratorLock); 925 926 SinglyLinkedList<TreeIterator>::Iterator iterator 927 = fIterators.GetIterator(); 928 while (iterator.HasNext()) 929 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 930 } 931 932 933 void 934 BPlusTree::_AddIterator(TreeIterator* iterator) 935 { 936 MutexLocker _(fIteratorLock); 937 fIterators.Add(iterator); 938 } 939 940 941 void 942 BPlusTree::_RemoveIterator(TreeIterator* iterator) 943 { 944 MutexLocker _(fIteratorLock); 945 fIterators.Remove(iterator); 946 } 947 948 949 int32 950 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 951 int keyLength2) 952 { 953 type_code type = 0; 954 switch (fHeader.DataType()) { 955 case BPLUSTREE_STRING_TYPE: 956 type = B_STRING_TYPE; 957 break; 958 case BPLUSTREE_INT32_TYPE: 959 type = B_INT32_TYPE; 960 break; 961 case BPLUSTREE_UINT32_TYPE: 962 type = B_UINT32_TYPE; 963 break; 964 case BPLUSTREE_INT64_TYPE: 965 type = B_INT64_TYPE; 966 break; 967 case BPLUSTREE_UINT64_TYPE: 968 type = B_UINT64_TYPE; 969 break; 970 case BPLUSTREE_FLOAT_TYPE: 971 type = B_FLOAT_TYPE; 972 break; 973 case BPLUSTREE_DOUBLE_TYPE: 974 type = B_DOUBLE_TYPE; 975 break; 976 } 977 return compareKeys(type, key1, keyLength1, key2, keyLength2); 978 } 979 980 981 status_t 982 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 983 uint16 keyLength, uint16* _index, off_t* _next) 984 { 985 #ifdef DEBUG 986 NodeChecker checker(node, fNodeSize, "find"); 987 #endif 988 989 if (node->all_key_count == 0) { 990 if (_index) 991 *_index = 0; 992 if (_next) 993 *_next = node->OverflowLink(); 994 return B_ENTRY_NOT_FOUND; 995 } 996 997 off_t* values = node->Values(); 998 int16 saveIndex = -1; 999 1000 // binary search in the key array 1001 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1002 uint16 i = (first + last) >> 1; 1003 1004 uint16 searchLength; 1005 uint8* searchKey = node->KeyAt(i, &searchLength); 1006 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1007 > (uint8*)node + fNodeSize 1008 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1009 fStream->GetVolume()->Panic(); 1010 RETURN_ERROR(B_BAD_DATA); 1011 } 1012 1013 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1014 if (cmp < 0) { 1015 last = i - 1; 1016 saveIndex = i; 1017 } else if (cmp > 0) { 1018 saveIndex = first = i + 1; 1019 } else { 1020 if (_index) 1021 *_index = i; 1022 if (_next) 1023 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1024 return B_OK; 1025 } 1026 } 1027 1028 if (_index) 1029 *_index = saveIndex; 1030 if (_next) { 1031 if (saveIndex == node->NumKeys()) 1032 *_next = node->OverflowLink(); 1033 else 1034 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1035 } 1036 return B_ENTRY_NOT_FOUND; 1037 } 1038 1039 1040 /*! Prepares the stack to contain all nodes that were passed while 1041 following the key, from the root node to the leaf node that could 1042 or should contain that key. 1043 */ 1044 status_t 1045 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1046 uint16 keyLength) 1047 { 1048 // set the root node to begin with 1049 node_and_key nodeAndKey; 1050 nodeAndKey.nodeOffset = fHeader.RootNode(); 1051 1052 CachedNode cached(this); 1053 const bplustree_node* node; 1054 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1055 // if we are already on leaf level, we're done 1056 if (node->OverflowLink() == BPLUSTREE_NULL) { 1057 // node that the keyIndex is not properly set here (but it's not 1058 // needed in the calling functions anyway)! 1059 nodeAndKey.keyIndex = 0; 1060 stack.Push(nodeAndKey); 1061 return B_OK; 1062 } 1063 1064 off_t nextOffset; 1065 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1066 &nextOffset); 1067 1068 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1069 RETURN_ERROR(B_ERROR); 1070 1071 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1072 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1073 break; 1074 } 1075 1076 // put the node offset & the correct keyIndex on the stack 1077 stack.Push(nodeAndKey); 1078 1079 nodeAndKey.nodeOffset = nextOffset; 1080 } 1081 1082 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1083 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1084 return B_ERROR; 1085 } 1086 1087 1088 /*! This will find a free duplicate fragment in the given bplustree_node. 1089 The CachedNode will be set to the writable fragment on success. 1090 */ 1091 status_t 1092 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1093 const bplustree_node* node, CachedNode& cached, 1094 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1095 { 1096 off_t* values = node->Values(); 1097 for (int32 i = 0; i < node->NumKeys(); i++) { 1098 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1099 1100 // does the value link to a duplicate fragment? 1101 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1102 continue; 1103 1104 const bplustree_node* fragment = cached.SetTo( 1105 bplustree_node::FragmentOffset(value), false); 1106 if (fragment == NULL) { 1107 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1108 B_PRIdOFF "\n", value, fStream->ID())); 1109 continue; 1110 } 1111 1112 // see if there is some space left for us 1113 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1114 for (uint32 j = 0; j < num; j++) { 1115 duplicate_array* array = fragment->FragmentAt(j); 1116 1117 if (array->IsEmpty()) { 1118 // found an unused fragment 1119 *_fragment = cached.MakeWritable(transaction); 1120 if (*_fragment == NULL) 1121 return B_IO_ERROR; 1122 1123 *_offset = bplustree_node::FragmentOffset(value); 1124 *_index = j; 1125 return B_OK; 1126 } 1127 } 1128 } 1129 return B_ENTRY_NOT_FOUND; 1130 } 1131 1132 1133 status_t 1134 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1135 const bplustree_node* node, uint16 index, off_t value) 1136 { 1137 CachedNode cachedDuplicate(this); 1138 off_t* values = node->Values(); 1139 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1140 status_t status; 1141 off_t offset; 1142 1143 if (bplustree_node::IsDuplicate(oldValue)) { 1144 // If it's a duplicate fragment, try to insert it into that, or if it 1145 // doesn't fit anymore, create a new duplicate node 1146 1147 if (bplustree_node::LinkType(oldValue) 1148 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1149 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1150 transaction, bplustree_node::FragmentOffset(oldValue), false); 1151 if (duplicate == NULL) 1152 return B_IO_ERROR; 1153 1154 duplicate_array* array = duplicate->FragmentAt( 1155 bplustree_node::FragmentIndex(oldValue)); 1156 int32 arrayCount = array->Count(); 1157 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1158 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1159 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1160 (int)bplustree_node::FragmentIndex(oldValue), 1161 bplustree_node::FragmentOffset(oldValue), arrayCount, 1162 fStream->ID())); 1163 return B_BAD_DATA; 1164 } 1165 1166 if (arrayCount < NUM_FRAGMENT_VALUES) { 1167 array->Insert(value); 1168 } else { 1169 // Test if the fragment will be empty if we remove this key's 1170 // values 1171 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1172 // The node will be empty without our values, so let us 1173 // reuse it as a duplicate node 1174 offset = bplustree_node::FragmentOffset(oldValue); 1175 1176 memmove(duplicate->DuplicateArray(), array, 1177 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1178 duplicate->left_link = duplicate->right_link 1179 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1180 1181 array = duplicate->DuplicateArray(); 1182 array->Insert(value); 1183 } else { 1184 // Create a new duplicate node 1185 1186 cachedDuplicate.UnsetUnchanged(transaction); 1187 // The old duplicate has not been touched, so we can 1188 // reuse it 1189 1190 bplustree_node* newDuplicate; 1191 status = cachedDuplicate.Allocate(transaction, 1192 &newDuplicate, &offset); 1193 if (status != B_OK) 1194 RETURN_ERROR(status); 1195 1196 // Copy the array from the fragment node to the duplicate 1197 // node and free the old entry (by zero'ing all values) 1198 newDuplicate->overflow_link = array->count; 1199 memcpy(&newDuplicate->all_key_count, &array->values[0], 1200 array->Count() * sizeof(off_t)); 1201 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1202 1203 array = newDuplicate->DuplicateArray(); 1204 array->Insert(value); 1205 } 1206 1207 // Update the main pointer to link to a duplicate node 1208 if (cached.MakeWritable(transaction) == NULL) 1209 return B_IO_ERROR; 1210 1211 values[index] 1212 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1213 BPLUSTREE_DUPLICATE_NODE, offset)); 1214 } 1215 1216 return B_OK; 1217 } 1218 1219 // Put the value into a dedicated duplicate node 1220 1221 // search for free space in the duplicate nodes of that key 1222 duplicate_array* array; 1223 int32 arrayCount; 1224 const bplustree_node* duplicate; 1225 off_t duplicateOffset; 1226 do { 1227 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1228 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1229 if (duplicate == NULL) 1230 return B_IO_ERROR; 1231 1232 array = duplicate->DuplicateArray(); 1233 arrayCount =array->Count(); 1234 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1235 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1236 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1237 duplicateOffset, arrayCount, fStream->ID())); 1238 return B_BAD_DATA; 1239 } 1240 } while (arrayCount >= NUM_DUPLICATE_VALUES 1241 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1242 1243 bplustree_node* writableDuplicate 1244 = cachedDuplicate.MakeWritable(transaction); 1245 if (writableDuplicate == NULL) 1246 return B_IO_ERROR; 1247 1248 if (arrayCount < NUM_DUPLICATE_VALUES) { 1249 array = writableDuplicate->DuplicateArray(); 1250 array->Insert(value); 1251 } else { 1252 // no space left - add a new duplicate node 1253 1254 bplustree_node* newDuplicate; 1255 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1256 &offset); 1257 if (status != B_OK) 1258 RETURN_ERROR(status); 1259 1260 // link the two nodes together 1261 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1262 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1263 1264 array = newDuplicate->DuplicateArray(); 1265 array->count = 0; 1266 array->Insert(value); 1267 } 1268 return B_OK; 1269 } 1270 1271 // Search for a free duplicate fragment or create a new one 1272 // to insert the duplicate value into 1273 1274 uint32 fragmentIndex = 0; 1275 bplustree_node* fragment; 1276 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1277 &offset, &fragment, &fragmentIndex) != B_OK) { 1278 // allocate a new duplicate fragment node 1279 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1280 if (status != B_OK) 1281 RETURN_ERROR(status); 1282 1283 memset(fragment, 0, fNodeSize); 1284 } 1285 1286 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1287 array->Insert(oldValue); 1288 array->Insert(value); 1289 1290 if (cached.MakeWritable(transaction) == NULL) 1291 return B_IO_ERROR; 1292 1293 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1294 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1295 1296 return B_OK; 1297 } 1298 1299 1300 void 1301 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1302 uint16 keyLength, off_t value) 1303 { 1304 // should never happen, but who knows? 1305 if (index > node->NumKeys()) 1306 return; 1307 1308 off_t* values = node->Values(); 1309 uint16* keyLengths = node->KeyLengths(); 1310 uint8* keys = node->Keys(); 1311 1312 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1313 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1314 + keyLength); 1315 1316 off_t* newValues = node->Values(); 1317 uint16* newKeyLengths = node->KeyLengths(); 1318 1319 // move values and copy new value into them 1320 memmove(newValues + index + 1, values + index, 1321 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1322 memmove(newValues, values, sizeof(off_t) * index); 1323 1324 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1325 1326 // move and update key length index 1327 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1328 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1329 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1330 } 1331 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1332 1333 int32 keyStart; 1334 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1335 + (keyStart = index > 0 1336 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1337 1338 // move keys and copy new key into them 1339 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1340 int32 size = node->AllKeyLength() - length; 1341 if (size > 0) 1342 memmove(keys + length, keys + length - keyLength, size); 1343 1344 memcpy(keys + keyStart, key, keyLength); 1345 } 1346 1347 1348 /*! Splits the \a node into two halves - the other half will be put into 1349 \a other. It also takes care to create a new overflow link if the node 1350 to split is an index node. 1351 */ 1352 status_t 1353 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1354 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1355 uint16* _keyLength, off_t* _value) 1356 { 1357 if (*_keyIndex > node->NumKeys() + 1) 1358 return B_BAD_VALUE; 1359 1360 uint16* inKeyLengths = node->KeyLengths(); 1361 off_t* inKeyValues = node->Values(); 1362 uint8* inKeys = node->Keys(); 1363 uint8* outKeys = other->Keys(); 1364 int32 keyIndex = *_keyIndex; // can become less than zero! 1365 1366 if (keyIndex > node->NumKeys()) { 1367 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1368 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1369 return B_BAD_VALUE; 1370 } 1371 1372 // how many keys will fit in one (half) page? 1373 // that loop will find the answer to this question and 1374 // change the key lengths indices for their new home 1375 1376 // "bytes" is the number of bytes written for the new key, 1377 // "bytesBefore" are the bytes before that key 1378 // "bytesAfter" are the bytes after the new key, if any 1379 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1380 1381 size_t size = fNodeSize >> 1; 1382 int32 out, in; 1383 for (in = out = 0; in < node->NumKeys() + 1;) { 1384 if (!bytes) { 1385 bytesBefore = in > 0 1386 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1387 } 1388 1389 if (in == keyIndex && !bytes) { 1390 bytes = *_keyLength; 1391 } else { 1392 if (keyIndex < out) { 1393 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1394 - bytesBefore; 1395 } 1396 1397 in++; 1398 } 1399 out++; 1400 1401 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1402 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1403 // we have found the number of keys in the new node! 1404 break; 1405 } 1406 } 1407 1408 // if the new key was not inserted, set the length of the keys 1409 // that can be copied directly 1410 if (keyIndex >= out && in > 0) 1411 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1412 1413 if (bytesBefore < 0 || bytesAfter < 0) 1414 return B_BAD_DATA; 1415 1416 other->left_link = node->left_link; 1417 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1418 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1419 + bytesAfter); 1420 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1421 1422 uint16* outKeyLengths = other->KeyLengths(); 1423 off_t* outKeyValues = other->Values(); 1424 int32 keys = out > keyIndex ? keyIndex : out; 1425 1426 if (bytesBefore) { 1427 // copy the keys 1428 memcpy(outKeys, inKeys, bytesBefore); 1429 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1430 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1431 } 1432 if (bytes) { 1433 // copy the newly inserted key 1434 memcpy(outKeys + bytesBefore, key, bytes); 1435 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1436 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1437 1438 if (bytesAfter) { 1439 // copy the keys after the new key 1440 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1441 bytesAfter); 1442 keys = out - keyIndex - 1; 1443 for (int32 i = 0;i < keys;i++) { 1444 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1445 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1446 + bytes); 1447 } 1448 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1449 keys * sizeof(off_t)); 1450 } 1451 } 1452 1453 // if the new key was already inserted, we shouldn't use it again 1454 if (in != out) 1455 keyIndex--; 1456 1457 int32 total = bytesBefore + bytesAfter; 1458 1459 // these variables are for the key that will be returned 1460 // to the parent node 1461 uint8* newKey = NULL; 1462 uint16 newLength; 1463 bool newAllocated = false; 1464 1465 // If we have split an index node, we have to drop the first key 1466 // of the next node (which can also be the new key to insert). 1467 // The dropped key is also the one which has to be inserted in 1468 // the parent node, so we will set the "newKey" already here. 1469 if (node->OverflowLink() != BPLUSTREE_NULL) { 1470 if (in == keyIndex) { 1471 newKey = key; 1472 newLength = *_keyLength; 1473 1474 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1475 keyIndex--; 1476 } else { 1477 // If a key is dropped (is not the new key), we have to copy 1478 // it, because it would be lost if not. 1479 uint8* droppedKey = node->KeyAt(in, &newLength); 1480 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1481 > (uint8*)node + fNodeSize 1482 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1483 fStream->GetVolume()->Panic(); 1484 RETURN_ERROR(B_BAD_DATA); 1485 } 1486 newKey = (uint8*)malloc(newLength); 1487 if (newKey == NULL) 1488 return B_NO_MEMORY; 1489 1490 newAllocated = true; 1491 memcpy(newKey, droppedKey, newLength); 1492 1493 other->overflow_link = inKeyValues[in]; 1494 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1495 } 1496 } 1497 1498 // and now the same game for the other page and the rest of the keys 1499 // (but with memmove() instead of memcpy(), because they may overlap) 1500 1501 bytesBefore = bytesAfter = bytes = 0; 1502 out = 0; 1503 int32 skip = in; 1504 while (in < node->NumKeys() + 1) { 1505 if (in == keyIndex && !bytes) { 1506 // it's enough to set bytesBefore once here, because we do 1507 // not need to know the exact length of all keys in this 1508 // loop 1509 bytesBefore = in > skip 1510 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1511 bytes = *_keyLength; 1512 out++; 1513 } else { 1514 if (in < node->NumKeys()) { 1515 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1516 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1517 1518 if (bytes) { 1519 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1520 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1521 1522 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1523 - bytesBefore - bytes; 1524 } 1525 out++; 1526 } 1527 in++; 1528 } 1529 } 1530 1531 // adjust the byte counts (since we were a bit lazy in the loop) 1532 if (keyIndex < skip) 1533 bytesBefore = node->AllKeyLength() - total; 1534 1535 if (bytesBefore < 0 || bytesAfter < 0) { 1536 if (newAllocated) 1537 free(newKey); 1538 return B_BAD_DATA; 1539 } 1540 1541 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1542 // right link, and overflow link can stay the same 1543 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1544 + bytesAfter); 1545 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1546 1547 // array positions have changed 1548 outKeyLengths = node->KeyLengths(); 1549 outKeyValues = node->Values(); 1550 1551 // move the keys in the old node: the order is important here, 1552 // because we don't want to overwrite any contents 1553 1554 keys = keyIndex <= skip ? out : keyIndex - skip; 1555 keyIndex -= skip; 1556 in = out - keyIndex - 1; 1557 // Note: keyIndex and in will contain invalid values when the new key 1558 // went to the other node. But in this case bytes and bytesAfter are 1559 // 0 and subsequently we never use keyIndex and in. 1560 1561 if (bytesBefore) 1562 memmove(inKeys, inKeys + total, bytesBefore); 1563 if (bytesAfter) { 1564 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1565 bytesAfter); 1566 } 1567 1568 if (bytesBefore) 1569 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1570 if (bytesAfter) { 1571 // if byteAfter is > 0, keyIndex is larger than skip 1572 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1573 in * sizeof(uint16)); 1574 } 1575 1576 if (bytesBefore) 1577 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1578 if (bytesAfter) { 1579 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1580 in * sizeof(off_t)); 1581 } 1582 1583 if (bytes) { 1584 // finally, copy the newly inserted key (don't overwrite anything) 1585 memcpy(inKeys + bytesBefore, key, bytes); 1586 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1587 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1588 } 1589 1590 // Prepare the key that will be inserted in the parent node which 1591 // is either the dropped key or the last of the other node. 1592 // If it's the dropped key, "newKey" was already set earlier. 1593 1594 if (newKey == NULL) 1595 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1596 1597 memcpy(key, newKey, newLength); 1598 *_keyLength = newLength; 1599 *_value = otherOffset; 1600 1601 if (newAllocated) 1602 free(newKey); 1603 1604 return B_OK; 1605 } 1606 1607 1608 /*! This inserts a key into the tree. The changes made to the tree will 1609 all be part of the \a transaction. 1610 You need to have the inode write locked. 1611 */ 1612 status_t 1613 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1614 off_t value) 1615 { 1616 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1617 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1618 RETURN_ERROR(B_BAD_VALUE); 1619 #ifdef DEBUG 1620 if (value < 0) 1621 panic("tried to insert invalid value %Ld!\n", value); 1622 #endif 1623 1624 ASSERT_WRITE_LOCKED_INODE(fStream); 1625 1626 Stack<node_and_key> stack; 1627 if (_SeekDown(stack, key, keyLength) != B_OK) 1628 RETURN_ERROR(B_ERROR); 1629 1630 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1631 1632 memcpy(keyBuffer, key, keyLength); 1633 keyBuffer[keyLength] = 0; 1634 1635 node_and_key nodeAndKey; 1636 const bplustree_node* node; 1637 1638 CachedNode cached(this); 1639 while (stack.Pop(&nodeAndKey) 1640 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1641 #ifdef DEBUG 1642 NodeChecker checker(node, fNodeSize, "insert"); 1643 #endif 1644 if (node->IsLeaf()) { 1645 // first round, check for duplicate entries 1646 status_t status = _FindKey(node, key, keyLength, 1647 &nodeAndKey.keyIndex); 1648 1649 // is this a duplicate entry? 1650 if (status == B_OK) { 1651 if (fAllowDuplicates) { 1652 status = _InsertDuplicate(transaction, cached, node, 1653 nodeAndKey.keyIndex, value); 1654 if (status != B_OK) 1655 RETURN_ERROR(status); 1656 return B_OK; 1657 } 1658 1659 return B_NAME_IN_USE; 1660 } 1661 } 1662 1663 bplustree_node* writableNode = cached.MakeWritable(transaction); 1664 if (writableNode == NULL) 1665 return B_IO_ERROR; 1666 1667 // is the node big enough to hold the pair? 1668 if (int32(key_align(sizeof(bplustree_node) 1669 + writableNode->AllKeyLength() + keyLength) 1670 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1671 + sizeof(off_t))) < fNodeSize) { 1672 _InsertKey(writableNode, nodeAndKey.keyIndex, 1673 keyBuffer, keyLength, value); 1674 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1675 nodeAndKey.keyIndex, 0, 1); 1676 1677 return B_OK; 1678 } else { 1679 CachedNode cachedNewRoot(this); 1680 CachedNode cachedOther(this); 1681 1682 // do we need to allocate a new root node? if so, then do 1683 // it now 1684 off_t newRoot = BPLUSTREE_NULL; 1685 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1686 bplustree_node* root; 1687 status_t status = cachedNewRoot.Allocate(transaction, &root, 1688 &newRoot); 1689 if (status != B_OK) { 1690 // The tree is most likely corrupted! 1691 // But it's still sane at leaf level - we could set 1692 // a flag in the header that forces the tree to be 1693 // rebuild next time... 1694 // But since we will have journaling, that's not a big 1695 // problem anyway. 1696 RETURN_ERROR(status); 1697 } 1698 } 1699 1700 // reserve space for the other node 1701 bplustree_node* other; 1702 off_t otherOffset; 1703 status_t status = cachedOther.Allocate(transaction, &other, 1704 &otherOffset); 1705 if (status != B_OK) { 1706 cachedNewRoot.Free(transaction, newRoot); 1707 RETURN_ERROR(status); 1708 } 1709 1710 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1711 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1712 &value) != B_OK) { 1713 // free root node & other node here 1714 cachedOther.Free(transaction, otherOffset); 1715 cachedNewRoot.Free(transaction, newRoot); 1716 1717 RETURN_ERROR(B_ERROR); 1718 } 1719 #ifdef DEBUG 1720 checker.Check("insert split"); 1721 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1722 #endif 1723 1724 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1725 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1726 1727 // update the right link of the node in the left of the new node 1728 if ((other = cachedOther.SetToWritable(transaction, 1729 other->LeftLink())) != NULL) { 1730 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1731 } 1732 1733 // create a new root if necessary 1734 if (newRoot != BPLUSTREE_NULL) { 1735 bplustree_node* root = cachedNewRoot.Node(); 1736 1737 _InsertKey(root, 0, keyBuffer, keyLength, 1738 writableNode->LeftLink()); 1739 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1740 nodeAndKey.nodeOffset); 1741 1742 CachedNode cached(this); 1743 bplustree_header* header 1744 = cached.SetToWritableHeader(transaction); 1745 if (header == NULL) 1746 return B_IO_ERROR; 1747 1748 // finally, update header to point to the new root 1749 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1750 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1751 header->MaxNumberOfLevels() + 1); 1752 1753 return B_OK; 1754 } 1755 } 1756 } 1757 RETURN_ERROR(B_ERROR); 1758 } 1759 1760 1761 /*! Removes the duplicate index/value pair from the tree. 1762 It's part of the private tree interface. 1763 */ 1764 status_t 1765 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1766 const bplustree_node* node, CachedNode& cached, uint16 index, 1767 off_t value) 1768 { 1769 off_t* values = node->Values(); 1770 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1771 1772 CachedNode cachedDuplicate(this); 1773 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1774 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1775 duplicateOffset, false); 1776 if (duplicate == NULL) 1777 return B_IO_ERROR; 1778 1779 // if it's a duplicate fragment, remove the entry from there 1780 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1781 duplicate_array* array = duplicate->FragmentAt( 1782 bplustree_node::FragmentIndex(oldValue)); 1783 int32 arrayCount = array->Count(); 1784 1785 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1786 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1787 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1788 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1789 arrayCount, fStream->ID())); 1790 return B_BAD_DATA; 1791 } 1792 if (!array->Remove(value)) { 1793 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1794 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1795 duplicateOffset, fStream->ID())); 1796 return B_ENTRY_NOT_FOUND; 1797 } 1798 1799 // remove the array from the fragment node if it is empty 1800 if (--arrayCount == 1) { 1801 // set the link to the remaining value 1802 if (cached.MakeWritable(transaction) == NULL) 1803 return B_IO_ERROR; 1804 1805 values[index] = array->values[0]; 1806 1807 // Remove the whole fragment node, if this was the only array, 1808 // otherwise free just the array 1809 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1810 status_t status = cachedDuplicate.Free(transaction, 1811 duplicateOffset); 1812 if (status != B_OK) 1813 return status; 1814 } else 1815 array->count = 0; 1816 } 1817 return B_OK; 1818 } 1819 1820 // Remove value from a duplicate node! 1821 1822 duplicate_array* array = NULL; 1823 int32 arrayCount = 0; 1824 1825 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1826 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1827 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1828 return B_BAD_DATA; 1829 } 1830 1831 // Search the duplicate nodes until the entry could be found (and removed) 1832 while (duplicate != NULL) { 1833 array = duplicate->DuplicateArray(); 1834 arrayCount = array->Count(); 1835 1836 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1837 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1838 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1839 duplicateOffset, arrayCount, fStream->ID())); 1840 return B_BAD_DATA; 1841 } 1842 1843 if (array->Remove(value)) { 1844 arrayCount--; 1845 break; 1846 } 1847 1848 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1849 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1850 1851 cachedDuplicate.UnsetUnchanged(transaction); 1852 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1853 false); 1854 } 1855 if (duplicate == NULL) 1856 RETURN_ERROR(B_IO_ERROR); 1857 1858 // The entry got removed from the duplicate node, but we might want to free 1859 // it now in case it's empty 1860 1861 while (true) { 1862 off_t left = duplicate->LeftLink(); 1863 off_t right = duplicate->RightLink(); 1864 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1865 1866 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1867 // Free empty duplicate page, link their siblings together, and 1868 // update the duplicate link if needed (ie. when we either remove 1869 // the last duplicate node or have a new first one) 1870 1871 if (left == BPLUSTREE_NULL) { 1872 // the duplicate link points to us 1873 if (cached.MakeWritable(transaction) == NULL) 1874 return B_IO_ERROR; 1875 1876 if (arrayCount == 1) { 1877 // This is the last node, and there is only one value left; 1878 // replace the duplicate link with that value, it's no 1879 // duplicate anymore 1880 values[index] = array->values[0]; 1881 } else { 1882 // Move the duplicate link to the next node 1883 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1884 bplustree_node::MakeLink( 1885 BPLUSTREE_DUPLICATE_NODE, right)); 1886 } 1887 } 1888 1889 status_t status = cachedDuplicate.Free(transaction, 1890 duplicateOffset); 1891 if (status != B_OK) 1892 return status; 1893 1894 if (left != BPLUSTREE_NULL 1895 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1896 false)) != NULL) { 1897 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1898 1899 // If the next node is the last node, we need to free that node 1900 // and convert the duplicate entry back into a normal entry 1901 array = duplicate->DuplicateArray(); 1902 arrayCount = array->Count(); 1903 if (right == BPLUSTREE_NULL 1904 && duplicate->LeftLink() == BPLUSTREE_NULL 1905 && arrayCount <= NUM_FRAGMENT_VALUES) { 1906 duplicateOffset = left; 1907 continue; 1908 } 1909 } 1910 if (right != BPLUSTREE_NULL 1911 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1912 right, false)) != NULL) { 1913 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1914 1915 // Again, we may need to turn the duplicate entry back into a 1916 // normal entry 1917 array = duplicate->DuplicateArray(); 1918 arrayCount = array->Count(); 1919 if (left == BPLUSTREE_NULL 1920 && duplicate->RightLink() == BPLUSTREE_NULL 1921 && arrayCount <= NUM_FRAGMENT_VALUES) { 1922 duplicateOffset = right; 1923 continue; 1924 } 1925 } 1926 return B_OK; 1927 } 1928 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1929 // If the number of entries fits in a duplicate fragment, then 1930 // either find a free fragment node, or convert this node to a 1931 // fragment node. 1932 CachedNode cachedOther(this); 1933 1934 bplustree_node* fragment = NULL; 1935 uint32 fragmentIndex = 0; 1936 off_t offset; 1937 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1938 &offset, &fragment, &fragmentIndex) == B_OK) { 1939 // move to other node 1940 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 1941 memcpy(target, array, 1942 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1943 1944 cachedDuplicate.Free(transaction, duplicateOffset); 1945 duplicateOffset = offset; 1946 } else { 1947 // convert node 1948 memmove(duplicate, array, 1949 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1950 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1951 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1952 } 1953 1954 if (cached.MakeWritable(transaction) == NULL) 1955 return B_IO_ERROR; 1956 1957 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1958 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1959 } 1960 return B_OK; 1961 } 1962 } 1963 1964 1965 /*! Removes the key with the given index from the specified node. 1966 Since it has to get the key from the node anyway (to obtain it's 1967 pointer), it's not needed to pass the key & its length, although 1968 the calling method (BPlusTree::Remove()) have this data. 1969 */ 1970 void 1971 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 1972 { 1973 // should never happen, but who knows? 1974 if (index > node->NumKeys() && node->NumKeys() > 0) { 1975 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 1976 "\n", index, fStream->ID())); 1977 return; 1978 } 1979 1980 off_t* values = node->Values(); 1981 1982 // if we would have to drop the overflow link, drop 1983 // the last key instead and update the overflow link 1984 // to the value of that one 1985 if (!node->IsLeaf() && index == node->NumKeys()) 1986 node->overflow_link = values[--index]; 1987 1988 uint16 length; 1989 uint8* key = node->KeyAt(index, &length); 1990 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 1991 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1992 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 1993 fStream->ID())); 1994 fStream->GetVolume()->Panic(); 1995 return; 1996 } 1997 1998 uint16* keyLengths = node->KeyLengths(); 1999 uint8* keys = node->Keys(); 2000 2001 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2002 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2003 node->AllKeyLength() - length); 2004 2005 off_t* newValues = node->Values(); 2006 uint16* newKeyLengths = node->KeyLengths(); 2007 2008 // move key data 2009 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2010 2011 // move and update key lengths 2012 if (index > 0 && newKeyLengths != keyLengths) 2013 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2014 for (uint16 i = index; i < node->NumKeys(); i++) { 2015 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2016 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2017 } 2018 2019 // move values 2020 if (index > 0) 2021 memmove(newValues, values, index * sizeof(off_t)); 2022 if (node->NumKeys() > index) { 2023 memmove(newValues + index, values + index + 1, 2024 (node->NumKeys() - index) * sizeof(off_t)); 2025 } 2026 } 2027 2028 2029 /*! Removes the specified key from the tree. The "value" parameter is only used 2030 for trees which allow duplicates, so you may safely ignore it. 2031 It's not an optional parameter, so at least you have to think about it. 2032 You need to have the inode write locked. 2033 */ 2034 status_t 2035 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2036 off_t value) 2037 { 2038 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2039 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2040 RETURN_ERROR(B_BAD_VALUE); 2041 2042 ASSERT_WRITE_LOCKED_INODE(fStream); 2043 2044 Stack<node_and_key> stack; 2045 if (_SeekDown(stack, key, keyLength) != B_OK) 2046 RETURN_ERROR(B_ERROR); 2047 2048 node_and_key nodeAndKey; 2049 const bplustree_node* node; 2050 2051 CachedNode cached(this); 2052 while (stack.Pop(&nodeAndKey) 2053 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2054 #ifdef DEBUG 2055 NodeChecker checker(node, fNodeSize, "remove"); 2056 #endif 2057 if (node->IsLeaf()) { 2058 // first round, check for duplicate entries 2059 status_t status = _FindKey(node, key, keyLength, 2060 &nodeAndKey.keyIndex); 2061 if (status != B_OK) 2062 RETURN_ERROR(status); 2063 2064 // Is this a duplicate entry? 2065 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2066 node->Values()[nodeAndKey.keyIndex]))) { 2067 if (fAllowDuplicates) { 2068 return _RemoveDuplicate(transaction, node, cached, 2069 nodeAndKey.keyIndex, value); 2070 } 2071 2072 FATAL(("dupliate node found where no duplicates are " 2073 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2074 RETURN_ERROR(B_ERROR); 2075 } else { 2076 if (node->Values()[nodeAndKey.keyIndex] != value) 2077 return B_ENTRY_NOT_FOUND; 2078 2079 // If we will remove the last key, the iterator will be set 2080 // to the next node after the current - if there aren't any 2081 // more nodes, we need a way to prevent the TreeIterators to 2082 // touch the old node again, we use BPLUSTREE_FREE for this 2083 off_t next = node->RightLink() == BPLUSTREE_NULL 2084 ? BPLUSTREE_FREE : node->RightLink(); 2085 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2086 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2087 } 2088 } 2089 2090 bplustree_node* writableNode = cached.MakeWritable(transaction); 2091 if (writableNode == NULL) 2092 return B_IO_ERROR; 2093 2094 // if it's an empty root node, we have to convert it 2095 // to a leaf node by dropping the overflow link, or, 2096 // if it's already a leaf node, just empty it 2097 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2098 && (node->NumKeys() == 0 2099 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2100 writableNode->overflow_link 2101 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2102 writableNode->all_key_count = 0; 2103 writableNode->all_key_length = 0; 2104 2105 // if we've made a leaf node out of the root node, we need 2106 // to reset the maximum number of levels in the header 2107 if (fHeader.MaxNumberOfLevels() != 1) { 2108 CachedNode cached(this); 2109 bplustree_header* header 2110 = cached.SetToWritableHeader(transaction); 2111 if (header == NULL) 2112 return B_IO_ERROR; 2113 2114 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2115 } 2116 return B_OK; 2117 } 2118 2119 // if there is only one key left, we don't have to remove 2120 // it, we can just dump the node (index nodes still have 2121 // the overflow link, so we have to drop the last key) 2122 if (writableNode->NumKeys() > 1 2123 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2124 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2125 return B_OK; 2126 } 2127 2128 // when we are here, we can just free the node, but 2129 // we have to update the right/left link of the 2130 // siblings first 2131 CachedNode otherCached(this); 2132 bplustree_node* other = otherCached.SetToWritable(transaction, 2133 writableNode->LeftLink()); 2134 if (other != NULL) 2135 other->right_link = writableNode->right_link; 2136 2137 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2138 != NULL) { 2139 other->left_link = writableNode->left_link; 2140 } 2141 2142 cached.Free(transaction, nodeAndKey.nodeOffset); 2143 } 2144 RETURN_ERROR(B_ERROR); 2145 } 2146 2147 2148 /*! Replaces the value for the key in the tree. 2149 Returns B_OK if the key could be found and its value replaced, 2150 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2151 to indicate that something went terribly wrong. 2152 Note that this doesn't work with duplicates - it will just 2153 return B_BAD_TYPE if you call this function on a tree where 2154 duplicates are allowed. 2155 You need to have the inode write locked. 2156 */ 2157 status_t 2158 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2159 off_t value) 2160 { 2161 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2162 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2163 || key == NULL) 2164 RETURN_ERROR(B_BAD_VALUE); 2165 2166 if (fAllowDuplicates) 2167 RETURN_ERROR(B_BAD_TYPE); 2168 2169 ASSERT_WRITE_LOCKED_INODE(fStream); 2170 2171 off_t nodeOffset = fHeader.RootNode(); 2172 CachedNode cached(this); 2173 const bplustree_node* node; 2174 2175 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2176 uint16 keyIndex = 0; 2177 off_t nextOffset; 2178 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2179 &nextOffset); 2180 2181 if (node->OverflowLink() == BPLUSTREE_NULL) { 2182 if (status == B_OK) { 2183 bplustree_node* writableNode = cached.MakeWritable(transaction); 2184 if (writableNode != NULL) { 2185 writableNode->Values()[keyIndex] 2186 = HOST_ENDIAN_TO_BFS_INT64(value); 2187 } else 2188 status = B_IO_ERROR; 2189 } 2190 2191 return status; 2192 } else if (nextOffset == nodeOffset) 2193 RETURN_ERROR(B_ERROR); 2194 2195 nodeOffset = nextOffset; 2196 } 2197 RETURN_ERROR(B_ERROR); 2198 } 2199 2200 2201 /*! Searches the key in the tree, and stores the offset found in _value, 2202 if successful. 2203 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2204 while it descends the tree. 2205 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2206 It can also return other errors to indicate that something went wrong. 2207 Note that this doesn't work with duplicates - it will just return 2208 B_BAD_TYPE if you call this function on a tree where duplicates are 2209 allowed. 2210 You need to have the inode read or write locked. 2211 */ 2212 status_t 2213 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2214 { 2215 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2216 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2217 || key == NULL) 2218 RETURN_ERROR(B_BAD_VALUE); 2219 2220 if (fAllowDuplicates) 2221 RETURN_ERROR(B_BAD_TYPE); 2222 2223 ASSERT_READ_LOCKED_INODE(fStream); 2224 2225 off_t nodeOffset = fHeader.RootNode(); 2226 CachedNode cached(this); 2227 const bplustree_node* node; 2228 2229 #ifdef DEBUG 2230 int32 levels = 0; 2231 #endif 2232 2233 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2234 uint16 keyIndex = 0; 2235 off_t nextOffset; 2236 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2237 &nextOffset); 2238 2239 #ifdef DEBUG 2240 levels++; 2241 #endif 2242 if (node->OverflowLink() == BPLUSTREE_NULL) { 2243 if (status == B_OK && _value != NULL) 2244 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2245 2246 #ifdef DEBUG 2247 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2248 DEBUGGER(("levels don't match")); 2249 #endif 2250 return status; 2251 } else if (nextOffset == nodeOffset) 2252 RETURN_ERROR(B_ERROR); 2253 2254 nodeOffset = nextOffset; 2255 } 2256 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2257 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2258 RETURN_ERROR(B_ERROR); 2259 } 2260 2261 2262 status_t 2263 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2264 const uint8* largestKey, uint16 largestKeyLength, 2265 const bplustree_node* parent) 2266 { 2267 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2268 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2269 "failed!\n", fStream->ID(), offset); 2270 check.FoundError(); 2271 return B_OK; 2272 } 2273 if (level >= fHeader.MaxNumberOfLevels()) { 2274 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2275 "!\n", fStream->ID(), offset); 2276 check.FoundError(); 2277 return B_OK; 2278 } 2279 2280 check.SetLevel(level); 2281 2282 if (check.Visited(offset)) { 2283 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2284 fStream->ID(), offset); 2285 check.FoundError(); 2286 return B_OK; 2287 } 2288 2289 check.SetVisited(offset); 2290 2291 uint32 count = parent->NumKeys(); 2292 off_t* values = parent->Values(); 2293 off_t lastOffset = check.PreviousOffset(level); 2294 CachedNode cached(this); 2295 2296 for (uint32 i = 0; i < count; i++) { 2297 uint16 keyLength; 2298 uint8* key = parent->KeyAt(i, &keyLength); 2299 if (largestKey != NULL) { 2300 int result = compareKeys(fHeader.DataType(), key, keyLength, 2301 largestKey, largestKeyLength); 2302 if (result >= 0) { 2303 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2304 B_PRIu32 " larger than it should!\n", 2305 fStream->ID(), offset, i); 2306 check.FoundError(); 2307 } 2308 } 2309 2310 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2311 if (bplustree_node::IsDuplicate(childOffset)) { 2312 // Walk the duplicate nodes 2313 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2314 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2315 2316 while (duplicateOffset != BPLUSTREE_NULL) { 2317 const bplustree_node* node; 2318 status_t status = cached.SetTo(duplicateOffset, &node, false); 2319 if (status != B_OK) { 2320 if (status == B_IO_ERROR) 2321 return B_IO_ERROR; 2322 2323 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2324 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2325 duplicateOffset, strerror(status)); 2326 check.FoundError(); 2327 break; 2328 } 2329 2330 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2331 == BPLUSTREE_DUPLICATE_FRAGMENT; 2332 bool isKnownFragment = isFragmentNode 2333 && check.VisitedFragment(duplicateOffset); 2334 2335 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2336 dprintf("inode %" B_PRIdOFF ": %s node at %" 2337 B_PRIdOFF " already visited, referenced from %" 2338 B_PRIdOFF "!\n", fStream->ID(), 2339 isFragmentNode ? "fragment" : "duplicate", 2340 duplicateOffset, offset); 2341 check.FoundError(); 2342 break; 2343 } 2344 2345 // Fragment nodes may be visited more than once from different 2346 // places 2347 if (!check.Visited(duplicateOffset)) 2348 check.SetVisited(duplicateOffset); 2349 if (!isKnownFragment && isFragmentNode) 2350 check.SetVisitedFragment(duplicateOffset); 2351 2352 duplicate_array* array; 2353 int32 minSize; 2354 int32 maxSize; 2355 if (isFragmentNode) { 2356 array = node->FragmentAt( 2357 bplustree_node::FragmentIndex(childOffset)); 2358 minSize = 2; 2359 maxSize = NUM_FRAGMENT_VALUES; 2360 } else { 2361 array = node->DuplicateArray(); 2362 minSize = 1; 2363 maxSize = NUM_DUPLICATE_VALUES; 2364 } 2365 int32 arrayCount = array->Count(); 2366 2367 if (arrayCount < minSize || arrayCount > maxSize) { 2368 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2369 " has invalid array size %" B_PRId32 "!\n", 2370 fStream->ID(), duplicateOffset, arrayCount); 2371 check.FoundError(); 2372 } else { 2373 // Simple check if the values in the array may be valid 2374 for (int32 j = 0; j < arrayCount; j++) { 2375 if (!fStream->GetVolume()->IsValidInodeBlock( 2376 array->ValueAt(j))) { 2377 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2378 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2379 " at %" B_PRId32 "!\n", fStream->ID(), 2380 duplicateOffset, array->ValueAt(j), j); 2381 check.FoundError(); 2382 break; 2383 } 2384 } 2385 } 2386 2387 // A fragment node is not linked (and does not have valid links) 2388 if (isFragmentNode) 2389 break; 2390 2391 if (node->LeftLink() != lastDuplicateOffset) { 2392 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2393 " has wrong left link %" B_PRIdOFF ", expected %" 2394 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2395 node->LeftLink(), lastDuplicateOffset); 2396 check.FoundError(); 2397 } 2398 2399 lastDuplicateOffset = duplicateOffset; 2400 duplicateOffset = node->RightLink(); 2401 } 2402 } else if (!parent->IsLeaf()) { 2403 // Test a regular child node recursively 2404 off_t nextOffset = parent->OverflowLink(); 2405 if (i < count - 1) 2406 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2407 2408 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2409 // Test right link of the previous node 2410 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2411 if (previous == NULL) 2412 return B_IO_ERROR; 2413 2414 if (previous->RightLink() != childOffset) { 2415 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2416 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2417 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2418 childOffset); 2419 check.FoundError(); 2420 } 2421 } 2422 2423 status_t status = _ValidateChild(check, cached, level, childOffset, 2424 lastOffset, nextOffset, key, keyLength); 2425 if (status != B_OK) 2426 return status; 2427 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2428 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2429 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2430 fStream->ID(), offset, childOffset, i); 2431 check.FoundError(); 2432 } 2433 2434 lastOffset = childOffset; 2435 } 2436 2437 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2438 off_t childOffset = parent->OverflowLink(); 2439 status_t status = _ValidateChild(check, cached, level, childOffset, 2440 lastOffset, 0, NULL, 0); 2441 if (status != B_OK) 2442 return status; 2443 2444 lastOffset = childOffset; 2445 } 2446 2447 check.SetPreviousOffset(level, lastOffset); 2448 return B_OK; 2449 } 2450 2451 2452 status_t 2453 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2454 off_t offset, off_t lastOffset, off_t nextOffset, 2455 const uint8* key, uint16 keyLength) 2456 { 2457 const bplustree_node* node; 2458 status_t status = cached.SetTo(offset, &node, true); 2459 if (status != B_OK) { 2460 if (status == B_IO_ERROR) 2461 return B_IO_ERROR; 2462 2463 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2464 "read: %s\n", fStream->ID(), offset, strerror(status)); 2465 check.FoundError(); 2466 return B_OK; 2467 } 2468 2469 if (node->LeftLink() != lastOffset) { 2470 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2471 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2472 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2473 check.FoundError(); 2474 } 2475 2476 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2477 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2478 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2479 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2480 check.FoundError(); 2481 } 2482 2483 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2484 } 2485 2486 2487 // #pragma mark - 2488 2489 2490 TreeIterator::TreeIterator(BPlusTree* tree) 2491 : 2492 fTree(tree), 2493 fCurrentNodeOffset(BPLUSTREE_NULL) 2494 { 2495 tree->_AddIterator(this); 2496 } 2497 2498 2499 TreeIterator::~TreeIterator() 2500 { 2501 if (fTree) 2502 fTree->_RemoveIterator(this); 2503 } 2504 2505 2506 status_t 2507 TreeIterator::Goto(int8 to) 2508 { 2509 if (fTree == NULL || fTree->fStream == NULL) 2510 RETURN_ERROR(B_BAD_VALUE); 2511 2512 // lock access to stream 2513 InodeReadLocker locker(fTree->fStream); 2514 2515 off_t nodeOffset = fTree->fHeader.RootNode(); 2516 CachedNode cached(fTree); 2517 const bplustree_node* node; 2518 2519 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2520 // is the node a leaf node? 2521 if (node->OverflowLink() == BPLUSTREE_NULL) { 2522 fCurrentNodeOffset = nodeOffset; 2523 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2524 fDuplicateNode = BPLUSTREE_NULL; 2525 2526 return B_OK; 2527 } 2528 2529 // get the next node offset depending on the direction (and if there 2530 // are any keys in that node at all) 2531 off_t nextOffset; 2532 if (to == BPLUSTREE_END || node->all_key_count == 0) 2533 nextOffset = node->OverflowLink(); 2534 else { 2535 if (node->AllKeyLength() > fTree->fNodeSize 2536 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2537 - 8 * node->NumKeys()) 2538 RETURN_ERROR(B_ERROR); 2539 2540 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2541 } 2542 if (nextOffset == nodeOffset) 2543 break; 2544 2545 nodeOffset = nextOffset; 2546 } 2547 FATAL(("%s fails\n", __FUNCTION__)); 2548 2549 RETURN_ERROR(B_ERROR); 2550 } 2551 2552 2553 /*! Iterates through the tree in the specified direction. 2554 When it iterates through duplicates, the "key" is only updated for the 2555 first entry - if you need to know when this happens, use the "duplicate" 2556 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2557 the other duplicates. 2558 That's not too nice, but saves the 256 bytes that would be needed to 2559 store the last key - if this will ever become an issue, it will be 2560 easy to change. 2561 The other advantage of this is, that the queries can skip all duplicates 2562 at once when they are not relevant to them. 2563 */ 2564 status_t 2565 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2566 uint16 maxLength, off_t* value, uint16* duplicate) 2567 { 2568 if (fTree == NULL) 2569 return B_INTERRUPTED; 2570 2571 bool forward = direction == BPLUSTREE_FORWARD; 2572 2573 if (fCurrentNodeOffset == BPLUSTREE_NULL 2574 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2575 RETURN_ERROR(B_ERROR); 2576 2577 // if the tree was emptied since the last call 2578 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2579 return B_ENTRY_NOT_FOUND; 2580 2581 // lock access to stream 2582 InodeReadLocker locker(fTree->fStream); 2583 2584 CachedNode cached(fTree); 2585 const bplustree_node* node; 2586 2587 if (fDuplicateNode != BPLUSTREE_NULL) { 2588 // regardless of traverse direction the duplicates are always presented 2589 // in the same order; since they are all considered as equal, this 2590 // shouldn't cause any problems 2591 2592 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2593 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2594 false); 2595 } else 2596 node = NULL; 2597 2598 if (node != NULL) { 2599 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2600 // If the node is out of duplicates, we go directly to the next 2601 // one 2602 fDuplicateNode = node->RightLink(); 2603 if (fDuplicateNode != BPLUSTREE_NULL 2604 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2605 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2606 false); 2607 fDuplicate = 0; 2608 } 2609 } 2610 if (fDuplicate < fNumDuplicates) { 2611 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2612 fDuplicate++); 2613 if (duplicate) 2614 *duplicate = 2; 2615 return B_OK; 2616 } 2617 } 2618 fDuplicateNode = BPLUSTREE_NULL; 2619 } 2620 2621 off_t savedNodeOffset = fCurrentNodeOffset; 2622 int32 savedKey = fCurrentKey; 2623 2624 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2625 RETURN_ERROR(B_ERROR); 2626 2627 if (duplicate) 2628 *duplicate = 0; 2629 2630 fCurrentKey += direction; 2631 2632 // is the current key in the current node? 2633 while ((forward && fCurrentKey >= node->NumKeys()) 2634 || (!forward && fCurrentKey < 0)) { 2635 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2636 2637 // are there any more nodes? 2638 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2639 node = cached.SetTo(fCurrentNodeOffset); 2640 if (!node) 2641 RETURN_ERROR(B_ERROR); 2642 2643 // reset current key 2644 fCurrentKey = forward ? 0 : node->NumKeys(); 2645 } else { 2646 // there are no nodes left, so turn back to the last key 2647 fCurrentNodeOffset = savedNodeOffset; 2648 fCurrentKey = savedKey; 2649 2650 return B_ENTRY_NOT_FOUND; 2651 } 2652 } 2653 2654 if (node->all_key_count == 0) 2655 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2656 2657 uint16 length; 2658 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2659 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2660 > (uint8*)node + fTree->fNodeSize 2661 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2662 fTree->fStream->GetVolume()->Panic(); 2663 RETURN_ERROR(B_BAD_DATA); 2664 } 2665 2666 length = min_c(length, maxLength); 2667 memcpy(key, keyStart, length); 2668 2669 if (fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE) { 2670 // terminate string type 2671 if (length == maxLength) 2672 length--; 2673 ((char*)key)[length] = '\0'; 2674 } 2675 *keyLength = length; 2676 2677 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2678 2679 // duplicate fragments? 2680 uint8 type = bplustree_node::LinkType(offset); 2681 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2682 || type == BPLUSTREE_DUPLICATE_NODE) { 2683 fDuplicateNode = offset; 2684 2685 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2686 false); 2687 if (node == NULL) 2688 RETURN_ERROR(B_ERROR); 2689 2690 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2691 2692 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2693 if (fNumDuplicates) { 2694 offset = node->DuplicateAt(offset, fIsFragment, 0); 2695 fDuplicate = 1; 2696 if (duplicate) 2697 *duplicate = 1; 2698 } else { 2699 // Shouldn't happen, but we're dealing here with potentially 2700 // corrupt disks... 2701 fDuplicateNode = BPLUSTREE_NULL; 2702 offset = 0; 2703 } 2704 } 2705 *value = offset; 2706 2707 return B_OK; 2708 } 2709 2710 2711 /*! This is more or less a copy of BPlusTree::Find() - but it just 2712 sets the current position in the iterator, regardless of if the 2713 key could be found or not. 2714 */ 2715 status_t 2716 TreeIterator::Find(const uint8* key, uint16 keyLength) 2717 { 2718 if (fTree == NULL) 2719 return B_INTERRUPTED; 2720 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2721 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2722 || key == NULL) 2723 RETURN_ERROR(B_BAD_VALUE); 2724 2725 // lock access to stream 2726 InodeReadLocker locker(fTree->fStream); 2727 2728 off_t nodeOffset = fTree->fHeader.RootNode(); 2729 2730 CachedNode cached(fTree); 2731 const bplustree_node* node; 2732 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2733 uint16 keyIndex = 0; 2734 off_t nextOffset; 2735 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2736 &nextOffset); 2737 2738 if (node->OverflowLink() == BPLUSTREE_NULL) { 2739 fCurrentNodeOffset = nodeOffset; 2740 fCurrentKey = keyIndex - 1; 2741 fDuplicateNode = BPLUSTREE_NULL; 2742 2743 return status; 2744 } else if (nextOffset == nodeOffset) 2745 RETURN_ERROR(B_ERROR); 2746 2747 nodeOffset = nextOffset; 2748 } 2749 RETURN_ERROR(B_ERROR); 2750 } 2751 2752 2753 void 2754 TreeIterator::SkipDuplicates() 2755 { 2756 fDuplicateNode = BPLUSTREE_NULL; 2757 } 2758 2759 2760 void 2761 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2762 uint16 splitAt, int8 change) 2763 { 2764 if (offset != fCurrentNodeOffset) 2765 return; 2766 2767 if (nextOffset != BPLUSTREE_NULL) { 2768 fCurrentNodeOffset = nextOffset; 2769 if (splitAt <= fCurrentKey) { 2770 fCurrentKey -= splitAt; 2771 keyIndex -= splitAt; 2772 } 2773 } 2774 2775 // Adjust fCurrentKey to point to the same key as before. 2776 // Note, that if a key is inserted at the current position 2777 // it won't be included in this tree transition. 2778 if (keyIndex <= fCurrentKey) 2779 fCurrentKey += change; 2780 2781 // TODO: duplicate handling! 2782 } 2783 2784 2785 void 2786 TreeIterator::Stop() 2787 { 2788 fTree = NULL; 2789 } 2790 2791 2792 #ifdef DEBUG 2793 void 2794 TreeIterator::Dump() 2795 { 2796 __out("TreeIterator at %p:\n", this); 2797 __out("\tfTree = %p\n", fTree); 2798 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2799 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2800 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", 2801 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2802 fDuplicateNode); 2803 __out("\tfDuplicate = %u\n", fDuplicate); 2804 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2805 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2806 } 2807 #endif 2808 2809 2810 // #pragma mark - 2811 2812 2813 bool 2814 bplustree_header::IsValid() const 2815 { 2816 return Magic() == BPLUSTREE_MAGIC 2817 && (RootNode() % NodeSize()) == 0 2818 && IsValidLink(RootNode()) 2819 && IsValidLink(FreeNode()); 2820 } 2821 2822 2823 // #pragma mark - 2824 2825 2826 void 2827 bplustree_node::Initialize() 2828 { 2829 left_link = right_link = overflow_link 2830 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2831 all_key_count = 0; 2832 all_key_length = 0; 2833 } 2834 2835 2836 uint8* 2837 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2838 { 2839 if (index < 0 || index > NumKeys()) 2840 return NULL; 2841 2842 uint8* keyStart = Keys(); 2843 uint16* keyLengths = KeyLengths(); 2844 2845 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2846 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2847 if (index > 0) 2848 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2849 2850 return keyStart; 2851 } 2852 2853 2854 uint8 2855 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2856 { 2857 // the duplicate fragment handling is currently hard-coded to a node size 2858 // of 1024 bytes - with future versions of BFS, this may be a problem 2859 2860 if (isFragment) { 2861 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2862 2863 return ((off_t*)this)[fragment]; 2864 } 2865 return OverflowLink(); 2866 } 2867 2868 2869 off_t 2870 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2871 { 2872 uint32 start; 2873 if (isFragment) 2874 start = 8 * ((uint64)offset & 0x3ff); 2875 else 2876 start = 2; 2877 2878 return ((off_t*)this)[start + 1 + index]; 2879 } 2880 2881 2882 /*! Although the name suggests it, this function doesn't return the real 2883 used fragment count; at least, it can only count to two: it returns 2884 0, if there is no fragment used, 1 if there is only one fragment 2885 used, and 2 if there are at least 2 fragments used. 2886 */ 2887 uint32 2888 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2889 { 2890 uint32 used = 0; 2891 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2892 duplicate_array* array = FragmentAt(i); 2893 if (array->Count() > 0 && ++used > 1) 2894 return used; 2895 } 2896 return used; 2897 } 2898 2899 2900 status_t 2901 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2902 { 2903 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2904 DEBUGGER(("invalid node: key/length count")); 2905 2906 for (int32 i = 0; i < NumKeys(); i++) { 2907 uint16 length; 2908 uint8* key = KeyAt(i, &length); 2909 if (key + length + sizeof(off_t) + sizeof(uint16) 2910 > (uint8*)this + nodeSize 2911 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2912 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 2913 return B_BAD_DATA; 2914 } 2915 if (Values()[i] == -1) { 2916 dprintf("invalid node %p, value %d: %lld: values corrupted\n", 2917 this, (int)i, Values()[i]); 2918 return B_BAD_DATA; 2919 } 2920 } 2921 return B_OK; 2922 } 2923 2924 2925 // #pragma mark - 2926 2927 2928 BitmapArray::BitmapArray(size_t numBits) 2929 { 2930 fSize = (numBits + 7) / 8; 2931 fBitmap = (uint8*)calloc(fSize, 1); 2932 fCountSet = 0; 2933 } 2934 2935 2936 BitmapArray::~BitmapArray() 2937 { 2938 free(fBitmap); 2939 } 2940 2941 2942 status_t 2943 BitmapArray::InitCheck() const 2944 { 2945 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 2946 } 2947 2948 2949 bool 2950 BitmapArray::IsSet(size_t index) const 2951 { 2952 uint32 byteIndex = index / 8; 2953 if (byteIndex >= fSize) 2954 return false; 2955 2956 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 2957 } 2958 2959 2960 void 2961 BitmapArray::Set(size_t index, bool set) 2962 { 2963 uint32 byteIndex = index / 8; 2964 if (byteIndex >= fSize) 2965 return; 2966 2967 if (set) { 2968 fBitmap[byteIndex] |= 1UL << (index & 0x7); 2969 fCountSet++; 2970 } else { 2971 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 2972 fCountSet--; 2973 } 2974 } 2975 2976 2977 // #pragma mark - 2978 2979 2980 bool 2981 duplicate_array::_FindInternal(off_t value, int32& index) const 2982 { 2983 int32 min = 0, max = Count() - 1; 2984 off_t cmp; 2985 while (min <= max) { 2986 index = (min + max) / 2; 2987 2988 cmp = ValueAt(index) - value; 2989 if (cmp < 0) 2990 min = index + 1; 2991 else if (cmp > 0) 2992 max = index - 1; 2993 else 2994 return true; 2995 } 2996 return false; 2997 } 2998 2999 3000 void 3001 duplicate_array::Insert(off_t value) 3002 { 3003 // if there are more than 8 values in this array, use a 3004 // binary search, if not, just iterate linearly to find 3005 // the insertion point 3006 int32 size = Count(); 3007 int32 i; 3008 if (size > 8 ) { 3009 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3010 i++; 3011 } else { 3012 for (i = 0; i < size; i++) { 3013 if (ValueAt(i) > value) 3014 break; 3015 } 3016 } 3017 3018 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3019 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3020 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3021 } 3022 3023 3024 bool 3025 duplicate_array::Remove(off_t value) 3026 { 3027 int32 index = Find(value); 3028 if (index == -1) 3029 return false; 3030 3031 int32 newSize = Count() - 1; 3032 memmove(&values[index], &values[index + 1], 3033 (newSize - index) * sizeof(off_t)); 3034 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3035 3036 return true; 3037 } 3038 3039 3040 // #pragma mark - 3041 3042 3043 int32 3044 compareKeys(type_code type, const void* key1, int keyLength1, 3045 const void* key2, int keyLength2) 3046 { 3047 // if one of the keys is NULL, bail out gracefully 3048 if (key1 == NULL || key2 == NULL) { 3049 // even if it's most probably a bug in the calling code, we try to 3050 // give a meaningful result 3051 if (key1 == NULL && key2 != NULL) 3052 return 1; 3053 if (key2 == NULL && key1 != NULL) 3054 return -1; 3055 3056 return 0; 3057 } 3058 3059 switch (type) { 3060 case B_STRING_TYPE: 3061 { 3062 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 3063 if (result == 0) { 3064 // ignore trailing null bytes 3065 if ((keyLength1 == keyLength2 + 1 3066 && ((uint8*)key1)[keyLength2] == '\0') 3067 || (keyLength2 == keyLength1 + 1 3068 && ((uint8*)key2)[keyLength1] == '\0')) 3069 return 0; 3070 3071 result = keyLength1 - keyLength2; 3072 } 3073 3074 return result; 3075 } 3076 3077 case B_SSIZE_T_TYPE: 3078 case B_INT32_TYPE: 3079 return *(int32*)key1 - *(int32*)key2; 3080 3081 case B_SIZE_T_TYPE: 3082 case B_UINT32_TYPE: 3083 if (*(uint32*)key1 == *(uint32*)key2) 3084 return 0; 3085 if (*(uint32*)key1 > *(uint32*)key2) 3086 return 1; 3087 3088 return -1; 3089 3090 case B_OFF_T_TYPE: 3091 case B_INT64_TYPE: 3092 if (*(int64*)key1 == *(int64*)key2) 3093 return 0; 3094 if (*(int64*)key1 > *(int64*)key2) 3095 return 1; 3096 3097 return -1; 3098 3099 case B_UINT64_TYPE: 3100 if (*(uint64*)key1 == *(uint64*)key2) 3101 return 0; 3102 if (*(uint64*)key1 > *(uint64*)key2) 3103 return 1; 3104 3105 return -1; 3106 3107 case B_FLOAT_TYPE: 3108 { 3109 float result = *(float*)key1 - *(float*)key2; 3110 if (result == 0.0f) 3111 return 0; 3112 3113 return (result < 0.0f) ? -1 : 1; 3114 } 3115 3116 case B_DOUBLE_TYPE: 3117 { 3118 double result = *(double*)key1 - *(double*)key2; 3119 if (result == 0.0) 3120 return 0; 3121 3122 return (result < 0.0) ? -1 : 1; 3123 } 3124 } 3125 3126 // if the type is unknown, the entries don't match... 3127 return -1; 3128 } 3129 3130