1 /* 2 * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "Debug.h" 14 #include "BPlusTree.h" 15 #include "Utility.h" 16 17 #if !_BOOT_MODE 18 #include "Inode.h" 19 #else 20 #include "Stream.h" 21 22 // BFS::Stream from the bootloader has the same API as Inode. 23 #define Inode BFS::Stream 24 25 #define strerror(x) "error message unavailable" 26 27 namespace BFS { 28 #endif 29 30 31 /*! Simple array used for the duplicate handling in the B+Tree. This is an 32 on disk structure. 33 */ 34 struct duplicate_array { 35 off_t count; 36 off_t values[0]; 37 38 inline bool IsEmpty() const 39 { 40 return count == 0; 41 } 42 43 inline int32 Count() const 44 { 45 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 46 } 47 48 inline off_t ValueAt(uint32 index) const 49 { 50 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 51 } 52 53 inline void SetValueAt(uint32 index, off_t value) 54 { 55 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 56 } 57 58 inline int32 Find(off_t value) const 59 { 60 int32 i; 61 return _FindInternal(value, i) ? i : -1; 62 } 63 64 void Insert(off_t value); 65 bool Remove(off_t value); 66 67 private: 68 bool _FindInternal(off_t value, int32& index) const; 69 } _PACKED; 70 71 72 #ifdef DEBUG 73 class NodeChecker { 74 public: 75 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 76 : 77 fNode(node), 78 fSize(nodeSize), 79 fText(text) 80 { 81 Check("integrity check failed on construction."); 82 } 83 84 ~NodeChecker() 85 { 86 Check("integrity check failed on destruction."); 87 } 88 89 void 90 Check(const char* message) 91 { 92 if (fNode->CheckIntegrity(fSize) != B_OK) { 93 dprintf("%s: %s\n", fText, message); 94 DEBUGGER(("NodeChecker integrity check failed!")); 95 } 96 } 97 98 private: 99 const bplustree_node* fNode; 100 int32 fSize; 101 const char* fText; 102 }; 103 #endif // DEBUG 104 105 106 #if !_BOOT_MODE 107 class BitmapArray { 108 public: 109 BitmapArray(size_t numBits); 110 ~BitmapArray(); 111 112 status_t InitCheck() const; 113 114 bool IsSet(size_t index) const; 115 void Set(size_t index, bool set); 116 117 size_t CountSet() const { return fCountSet; } 118 119 private: 120 uint8* fBitmap; 121 size_t fSize; 122 size_t fCountSet; 123 }; 124 125 126 struct TreeCheck { 127 TreeCheck(BPlusTree* tree) 128 : 129 fLevelCount(0), 130 fFreeCount(0), 131 fNodeSize(tree->NodeSize()), 132 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 133 fFoundErrors(0), 134 fVisited(tree->Stream()->Size() / tree->NodeSize()), 135 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 136 { 137 fPreviousOffsets = (off_t*)malloc( 138 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 139 if (fPreviousOffsets != NULL) { 140 for (size_t i = 0; i < fMaxLevels; i++) 141 fPreviousOffsets[i] = BPLUSTREE_NULL; 142 } 143 } 144 145 ~TreeCheck() 146 { 147 free(fPreviousOffsets); 148 } 149 150 status_t InitCheck() const 151 { 152 if (fPreviousOffsets == NULL) 153 return B_NO_MEMORY; 154 155 status_t status = fVisited.InitCheck(); 156 if (status != B_OK) 157 return status; 158 159 return fVisitedFragment.InitCheck(); 160 } 161 162 bool Visited(off_t offset) const 163 { 164 return fVisited.IsSet(offset / fNodeSize); 165 } 166 167 void SetVisited(off_t offset) 168 { 169 fVisited.Set(offset / fNodeSize, true); 170 } 171 172 size_t VisitedCount() const 173 { 174 return fVisited.CountSet(); 175 } 176 177 bool VisitedFragment(off_t offset) const 178 { 179 return fVisitedFragment.IsSet(offset / fNodeSize); 180 } 181 182 void SetVisitedFragment(off_t offset) 183 { 184 fVisitedFragment.Set(offset / fNodeSize, true); 185 } 186 187 uint32 MaxLevels() const 188 { 189 return fLevelCount; 190 } 191 192 void SetLevel(uint32 level) 193 { 194 if (fLevelCount < level) 195 fLevelCount = level; 196 } 197 198 off_t PreviousOffset(uint32 level) 199 { 200 return fPreviousOffsets[level]; 201 } 202 203 void SetPreviousOffset(uint32 level, off_t offset) 204 { 205 fPreviousOffsets[level] = offset; 206 } 207 208 void FoundError() 209 { 210 fFoundErrors++; 211 } 212 213 bool ErrorsFound() 214 { 215 return fFoundErrors != 0; 216 } 217 218 private: 219 uint32 fLevelCount; 220 uint32 fFreeCount; 221 uint32 fNodeSize; 222 uint32 fMaxLevels; 223 uint32 fFoundErrors; 224 BitmapArray fVisited; 225 BitmapArray fVisitedFragment; 226 off_t* fPreviousOffsets; 227 }; 228 229 230 // #pragma mark - 231 232 233 // Node Caching for the BPlusTree class 234 // 235 // With write support, there is the need for a function that allocates new 236 // nodes by either returning empty nodes, or by growing the file's data stream 237 // 238 // !! The CachedNode class assumes that you have properly locked the stream 239 // !! before asking for nodes. 240 // 241 // Note: This code will fail if the block size is smaller than the node size! 242 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 243 // is hard-coded to 1024 bytes, that's not an issue now. 244 245 void 246 CachedNode::UnsetUnchanged(Transaction& transaction) 247 { 248 if (fTree == NULL || fTree->fStream == NULL) 249 return; 250 251 if (fNode != NULL) { 252 void* cache = fTree->fStream->GetVolume()->BlockCache(); 253 254 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 255 block_cache_put(cache, fBlockNumber); 256 fNode = NULL; 257 } 258 } 259 #endif // !_BOOT_MODE 260 261 262 void 263 CachedNode::Unset() 264 { 265 if (fTree == NULL || fTree->fStream == NULL) 266 return; 267 268 if (fNode != NULL) { 269 #if !_BOOT_MODE 270 if (fWritable && fOffset == 0) { 271 // The B+tree header has been updated - we need to update the 272 // BPlusTrees copy of it, as well. 273 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 274 } 275 276 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 277 fBlockNumber); 278 #endif // !_BOOT_MODE 279 280 fNode = NULL; 281 } 282 } 283 284 285 const bplustree_node* 286 CachedNode::SetTo(off_t offset, bool check) 287 { 288 const bplustree_node* node; 289 if (SetTo(offset, &node, check) == B_OK) 290 return node; 291 292 return NULL; 293 } 294 295 296 status_t 297 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 298 { 299 if (fTree == NULL || fTree->fStream == NULL) 300 RETURN_ERROR(B_BAD_VALUE); 301 302 Unset(); 303 304 // You can only ask for nodes at valid positions - you can't 305 // even access the b+tree header with this method (use SetToHeader() 306 // instead) 307 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 308 || offset <= 0 309 || (offset % fTree->fNodeSize) != 0) { 310 RETURN_ERROR(B_BAD_VALUE); 311 } 312 313 if (InternalSetTo(NULL, offset) != NULL && check) { 314 // sanity checks (links, all_key_count) 315 if (!fTree->fHeader.CheckNode(fNode)) { 316 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 317 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 318 fBlockNumber, fTree->fStream->ID())); 319 return B_BAD_DATA; 320 } 321 } 322 323 *_node = fNode; 324 return B_OK; 325 } 326 327 328 #if !_BOOT_MODE 329 bplustree_node* 330 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 331 { 332 if (fTree == NULL || fTree->fStream == NULL) { 333 REPORT_ERROR(B_BAD_VALUE); 334 return NULL; 335 } 336 337 Unset(); 338 339 // You can only ask for nodes at valid positions - you can't 340 // even access the b+tree header with this method (use SetToHeader() 341 // instead) 342 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 343 || offset <= 0 344 || (offset % fTree->fNodeSize) != 0) 345 return NULL; 346 347 if (InternalSetTo(&transaction, offset) != NULL && check) { 348 // sanity checks (links, all_key_count) 349 if (!fTree->fHeader.CheckNode(fNode)) { 350 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 351 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 352 fBlockNumber, fTree->fStream->ID())); 353 return NULL; 354 } 355 } 356 return fNode; 357 } 358 359 360 bplustree_node* 361 CachedNode::MakeWritable(Transaction& transaction) 362 { 363 if (fNode == NULL) 364 return NULL; 365 366 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 367 fBlockNumber, transaction.ID()) == B_OK) { 368 return fNode; 369 } 370 371 return NULL; 372 } 373 #endif // !_BOOT_MODE 374 375 376 const bplustree_header* 377 CachedNode::SetToHeader() 378 { 379 if (fTree == NULL || fTree->fStream == NULL) { 380 REPORT_ERROR(B_BAD_VALUE); 381 return NULL; 382 } 383 384 Unset(); 385 386 InternalSetTo(NULL, 0LL); 387 return (bplustree_header*)fNode; 388 } 389 390 391 #if !_BOOT_MODE 392 bplustree_header* 393 CachedNode::SetToWritableHeader(Transaction& transaction) 394 { 395 if (fTree == NULL || fTree->fStream == NULL) { 396 REPORT_ERROR(B_BAD_VALUE); 397 return NULL; 398 } 399 400 Unset(); 401 402 InternalSetTo(&transaction, 0LL); 403 404 if (fNode != NULL && !fTree->fInTransaction) { 405 transaction.AddListener(fTree); 406 fTree->fInTransaction = true; 407 408 if (!transaction.GetVolume()->IsInitializing()) { 409 acquire_vnode(transaction.GetVolume()->FSVolume(), 410 fTree->fStream->ID()); 411 } 412 } 413 414 return (bplustree_header*)fNode; 415 } 416 #endif // !_BOOT_MODE 417 418 419 bplustree_node* 420 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 421 { 422 fNode = NULL; 423 fOffset = offset; 424 425 off_t fileOffset; 426 block_run run; 427 if (offset < fTree->fStream->Size() 428 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 429 430 #if !_BOOT_MODE 431 Volume* volume = fTree->fStream->GetVolume(); 432 #else 433 Volume* volume = &fTree->fStream->GetVolume(); 434 #endif 435 436 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 437 fBlockNumber = volume->ToBlock(run) + blockOffset; 438 uint8* block = NULL; 439 440 #if !_BOOT_MODE 441 if (transaction != NULL) { 442 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 443 fBlockNumber, transaction->ID()); 444 fWritable = true; 445 } else { 446 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 447 fWritable = false; 448 } 449 #else // !_BOOT_MODE 450 if (fBlock == NULL) { 451 fBlock = (uint8*)malloc(volume->BlockSize()); 452 if (fBlock == NULL) 453 return NULL; 454 } 455 456 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), 457 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { 458 block = fBlock; 459 } 460 461 fWritable = false; 462 #endif // _BOOT_MODE 463 464 if (block != NULL) { 465 // The node is somewhere in that block... 466 // (confusing offset calculation) 467 fNode = (bplustree_node*)(block + offset 468 - (fileOffset + (blockOffset << volume->BlockShift()))); 469 } else 470 REPORT_ERROR(B_IO_ERROR); 471 } 472 return fNode; 473 } 474 475 476 #if !_BOOT_MODE 477 status_t 478 CachedNode::Free(Transaction& transaction, off_t offset) 479 { 480 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 481 RETURN_ERROR(B_BAD_VALUE); 482 483 // TODO: scan the free nodes list and remove all nodes at the end 484 // of the tree - perhaps that shouldn't be done everytime that 485 // function is called, perhaps it should be done when the directory 486 // inode is closed or based on some calculation or whatever... 487 488 CachedNode cached(fTree); 489 bplustree_header* header = cached.SetToWritableHeader(transaction); 490 if (header == NULL) 491 return B_IO_ERROR; 492 493 #if 0 494 // TODO: temporarily disabled because CheckNode() doesn't like this... 495 // Also, it's such an edge case that it's almost useless, anyway. 496 // if the node is the last one in the tree, we shrink 497 // the tree and file size by one node 498 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 499 if (offset == lastOffset) { 500 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 501 if (status != B_OK) 502 return status; 503 504 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 505 return B_OK; 506 } 507 #endif 508 509 // add the node to the free nodes list 510 fNode->left_link = header->free_node_pointer; 511 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 512 513 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 514 return B_OK; 515 } 516 517 518 status_t 519 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 520 off_t* _offset) 521 { 522 if (fTree == NULL || fTree->fStream == NULL) 523 RETURN_ERROR(B_BAD_VALUE); 524 525 Unset(); 526 527 bplustree_header* header; 528 status_t status; 529 530 // if there are any free nodes, recycle them 531 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 532 CachedNode cached(fTree); 533 header = cached.SetToWritableHeader(transaction); 534 if (header == NULL) 535 return B_IO_ERROR; 536 537 *_offset = header->FreeNode(); 538 *_node = fNode; 539 540 // set new free node pointer 541 header->free_node_pointer = fNode->left_link; 542 fNode->Initialize(); 543 return B_OK; 544 } 545 546 // allocate space for a new node 547 Inode* stream = fTree->fStream; 548 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 549 return status; 550 551 CachedNode cached(fTree); 552 header = cached.SetToWritableHeader(transaction); 553 if (header == NULL) 554 return B_IO_ERROR; 555 556 // the maximum_size has to be changed before the call to SetTo() - or 557 // else it will fail because the requested node is out of bounds 558 off_t offset = header->MaximumSize(); 559 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 560 + fTree->fNodeSize); 561 562 cached.Unset(); 563 // SetToWritable() below needs the new values in the tree's header 564 565 if (SetToWritable(transaction, offset, false) == NULL) 566 RETURN_ERROR(B_ERROR); 567 568 fNode->Initialize(); 569 570 *_offset = offset; 571 *_node = fNode; 572 return B_OK; 573 } 574 575 576 // #pragma mark - 577 578 579 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 580 : 581 fStream(NULL), 582 fInTransaction(false) 583 { 584 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 585 SetTo(transaction, stream); 586 } 587 #endif // !_BOOT_MODE 588 589 590 BPlusTree::BPlusTree(Inode* stream) 591 : 592 fStream(NULL), 593 fInTransaction(false) 594 { 595 #if !_BOOT_MODE 596 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 597 #endif 598 599 SetTo(stream); 600 } 601 602 603 BPlusTree::BPlusTree() 604 : 605 fStream(NULL), 606 fNodeSize(BPLUSTREE_NODE_SIZE), 607 fAllowDuplicates(true), 608 fInTransaction(false), 609 fStatus(B_NO_INIT) 610 { 611 #if !_BOOT_MODE 612 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 613 #endif 614 } 615 616 617 BPlusTree::~BPlusTree() 618 { 619 #if !_BOOT_MODE 620 // if there are any TreeIterators left, we need to stop them 621 // (can happen when the tree's inode gets deleted while 622 // traversing the tree - a TreeIterator doesn't lock the inode) 623 mutex_lock(&fIteratorLock); 624 625 SinglyLinkedList<TreeIterator>::Iterator iterator 626 = fIterators.GetIterator(); 627 while (iterator.HasNext()) 628 iterator.Next()->Stop(); 629 630 mutex_destroy(&fIteratorLock); 631 632 ASSERT(!fInTransaction); 633 #endif // !_BOOT_MODE 634 } 635 636 637 #if !_BOOT_MODE 638 /*! Create a new B+Tree on the specified stream */ 639 status_t 640 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 641 { 642 // initializes in-memory B+Tree 643 644 fStream = stream; 645 646 CachedNode cached(this); 647 bplustree_header* header = cached.SetToWritableHeader(transaction); 648 if (header == NULL) { 649 // allocate space for new header + node! 650 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 651 if (fStatus != B_OK) 652 RETURN_ERROR(fStatus); 653 654 header = cached.SetToWritableHeader(transaction); 655 if (header == NULL) 656 RETURN_ERROR(fStatus = B_ERROR); 657 } 658 659 fAllowDuplicates = stream->IsIndex() 660 || (stream->Mode() & S_ALLOW_DUPS) != 0; 661 662 fNodeSize = nodeSize; 663 664 // initialize b+tree header 665 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 666 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 667 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 668 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 669 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 670 header->free_node_pointer 671 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 672 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 673 674 cached.Unset(); 675 676 // initialize b+tree root node 677 cached.SetToWritable(transaction, fHeader.RootNode(), false); 678 if (cached.Node() == NULL) 679 RETURN_ERROR(B_IO_ERROR); 680 681 cached.Node()->Initialize(); 682 683 return fStatus = B_OK; 684 } 685 #endif // !_BOOT_MODE 686 687 688 status_t 689 BPlusTree::SetTo(Inode* stream) 690 { 691 if (stream == NULL) 692 RETURN_ERROR(fStatus = B_BAD_VALUE); 693 694 fStream = stream; 695 696 // get on-disk B+Tree header 697 698 CachedNode cached(this); 699 const bplustree_header* header = cached.SetToHeader(); 700 if (header != NULL) 701 memcpy(&fHeader, header, sizeof(bplustree_header)); 702 else 703 RETURN_ERROR(fStatus = B_IO_ERROR); 704 705 // is header valid? 706 707 if (fHeader.MaximumSize() != stream->Size()) { 708 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 709 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 710 // we can't change the header since we don't have a transaction 711 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 712 } 713 if (!fHeader.IsValid()) { 714 #ifdef DEBUG 715 dump_bplustree_header(&fHeader); 716 dump_block((const char*)&fHeader, 128); 717 #endif 718 RETURN_ERROR(fStatus = B_BAD_DATA); 719 } 720 721 fNodeSize = fHeader.NodeSize(); 722 723 // validity check 724 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 725 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 726 S_DOUBLE_INDEX}; 727 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 728 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 729 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 730 731 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 732 || ((stream->Mode() & S_INDEX_DIR) != 0 733 && kToMode[fHeader.DataType()] != mode) 734 || !stream->IsContainer()) { 735 D( dump_bplustree_header(&fHeader); 736 dump_inode(&stream->Node()); 737 ); 738 RETURN_ERROR(fStatus = B_BAD_TYPE); 739 } 740 741 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 742 // in the original BFS code - we will honour it nevertheless 743 fAllowDuplicates = stream->IsIndex() 744 || (stream->Mode() & S_ALLOW_DUPS) != 0; 745 746 cached.SetTo(fHeader.RootNode()); 747 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 748 } 749 750 751 status_t 752 BPlusTree::InitCheck() 753 { 754 return fStatus; 755 } 756 757 758 #if !_BOOT_MODE 759 status_t 760 BPlusTree::Validate(bool repair, bool& _errorsFound) 761 { 762 TreeCheck check(this); 763 if (check.InitCheck() != B_OK) 764 return B_NO_MEMORY; 765 766 check.SetVisited(0); 767 768 // Walk the free nodes 769 770 CachedNode cached(this); 771 off_t freeOffset = fHeader.FreeNode(); 772 while (freeOffset > 0) { 773 const bplustree_node* node; 774 status_t status = cached.SetTo(freeOffset, &node, false); 775 if (status != B_OK) { 776 if (status == B_IO_ERROR) 777 return B_IO_ERROR; 778 779 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 780 "not be read: %s\n", fStream->ID(), freeOffset, 781 strerror(status)); 782 check.FoundError(); 783 break; 784 } 785 786 if (check.Visited(freeOffset)) { 787 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 788 " circular!\n", fStream->ID(), freeOffset); 789 // TODO: if 'repair' is true, we could collect all unvisited nodes 790 // at the end, and put the into the free list 791 check.FoundError(); 792 break; 793 } 794 795 check.SetVisited(freeOffset); 796 797 if (node->OverflowLink() != BPLUSTREE_FREE) { 798 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 799 " misses free mark!\n", fStream->ID(), freeOffset); 800 } 801 freeOffset = node->LeftLink(); 802 } 803 804 // Iterate over the complete tree recursively 805 806 const bplustree_node* root; 807 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 808 if (status != B_OK) 809 return status; 810 811 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 812 813 if (check.ErrorsFound()) 814 _errorsFound = true; 815 816 if (status != B_OK) 817 return status; 818 819 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 820 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 821 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 822 fHeader.MaxNumberOfLevels()); 823 } 824 825 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 826 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 827 " nodes.\n", fStream->ID(), check.VisitedCount(), 828 fHeader.MaximumSize() / fNodeSize); 829 } 830 831 return B_OK; 832 } 833 834 835 status_t 836 BPlusTree::MakeEmpty() 837 { 838 // Put all nodes into the free list in order 839 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 840 841 // Reset the header, and root node 842 CachedNode cached(this); 843 bplustree_header* header = cached.SetToWritableHeader(transaction); 844 if (header == NULL) 845 return B_IO_ERROR; 846 847 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 848 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 849 if (fStream->Size() > (off_t)NodeSize() * 2) 850 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 851 else { 852 header->free_node_pointer 853 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 854 } 855 856 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 857 if (node == NULL) 858 return B_IO_ERROR; 859 860 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 861 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 862 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 863 node->all_key_count = 0; 864 node->all_key_length = 0; 865 866 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 867 offset += NodeSize()) { 868 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 869 if (node == NULL) { 870 dprintf("--> could not open %" B_PRIdOFF "\n", offset); 871 return B_IO_ERROR; 872 } 873 if (offset < fStream->Size() - (off_t)NodeSize()) 874 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 875 else 876 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 877 878 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 879 880 // It's not important to write it out in a single transaction; just 881 // make sure it doesn't get too large 882 if (offset % (1024 * 1024) == 0) { 883 transaction.Done(); 884 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 885 } 886 } 887 888 return transaction.Done(); 889 } 890 891 892 int32 893 BPlusTree::TypeCodeToKeyType(type_code code) 894 { 895 switch (code) { 896 case B_STRING_TYPE: 897 return BPLUSTREE_STRING_TYPE; 898 case B_SSIZE_T_TYPE: 899 case B_INT32_TYPE: 900 return BPLUSTREE_INT32_TYPE; 901 case B_SIZE_T_TYPE: 902 case B_UINT32_TYPE: 903 return BPLUSTREE_UINT32_TYPE; 904 case B_OFF_T_TYPE: 905 case B_INT64_TYPE: 906 return BPLUSTREE_INT64_TYPE; 907 case B_UINT64_TYPE: 908 return BPLUSTREE_UINT64_TYPE; 909 case B_FLOAT_TYPE: 910 return BPLUSTREE_FLOAT_TYPE; 911 case B_DOUBLE_TYPE: 912 return BPLUSTREE_DOUBLE_TYPE; 913 } 914 return -1; 915 } 916 917 918 int32 919 BPlusTree::ModeToKeyType(mode_t mode) 920 { 921 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 922 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 923 case S_INT_INDEX: 924 return BPLUSTREE_INT32_TYPE; 925 case S_UINT_INDEX: 926 return BPLUSTREE_UINT32_TYPE; 927 case S_LONG_LONG_INDEX: 928 return BPLUSTREE_INT64_TYPE; 929 case S_ULONG_LONG_INDEX: 930 return BPLUSTREE_UINT64_TYPE; 931 case S_FLOAT_INDEX: 932 return BPLUSTREE_FLOAT_TYPE; 933 case S_DOUBLE_INDEX: 934 return BPLUSTREE_DOUBLE_TYPE; 935 case S_STR_INDEX: 936 default: 937 // default is for standard directories 938 return BPLUSTREE_STRING_TYPE; 939 } 940 } 941 #endif // !_BOOT_MODE 942 943 944 // #pragma mark - TransactionListener implementation 945 946 947 #if !_BOOT_MODE 948 void 949 BPlusTree::TransactionDone(bool success) 950 { 951 if (!success) { 952 // update header from disk 953 CachedNode cached(this); 954 const bplustree_header* header = cached.SetToHeader(); 955 if (header != NULL) 956 memcpy(&fHeader, header, sizeof(bplustree_header)); 957 } 958 } 959 960 961 void 962 BPlusTree::RemovedFromTransaction() 963 { 964 fInTransaction = false; 965 966 if (!fStream->GetVolume()->IsInitializing()) 967 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 968 } 969 970 971 // #pragma mark - 972 973 974 void 975 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 976 uint16 splitAt, int8 change) 977 { 978 // Although every iterator which is affected by this update currently 979 // waits on a semaphore, other iterators could be added/removed at 980 // any time, so we need to protect this loop 981 MutexLocker _(fIteratorLock); 982 983 SinglyLinkedList<TreeIterator>::Iterator iterator 984 = fIterators.GetIterator(); 985 while (iterator.HasNext()) 986 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 987 } 988 989 990 void 991 BPlusTree::_AddIterator(TreeIterator* iterator) 992 { 993 MutexLocker _(fIteratorLock); 994 fIterators.Add(iterator); 995 } 996 997 998 void 999 BPlusTree::_RemoveIterator(TreeIterator* iterator) 1000 { 1001 MutexLocker _(fIteratorLock); 1002 fIterators.Remove(iterator); 1003 } 1004 #endif // !_BOOT_MODE 1005 1006 1007 int32 1008 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 1009 int keyLength2) 1010 { 1011 type_code type = 0; 1012 switch (fHeader.DataType()) { 1013 case BPLUSTREE_STRING_TYPE: 1014 type = B_STRING_TYPE; 1015 break; 1016 case BPLUSTREE_INT32_TYPE: 1017 type = B_INT32_TYPE; 1018 break; 1019 case BPLUSTREE_UINT32_TYPE: 1020 type = B_UINT32_TYPE; 1021 break; 1022 case BPLUSTREE_INT64_TYPE: 1023 type = B_INT64_TYPE; 1024 break; 1025 case BPLUSTREE_UINT64_TYPE: 1026 type = B_UINT64_TYPE; 1027 break; 1028 case BPLUSTREE_FLOAT_TYPE: 1029 type = B_FLOAT_TYPE; 1030 break; 1031 case BPLUSTREE_DOUBLE_TYPE: 1032 type = B_DOUBLE_TYPE; 1033 break; 1034 } 1035 return compareKeys(type, key1, keyLength1, key2, keyLength2); 1036 } 1037 1038 1039 status_t 1040 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 1041 uint16 keyLength, uint16* _index, off_t* _next) 1042 { 1043 #ifdef DEBUG 1044 NodeChecker checker(node, fNodeSize, "find"); 1045 #endif 1046 1047 if (node->all_key_count == 0) { 1048 if (_index) 1049 *_index = 0; 1050 if (_next) 1051 *_next = node->OverflowLink(); 1052 return B_ENTRY_NOT_FOUND; 1053 } 1054 1055 off_t* values = node->Values(); 1056 int16 saveIndex = -1; 1057 1058 // binary search in the key array 1059 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1060 uint16 i = (first + last) >> 1; 1061 1062 uint16 searchLength; 1063 uint8* searchKey = node->KeyAt(i, &searchLength); 1064 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1065 > (uint8*)node + fNodeSize 1066 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1067 #if !_BOOT_MODE 1068 fStream->GetVolume()->Panic(); 1069 #endif 1070 RETURN_ERROR(B_BAD_DATA); 1071 } 1072 1073 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1074 if (cmp < 0) { 1075 last = i - 1; 1076 saveIndex = i; 1077 } else if (cmp > 0) { 1078 saveIndex = first = i + 1; 1079 } else { 1080 if (_index) 1081 *_index = i; 1082 if (_next) 1083 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1084 return B_OK; 1085 } 1086 } 1087 1088 if (_index) 1089 *_index = saveIndex; 1090 if (_next) { 1091 if (saveIndex == node->NumKeys()) 1092 *_next = node->OverflowLink(); 1093 else 1094 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1095 } 1096 return B_ENTRY_NOT_FOUND; 1097 } 1098 1099 1100 #if !_BOOT_MODE 1101 /*! Prepares the stack to contain all nodes that were passed while 1102 following the key, from the root node to the leaf node that could 1103 or should contain that key. 1104 */ 1105 status_t 1106 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1107 uint16 keyLength) 1108 { 1109 // set the root node to begin with 1110 node_and_key nodeAndKey; 1111 nodeAndKey.nodeOffset = fHeader.RootNode(); 1112 1113 CachedNode cached(this); 1114 const bplustree_node* node; 1115 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1116 // if we are already on leaf level, we're done 1117 if (node->OverflowLink() == BPLUSTREE_NULL) { 1118 // node that the keyIndex is not properly set here (but it's not 1119 // needed in the calling functions anyway)! 1120 nodeAndKey.keyIndex = 0; 1121 stack.Push(nodeAndKey); 1122 return B_OK; 1123 } 1124 1125 off_t nextOffset; 1126 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1127 &nextOffset); 1128 1129 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1130 RETURN_ERROR(B_ERROR); 1131 1132 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1133 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1134 break; 1135 } 1136 1137 // put the node offset & the correct keyIndex on the stack 1138 stack.Push(nodeAndKey); 1139 1140 nodeAndKey.nodeOffset = nextOffset; 1141 } 1142 1143 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1144 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1145 return B_ERROR; 1146 } 1147 1148 1149 /*! This will find a free duplicate fragment in the given bplustree_node. 1150 The CachedNode will be set to the writable fragment on success. 1151 */ 1152 status_t 1153 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1154 const bplustree_node* node, CachedNode& cached, 1155 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1156 { 1157 off_t* values = node->Values(); 1158 for (int32 i = 0; i < node->NumKeys(); i++) { 1159 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1160 1161 // does the value link to a duplicate fragment? 1162 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1163 continue; 1164 1165 const bplustree_node* fragment = cached.SetTo( 1166 bplustree_node::FragmentOffset(value), false); 1167 if (fragment == NULL) { 1168 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1169 B_PRIdOFF "\n", value, fStream->ID())); 1170 continue; 1171 } 1172 1173 // see if there is some space left for us 1174 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1175 for (uint32 j = 0; j < num; j++) { 1176 duplicate_array* array = fragment->FragmentAt(j); 1177 1178 if (array->IsEmpty()) { 1179 // found an unused fragment 1180 *_fragment = cached.MakeWritable(transaction); 1181 if (*_fragment == NULL) 1182 return B_IO_ERROR; 1183 1184 *_offset = bplustree_node::FragmentOffset(value); 1185 *_index = j; 1186 return B_OK; 1187 } 1188 } 1189 } 1190 return B_ENTRY_NOT_FOUND; 1191 } 1192 1193 1194 status_t 1195 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1196 const bplustree_node* node, uint16 index, off_t value) 1197 { 1198 CachedNode cachedDuplicate(this); 1199 off_t* values = node->Values(); 1200 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1201 status_t status; 1202 off_t offset; 1203 1204 if (bplustree_node::IsDuplicate(oldValue)) { 1205 // If it's a duplicate fragment, try to insert it into that, or if it 1206 // doesn't fit anymore, create a new duplicate node 1207 1208 if (bplustree_node::LinkType(oldValue) 1209 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1210 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1211 transaction, bplustree_node::FragmentOffset(oldValue), false); 1212 if (duplicate == NULL) 1213 return B_IO_ERROR; 1214 1215 duplicate_array* array = duplicate->FragmentAt( 1216 bplustree_node::FragmentIndex(oldValue)); 1217 int32 arrayCount = array->Count(); 1218 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1219 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1220 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1221 (int)bplustree_node::FragmentIndex(oldValue), 1222 bplustree_node::FragmentOffset(oldValue), arrayCount, 1223 fStream->ID())); 1224 return B_BAD_DATA; 1225 } 1226 1227 if (arrayCount < NUM_FRAGMENT_VALUES) { 1228 array->Insert(value); 1229 } else { 1230 // Test if the fragment will be empty if we remove this key's 1231 // values 1232 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1233 // The node will be empty without our values, so let us 1234 // reuse it as a duplicate node 1235 offset = bplustree_node::FragmentOffset(oldValue); 1236 1237 memmove(duplicate->DuplicateArray(), array, 1238 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1239 duplicate->left_link = duplicate->right_link 1240 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1241 1242 array = duplicate->DuplicateArray(); 1243 array->Insert(value); 1244 } else { 1245 // Create a new duplicate node 1246 1247 cachedDuplicate.UnsetUnchanged(transaction); 1248 // The old duplicate has not been touched, so we can 1249 // reuse it 1250 1251 bplustree_node* newDuplicate; 1252 status = cachedDuplicate.Allocate(transaction, 1253 &newDuplicate, &offset); 1254 if (status != B_OK) 1255 RETURN_ERROR(status); 1256 1257 // Copy the array from the fragment node to the duplicate 1258 // node and free the old entry (by zero'ing all values) 1259 newDuplicate->overflow_link = array->count; 1260 memcpy(&newDuplicate->all_key_count, &array->values[0], 1261 array->Count() * sizeof(off_t)); 1262 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1263 1264 array = newDuplicate->DuplicateArray(); 1265 array->Insert(value); 1266 } 1267 1268 // Update the main pointer to link to a duplicate node 1269 if (cached.MakeWritable(transaction) == NULL) 1270 return B_IO_ERROR; 1271 1272 values[index] 1273 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1274 BPLUSTREE_DUPLICATE_NODE, offset)); 1275 } 1276 1277 return B_OK; 1278 } 1279 1280 // Put the value into a dedicated duplicate node 1281 1282 // search for free space in the duplicate nodes of that key 1283 duplicate_array* array; 1284 int32 arrayCount; 1285 const bplustree_node* duplicate; 1286 off_t duplicateOffset; 1287 do { 1288 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1289 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1290 if (duplicate == NULL) 1291 return B_IO_ERROR; 1292 1293 array = duplicate->DuplicateArray(); 1294 arrayCount =array->Count(); 1295 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1296 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1297 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1298 duplicateOffset, arrayCount, fStream->ID())); 1299 return B_BAD_DATA; 1300 } 1301 } while (arrayCount >= NUM_DUPLICATE_VALUES 1302 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1303 1304 bplustree_node* writableDuplicate 1305 = cachedDuplicate.MakeWritable(transaction); 1306 if (writableDuplicate == NULL) 1307 return B_IO_ERROR; 1308 1309 if (arrayCount < NUM_DUPLICATE_VALUES) { 1310 array = writableDuplicate->DuplicateArray(); 1311 array->Insert(value); 1312 } else { 1313 // no space left - add a new duplicate node 1314 1315 bplustree_node* newDuplicate; 1316 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1317 &offset); 1318 if (status != B_OK) 1319 RETURN_ERROR(status); 1320 1321 // link the two nodes together 1322 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1323 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1324 1325 array = newDuplicate->DuplicateArray(); 1326 array->count = 0; 1327 array->Insert(value); 1328 } 1329 return B_OK; 1330 } 1331 1332 // Search for a free duplicate fragment or create a new one 1333 // to insert the duplicate value into 1334 1335 uint32 fragmentIndex = 0; 1336 bplustree_node* fragment; 1337 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1338 &offset, &fragment, &fragmentIndex) != B_OK) { 1339 // allocate a new duplicate fragment node 1340 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1341 if (status != B_OK) 1342 RETURN_ERROR(status); 1343 1344 memset(fragment, 0, fNodeSize); 1345 } 1346 1347 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1348 array->Insert(oldValue); 1349 array->Insert(value); 1350 1351 if (cached.MakeWritable(transaction) == NULL) 1352 return B_IO_ERROR; 1353 1354 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1355 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1356 1357 return B_OK; 1358 } 1359 1360 1361 void 1362 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1363 uint16 keyLength, off_t value) 1364 { 1365 // should never happen, but who knows? 1366 if (index > node->NumKeys()) 1367 return; 1368 1369 off_t* values = node->Values(); 1370 uint16* keyLengths = node->KeyLengths(); 1371 uint8* keys = node->Keys(); 1372 1373 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1374 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1375 + keyLength); 1376 1377 off_t* newValues = node->Values(); 1378 uint16* newKeyLengths = node->KeyLengths(); 1379 1380 // move values and copy new value into them 1381 memmove(newValues + index + 1, values + index, 1382 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1383 memmove(newValues, values, sizeof(off_t) * index); 1384 1385 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1386 1387 // move and update key length index 1388 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1389 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1390 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1391 } 1392 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1393 1394 int32 keyStart; 1395 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1396 + (keyStart = index > 0 1397 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1398 1399 // move keys and copy new key into them 1400 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1401 int32 size = node->AllKeyLength() - length; 1402 if (size > 0) 1403 memmove(keys + length, keys + length - keyLength, size); 1404 1405 memcpy(keys + keyStart, key, keyLength); 1406 } 1407 1408 1409 /*! Splits the \a node into two halves - the other half will be put into 1410 \a other. It also takes care to create a new overflow link if the node 1411 to split is an index node. 1412 */ 1413 status_t 1414 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1415 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1416 uint16* _keyLength, off_t* _value) 1417 { 1418 if (*_keyIndex > node->NumKeys() + 1) 1419 return B_BAD_VALUE; 1420 1421 uint16* inKeyLengths = node->KeyLengths(); 1422 off_t* inKeyValues = node->Values(); 1423 uint8* inKeys = node->Keys(); 1424 uint8* outKeys = other->Keys(); 1425 int32 keyIndex = *_keyIndex; // can become less than zero! 1426 1427 if (keyIndex > node->NumKeys()) { 1428 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1429 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1430 return B_BAD_VALUE; 1431 } 1432 1433 // how many keys will fit in one (half) page? 1434 // that loop will find the answer to this question and 1435 // change the key lengths indices for their new home 1436 1437 // "bytes" is the number of bytes written for the new key, 1438 // "bytesBefore" are the bytes before that key 1439 // "bytesAfter" are the bytes after the new key, if any 1440 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1441 1442 size_t size = fNodeSize >> 1; 1443 int32 out, in; 1444 for (in = out = 0; in < node->NumKeys() + 1;) { 1445 if (!bytes) { 1446 bytesBefore = in > 0 1447 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1448 } 1449 1450 if (in == keyIndex && !bytes) { 1451 bytes = *_keyLength; 1452 } else { 1453 if (keyIndex < out) { 1454 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1455 - bytesBefore; 1456 } 1457 1458 in++; 1459 } 1460 out++; 1461 1462 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1463 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1464 // we have found the number of keys in the new node! 1465 break; 1466 } 1467 } 1468 1469 // if the new key was not inserted, set the length of the keys 1470 // that can be copied directly 1471 if (keyIndex >= out && in > 0) 1472 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1473 1474 if (bytesBefore < 0 || bytesAfter < 0) 1475 return B_BAD_DATA; 1476 1477 other->left_link = node->left_link; 1478 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1479 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1480 + bytesAfter); 1481 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1482 1483 uint16* outKeyLengths = other->KeyLengths(); 1484 off_t* outKeyValues = other->Values(); 1485 int32 keys = out > keyIndex ? keyIndex : out; 1486 1487 if (bytesBefore) { 1488 // copy the keys 1489 memcpy(outKeys, inKeys, bytesBefore); 1490 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1491 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1492 } 1493 if (bytes) { 1494 // copy the newly inserted key 1495 memcpy(outKeys + bytesBefore, key, bytes); 1496 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1497 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1498 1499 if (bytesAfter) { 1500 // copy the keys after the new key 1501 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1502 bytesAfter); 1503 keys = out - keyIndex - 1; 1504 for (int32 i = 0;i < keys;i++) { 1505 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1506 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1507 + bytes); 1508 } 1509 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1510 keys * sizeof(off_t)); 1511 } 1512 } 1513 1514 // if the new key was already inserted, we shouldn't use it again 1515 if (in != out) 1516 keyIndex--; 1517 1518 int32 total = bytesBefore + bytesAfter; 1519 1520 // these variables are for the key that will be returned 1521 // to the parent node 1522 uint8* newKey = NULL; 1523 uint16 newLength; 1524 bool newAllocated = false; 1525 1526 // If we have split an index node, we have to drop the first key 1527 // of the next node (which can also be the new key to insert). 1528 // The dropped key is also the one which has to be inserted in 1529 // the parent node, so we will set the "newKey" already here. 1530 if (node->OverflowLink() != BPLUSTREE_NULL) { 1531 if (in == keyIndex) { 1532 newKey = key; 1533 newLength = *_keyLength; 1534 1535 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1536 keyIndex--; 1537 } else { 1538 // If a key is dropped (is not the new key), we have to copy 1539 // it, because it would be lost if not. 1540 uint8* droppedKey = node->KeyAt(in, &newLength); 1541 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1542 > (uint8*)node + fNodeSize 1543 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1544 fStream->GetVolume()->Panic(); 1545 RETURN_ERROR(B_BAD_DATA); 1546 } 1547 newKey = (uint8*)malloc(newLength); 1548 if (newKey == NULL) 1549 return B_NO_MEMORY; 1550 1551 newAllocated = true; 1552 memcpy(newKey, droppedKey, newLength); 1553 1554 other->overflow_link = inKeyValues[in]; 1555 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1556 } 1557 } 1558 1559 // and now the same game for the other page and the rest of the keys 1560 // (but with memmove() instead of memcpy(), because they may overlap) 1561 1562 bytesBefore = bytesAfter = bytes = 0; 1563 out = 0; 1564 int32 skip = in; 1565 while (in < node->NumKeys() + 1) { 1566 if (in == keyIndex && !bytes) { 1567 // it's enough to set bytesBefore once here, because we do 1568 // not need to know the exact length of all keys in this 1569 // loop 1570 bytesBefore = in > skip 1571 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1572 bytes = *_keyLength; 1573 out++; 1574 } else { 1575 if (in < node->NumKeys()) { 1576 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1577 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1578 1579 if (bytes) { 1580 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1581 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1582 1583 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1584 - bytesBefore - bytes; 1585 } 1586 out++; 1587 } 1588 in++; 1589 } 1590 } 1591 1592 // adjust the byte counts (since we were a bit lazy in the loop) 1593 if (keyIndex < skip) 1594 bytesBefore = node->AllKeyLength() - total; 1595 1596 if (bytesBefore < 0 || bytesAfter < 0) { 1597 if (newAllocated) 1598 free(newKey); 1599 return B_BAD_DATA; 1600 } 1601 1602 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1603 // right link, and overflow link can stay the same 1604 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1605 + bytesAfter); 1606 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1607 1608 // array positions have changed 1609 outKeyLengths = node->KeyLengths(); 1610 outKeyValues = node->Values(); 1611 1612 // move the keys in the old node: the order is important here, 1613 // because we don't want to overwrite any contents 1614 1615 keys = keyIndex <= skip ? out : keyIndex - skip; 1616 keyIndex -= skip; 1617 in = out - keyIndex - 1; 1618 // Note: keyIndex and in will contain invalid values when the new key 1619 // went to the other node. But in this case bytes and bytesAfter are 1620 // 0 and subsequently we never use keyIndex and in. 1621 1622 if (bytesBefore) 1623 memmove(inKeys, inKeys + total, bytesBefore); 1624 if (bytesAfter) { 1625 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1626 bytesAfter); 1627 } 1628 1629 if (bytesBefore) 1630 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1631 if (bytesAfter) { 1632 // if byteAfter is > 0, keyIndex is larger than skip 1633 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1634 in * sizeof(uint16)); 1635 } 1636 1637 if (bytesBefore) 1638 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1639 if (bytesAfter) { 1640 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1641 in * sizeof(off_t)); 1642 } 1643 1644 if (bytes) { 1645 // finally, copy the newly inserted key (don't overwrite anything) 1646 memcpy(inKeys + bytesBefore, key, bytes); 1647 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1648 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1649 } 1650 1651 // Prepare the key that will be inserted in the parent node which 1652 // is either the dropped key or the last of the other node. 1653 // If it's the dropped key, "newKey" was already set earlier. 1654 1655 if (newKey == NULL) 1656 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1657 1658 memcpy(key, newKey, newLength); 1659 *_keyLength = newLength; 1660 *_value = otherOffset; 1661 1662 if (newAllocated) 1663 free(newKey); 1664 1665 return B_OK; 1666 } 1667 1668 1669 /*! This inserts a key into the tree. The changes made to the tree will 1670 all be part of the \a transaction. 1671 You need to have the inode write locked. 1672 */ 1673 status_t 1674 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1675 off_t value) 1676 { 1677 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1678 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1679 RETURN_ERROR(B_BAD_VALUE); 1680 #ifdef DEBUG 1681 if (value < 0) 1682 panic("tried to insert invalid value %Ld!\n", value); 1683 #endif 1684 1685 ASSERT_WRITE_LOCKED_INODE(fStream); 1686 1687 Stack<node_and_key> stack; 1688 if (_SeekDown(stack, key, keyLength) != B_OK) 1689 RETURN_ERROR(B_ERROR); 1690 1691 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1692 1693 memcpy(keyBuffer, key, keyLength); 1694 keyBuffer[keyLength] = 0; 1695 1696 node_and_key nodeAndKey; 1697 const bplustree_node* node; 1698 1699 CachedNode cached(this); 1700 while (stack.Pop(&nodeAndKey) 1701 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1702 #ifdef DEBUG 1703 NodeChecker checker(node, fNodeSize, "insert"); 1704 #endif 1705 if (node->IsLeaf()) { 1706 // first round, check for duplicate entries 1707 status_t status = _FindKey(node, key, keyLength, 1708 &nodeAndKey.keyIndex); 1709 1710 // is this a duplicate entry? 1711 if (status == B_OK) { 1712 if (fAllowDuplicates) { 1713 status = _InsertDuplicate(transaction, cached, node, 1714 nodeAndKey.keyIndex, value); 1715 if (status != B_OK) 1716 RETURN_ERROR(status); 1717 return B_OK; 1718 } 1719 1720 return B_NAME_IN_USE; 1721 } 1722 } 1723 1724 bplustree_node* writableNode = cached.MakeWritable(transaction); 1725 if (writableNode == NULL) 1726 return B_IO_ERROR; 1727 1728 // is the node big enough to hold the pair? 1729 if (int32(key_align(sizeof(bplustree_node) 1730 + writableNode->AllKeyLength() + keyLength) 1731 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1732 + sizeof(off_t))) < fNodeSize) { 1733 _InsertKey(writableNode, nodeAndKey.keyIndex, 1734 keyBuffer, keyLength, value); 1735 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1736 nodeAndKey.keyIndex, 0, 1); 1737 1738 return B_OK; 1739 } else { 1740 CachedNode cachedNewRoot(this); 1741 CachedNode cachedOther(this); 1742 1743 // do we need to allocate a new root node? if so, then do 1744 // it now 1745 off_t newRoot = BPLUSTREE_NULL; 1746 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1747 bplustree_node* root; 1748 status_t status = cachedNewRoot.Allocate(transaction, &root, 1749 &newRoot); 1750 if (status != B_OK) { 1751 // The tree is most likely corrupted! 1752 // But it's still sane at leaf level - we could set 1753 // a flag in the header that forces the tree to be 1754 // rebuild next time... 1755 // But since we will have journaling, that's not a big 1756 // problem anyway. 1757 RETURN_ERROR(status); 1758 } 1759 } 1760 1761 // reserve space for the other node 1762 bplustree_node* other; 1763 off_t otherOffset; 1764 status_t status = cachedOther.Allocate(transaction, &other, 1765 &otherOffset); 1766 if (status != B_OK) { 1767 cachedNewRoot.Free(transaction, newRoot); 1768 RETURN_ERROR(status); 1769 } 1770 1771 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1772 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1773 &value) != B_OK) { 1774 // free root node & other node here 1775 cachedOther.Free(transaction, otherOffset); 1776 cachedNewRoot.Free(transaction, newRoot); 1777 1778 RETURN_ERROR(B_ERROR); 1779 } 1780 #ifdef DEBUG 1781 checker.Check("insert split"); 1782 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1783 #endif 1784 1785 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1786 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1787 1788 // update the right link of the node in the left of the new node 1789 if ((other = cachedOther.SetToWritable(transaction, 1790 other->LeftLink())) != NULL) { 1791 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1792 } 1793 1794 // create a new root if necessary 1795 if (newRoot != BPLUSTREE_NULL) { 1796 bplustree_node* root = cachedNewRoot.Node(); 1797 1798 _InsertKey(root, 0, keyBuffer, keyLength, 1799 writableNode->LeftLink()); 1800 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1801 nodeAndKey.nodeOffset); 1802 1803 CachedNode cached(this); 1804 bplustree_header* header 1805 = cached.SetToWritableHeader(transaction); 1806 if (header == NULL) 1807 return B_IO_ERROR; 1808 1809 // finally, update header to point to the new root 1810 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1811 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1812 header->MaxNumberOfLevels() + 1); 1813 1814 return B_OK; 1815 } 1816 } 1817 } 1818 RETURN_ERROR(B_ERROR); 1819 } 1820 1821 1822 /*! Removes the duplicate index/value pair from the tree. 1823 It's part of the private tree interface. 1824 */ 1825 status_t 1826 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1827 const bplustree_node* node, CachedNode& cached, uint16 index, 1828 off_t value) 1829 { 1830 off_t* values = node->Values(); 1831 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1832 1833 CachedNode cachedDuplicate(this); 1834 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1835 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1836 duplicateOffset, false); 1837 if (duplicate == NULL) 1838 return B_IO_ERROR; 1839 1840 // if it's a duplicate fragment, remove the entry from there 1841 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1842 duplicate_array* array = duplicate->FragmentAt( 1843 bplustree_node::FragmentIndex(oldValue)); 1844 int32 arrayCount = array->Count(); 1845 1846 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1847 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1848 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1849 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1850 arrayCount, fStream->ID())); 1851 return B_BAD_DATA; 1852 } 1853 if (!array->Remove(value)) { 1854 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1855 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1856 duplicateOffset, fStream->ID())); 1857 return B_ENTRY_NOT_FOUND; 1858 } 1859 1860 // remove the array from the fragment node if it is empty 1861 if (--arrayCount == 1) { 1862 // set the link to the remaining value 1863 if (cached.MakeWritable(transaction) == NULL) 1864 return B_IO_ERROR; 1865 1866 values[index] = array->values[0]; 1867 1868 // Remove the whole fragment node, if this was the only array, 1869 // otherwise free just the array 1870 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1871 status_t status = cachedDuplicate.Free(transaction, 1872 duplicateOffset); 1873 if (status != B_OK) 1874 return status; 1875 } else 1876 array->count = 0; 1877 } 1878 return B_OK; 1879 } 1880 1881 // Remove value from a duplicate node! 1882 1883 duplicate_array* array = NULL; 1884 int32 arrayCount = 0; 1885 1886 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1887 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1888 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1889 return B_BAD_DATA; 1890 } 1891 1892 // Search the duplicate nodes until the entry could be found (and removed) 1893 while (duplicate != NULL) { 1894 array = duplicate->DuplicateArray(); 1895 arrayCount = array->Count(); 1896 1897 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1898 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1899 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1900 duplicateOffset, arrayCount, fStream->ID())); 1901 return B_BAD_DATA; 1902 } 1903 1904 if (array->Remove(value)) { 1905 arrayCount--; 1906 break; 1907 } 1908 1909 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1910 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1911 1912 cachedDuplicate.UnsetUnchanged(transaction); 1913 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1914 false); 1915 } 1916 if (duplicate == NULL) 1917 RETURN_ERROR(B_IO_ERROR); 1918 1919 // The entry got removed from the duplicate node, but we might want to free 1920 // it now in case it's empty 1921 1922 while (true) { 1923 off_t left = duplicate->LeftLink(); 1924 off_t right = duplicate->RightLink(); 1925 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1926 1927 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1928 // Free empty duplicate page, link their siblings together, and 1929 // update the duplicate link if needed (ie. when we either remove 1930 // the last duplicate node or have a new first one) 1931 1932 if (left == BPLUSTREE_NULL) { 1933 // the duplicate link points to us 1934 if (cached.MakeWritable(transaction) == NULL) 1935 return B_IO_ERROR; 1936 1937 if (arrayCount == 1) { 1938 // This is the last node, and there is only one value left; 1939 // replace the duplicate link with that value, it's no 1940 // duplicate anymore 1941 values[index] = array->values[0]; 1942 } else { 1943 // Move the duplicate link to the next node 1944 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1945 bplustree_node::MakeLink( 1946 BPLUSTREE_DUPLICATE_NODE, right)); 1947 } 1948 } 1949 1950 status_t status = cachedDuplicate.Free(transaction, 1951 duplicateOffset); 1952 if (status != B_OK) 1953 return status; 1954 1955 if (left != BPLUSTREE_NULL 1956 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1957 false)) != NULL) { 1958 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1959 1960 // If the next node is the last node, we need to free that node 1961 // and convert the duplicate entry back into a normal entry 1962 array = duplicate->DuplicateArray(); 1963 arrayCount = array->Count(); 1964 if (right == BPLUSTREE_NULL 1965 && duplicate->LeftLink() == BPLUSTREE_NULL 1966 && arrayCount <= NUM_FRAGMENT_VALUES) { 1967 duplicateOffset = left; 1968 continue; 1969 } 1970 } 1971 if (right != BPLUSTREE_NULL 1972 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1973 right, false)) != NULL) { 1974 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1975 1976 // Again, we may need to turn the duplicate entry back into a 1977 // normal entry 1978 array = duplicate->DuplicateArray(); 1979 arrayCount = array->Count(); 1980 if (left == BPLUSTREE_NULL 1981 && duplicate->RightLink() == BPLUSTREE_NULL 1982 && arrayCount <= NUM_FRAGMENT_VALUES) { 1983 duplicateOffset = right; 1984 continue; 1985 } 1986 } 1987 return B_OK; 1988 } 1989 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1990 // If the number of entries fits in a duplicate fragment, then 1991 // either find a free fragment node, or convert this node to a 1992 // fragment node. 1993 CachedNode cachedOther(this); 1994 1995 bplustree_node* fragment = NULL; 1996 uint32 fragmentIndex = 0; 1997 off_t offset; 1998 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1999 &offset, &fragment, &fragmentIndex) == B_OK) { 2000 // move to other node 2001 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 2002 memcpy(target, array, 2003 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2004 2005 cachedDuplicate.Free(transaction, duplicateOffset); 2006 duplicateOffset = offset; 2007 } else { 2008 // convert node 2009 memmove(duplicate, array, 2010 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2011 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 2012 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2013 } 2014 2015 if (cached.MakeWritable(transaction) == NULL) 2016 return B_IO_ERROR; 2017 2018 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 2019 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 2020 } 2021 return B_OK; 2022 } 2023 } 2024 2025 2026 /*! Removes the key with the given index from the specified node. 2027 Since it has to get the key from the node anyway (to obtain it's 2028 pointer), it's not needed to pass the key & its length, although 2029 the calling method (BPlusTree::Remove()) have this data. 2030 */ 2031 void 2032 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 2033 { 2034 // should never happen, but who knows? 2035 if (index > node->NumKeys() && node->NumKeys() > 0) { 2036 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 2037 "\n", index, fStream->ID())); 2038 return; 2039 } 2040 2041 off_t* values = node->Values(); 2042 2043 // if we would have to drop the overflow link, drop 2044 // the last key instead and update the overflow link 2045 // to the value of that one 2046 if (!node->IsLeaf() && index == node->NumKeys()) 2047 node->overflow_link = values[--index]; 2048 2049 uint16 length; 2050 uint8* key = node->KeyAt(index, &length); 2051 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 2052 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2053 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 2054 fStream->ID())); 2055 fStream->GetVolume()->Panic(); 2056 return; 2057 } 2058 2059 uint16* keyLengths = node->KeyLengths(); 2060 uint8* keys = node->Keys(); 2061 2062 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2063 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2064 node->AllKeyLength() - length); 2065 2066 off_t* newValues = node->Values(); 2067 uint16* newKeyLengths = node->KeyLengths(); 2068 2069 // move key data 2070 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2071 2072 // move and update key lengths 2073 if (index > 0 && newKeyLengths != keyLengths) 2074 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2075 for (uint16 i = index; i < node->NumKeys(); i++) { 2076 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2077 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2078 } 2079 2080 // move values 2081 if (index > 0) 2082 memmove(newValues, values, index * sizeof(off_t)); 2083 if (node->NumKeys() > index) { 2084 memmove(newValues + index, values + index + 1, 2085 (node->NumKeys() - index) * sizeof(off_t)); 2086 } 2087 } 2088 2089 2090 /*! Removes the specified key from the tree. The "value" parameter is only used 2091 for trees which allow duplicates, so you may safely ignore it. 2092 It's not an optional parameter, so at least you have to think about it. 2093 You need to have the inode write locked. 2094 */ 2095 status_t 2096 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2097 off_t value) 2098 { 2099 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2100 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2101 RETURN_ERROR(B_BAD_VALUE); 2102 2103 ASSERT_WRITE_LOCKED_INODE(fStream); 2104 2105 Stack<node_and_key> stack; 2106 if (_SeekDown(stack, key, keyLength) != B_OK) 2107 RETURN_ERROR(B_ERROR); 2108 2109 node_and_key nodeAndKey; 2110 const bplustree_node* node; 2111 2112 CachedNode cached(this); 2113 while (stack.Pop(&nodeAndKey) 2114 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2115 #ifdef DEBUG 2116 NodeChecker checker(node, fNodeSize, "remove"); 2117 #endif 2118 if (node->IsLeaf()) { 2119 // first round, check for duplicate entries 2120 status_t status = _FindKey(node, key, keyLength, 2121 &nodeAndKey.keyIndex); 2122 if (status != B_OK) 2123 RETURN_ERROR(status); 2124 2125 // Is this a duplicate entry? 2126 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2127 node->Values()[nodeAndKey.keyIndex]))) { 2128 if (fAllowDuplicates) { 2129 return _RemoveDuplicate(transaction, node, cached, 2130 nodeAndKey.keyIndex, value); 2131 } 2132 2133 FATAL(("dupliate node found where no duplicates are " 2134 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2135 RETURN_ERROR(B_ERROR); 2136 } else { 2137 if (node->Values()[nodeAndKey.keyIndex] != value) 2138 return B_ENTRY_NOT_FOUND; 2139 2140 // If we will remove the last key, the iterator will be set 2141 // to the next node after the current - if there aren't any 2142 // more nodes, we need a way to prevent the TreeIterators to 2143 // touch the old node again, we use BPLUSTREE_FREE for this 2144 off_t next = node->RightLink() == BPLUSTREE_NULL 2145 ? BPLUSTREE_FREE : node->RightLink(); 2146 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2147 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2148 } 2149 } 2150 2151 bplustree_node* writableNode = cached.MakeWritable(transaction); 2152 if (writableNode == NULL) 2153 return B_IO_ERROR; 2154 2155 // if it's an empty root node, we have to convert it 2156 // to a leaf node by dropping the overflow link, or, 2157 // if it's already a leaf node, just empty it 2158 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2159 && (node->NumKeys() == 0 2160 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2161 writableNode->overflow_link 2162 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2163 writableNode->all_key_count = 0; 2164 writableNode->all_key_length = 0; 2165 2166 // if we've made a leaf node out of the root node, we need 2167 // to reset the maximum number of levels in the header 2168 if (fHeader.MaxNumberOfLevels() != 1) { 2169 CachedNode cached(this); 2170 bplustree_header* header 2171 = cached.SetToWritableHeader(transaction); 2172 if (header == NULL) 2173 return B_IO_ERROR; 2174 2175 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2176 } 2177 return B_OK; 2178 } 2179 2180 // if there is only one key left, we don't have to remove 2181 // it, we can just dump the node (index nodes still have 2182 // the overflow link, so we have to drop the last key) 2183 if (writableNode->NumKeys() > 1 2184 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2185 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2186 return B_OK; 2187 } 2188 2189 // when we are here, we can just free the node, but 2190 // we have to update the right/left link of the 2191 // siblings first 2192 CachedNode otherCached(this); 2193 bplustree_node* other = otherCached.SetToWritable(transaction, 2194 writableNode->LeftLink()); 2195 if (other != NULL) 2196 other->right_link = writableNode->right_link; 2197 2198 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2199 != NULL) { 2200 other->left_link = writableNode->left_link; 2201 } 2202 2203 cached.Free(transaction, nodeAndKey.nodeOffset); 2204 } 2205 RETURN_ERROR(B_ERROR); 2206 } 2207 2208 2209 /*! Replaces the value for the key in the tree. 2210 Returns B_OK if the key could be found and its value replaced, 2211 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2212 to indicate that something went terribly wrong. 2213 Note that this doesn't work with duplicates - it will just 2214 return B_BAD_TYPE if you call this function on a tree where 2215 duplicates are allowed. 2216 You need to have the inode write locked. 2217 */ 2218 status_t 2219 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2220 off_t value) 2221 { 2222 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2223 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2224 || key == NULL) 2225 RETURN_ERROR(B_BAD_VALUE); 2226 2227 if (fAllowDuplicates) 2228 RETURN_ERROR(B_BAD_TYPE); 2229 2230 ASSERT_WRITE_LOCKED_INODE(fStream); 2231 2232 off_t nodeOffset = fHeader.RootNode(); 2233 CachedNode cached(this); 2234 const bplustree_node* node; 2235 2236 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2237 uint16 keyIndex = 0; 2238 off_t nextOffset; 2239 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2240 &nextOffset); 2241 2242 if (node->OverflowLink() == BPLUSTREE_NULL) { 2243 if (status == B_OK) { 2244 bplustree_node* writableNode = cached.MakeWritable(transaction); 2245 if (writableNode != NULL) { 2246 writableNode->Values()[keyIndex] 2247 = HOST_ENDIAN_TO_BFS_INT64(value); 2248 } else 2249 status = B_IO_ERROR; 2250 } 2251 2252 return status; 2253 } else if (nextOffset == nodeOffset) 2254 RETURN_ERROR(B_ERROR); 2255 2256 nodeOffset = nextOffset; 2257 } 2258 RETURN_ERROR(B_ERROR); 2259 } 2260 #endif // !_BOOT_MODE 2261 2262 2263 /*! Searches the key in the tree, and stores the offset found in _value, 2264 if successful. 2265 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2266 while it descends the tree. 2267 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2268 It can also return other errors to indicate that something went wrong. 2269 Note that this doesn't work with duplicates - it will just return 2270 B_BAD_TYPE if you call this function on a tree where duplicates are 2271 allowed. 2272 You need to have the inode read or write locked. 2273 */ 2274 status_t 2275 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2276 { 2277 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2278 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2279 || key == NULL) 2280 RETURN_ERROR(B_BAD_VALUE); 2281 2282 if (fAllowDuplicates) 2283 RETURN_ERROR(B_BAD_TYPE); 2284 2285 #if !_BOOT_MODE 2286 ASSERT_READ_LOCKED_INODE(fStream); 2287 #endif 2288 2289 off_t nodeOffset = fHeader.RootNode(); 2290 CachedNode cached(this); 2291 const bplustree_node* node; 2292 2293 #ifdef DEBUG 2294 int32 levels = 0; 2295 #endif 2296 2297 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2298 uint16 keyIndex = 0; 2299 off_t nextOffset; 2300 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2301 &nextOffset); 2302 2303 #ifdef DEBUG 2304 levels++; 2305 #endif 2306 if (node->OverflowLink() == BPLUSTREE_NULL) { 2307 if (status == B_OK && _value != NULL) 2308 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2309 2310 #ifdef DEBUG 2311 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2312 DEBUGGER(("levels don't match")); 2313 #endif 2314 return status; 2315 } else if (nextOffset == nodeOffset) 2316 RETURN_ERROR(B_ERROR); 2317 2318 nodeOffset = nextOffset; 2319 } 2320 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2321 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2322 RETURN_ERROR(B_ERROR); 2323 } 2324 2325 2326 #if !_BOOT_MODE 2327 status_t 2328 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2329 const uint8* largestKey, uint16 largestKeyLength, 2330 const bplustree_node* parent) 2331 { 2332 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2333 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2334 "failed!\n", fStream->ID(), offset); 2335 check.FoundError(); 2336 return B_OK; 2337 } 2338 if (level >= fHeader.MaxNumberOfLevels()) { 2339 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2340 "!\n", fStream->ID(), offset); 2341 check.FoundError(); 2342 return B_OK; 2343 } 2344 2345 check.SetLevel(level); 2346 2347 if (check.Visited(offset)) { 2348 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2349 fStream->ID(), offset); 2350 check.FoundError(); 2351 return B_OK; 2352 } 2353 2354 check.SetVisited(offset); 2355 2356 uint32 count = parent->NumKeys(); 2357 off_t* values = parent->Values(); 2358 off_t lastOffset = check.PreviousOffset(level); 2359 CachedNode cached(this); 2360 2361 for (uint32 i = 0; i < count; i++) { 2362 uint16 keyLength; 2363 uint8* key = parent->KeyAt(i, &keyLength); 2364 if (largestKey != NULL) { 2365 int result = compareKeys(fHeader.DataType(), key, keyLength, 2366 largestKey, largestKeyLength); 2367 if (result >= 0) { 2368 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2369 B_PRIu32 " larger than it should!\n", 2370 fStream->ID(), offset, i); 2371 check.FoundError(); 2372 } 2373 } 2374 2375 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2376 if (bplustree_node::IsDuplicate(childOffset)) { 2377 // Walk the duplicate nodes 2378 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2379 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2380 2381 while (duplicateOffset != BPLUSTREE_NULL) { 2382 const bplustree_node* node; 2383 status_t status = cached.SetTo(duplicateOffset, &node, false); 2384 if (status != B_OK) { 2385 if (status == B_IO_ERROR) 2386 return B_IO_ERROR; 2387 2388 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2389 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2390 duplicateOffset, strerror(status)); 2391 check.FoundError(); 2392 break; 2393 } 2394 2395 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2396 == BPLUSTREE_DUPLICATE_FRAGMENT; 2397 bool isKnownFragment = isFragmentNode 2398 && check.VisitedFragment(duplicateOffset); 2399 2400 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2401 dprintf("inode %" B_PRIdOFF ": %s node at %" 2402 B_PRIdOFF " already visited, referenced from %" 2403 B_PRIdOFF "!\n", fStream->ID(), 2404 isFragmentNode ? "fragment" : "duplicate", 2405 duplicateOffset, offset); 2406 check.FoundError(); 2407 break; 2408 } 2409 2410 // Fragment nodes may be visited more than once from different 2411 // places 2412 if (!check.Visited(duplicateOffset)) 2413 check.SetVisited(duplicateOffset); 2414 if (!isKnownFragment && isFragmentNode) 2415 check.SetVisitedFragment(duplicateOffset); 2416 2417 duplicate_array* array; 2418 int32 minSize; 2419 int32 maxSize; 2420 if (isFragmentNode) { 2421 array = node->FragmentAt( 2422 bplustree_node::FragmentIndex(childOffset)); 2423 minSize = 2; 2424 maxSize = NUM_FRAGMENT_VALUES; 2425 } else { 2426 array = node->DuplicateArray(); 2427 minSize = 1; 2428 maxSize = NUM_DUPLICATE_VALUES; 2429 } 2430 int32 arrayCount = array->Count(); 2431 2432 if (arrayCount < minSize || arrayCount > maxSize) { 2433 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2434 " has invalid array size %" B_PRId32 "!\n", 2435 fStream->ID(), duplicateOffset, arrayCount); 2436 check.FoundError(); 2437 } else { 2438 // Simple check if the values in the array may be valid 2439 for (int32 j = 0; j < arrayCount; j++) { 2440 if (!fStream->GetVolume()->IsValidInodeBlock( 2441 array->ValueAt(j))) { 2442 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2443 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2444 " at %" B_PRId32 "!\n", fStream->ID(), 2445 duplicateOffset, array->ValueAt(j), j); 2446 check.FoundError(); 2447 break; 2448 } 2449 } 2450 } 2451 2452 // A fragment node is not linked (and does not have valid links) 2453 if (isFragmentNode) 2454 break; 2455 2456 if (node->LeftLink() != lastDuplicateOffset) { 2457 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2458 " has wrong left link %" B_PRIdOFF ", expected %" 2459 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2460 node->LeftLink(), lastDuplicateOffset); 2461 check.FoundError(); 2462 } 2463 2464 lastDuplicateOffset = duplicateOffset; 2465 duplicateOffset = node->RightLink(); 2466 } 2467 } else if (!parent->IsLeaf()) { 2468 // Test a regular child node recursively 2469 off_t nextOffset = parent->OverflowLink(); 2470 if (i < count - 1) 2471 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2472 2473 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2474 // Test right link of the previous node 2475 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2476 if (previous == NULL) 2477 return B_IO_ERROR; 2478 2479 if (previous->RightLink() != childOffset) { 2480 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2481 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2482 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2483 childOffset); 2484 check.FoundError(); 2485 } 2486 } 2487 2488 status_t status = _ValidateChild(check, cached, level, childOffset, 2489 lastOffset, nextOffset, key, keyLength); 2490 if (status != B_OK) 2491 return status; 2492 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2493 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2494 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2495 fStream->ID(), offset, childOffset, i); 2496 check.FoundError(); 2497 } 2498 2499 lastOffset = childOffset; 2500 } 2501 2502 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2503 off_t childOffset = parent->OverflowLink(); 2504 status_t status = _ValidateChild(check, cached, level, childOffset, 2505 lastOffset, 0, NULL, 0); 2506 if (status != B_OK) 2507 return status; 2508 2509 lastOffset = childOffset; 2510 } 2511 2512 check.SetPreviousOffset(level, lastOffset); 2513 return B_OK; 2514 } 2515 2516 2517 status_t 2518 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2519 off_t offset, off_t lastOffset, off_t nextOffset, 2520 const uint8* key, uint16 keyLength) 2521 { 2522 const bplustree_node* node; 2523 status_t status = cached.SetTo(offset, &node, true); 2524 if (status != B_OK) { 2525 if (status == B_IO_ERROR) 2526 return B_IO_ERROR; 2527 2528 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2529 "read: %s\n", fStream->ID(), offset, strerror(status)); 2530 check.FoundError(); 2531 return B_OK; 2532 } 2533 2534 if (node->LeftLink() != lastOffset) { 2535 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2536 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2537 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2538 check.FoundError(); 2539 } 2540 2541 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2542 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2543 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2544 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2545 check.FoundError(); 2546 } 2547 2548 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2549 } 2550 #endif // !_BOOT_MODE 2551 2552 2553 // #pragma mark - 2554 2555 2556 TreeIterator::TreeIterator(BPlusTree* tree) 2557 : 2558 fTree(tree), 2559 fCurrentNodeOffset(BPLUSTREE_NULL) 2560 { 2561 #if !_BOOT_MODE 2562 tree->_AddIterator(this); 2563 #endif 2564 } 2565 2566 2567 TreeIterator::~TreeIterator() 2568 { 2569 #if !_BOOT_MODE 2570 if (fTree) 2571 fTree->_RemoveIterator(this); 2572 #endif 2573 } 2574 2575 2576 status_t 2577 TreeIterator::Goto(int8 to) 2578 { 2579 if (fTree == NULL || fTree->fStream == NULL) 2580 RETURN_ERROR(B_BAD_VALUE); 2581 2582 #if !_BOOT_MODE 2583 // lock access to stream 2584 InodeReadLocker locker(fTree->fStream); 2585 #endif 2586 2587 off_t nodeOffset = fTree->fHeader.RootNode(); 2588 CachedNode cached(fTree); 2589 const bplustree_node* node; 2590 2591 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2592 // is the node a leaf node? 2593 if (node->OverflowLink() == BPLUSTREE_NULL) { 2594 fCurrentNodeOffset = nodeOffset; 2595 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2596 fDuplicateNode = BPLUSTREE_NULL; 2597 2598 return B_OK; 2599 } 2600 2601 // get the next node offset depending on the direction (and if there 2602 // are any keys in that node at all) 2603 off_t nextOffset; 2604 if (to == BPLUSTREE_END || node->all_key_count == 0) 2605 nextOffset = node->OverflowLink(); 2606 else { 2607 if (node->AllKeyLength() > fTree->fNodeSize 2608 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2609 - 8 * node->NumKeys()) 2610 RETURN_ERROR(B_ERROR); 2611 2612 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2613 } 2614 if (nextOffset == nodeOffset) 2615 break; 2616 2617 nodeOffset = nextOffset; 2618 } 2619 FATAL(("%s fails\n", __FUNCTION__)); 2620 2621 RETURN_ERROR(B_ERROR); 2622 } 2623 2624 2625 /*! Iterates through the tree in the specified direction. 2626 When it iterates through duplicates, the "key" is only updated for the 2627 first entry - if you need to know when this happens, use the "duplicate" 2628 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2629 the other duplicates. 2630 That's not too nice, but saves the 256 bytes that would be needed to 2631 store the last key - if this will ever become an issue, it will be 2632 easy to change. 2633 The other advantage of this is, that the queries can skip all duplicates 2634 at once when they are not relevant to them. 2635 */ 2636 status_t 2637 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2638 uint16 maxLength, off_t* value, uint16* duplicate) 2639 { 2640 if (fTree == NULL) 2641 return B_INTERRUPTED; 2642 2643 bool forward = direction == BPLUSTREE_FORWARD; 2644 2645 if (fCurrentNodeOffset == BPLUSTREE_NULL 2646 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2647 RETURN_ERROR(B_ERROR); 2648 2649 // if the tree was emptied since the last call 2650 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2651 return B_ENTRY_NOT_FOUND; 2652 2653 #if !_BOOT_MODE 2654 // lock access to stream 2655 InodeReadLocker locker(fTree->fStream); 2656 #endif 2657 2658 CachedNode cached(fTree); 2659 const bplustree_node* node; 2660 2661 if (fDuplicateNode != BPLUSTREE_NULL) { 2662 // regardless of traverse direction the duplicates are always presented 2663 // in the same order; since they are all considered as equal, this 2664 // shouldn't cause any problems 2665 2666 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2667 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2668 false); 2669 } else 2670 node = NULL; 2671 2672 if (node != NULL) { 2673 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2674 // If the node is out of duplicates, we go directly to the next 2675 // one 2676 fDuplicateNode = node->RightLink(); 2677 if (fDuplicateNode != BPLUSTREE_NULL 2678 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2679 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2680 false); 2681 fDuplicate = 0; 2682 } 2683 } 2684 if (fDuplicate < fNumDuplicates) { 2685 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2686 fDuplicate++); 2687 if (duplicate) 2688 *duplicate = 2; 2689 return B_OK; 2690 } 2691 } 2692 fDuplicateNode = BPLUSTREE_NULL; 2693 } 2694 2695 off_t savedNodeOffset = fCurrentNodeOffset; 2696 int32 savedKey = fCurrentKey; 2697 2698 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2699 RETURN_ERROR(B_ERROR); 2700 2701 if (duplicate) 2702 *duplicate = 0; 2703 2704 fCurrentKey += direction; 2705 2706 // is the current key in the current node? 2707 while ((forward && fCurrentKey >= node->NumKeys()) 2708 || (!forward && fCurrentKey < 0)) { 2709 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2710 2711 // are there any more nodes? 2712 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2713 node = cached.SetTo(fCurrentNodeOffset); 2714 if (!node) 2715 RETURN_ERROR(B_ERROR); 2716 2717 // reset current key 2718 fCurrentKey = forward ? 0 : node->NumKeys() - 1; 2719 } else { 2720 // there are no nodes left, so turn back to the last key 2721 fCurrentNodeOffset = savedNodeOffset; 2722 fCurrentKey = savedKey; 2723 2724 return B_ENTRY_NOT_FOUND; 2725 } 2726 } 2727 2728 if (node->all_key_count == 0) 2729 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2730 2731 uint16 length; 2732 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2733 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2734 > (uint8*)node + fTree->fNodeSize 2735 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2736 #if !_BOOT_MODE 2737 fTree->fStream->GetVolume()->Panic(); 2738 #endif 2739 RETURN_ERROR(B_BAD_DATA); 2740 } 2741 2742 // include the termination for string types 2743 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; 2744 if (length + (needsTermination ? 1 : 0) > maxLength) { 2745 // the buffer is too small, restore the last key and return an error 2746 fCurrentNodeOffset = savedNodeOffset; 2747 fCurrentKey = savedKey; 2748 return B_BUFFER_OVERFLOW; 2749 } 2750 2751 memcpy(key, keyStart, length); 2752 2753 if (needsTermination) 2754 ((char*)key)[length] = '\0'; 2755 2756 *keyLength = length; 2757 2758 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2759 2760 // duplicate fragments? 2761 uint8 type = bplustree_node::LinkType(offset); 2762 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2763 || type == BPLUSTREE_DUPLICATE_NODE) { 2764 fDuplicateNode = offset; 2765 2766 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2767 false); 2768 if (node == NULL) 2769 RETURN_ERROR(B_ERROR); 2770 2771 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2772 2773 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2774 if (fNumDuplicates) { 2775 offset = node->DuplicateAt(offset, fIsFragment, 0); 2776 fDuplicate = 1; 2777 if (duplicate) 2778 *duplicate = 1; 2779 } else { 2780 // Shouldn't happen, but we're dealing here with potentially 2781 // corrupt disks... 2782 fDuplicateNode = BPLUSTREE_NULL; 2783 offset = 0; 2784 } 2785 } 2786 *value = offset; 2787 2788 return B_OK; 2789 } 2790 2791 2792 /*! This is more or less a copy of BPlusTree::Find() - but it just 2793 sets the current position in the iterator, regardless of if the 2794 key could be found or not. 2795 */ 2796 status_t 2797 TreeIterator::Find(const uint8* key, uint16 keyLength) 2798 { 2799 if (fTree == NULL) 2800 return B_INTERRUPTED; 2801 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2802 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2803 || key == NULL) 2804 RETURN_ERROR(B_BAD_VALUE); 2805 2806 #if !_BOOT_MODE 2807 // lock access to stream 2808 InodeReadLocker locker(fTree->fStream); 2809 #endif 2810 2811 off_t nodeOffset = fTree->fHeader.RootNode(); 2812 2813 CachedNode cached(fTree); 2814 const bplustree_node* node; 2815 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2816 uint16 keyIndex = 0; 2817 off_t nextOffset; 2818 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2819 &nextOffset); 2820 2821 if (node->OverflowLink() == BPLUSTREE_NULL) { 2822 fCurrentNodeOffset = nodeOffset; 2823 fCurrentKey = keyIndex - 1; 2824 fDuplicateNode = BPLUSTREE_NULL; 2825 2826 return status; 2827 } else if (nextOffset == nodeOffset) 2828 RETURN_ERROR(B_ERROR); 2829 2830 nodeOffset = nextOffset; 2831 } 2832 RETURN_ERROR(B_ERROR); 2833 } 2834 2835 2836 void 2837 TreeIterator::SkipDuplicates() 2838 { 2839 fDuplicateNode = BPLUSTREE_NULL; 2840 } 2841 2842 2843 void 2844 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2845 uint16 splitAt, int8 change) 2846 { 2847 if (offset != fCurrentNodeOffset) 2848 return; 2849 2850 if (nextOffset != BPLUSTREE_NULL) { 2851 fCurrentNodeOffset = nextOffset; 2852 if (splitAt <= fCurrentKey) { 2853 fCurrentKey -= splitAt; 2854 keyIndex -= splitAt; 2855 } 2856 } 2857 2858 // Adjust fCurrentKey to point to the same key as before. 2859 // Note, that if a key is inserted at the current position 2860 // it won't be included in this tree transition. 2861 if (keyIndex <= fCurrentKey) 2862 fCurrentKey += change; 2863 2864 // TODO: duplicate handling! 2865 } 2866 2867 2868 void 2869 TreeIterator::Stop() 2870 { 2871 fTree = NULL; 2872 } 2873 2874 2875 #ifdef DEBUG 2876 void 2877 TreeIterator::Dump() 2878 { 2879 __out("TreeIterator at %p:\n", this); 2880 __out("\tfTree = %p\n", fTree); 2881 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2882 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2883 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", 2884 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2885 fDuplicateNode); 2886 __out("\tfDuplicate = %u\n", fDuplicate); 2887 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2888 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2889 } 2890 #endif 2891 2892 2893 // #pragma mark - 2894 2895 2896 bool 2897 bplustree_header::IsValid() const 2898 { 2899 return Magic() == BPLUSTREE_MAGIC 2900 && (RootNode() % NodeSize()) == 0 2901 && IsValidLink(RootNode()) 2902 && IsValidLink(FreeNode()); 2903 } 2904 2905 2906 // #pragma mark - 2907 2908 2909 void 2910 bplustree_node::Initialize() 2911 { 2912 left_link = right_link = overflow_link 2913 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2914 all_key_count = 0; 2915 all_key_length = 0; 2916 } 2917 2918 2919 uint8* 2920 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2921 { 2922 if (index < 0 || index > NumKeys()) 2923 return NULL; 2924 2925 uint8* keyStart = Keys(); 2926 uint16* keyLengths = KeyLengths(); 2927 2928 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2929 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2930 if (index > 0) 2931 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2932 2933 return keyStart; 2934 } 2935 2936 2937 uint8 2938 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2939 { 2940 // the duplicate fragment handling is currently hard-coded to a node size 2941 // of 1024 bytes - with future versions of BFS, this may be a problem 2942 2943 if (isFragment) { 2944 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2945 2946 return ((off_t*)this)[fragment]; 2947 } 2948 return OverflowLink(); 2949 } 2950 2951 2952 off_t 2953 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2954 { 2955 uint32 start; 2956 if (isFragment) 2957 start = 8 * ((uint64)offset & 0x3ff); 2958 else 2959 start = 2; 2960 2961 return ((off_t*)this)[start + 1 + index]; 2962 } 2963 2964 2965 /*! Although the name suggests it, this function doesn't return the real 2966 used fragment count; at least, it can only count to two: it returns 2967 0, if there is no fragment used, 1 if there is only one fragment 2968 used, and 2 if there are at least 2 fragments used. 2969 */ 2970 uint32 2971 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2972 { 2973 uint32 used = 0; 2974 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2975 duplicate_array* array = FragmentAt(i); 2976 if (array->Count() > 0 && ++used > 1) 2977 return used; 2978 } 2979 return used; 2980 } 2981 2982 2983 status_t 2984 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2985 { 2986 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2987 DEBUGGER(("invalid node: key/length count")); 2988 2989 for (int32 i = 0; i < NumKeys(); i++) { 2990 uint16 length; 2991 uint8* key = KeyAt(i, &length); 2992 if (key + length + sizeof(off_t) + sizeof(uint16) 2993 > (uint8*)this + nodeSize 2994 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2995 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 2996 return B_BAD_DATA; 2997 } 2998 if (Values()[i] == -1) { 2999 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " 3000 "corrupted\n", this, (int)i, Values()[i]); 3001 return B_BAD_DATA; 3002 } 3003 } 3004 return B_OK; 3005 } 3006 3007 3008 // #pragma mark - 3009 3010 3011 #if !_BOOT_MODE 3012 BitmapArray::BitmapArray(size_t numBits) 3013 { 3014 fSize = (numBits + 7) / 8; 3015 fBitmap = (uint8*)calloc(fSize, 1); 3016 fCountSet = 0; 3017 } 3018 3019 3020 BitmapArray::~BitmapArray() 3021 { 3022 free(fBitmap); 3023 } 3024 3025 3026 status_t 3027 BitmapArray::InitCheck() const 3028 { 3029 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 3030 } 3031 3032 3033 bool 3034 BitmapArray::IsSet(size_t index) const 3035 { 3036 uint32 byteIndex = index / 8; 3037 if (byteIndex >= fSize) 3038 return false; 3039 3040 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 3041 } 3042 3043 3044 void 3045 BitmapArray::Set(size_t index, bool set) 3046 { 3047 uint32 byteIndex = index / 8; 3048 if (byteIndex >= fSize) 3049 return; 3050 3051 if (set) { 3052 fBitmap[byteIndex] |= 1UL << (index & 0x7); 3053 fCountSet++; 3054 } else { 3055 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 3056 fCountSet--; 3057 } 3058 } 3059 #endif // !_BOOT_MODE 3060 3061 3062 // #pragma mark - 3063 3064 3065 bool 3066 duplicate_array::_FindInternal(off_t value, int32& index) const 3067 { 3068 int32 min = 0, max = Count() - 1; 3069 off_t cmp; 3070 while (min <= max) { 3071 index = (min + max) / 2; 3072 3073 cmp = ValueAt(index) - value; 3074 if (cmp < 0) 3075 min = index + 1; 3076 else if (cmp > 0) 3077 max = index - 1; 3078 else 3079 return true; 3080 } 3081 return false; 3082 } 3083 3084 3085 void 3086 duplicate_array::Insert(off_t value) 3087 { 3088 // if there are more than 8 values in this array, use a 3089 // binary search, if not, just iterate linearly to find 3090 // the insertion point 3091 int32 size = Count(); 3092 int32 i = 0; 3093 if (size > 8 ) { 3094 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3095 i++; 3096 } else { 3097 for (i = 0; i < size; i++) { 3098 if (ValueAt(i) > value) 3099 break; 3100 } 3101 } 3102 3103 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3104 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3105 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3106 } 3107 3108 3109 bool 3110 duplicate_array::Remove(off_t value) 3111 { 3112 int32 index = Find(value); 3113 if (index == -1) 3114 return false; 3115 3116 int32 newSize = Count() - 1; 3117 memmove(&values[index], &values[index + 1], 3118 (newSize - index) * sizeof(off_t)); 3119 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3120 3121 return true; 3122 } 3123 3124 3125 // #pragma mark - 3126 3127 3128 int32 3129 compareKeys(type_code type, const void* key1, int keyLength1, 3130 const void* key2, int keyLength2) 3131 { 3132 // if one of the keys is NULL, bail out gracefully 3133 if (key1 == NULL || key2 == NULL) { 3134 // even if it's most probably a bug in the calling code, we try to 3135 // give a meaningful result 3136 if (key1 == NULL && key2 != NULL) 3137 return 1; 3138 if (key2 == NULL && key1 != NULL) 3139 return -1; 3140 3141 return 0; 3142 } 3143 3144 switch (type) { 3145 case B_STRING_TYPE: 3146 { 3147 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 3148 if (result == 0) { 3149 // ignore trailing null bytes 3150 if ((keyLength1 == keyLength2 + 1 3151 && ((uint8*)key1)[keyLength2] == '\0') 3152 || (keyLength2 == keyLength1 + 1 3153 && ((uint8*)key2)[keyLength1] == '\0')) 3154 return 0; 3155 3156 result = keyLength1 - keyLength2; 3157 } 3158 3159 return result; 3160 } 3161 3162 case B_SSIZE_T_TYPE: 3163 case B_INT32_TYPE: 3164 return *(int32*)key1 - *(int32*)key2; 3165 3166 case B_SIZE_T_TYPE: 3167 case B_UINT32_TYPE: 3168 if (*(uint32*)key1 == *(uint32*)key2) 3169 return 0; 3170 if (*(uint32*)key1 > *(uint32*)key2) 3171 return 1; 3172 3173 return -1; 3174 3175 case B_OFF_T_TYPE: 3176 case B_INT64_TYPE: 3177 if (*(int64*)key1 == *(int64*)key2) 3178 return 0; 3179 if (*(int64*)key1 > *(int64*)key2) 3180 return 1; 3181 3182 return -1; 3183 3184 case B_UINT64_TYPE: 3185 if (*(uint64*)key1 == *(uint64*)key2) 3186 return 0; 3187 if (*(uint64*)key1 > *(uint64*)key2) 3188 return 1; 3189 3190 return -1; 3191 3192 case B_FLOAT_TYPE: 3193 { 3194 float result = *(float*)key1 - *(float*)key2; 3195 if (result == 0.0f) 3196 return 0; 3197 3198 return (result < 0.0f) ? -1 : 1; 3199 } 3200 3201 case B_DOUBLE_TYPE: 3202 { 3203 double result = *(double*)key1 - *(double*)key2; 3204 if (result == 0.0) 3205 return 0; 3206 3207 return (result < 0.0) ? -1 : 1; 3208 } 3209 } 3210 3211 // if the type is unknown, the entries don't match... 3212 return -1; 3213 } 3214 3215 3216 #if _BOOT_MODE 3217 } // namespace BFS 3218 #endif 3219