1 /* 2 * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "BPlusTree.h" 14 15 #include <file_systems/QueryParserUtils.h> 16 17 #include "Debug.h" 18 #include "Utility.h" 19 20 #if !_BOOT_MODE 21 # include "Inode.h" 22 #else 23 # include "Stream.h" 24 25 // BFS::Stream from the bootloader has the same API as Inode. 26 # define Inode BFS::Stream 27 28 # define strerror(x) "error message unavailable" 29 30 namespace BFS { 31 #endif 32 33 34 /*! Simple array used for the duplicate handling in the B+Tree. This is an 35 on disk structure. 36 */ 37 struct duplicate_array { 38 off_t count; 39 off_t values[0]; 40 41 inline bool IsEmpty() const 42 { 43 return count == 0; 44 } 45 46 inline int32 Count() const 47 { 48 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 49 } 50 51 inline off_t ValueAt(uint32 index) const 52 { 53 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 54 } 55 56 inline void SetValueAt(uint32 index, off_t value) 57 { 58 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 59 } 60 61 inline int32 Find(off_t value) const 62 { 63 int32 i; 64 return _FindInternal(value, i) ? i : -1; 65 } 66 67 void Insert(off_t value); 68 bool Remove(off_t value); 69 70 private: 71 bool _FindInternal(off_t value, int32& index) const; 72 } _PACKED; 73 74 75 #ifdef DEBUG 76 class NodeChecker { 77 public: 78 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 79 : 80 fNode(node), 81 fSize(nodeSize), 82 fText(text) 83 { 84 Check("integrity check failed on construction."); 85 } 86 87 ~NodeChecker() 88 { 89 Check("integrity check failed on destruction."); 90 } 91 92 void 93 Check(const char* message) 94 { 95 if (fNode->CheckIntegrity(fSize) != B_OK) { 96 dprintf("%s: %s\n", fText, message); 97 DEBUGGER(("NodeChecker integrity check failed!")); 98 } 99 } 100 101 private: 102 const bplustree_node* fNode; 103 int32 fSize; 104 const char* fText; 105 }; 106 #endif // DEBUG 107 108 109 #if !_BOOT_MODE 110 class BitmapArray { 111 public: 112 BitmapArray(size_t numBits); 113 ~BitmapArray(); 114 115 status_t InitCheck() const; 116 117 bool IsSet(size_t index) const; 118 void Set(size_t index, bool set); 119 120 size_t CountSet() const { return fCountSet; } 121 122 private: 123 uint8* fBitmap; 124 size_t fSize; 125 size_t fCountSet; 126 }; 127 128 129 struct TreeCheck { 130 TreeCheck(BPlusTree* tree) 131 : 132 fLevelCount(0), 133 fFreeCount(0), 134 fNodeSize(tree->NodeSize()), 135 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 136 fFoundErrors(0), 137 fVisited(tree->Stream()->Size() / tree->NodeSize()), 138 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 139 { 140 fPreviousOffsets = (off_t*)malloc( 141 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 142 if (fPreviousOffsets != NULL) { 143 for (size_t i = 0; i < fMaxLevels; i++) 144 fPreviousOffsets[i] = BPLUSTREE_NULL; 145 } 146 } 147 148 ~TreeCheck() 149 { 150 free(fPreviousOffsets); 151 } 152 153 status_t InitCheck() const 154 { 155 if (fPreviousOffsets == NULL) 156 return B_NO_MEMORY; 157 158 status_t status = fVisited.InitCheck(); 159 if (status != B_OK) 160 return status; 161 162 return fVisitedFragment.InitCheck(); 163 } 164 165 bool Visited(off_t offset) const 166 { 167 return fVisited.IsSet(offset / fNodeSize); 168 } 169 170 void SetVisited(off_t offset) 171 { 172 fVisited.Set(offset / fNodeSize, true); 173 } 174 175 size_t VisitedCount() const 176 { 177 return fVisited.CountSet(); 178 } 179 180 bool VisitedFragment(off_t offset) const 181 { 182 return fVisitedFragment.IsSet(offset / fNodeSize); 183 } 184 185 void SetVisitedFragment(off_t offset) 186 { 187 fVisitedFragment.Set(offset / fNodeSize, true); 188 } 189 190 uint32 MaxLevels() const 191 { 192 return fLevelCount; 193 } 194 195 void SetLevel(uint32 level) 196 { 197 if (fLevelCount < level) 198 fLevelCount = level; 199 } 200 201 off_t PreviousOffset(uint32 level) 202 { 203 return fPreviousOffsets[level]; 204 } 205 206 void SetPreviousOffset(uint32 level, off_t offset) 207 { 208 fPreviousOffsets[level] = offset; 209 } 210 211 void FoundError() 212 { 213 fFoundErrors++; 214 } 215 216 bool ErrorsFound() 217 { 218 return fFoundErrors != 0; 219 } 220 221 private: 222 uint32 fLevelCount; 223 uint32 fFreeCount; 224 uint32 fNodeSize; 225 uint32 fMaxLevels; 226 uint32 fFoundErrors; 227 BitmapArray fVisited; 228 BitmapArray fVisitedFragment; 229 off_t* fPreviousOffsets; 230 }; 231 232 233 // #pragma mark - 234 235 236 // Node Caching for the BPlusTree class 237 // 238 // With write support, there is the need for a function that allocates new 239 // nodes by either returning empty nodes, or by growing the file's data stream 240 // 241 // !! The CachedNode class assumes that you have properly locked the stream 242 // !! before asking for nodes. 243 // 244 // Note: This code will fail if the block size is smaller than the node size! 245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 246 // is hard-coded to 1024 bytes, that's not an issue now. 247 248 void 249 CachedNode::UnsetUnchanged(Transaction& transaction) 250 { 251 if (fTree == NULL || fTree->fStream == NULL) 252 return; 253 254 if (fNode != NULL) { 255 void* cache = fTree->fStream->GetVolume()->BlockCache(); 256 257 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 258 block_cache_put(cache, fBlockNumber); 259 fNode = NULL; 260 } 261 } 262 #endif // !_BOOT_MODE 263 264 265 void 266 CachedNode::Unset() 267 { 268 if (fTree == NULL || fTree->fStream == NULL) 269 return; 270 271 if (fNode != NULL) { 272 #if !_BOOT_MODE 273 if (fWritable && fOffset == 0) { 274 // The B+tree header has been updated - we need to update the 275 // BPlusTrees copy of it, as well. 276 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 277 } 278 279 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 280 fBlockNumber); 281 #endif // !_BOOT_MODE 282 283 fNode = NULL; 284 } 285 } 286 287 288 const bplustree_node* 289 CachedNode::SetTo(off_t offset, bool check) 290 { 291 const bplustree_node* node; 292 if (SetTo(offset, &node, check) == B_OK) 293 return node; 294 295 return NULL; 296 } 297 298 299 status_t 300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 301 { 302 if (fTree == NULL || fTree->fStream == NULL) 303 RETURN_ERROR(B_BAD_VALUE); 304 305 Unset(); 306 307 // You can only ask for nodes at valid positions - you can't 308 // even access the b+tree header with this method (use SetToHeader() 309 // instead) 310 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 311 || offset <= 0 312 || (offset % fTree->fNodeSize) != 0) { 313 RETURN_ERROR(B_BAD_VALUE); 314 } 315 316 if (InternalSetTo(NULL, offset) != NULL && check) { 317 // sanity checks (links, all_key_count) 318 if (!fTree->fHeader.CheckNode(fNode)) { 319 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 320 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 321 fBlockNumber, fTree->fStream->ID())); 322 return B_BAD_DATA; 323 } 324 } 325 326 *_node = fNode; 327 return B_OK; 328 } 329 330 331 #if !_BOOT_MODE 332 bplustree_node* 333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 334 { 335 if (fTree == NULL || fTree->fStream == NULL) { 336 REPORT_ERROR(B_BAD_VALUE); 337 return NULL; 338 } 339 340 Unset(); 341 342 // You can only ask for nodes at valid positions - you can't 343 // even access the b+tree header with this method (use SetToHeader() 344 // instead) 345 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 346 || offset <= 0 347 || (offset % fTree->fNodeSize) != 0) 348 return NULL; 349 350 if (InternalSetTo(&transaction, offset) != NULL && check) { 351 // sanity checks (links, all_key_count) 352 if (!fTree->fHeader.CheckNode(fNode)) { 353 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 354 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 355 fBlockNumber, fTree->fStream->ID())); 356 return NULL; 357 } 358 } 359 return fNode; 360 } 361 362 363 bplustree_node* 364 CachedNode::MakeWritable(Transaction& transaction) 365 { 366 if (fNode == NULL) 367 return NULL; 368 369 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 370 fBlockNumber, transaction.ID()) == B_OK) { 371 return fNode; 372 } 373 374 return NULL; 375 } 376 #endif // !_BOOT_MODE 377 378 379 const bplustree_header* 380 CachedNode::SetToHeader() 381 { 382 if (fTree == NULL || fTree->fStream == NULL) { 383 REPORT_ERROR(B_BAD_VALUE); 384 return NULL; 385 } 386 387 Unset(); 388 389 InternalSetTo(NULL, 0LL); 390 return (bplustree_header*)fNode; 391 } 392 393 394 #if !_BOOT_MODE 395 bplustree_header* 396 CachedNode::SetToWritableHeader(Transaction& transaction) 397 { 398 if (fTree == NULL || fTree->fStream == NULL) { 399 REPORT_ERROR(B_BAD_VALUE); 400 return NULL; 401 } 402 403 Unset(); 404 405 InternalSetTo(&transaction, 0LL); 406 407 if (fNode != NULL && !fTree->fInTransaction) { 408 transaction.AddListener(fTree); 409 fTree->fInTransaction = true; 410 411 if (!transaction.GetVolume()->IsInitializing()) { 412 acquire_vnode(transaction.GetVolume()->FSVolume(), 413 fTree->fStream->ID()); 414 } 415 } 416 417 return (bplustree_header*)fNode; 418 } 419 #endif // !_BOOT_MODE 420 421 422 bplustree_node* 423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 424 { 425 fNode = NULL; 426 fOffset = offset; 427 428 off_t fileOffset; 429 block_run run; 430 if (offset < fTree->fStream->Size() 431 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 432 433 #if !_BOOT_MODE 434 Volume* volume = fTree->fStream->GetVolume(); 435 #else 436 Volume* volume = &fTree->fStream->GetVolume(); 437 #endif 438 439 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 440 fBlockNumber = volume->ToBlock(run) + blockOffset; 441 uint8* block = NULL; 442 443 #if !_BOOT_MODE 444 if (transaction != NULL) { 445 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 446 fBlockNumber, transaction->ID()); 447 fWritable = true; 448 } else { 449 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 450 fWritable = false; 451 } 452 #else // !_BOOT_MODE 453 if (fBlock == NULL) { 454 fBlock = (uint8*)malloc(volume->BlockSize()); 455 if (fBlock == NULL) 456 return NULL; 457 } 458 459 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), 460 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { 461 block = fBlock; 462 } 463 464 fWritable = false; 465 #endif // _BOOT_MODE 466 467 if (block != NULL) { 468 // The node is somewhere in that block... 469 // (confusing offset calculation) 470 fNode = (bplustree_node*)(block + offset 471 - (fileOffset + (blockOffset << volume->BlockShift()))); 472 } else 473 REPORT_ERROR(B_IO_ERROR); 474 } 475 return fNode; 476 } 477 478 479 #if !_BOOT_MODE 480 status_t 481 CachedNode::Free(Transaction& transaction, off_t offset) 482 { 483 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 484 RETURN_ERROR(B_BAD_VALUE); 485 486 // TODO: scan the free nodes list and remove all nodes at the end 487 // of the tree - perhaps that shouldn't be done everytime that 488 // function is called, perhaps it should be done when the directory 489 // inode is closed or based on some calculation or whatever... 490 491 CachedNode cached(fTree); 492 bplustree_header* header = cached.SetToWritableHeader(transaction); 493 if (header == NULL) 494 return B_IO_ERROR; 495 496 #if 0 497 // TODO: temporarily disabled because CheckNode() doesn't like this... 498 // Also, it's such an edge case that it's almost useless, anyway. 499 // if the node is the last one in the tree, we shrink 500 // the tree and file size by one node 501 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 502 if (offset == lastOffset) { 503 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 504 if (status != B_OK) 505 return status; 506 507 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 508 return B_OK; 509 } 510 #endif 511 512 // add the node to the free nodes list 513 fNode->left_link = header->free_node_pointer; 514 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 515 516 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 517 return B_OK; 518 } 519 520 521 status_t 522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 523 off_t* _offset) 524 { 525 if (fTree == NULL || fTree->fStream == NULL) 526 RETURN_ERROR(B_BAD_VALUE); 527 528 Unset(); 529 530 bplustree_header* header; 531 status_t status; 532 533 // if there are any free nodes, recycle them 534 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 535 CachedNode cached(fTree); 536 header = cached.SetToWritableHeader(transaction); 537 if (header == NULL) 538 return B_IO_ERROR; 539 540 *_offset = header->FreeNode(); 541 *_node = fNode; 542 543 // set new free node pointer 544 header->free_node_pointer = fNode->left_link; 545 fNode->Initialize(); 546 return B_OK; 547 } 548 549 // allocate space for a new node 550 Inode* stream = fTree->fStream; 551 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 552 return status; 553 554 CachedNode cached(fTree); 555 header = cached.SetToWritableHeader(transaction); 556 if (header == NULL) 557 return B_IO_ERROR; 558 559 // the maximum_size has to be changed before the call to SetTo() - or 560 // else it will fail because the requested node is out of bounds 561 off_t offset = header->MaximumSize(); 562 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 563 + fTree->fNodeSize); 564 565 cached.Unset(); 566 // SetToWritable() below needs the new values in the tree's header 567 568 if (SetToWritable(transaction, offset, false) == NULL) 569 RETURN_ERROR(B_ERROR); 570 571 fNode->Initialize(); 572 573 *_offset = offset; 574 *_node = fNode; 575 return B_OK; 576 } 577 578 579 // #pragma mark - 580 581 582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 583 : 584 fStream(NULL), 585 fInTransaction(false) 586 { 587 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 588 SetTo(transaction, stream); 589 } 590 #endif // !_BOOT_MODE 591 592 593 BPlusTree::BPlusTree(Inode* stream) 594 : 595 fStream(NULL), 596 fInTransaction(false) 597 { 598 #if !_BOOT_MODE 599 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 600 #endif 601 602 SetTo(stream); 603 } 604 605 606 BPlusTree::BPlusTree() 607 : 608 fStream(NULL), 609 fNodeSize(BPLUSTREE_NODE_SIZE), 610 fAllowDuplicates(true), 611 fInTransaction(false), 612 fStatus(B_NO_INIT) 613 { 614 #if !_BOOT_MODE 615 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 616 #endif 617 } 618 619 620 BPlusTree::~BPlusTree() 621 { 622 #if !_BOOT_MODE 623 // if there are any TreeIterators left, we need to stop them 624 // (can happen when the tree's inode gets deleted while 625 // traversing the tree - a TreeIterator doesn't lock the inode) 626 mutex_lock(&fIteratorLock); 627 628 SinglyLinkedList<TreeIterator>::Iterator iterator 629 = fIterators.GetIterator(); 630 while (iterator.HasNext()) 631 iterator.Next()->Stop(); 632 633 mutex_destroy(&fIteratorLock); 634 635 ASSERT(!fInTransaction); 636 #endif // !_BOOT_MODE 637 } 638 639 640 #if !_BOOT_MODE 641 /*! Create a new B+Tree on the specified stream */ 642 status_t 643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 644 { 645 // initializes in-memory B+Tree 646 647 fStream = stream; 648 649 CachedNode cached(this); 650 bplustree_header* header = cached.SetToWritableHeader(transaction); 651 if (header == NULL) { 652 // allocate space for new header + node! 653 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 654 if (fStatus != B_OK) 655 RETURN_ERROR(fStatus); 656 657 header = cached.SetToWritableHeader(transaction); 658 if (header == NULL) 659 RETURN_ERROR(fStatus = B_ERROR); 660 } 661 662 fAllowDuplicates = stream->IsIndex() 663 || (stream->Mode() & S_ALLOW_DUPS) != 0; 664 665 fNodeSize = nodeSize; 666 667 // initialize b+tree header 668 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 669 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 670 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 671 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 672 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 673 header->free_node_pointer 674 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 675 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 676 677 cached.Unset(); 678 679 // initialize b+tree root node 680 cached.SetToWritable(transaction, fHeader.RootNode(), false); 681 if (cached.Node() == NULL) 682 RETURN_ERROR(B_IO_ERROR); 683 684 cached.Node()->Initialize(); 685 686 return fStatus = B_OK; 687 } 688 #endif // !_BOOT_MODE 689 690 691 status_t 692 BPlusTree::SetTo(Inode* stream) 693 { 694 if (stream == NULL) 695 RETURN_ERROR(fStatus = B_BAD_VALUE); 696 697 fStream = stream; 698 699 // get on-disk B+Tree header 700 701 CachedNode cached(this); 702 const bplustree_header* header = cached.SetToHeader(); 703 if (header != NULL) 704 memcpy(&fHeader, header, sizeof(bplustree_header)); 705 else 706 RETURN_ERROR(fStatus = B_IO_ERROR); 707 708 // is header valid? 709 710 if (fHeader.MaximumSize() != stream->Size()) { 711 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 712 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 713 // we can't change the header since we don't have a transaction 714 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 715 } 716 if (!fHeader.IsValid()) { 717 #ifdef DEBUG 718 dump_bplustree_header(&fHeader); 719 dump_block((const char*)&fHeader, 128); 720 #endif 721 RETURN_ERROR(fStatus = B_BAD_DATA); 722 } 723 724 fNodeSize = fHeader.NodeSize(); 725 726 // validity check 727 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 728 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 729 S_DOUBLE_INDEX}; 730 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 731 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 732 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 733 734 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 735 || ((stream->Mode() & S_INDEX_DIR) != 0 736 && kToMode[fHeader.DataType()] != mode) 737 || !stream->IsContainer()) { 738 D( dump_bplustree_header(&fHeader); 739 dump_inode(&stream->Node()); 740 ); 741 RETURN_ERROR(fStatus = B_BAD_TYPE); 742 } 743 744 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 745 // in the original BFS code - we will honour it nevertheless 746 fAllowDuplicates = stream->IsIndex() 747 || (stream->Mode() & S_ALLOW_DUPS) != 0; 748 749 cached.SetTo(fHeader.RootNode()); 750 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 751 } 752 753 754 status_t 755 BPlusTree::InitCheck() 756 { 757 return fStatus; 758 } 759 760 761 #if !_BOOT_MODE 762 status_t 763 BPlusTree::Validate(bool repair, bool& _errorsFound) 764 { 765 TreeCheck check(this); 766 if (check.InitCheck() != B_OK) 767 return B_NO_MEMORY; 768 769 check.SetVisited(0); 770 771 // Walk the free nodes 772 773 CachedNode cached(this); 774 off_t freeOffset = fHeader.FreeNode(); 775 while (freeOffset > 0) { 776 const bplustree_node* node; 777 status_t status = cached.SetTo(freeOffset, &node, false); 778 if (status != B_OK) { 779 if (status == B_IO_ERROR) 780 return B_IO_ERROR; 781 782 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 783 "not be read: %s\n", fStream->ID(), freeOffset, 784 strerror(status)); 785 check.FoundError(); 786 break; 787 } 788 789 if (check.Visited(freeOffset)) { 790 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 791 " circular!\n", fStream->ID(), freeOffset); 792 // TODO: if 'repair' is true, we could collect all unvisited nodes 793 // at the end, and put the into the free list 794 check.FoundError(); 795 break; 796 } 797 798 check.SetVisited(freeOffset); 799 800 if (node->OverflowLink() != BPLUSTREE_FREE) { 801 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 802 " misses free mark!\n", fStream->ID(), freeOffset); 803 } 804 freeOffset = node->LeftLink(); 805 } 806 807 // Iterate over the complete tree recursively 808 809 const bplustree_node* root; 810 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 811 if (status != B_OK) 812 return status; 813 814 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 815 816 if (check.ErrorsFound()) 817 _errorsFound = true; 818 819 if (status != B_OK) 820 return status; 821 822 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 823 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 824 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 825 fHeader.MaxNumberOfLevels()); 826 } 827 828 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 829 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 830 " nodes.\n", fStream->ID(), check.VisitedCount(), 831 fHeader.MaximumSize() / fNodeSize); 832 } 833 834 return B_OK; 835 } 836 837 838 status_t 839 BPlusTree::MakeEmpty() 840 { 841 // Put all nodes into the free list in order 842 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 843 844 // Reset the header, and root node 845 CachedNode cached(this); 846 bplustree_header* header = cached.SetToWritableHeader(transaction); 847 if (header == NULL) 848 return B_IO_ERROR; 849 850 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 851 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 852 if (fStream->Size() > (off_t)NodeSize() * 2) 853 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 854 else { 855 header->free_node_pointer 856 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 857 } 858 859 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 860 if (node == NULL) 861 return B_IO_ERROR; 862 863 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 864 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 865 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 866 node->all_key_count = 0; 867 node->all_key_length = 0; 868 869 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 870 offset += NodeSize()) { 871 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 872 if (node == NULL) { 873 dprintf("--> could not open %" B_PRIdOFF "\n", offset); 874 return B_IO_ERROR; 875 } 876 if (offset < fStream->Size() - (off_t)NodeSize()) 877 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 878 else 879 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 880 881 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 882 883 // It's not important to write it out in a single transaction; just 884 // make sure it doesn't get too large 885 if (offset % (1024 * 1024) == 0) { 886 transaction.Done(); 887 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 888 } 889 } 890 891 return transaction.Done(); 892 } 893 894 895 int32 896 BPlusTree::TypeCodeToKeyType(type_code code) 897 { 898 switch (code) { 899 case B_STRING_TYPE: 900 return BPLUSTREE_STRING_TYPE; 901 case B_SSIZE_T_TYPE: 902 case B_INT32_TYPE: 903 return BPLUSTREE_INT32_TYPE; 904 case B_SIZE_T_TYPE: 905 case B_UINT32_TYPE: 906 return BPLUSTREE_UINT32_TYPE; 907 case B_OFF_T_TYPE: 908 case B_INT64_TYPE: 909 return BPLUSTREE_INT64_TYPE; 910 case B_UINT64_TYPE: 911 return BPLUSTREE_UINT64_TYPE; 912 case B_FLOAT_TYPE: 913 return BPLUSTREE_FLOAT_TYPE; 914 case B_DOUBLE_TYPE: 915 return BPLUSTREE_DOUBLE_TYPE; 916 } 917 return -1; 918 } 919 920 921 int32 922 BPlusTree::ModeToKeyType(mode_t mode) 923 { 924 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 925 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 926 case S_INT_INDEX: 927 return BPLUSTREE_INT32_TYPE; 928 case S_UINT_INDEX: 929 return BPLUSTREE_UINT32_TYPE; 930 case S_LONG_LONG_INDEX: 931 return BPLUSTREE_INT64_TYPE; 932 case S_ULONG_LONG_INDEX: 933 return BPLUSTREE_UINT64_TYPE; 934 case S_FLOAT_INDEX: 935 return BPLUSTREE_FLOAT_TYPE; 936 case S_DOUBLE_INDEX: 937 return BPLUSTREE_DOUBLE_TYPE; 938 case S_STR_INDEX: 939 default: 940 // default is for standard directories 941 return BPLUSTREE_STRING_TYPE; 942 } 943 } 944 #endif // !_BOOT_MODE 945 946 947 // #pragma mark - TransactionListener implementation 948 949 950 #if !_BOOT_MODE 951 void 952 BPlusTree::TransactionDone(bool success) 953 { 954 if (!success) { 955 // update header from disk 956 CachedNode cached(this); 957 const bplustree_header* header = cached.SetToHeader(); 958 if (header != NULL) 959 memcpy(&fHeader, header, sizeof(bplustree_header)); 960 } 961 } 962 963 964 void 965 BPlusTree::RemovedFromTransaction() 966 { 967 fInTransaction = false; 968 969 if (!fStream->GetVolume()->IsInitializing()) 970 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 971 } 972 973 974 // #pragma mark - 975 976 977 void 978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 979 uint16 splitAt, int8 change) 980 { 981 // Although every iterator which is affected by this update currently 982 // waits on a semaphore, other iterators could be added/removed at 983 // any time, so we need to protect this loop 984 MutexLocker _(fIteratorLock); 985 986 SinglyLinkedList<TreeIterator>::Iterator iterator 987 = fIterators.GetIterator(); 988 while (iterator.HasNext()) 989 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 990 } 991 992 993 void 994 BPlusTree::_AddIterator(TreeIterator* iterator) 995 { 996 MutexLocker _(fIteratorLock); 997 fIterators.Add(iterator); 998 } 999 1000 1001 void 1002 BPlusTree::_RemoveIterator(TreeIterator* iterator) 1003 { 1004 MutexLocker _(fIteratorLock); 1005 fIterators.Remove(iterator); 1006 } 1007 #endif // !_BOOT_MODE 1008 1009 1010 int32 1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 1012 int keyLength2) 1013 { 1014 type_code type = 0; 1015 switch (fHeader.DataType()) { 1016 case BPLUSTREE_STRING_TYPE: 1017 type = B_STRING_TYPE; 1018 break; 1019 case BPLUSTREE_INT32_TYPE: 1020 type = B_INT32_TYPE; 1021 break; 1022 case BPLUSTREE_UINT32_TYPE: 1023 type = B_UINT32_TYPE; 1024 break; 1025 case BPLUSTREE_INT64_TYPE: 1026 type = B_INT64_TYPE; 1027 break; 1028 case BPLUSTREE_UINT64_TYPE: 1029 type = B_UINT64_TYPE; 1030 break; 1031 case BPLUSTREE_FLOAT_TYPE: 1032 type = B_FLOAT_TYPE; 1033 break; 1034 case BPLUSTREE_DOUBLE_TYPE: 1035 type = B_DOUBLE_TYPE; 1036 break; 1037 } 1038 return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2); 1039 } 1040 1041 1042 status_t 1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 1044 uint16 keyLength, uint16* _index, off_t* _next) 1045 { 1046 #ifdef DEBUG 1047 NodeChecker checker(node, fNodeSize, "find"); 1048 #endif 1049 1050 if (node->all_key_count == 0) { 1051 if (_index) 1052 *_index = 0; 1053 if (_next) 1054 *_next = node->OverflowLink(); 1055 return B_ENTRY_NOT_FOUND; 1056 } 1057 1058 off_t* values = node->Values(); 1059 int16 saveIndex = -1; 1060 1061 // binary search in the key array 1062 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1063 uint16 i = (first + last) >> 1; 1064 1065 uint16 searchLength = 0; 1066 uint8* searchKey = node->KeyAt(i, &searchLength); 1067 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1068 > (uint8*)node + fNodeSize 1069 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1070 #if !_BOOT_MODE 1071 fStream->GetVolume()->Panic(); 1072 #endif 1073 RETURN_ERROR(B_BAD_DATA); 1074 } 1075 1076 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1077 if (cmp < 0) { 1078 last = i - 1; 1079 saveIndex = i; 1080 } else if (cmp > 0) { 1081 saveIndex = first = i + 1; 1082 } else { 1083 if (_index) 1084 *_index = i; 1085 if (_next) 1086 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1087 return B_OK; 1088 } 1089 } 1090 1091 if (_index) 1092 *_index = saveIndex; 1093 if (_next) { 1094 if (saveIndex == node->NumKeys()) 1095 *_next = node->OverflowLink(); 1096 else 1097 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1098 } 1099 return B_ENTRY_NOT_FOUND; 1100 } 1101 1102 1103 #if !_BOOT_MODE 1104 /*! Prepares the stack to contain all nodes that were passed while 1105 following the key, from the root node to the leaf node that could 1106 or should contain that key. 1107 */ 1108 status_t 1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1110 uint16 keyLength) 1111 { 1112 // set the root node to begin with 1113 node_and_key nodeAndKey; 1114 nodeAndKey.nodeOffset = fHeader.RootNode(); 1115 1116 CachedNode cached(this); 1117 const bplustree_node* node; 1118 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1119 // if we are already on leaf level, we're done 1120 if (node->OverflowLink() == BPLUSTREE_NULL) { 1121 // node that the keyIndex is not properly set here (but it's not 1122 // needed in the calling functions anyway)! 1123 nodeAndKey.keyIndex = 0; 1124 stack.Push(nodeAndKey); 1125 return B_OK; 1126 } 1127 1128 off_t nextOffset; 1129 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1130 &nextOffset); 1131 1132 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1133 RETURN_ERROR(B_ERROR); 1134 1135 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1136 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1137 break; 1138 } 1139 1140 // put the node offset & the correct keyIndex on the stack 1141 stack.Push(nodeAndKey); 1142 1143 nodeAndKey.nodeOffset = nextOffset; 1144 } 1145 1146 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1147 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1148 return B_ERROR; 1149 } 1150 1151 1152 /*! This will find a free duplicate fragment in the given bplustree_node. 1153 The CachedNode will be set to the writable fragment on success. 1154 */ 1155 status_t 1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1157 const bplustree_node* node, CachedNode& cached, 1158 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1159 { 1160 off_t* values = node->Values(); 1161 for (int32 i = 0; i < node->NumKeys(); i++) { 1162 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1163 1164 // does the value link to a duplicate fragment? 1165 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1166 continue; 1167 1168 const bplustree_node* fragment = cached.SetTo( 1169 bplustree_node::FragmentOffset(value), false); 1170 if (fragment == NULL) { 1171 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1172 B_PRIdOFF "\n", value, fStream->ID())); 1173 continue; 1174 } 1175 1176 // see if there is some space left for us 1177 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1178 for (uint32 j = 0; j < num; j++) { 1179 duplicate_array* array = fragment->FragmentAt(j); 1180 1181 if (array->IsEmpty()) { 1182 // found an unused fragment 1183 *_fragment = cached.MakeWritable(transaction); 1184 if (*_fragment == NULL) 1185 return B_IO_ERROR; 1186 1187 *_offset = bplustree_node::FragmentOffset(value); 1188 *_index = j; 1189 return B_OK; 1190 } 1191 } 1192 } 1193 return B_ENTRY_NOT_FOUND; 1194 } 1195 1196 1197 status_t 1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1199 const bplustree_node* node, uint16 index, off_t value) 1200 { 1201 CachedNode cachedDuplicate(this); 1202 off_t* values = node->Values(); 1203 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1204 status_t status; 1205 off_t offset; 1206 1207 if (bplustree_node::IsDuplicate(oldValue)) { 1208 // If it's a duplicate fragment, try to insert it into that, or if it 1209 // doesn't fit anymore, create a new duplicate node 1210 1211 if (bplustree_node::LinkType(oldValue) 1212 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1213 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1214 transaction, bplustree_node::FragmentOffset(oldValue), false); 1215 if (duplicate == NULL) 1216 return B_IO_ERROR; 1217 1218 duplicate_array* array = duplicate->FragmentAt( 1219 bplustree_node::FragmentIndex(oldValue)); 1220 int32 arrayCount = array->Count(); 1221 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1222 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1223 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1224 (int)bplustree_node::FragmentIndex(oldValue), 1225 bplustree_node::FragmentOffset(oldValue), arrayCount, 1226 fStream->ID())); 1227 return B_BAD_DATA; 1228 } 1229 1230 if (arrayCount < NUM_FRAGMENT_VALUES) { 1231 array->Insert(value); 1232 } else { 1233 // Test if the fragment will be empty if we remove this key's 1234 // values 1235 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1236 // The node will be empty without our values, so let us 1237 // reuse it as a duplicate node 1238 offset = bplustree_node::FragmentOffset(oldValue); 1239 1240 memmove(duplicate->DuplicateArray(), array, 1241 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1242 duplicate->left_link = duplicate->right_link 1243 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1244 1245 array = duplicate->DuplicateArray(); 1246 array->Insert(value); 1247 } else { 1248 // Create a new duplicate node 1249 1250 cachedDuplicate.UnsetUnchanged(transaction); 1251 // The old duplicate has not been touched, so we can 1252 // reuse it 1253 1254 bplustree_node* newDuplicate; 1255 status = cachedDuplicate.Allocate(transaction, 1256 &newDuplicate, &offset); 1257 if (status != B_OK) 1258 RETURN_ERROR(status); 1259 1260 // Copy the array from the fragment node to the duplicate 1261 // node and free the old entry (by zero'ing all values) 1262 newDuplicate->overflow_link = array->count; 1263 memcpy(&newDuplicate->all_key_count, &array->values[0], 1264 array->Count() * sizeof(off_t)); 1265 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1266 1267 array = newDuplicate->DuplicateArray(); 1268 array->Insert(value); 1269 } 1270 1271 // Update the main pointer to link to a duplicate node 1272 if (cached.MakeWritable(transaction) == NULL) 1273 return B_IO_ERROR; 1274 1275 values[index] 1276 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1277 BPLUSTREE_DUPLICATE_NODE, offset)); 1278 } 1279 1280 return B_OK; 1281 } 1282 1283 // Put the value into a dedicated duplicate node 1284 1285 // search for free space in the duplicate nodes of that key 1286 duplicate_array* array; 1287 int32 arrayCount; 1288 const bplustree_node* duplicate; 1289 off_t duplicateOffset; 1290 do { 1291 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1292 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1293 if (duplicate == NULL) 1294 return B_IO_ERROR; 1295 1296 array = duplicate->DuplicateArray(); 1297 arrayCount =array->Count(); 1298 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1299 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1300 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1301 duplicateOffset, arrayCount, fStream->ID())); 1302 return B_BAD_DATA; 1303 } 1304 } while (arrayCount >= NUM_DUPLICATE_VALUES 1305 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1306 1307 bplustree_node* writableDuplicate 1308 = cachedDuplicate.MakeWritable(transaction); 1309 if (writableDuplicate == NULL) 1310 return B_IO_ERROR; 1311 1312 if (arrayCount < NUM_DUPLICATE_VALUES) { 1313 array = writableDuplicate->DuplicateArray(); 1314 array->Insert(value); 1315 } else { 1316 // no space left - add a new duplicate node 1317 1318 bplustree_node* newDuplicate; 1319 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1320 &offset); 1321 if (status != B_OK) 1322 RETURN_ERROR(status); 1323 1324 // link the two nodes together 1325 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1326 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1327 1328 array = newDuplicate->DuplicateArray(); 1329 array->count = 0; 1330 array->Insert(value); 1331 } 1332 return B_OK; 1333 } 1334 1335 // Search for a free duplicate fragment or create a new one 1336 // to insert the duplicate value into 1337 1338 uint32 fragmentIndex = 0; 1339 bplustree_node* fragment; 1340 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1341 &offset, &fragment, &fragmentIndex) != B_OK) { 1342 // allocate a new duplicate fragment node 1343 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1344 if (status != B_OK) 1345 RETURN_ERROR(status); 1346 1347 memset(fragment, 0, fNodeSize); 1348 } 1349 1350 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1351 array->Insert(oldValue); 1352 array->Insert(value); 1353 1354 if (cached.MakeWritable(transaction) == NULL) 1355 return B_IO_ERROR; 1356 1357 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1358 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1359 1360 return B_OK; 1361 } 1362 1363 1364 void 1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1366 uint16 keyLength, off_t value) 1367 { 1368 // should never happen, but who knows? 1369 if (index > node->NumKeys()) 1370 return; 1371 1372 off_t* values = node->Values(); 1373 uint16* keyLengths = node->KeyLengths(); 1374 uint8* keys = node->Keys(); 1375 1376 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1377 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1378 + keyLength); 1379 1380 off_t* newValues = node->Values(); 1381 uint16* newKeyLengths = node->KeyLengths(); 1382 1383 // move values and copy new value into them 1384 memmove(newValues + index + 1, values + index, 1385 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1386 memmove(newValues, values, sizeof(off_t) * index); 1387 1388 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1389 1390 // move and update key length index 1391 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1392 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1393 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1394 } 1395 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1396 1397 int32 keyStart; 1398 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1399 + (keyStart = index > 0 1400 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1401 1402 // move keys and copy new key into them 1403 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1404 int32 size = node->AllKeyLength() - length; 1405 if (size > 0) 1406 memmove(keys + length, keys + length - keyLength, size); 1407 1408 memcpy(keys + keyStart, key, keyLength); 1409 } 1410 1411 1412 /*! Splits the \a node into two halves - the other half will be put into 1413 \a other. It also takes care to create a new overflow link if the node 1414 to split is an index node. 1415 */ 1416 status_t 1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1418 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1419 uint16* _keyLength, off_t* _value) 1420 { 1421 if (*_keyIndex > node->NumKeys() + 1) 1422 return B_BAD_VALUE; 1423 1424 uint16* inKeyLengths = node->KeyLengths(); 1425 off_t* inKeyValues = node->Values(); 1426 uint8* inKeys = node->Keys(); 1427 uint8* outKeys = other->Keys(); 1428 int32 keyIndex = *_keyIndex; // can become less than zero! 1429 1430 if (keyIndex > node->NumKeys()) { 1431 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1432 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1433 return B_BAD_VALUE; 1434 } 1435 1436 // How many keys will fit in one (half) page? 1437 // The following loop will find the answer to this question and 1438 // change the key lengths indices for their new home 1439 1440 // "bytes" is the number of bytes written for the new key, 1441 // "bytesBefore" are the bytes before that key 1442 // "bytesAfter" are the bytes after the new key, if any 1443 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1444 1445 size_t size = fNodeSize >> 1; 1446 int32 out, in; 1447 size_t keyLengths = 0; 1448 for (in = out = 0; in < node->NumKeys() + 1;) { 1449 keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]); 1450 1451 if (in == keyIndex && !bytes) { 1452 bytes = *_keyLength; 1453 bytesBefore = in > 0 1454 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1455 } else { 1456 if (keyIndex < out) 1457 bytesAfter = keyLengths - bytesBefore; 1458 1459 in++; 1460 } 1461 out++; 1462 1463 if (key_align(sizeof(bplustree_node) + bytes + keyLengths) 1464 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1465 // we have found the number of keys in the new node! 1466 break; 1467 } 1468 } 1469 1470 // if the new key was not inserted, set the length of the keys 1471 // that can be copied directly 1472 1473 if (keyIndex >= out && in > 0) 1474 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1475 else if (keyIndex + 1 < out) 1476 bytesAfter = keyLengths - bytesBefore; 1477 1478 if (bytesBefore < 0 || bytesAfter < 0) 1479 return B_BAD_DATA; 1480 1481 other->left_link = node->left_link; 1482 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1483 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1484 + bytesAfter); 1485 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1486 1487 uint16* outKeyLengths = other->KeyLengths(); 1488 off_t* outKeyValues = other->Values(); 1489 int32 keys = out > keyIndex ? keyIndex : out; 1490 1491 if (bytesBefore) { 1492 // copy the keys 1493 memcpy(outKeys, inKeys, bytesBefore); 1494 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1495 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1496 } 1497 if (bytes) { 1498 // copy the newly inserted key 1499 memcpy(outKeys + bytesBefore, key, bytes); 1500 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1501 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1502 1503 if (bytesAfter) { 1504 // copy the keys after the new key 1505 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1506 bytesAfter); 1507 keys = out - keyIndex - 1; 1508 for (int32 i = 0;i < keys;i++) { 1509 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1510 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1511 + bytes); 1512 } 1513 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1514 keys * sizeof(off_t)); 1515 } 1516 } 1517 1518 // if the new key was already inserted, we shouldn't use it again 1519 if (in != out) 1520 keyIndex--; 1521 1522 int32 total = bytesBefore + bytesAfter; 1523 1524 // these variables are for the key that will be returned 1525 // to the parent node 1526 uint8* newKey = NULL; 1527 uint16 newLength; 1528 bool newAllocated = false; 1529 1530 // If we have split an index node, we have to drop the first key 1531 // of the next node (which can also be the new key to insert). 1532 // The dropped key is also the one which has to be inserted in 1533 // the parent node, so we will set the "newKey" already here. 1534 if (node->OverflowLink() != BPLUSTREE_NULL) { 1535 if (in == keyIndex) { 1536 newKey = key; 1537 newLength = *_keyLength; 1538 1539 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1540 keyIndex--; 1541 } else { 1542 // If a key is dropped (is not the new key), we have to copy 1543 // it, because it would be lost if not. 1544 uint8* droppedKey = node->KeyAt(in, &newLength); 1545 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1546 > (uint8*)node + fNodeSize 1547 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1548 fStream->GetVolume()->Panic(); 1549 RETURN_ERROR(B_BAD_DATA); 1550 } 1551 newKey = (uint8*)malloc(newLength); 1552 if (newKey == NULL) 1553 return B_NO_MEMORY; 1554 1555 newAllocated = true; 1556 memcpy(newKey, droppedKey, newLength); 1557 1558 other->overflow_link = inKeyValues[in]; 1559 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1560 } 1561 } 1562 1563 // and now the same game for the other page and the rest of the keys 1564 // (but with memmove() instead of memcpy(), because they may overlap) 1565 1566 bytesBefore = bytesAfter = bytes = 0; 1567 out = 0; 1568 int32 skip = in; 1569 while (in < node->NumKeys() + 1) { 1570 if (in == keyIndex && !bytes) { 1571 // it's enough to set bytesBefore once here, because we do 1572 // not need to know the exact length of all keys in this 1573 // loop 1574 bytesBefore = in > skip 1575 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1576 bytes = *_keyLength; 1577 out++; 1578 } else { 1579 if (in < node->NumKeys()) { 1580 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1581 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1582 1583 if (bytes) { 1584 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1585 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1586 1587 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1588 - bytesBefore - bytes; 1589 } 1590 out++; 1591 } 1592 in++; 1593 } 1594 } 1595 1596 // adjust the byte counts (since we were a bit lazy in the loop) 1597 if (keyIndex < skip) 1598 bytesBefore = node->AllKeyLength() - total; 1599 1600 if (bytesBefore < 0 || bytesAfter < 0) { 1601 if (newAllocated) 1602 free(newKey); 1603 return B_BAD_DATA; 1604 } 1605 1606 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1607 // right link, and overflow link can stay the same 1608 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1609 + bytesAfter); 1610 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1611 1612 // array positions have changed 1613 outKeyLengths = node->KeyLengths(); 1614 outKeyValues = node->Values(); 1615 1616 // move the keys in the old node: the order is important here, 1617 // because we don't want to overwrite any contents 1618 1619 keys = keyIndex <= skip ? out : keyIndex - skip; 1620 keyIndex -= skip; 1621 in = out - keyIndex - 1; 1622 // Note: keyIndex and in will contain invalid values when the new key 1623 // went to the other node. But in this case bytes and bytesAfter are 1624 // 0 and subsequently we never use keyIndex and in. 1625 1626 if (bytesBefore) 1627 memmove(inKeys, inKeys + total, bytesBefore); 1628 if (bytesAfter) { 1629 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1630 bytesAfter); 1631 } 1632 1633 if (bytesBefore) 1634 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1635 if (bytesAfter) { 1636 // if byteAfter is > 0, keyIndex is larger than skip 1637 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1638 in * sizeof(uint16)); 1639 } 1640 1641 if (bytesBefore) 1642 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1643 if (bytesAfter) { 1644 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1645 in * sizeof(off_t)); 1646 } 1647 1648 if (bytes) { 1649 // finally, copy the newly inserted key (don't overwrite anything) 1650 memcpy(inKeys + bytesBefore, key, bytes); 1651 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1652 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1653 } 1654 1655 // Prepare the key that will be inserted in the parent node which 1656 // is either the dropped key or the last of the other node. 1657 // If it's the dropped key, "newKey" was already set earlier. 1658 1659 if (newKey == NULL) 1660 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1661 1662 memcpy(key, newKey, newLength); 1663 *_keyLength = newLength; 1664 *_value = otherOffset; 1665 1666 if (newAllocated) 1667 free(newKey); 1668 1669 return B_OK; 1670 } 1671 1672 1673 /*! This inserts a key into the tree. The changes made to the tree will 1674 all be part of the \a transaction. 1675 You need to have the inode write locked. 1676 */ 1677 status_t 1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1679 off_t value) 1680 { 1681 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1682 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1683 RETURN_ERROR(B_BAD_VALUE); 1684 #ifdef DEBUG 1685 if (value < 0) 1686 panic("tried to insert invalid value %" B_PRId64 "!\n", value); 1687 #endif 1688 1689 ASSERT_WRITE_LOCKED_INODE(fStream); 1690 1691 Stack<node_and_key> stack; 1692 if (_SeekDown(stack, key, keyLength) != B_OK) 1693 RETURN_ERROR(B_ERROR); 1694 1695 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH]; 1696 memcpy(keyBuffer, key, keyLength); 1697 1698 node_and_key nodeAndKey; 1699 const bplustree_node* node; 1700 1701 CachedNode cached(this); 1702 while (stack.Pop(&nodeAndKey) 1703 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1704 #ifdef DEBUG 1705 NodeChecker checker(node, fNodeSize, "insert"); 1706 #endif 1707 if (node->IsLeaf()) { 1708 // first round, check for duplicate entries 1709 status_t status = _FindKey(node, key, keyLength, 1710 &nodeAndKey.keyIndex); 1711 1712 // is this a duplicate entry? 1713 if (status == B_OK) { 1714 if (fAllowDuplicates) { 1715 status = _InsertDuplicate(transaction, cached, node, 1716 nodeAndKey.keyIndex, value); 1717 if (status != B_OK) 1718 RETURN_ERROR(status); 1719 return B_OK; 1720 } 1721 1722 return B_NAME_IN_USE; 1723 } 1724 } 1725 1726 bplustree_node* writableNode = cached.MakeWritable(transaction); 1727 if (writableNode == NULL) 1728 return B_IO_ERROR; 1729 1730 // is the node big enough to hold the pair? 1731 if (int32(key_align(sizeof(bplustree_node) 1732 + writableNode->AllKeyLength() + keyLength) 1733 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1734 + sizeof(off_t))) < fNodeSize) { 1735 _InsertKey(writableNode, nodeAndKey.keyIndex, 1736 keyBuffer, keyLength, value); 1737 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1738 nodeAndKey.keyIndex, 0, 1); 1739 1740 return B_OK; 1741 } else { 1742 CachedNode cachedNewRoot(this); 1743 CachedNode cachedOther(this); 1744 1745 // do we need to allocate a new root node? if so, then do 1746 // it now 1747 off_t newRoot = BPLUSTREE_NULL; 1748 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1749 bplustree_node* root; 1750 status_t status = cachedNewRoot.Allocate(transaction, &root, 1751 &newRoot); 1752 if (status != B_OK) { 1753 // The tree is most likely corrupted! 1754 // But it's still sane at leaf level - we could set 1755 // a flag in the header that forces the tree to be 1756 // rebuild next time... 1757 // But since we will have journaling, that's not a big 1758 // problem anyway. 1759 RETURN_ERROR(status); 1760 } 1761 } 1762 1763 // reserve space for the other node 1764 bplustree_node* other; 1765 off_t otherOffset; 1766 status_t status = cachedOther.Allocate(transaction, &other, 1767 &otherOffset); 1768 if (status != B_OK) { 1769 cachedNewRoot.Free(transaction, newRoot); 1770 RETURN_ERROR(status); 1771 } 1772 1773 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1774 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1775 &value) != B_OK) { 1776 // free root node & other node here 1777 cachedOther.Free(transaction, otherOffset); 1778 cachedNewRoot.Free(transaction, newRoot); 1779 1780 RETURN_ERROR(B_ERROR); 1781 } 1782 #ifdef DEBUG 1783 checker.Check("insert split"); 1784 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1785 #endif 1786 1787 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1788 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1789 1790 // update the right link of the node in the left of the new node 1791 if ((other = cachedOther.SetToWritable(transaction, 1792 other->LeftLink())) != NULL) { 1793 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1794 } 1795 1796 // create a new root if necessary 1797 if (newRoot != BPLUSTREE_NULL) { 1798 bplustree_node* root = cachedNewRoot.Node(); 1799 1800 _InsertKey(root, 0, keyBuffer, keyLength, 1801 writableNode->LeftLink()); 1802 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1803 nodeAndKey.nodeOffset); 1804 1805 CachedNode cached(this); 1806 bplustree_header* header 1807 = cached.SetToWritableHeader(transaction); 1808 if (header == NULL) 1809 return B_IO_ERROR; 1810 1811 // finally, update header to point to the new root 1812 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1813 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1814 header->MaxNumberOfLevels() + 1); 1815 1816 return B_OK; 1817 } 1818 } 1819 } 1820 RETURN_ERROR(B_ERROR); 1821 } 1822 1823 1824 /*! Removes the duplicate index/value pair from the tree. 1825 It's part of the private tree interface. 1826 */ 1827 status_t 1828 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1829 const bplustree_node* node, CachedNode& cached, uint16 index, 1830 off_t value) 1831 { 1832 off_t* values = node->Values(); 1833 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1834 1835 CachedNode cachedDuplicate(this); 1836 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1837 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1838 duplicateOffset, false); 1839 if (duplicate == NULL) 1840 return B_IO_ERROR; 1841 1842 // if it's a duplicate fragment, remove the entry from there 1843 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1844 duplicate_array* array = duplicate->FragmentAt( 1845 bplustree_node::FragmentIndex(oldValue)); 1846 int32 arrayCount = array->Count(); 1847 1848 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1849 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1850 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1851 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1852 arrayCount, fStream->ID())); 1853 return B_BAD_DATA; 1854 } 1855 if (!array->Remove(value)) { 1856 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1857 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1858 duplicateOffset, fStream->ID())); 1859 return B_ENTRY_NOT_FOUND; 1860 } 1861 1862 // remove the array from the fragment node if it is empty 1863 if (--arrayCount == 1) { 1864 // set the link to the remaining value 1865 if (cached.MakeWritable(transaction) == NULL) 1866 return B_IO_ERROR; 1867 1868 values[index] = array->values[0]; 1869 1870 // Remove the whole fragment node, if this was the only array, 1871 // otherwise free just the array 1872 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1873 status_t status = cachedDuplicate.Free(transaction, 1874 duplicateOffset); 1875 if (status != B_OK) 1876 return status; 1877 } else 1878 array->count = 0; 1879 } 1880 return B_OK; 1881 } 1882 1883 // Remove value from a duplicate node! 1884 1885 duplicate_array* array = NULL; 1886 int32 arrayCount = 0; 1887 1888 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1889 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1890 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1891 return B_BAD_DATA; 1892 } 1893 1894 // Search the duplicate nodes until the entry could be found (and removed) 1895 while (duplicate != NULL) { 1896 array = duplicate->DuplicateArray(); 1897 arrayCount = array->Count(); 1898 1899 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1900 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1901 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1902 duplicateOffset, arrayCount, fStream->ID())); 1903 return B_BAD_DATA; 1904 } 1905 1906 if (array->Remove(value)) { 1907 arrayCount--; 1908 break; 1909 } 1910 1911 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1912 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1913 1914 cachedDuplicate.UnsetUnchanged(transaction); 1915 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1916 false); 1917 } 1918 if (duplicate == NULL) 1919 RETURN_ERROR(B_IO_ERROR); 1920 1921 // The entry got removed from the duplicate node, but we might want to free 1922 // it now in case it's empty 1923 1924 while (true) { 1925 off_t left = duplicate->LeftLink(); 1926 off_t right = duplicate->RightLink(); 1927 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1928 1929 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1930 // Free empty duplicate page, link their siblings together, and 1931 // update the duplicate link if needed (ie. when we either remove 1932 // the last duplicate node or have a new first one) 1933 1934 if (left == BPLUSTREE_NULL) { 1935 // the duplicate link points to us 1936 if (cached.MakeWritable(transaction) == NULL) 1937 return B_IO_ERROR; 1938 1939 if (arrayCount == 1) { 1940 // This is the last node, and there is only one value left; 1941 // replace the duplicate link with that value, it's no 1942 // duplicate anymore 1943 values[index] = array->values[0]; 1944 } else { 1945 // Move the duplicate link to the next node 1946 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1947 bplustree_node::MakeLink( 1948 BPLUSTREE_DUPLICATE_NODE, right)); 1949 } 1950 } 1951 1952 status_t status = cachedDuplicate.Free(transaction, 1953 duplicateOffset); 1954 if (status != B_OK) 1955 return status; 1956 1957 if (left != BPLUSTREE_NULL 1958 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1959 false)) != NULL) { 1960 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1961 1962 // If the next node is the last node, we need to free that node 1963 // and convert the duplicate entry back into a normal entry 1964 array = duplicate->DuplicateArray(); 1965 arrayCount = array->Count(); 1966 if (right == BPLUSTREE_NULL 1967 && duplicate->LeftLink() == BPLUSTREE_NULL 1968 && arrayCount <= NUM_FRAGMENT_VALUES) { 1969 duplicateOffset = left; 1970 continue; 1971 } 1972 } 1973 if (right != BPLUSTREE_NULL 1974 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1975 right, false)) != NULL) { 1976 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1977 1978 // Again, we may need to turn the duplicate entry back into a 1979 // normal entry 1980 array = duplicate->DuplicateArray(); 1981 arrayCount = array->Count(); 1982 if (left == BPLUSTREE_NULL 1983 && duplicate->RightLink() == BPLUSTREE_NULL 1984 && arrayCount <= NUM_FRAGMENT_VALUES) { 1985 duplicateOffset = right; 1986 continue; 1987 } 1988 } 1989 return B_OK; 1990 } 1991 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1992 // If the number of entries fits in a duplicate fragment, then 1993 // either find a free fragment node, or convert this node to a 1994 // fragment node. 1995 CachedNode cachedOther(this); 1996 1997 bplustree_node* fragment = NULL; 1998 uint32 fragmentIndex = 0; 1999 off_t offset; 2000 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 2001 &offset, &fragment, &fragmentIndex) == B_OK) { 2002 // move to other node 2003 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 2004 memcpy(target, array, 2005 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2006 2007 cachedDuplicate.Free(transaction, duplicateOffset); 2008 duplicateOffset = offset; 2009 } else { 2010 // convert node 2011 memmove(duplicate, array, 2012 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2013 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 2014 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2015 } 2016 2017 if (cached.MakeWritable(transaction) == NULL) 2018 return B_IO_ERROR; 2019 2020 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 2021 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 2022 } 2023 return B_OK; 2024 } 2025 } 2026 2027 2028 /*! Removes the key with the given index from the specified node. 2029 Since it has to get the key from the node anyway (to obtain it's 2030 pointer), it's not needed to pass the key & its length, although 2031 the calling method (BPlusTree::Remove()) have this data. 2032 */ 2033 void 2034 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 2035 { 2036 // should never happen, but who knows? 2037 if (index > node->NumKeys() && node->NumKeys() > 0) { 2038 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 2039 "\n", index, fStream->ID())); 2040 return; 2041 } 2042 2043 off_t* values = node->Values(); 2044 2045 // if we would have to drop the overflow link, drop 2046 // the last key instead and update the overflow link 2047 // to the value of that one 2048 if (!node->IsLeaf() && index == node->NumKeys()) 2049 node->overflow_link = values[--index]; 2050 2051 uint16 length; 2052 uint8* key = node->KeyAt(index, &length); 2053 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 2054 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2055 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 2056 fStream->ID())); 2057 fStream->GetVolume()->Panic(); 2058 return; 2059 } 2060 2061 uint16* keyLengths = node->KeyLengths(); 2062 uint8* keys = node->Keys(); 2063 2064 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2065 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2066 node->AllKeyLength() - length); 2067 2068 off_t* newValues = node->Values(); 2069 uint16* newKeyLengths = node->KeyLengths(); 2070 2071 // move key data 2072 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2073 2074 // move and update key lengths 2075 if (index > 0 && newKeyLengths != keyLengths) 2076 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2077 for (uint16 i = index; i < node->NumKeys(); i++) { 2078 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2079 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2080 } 2081 2082 // move values 2083 if (index > 0) 2084 memmove(newValues, values, index * sizeof(off_t)); 2085 if (node->NumKeys() > index) { 2086 memmove(newValues + index, values + index + 1, 2087 (node->NumKeys() - index) * sizeof(off_t)); 2088 } 2089 } 2090 2091 2092 /*! Removes the specified key from the tree. The "value" parameter is only used 2093 for trees which allow duplicates, so you may safely ignore it. 2094 It's not an optional parameter, so at least you have to think about it. 2095 You need to have the inode write locked. 2096 */ 2097 status_t 2098 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2099 off_t value) 2100 { 2101 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2102 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2103 RETURN_ERROR(B_BAD_VALUE); 2104 2105 ASSERT_WRITE_LOCKED_INODE(fStream); 2106 2107 Stack<node_and_key> stack; 2108 if (_SeekDown(stack, key, keyLength) != B_OK) 2109 RETURN_ERROR(B_ERROR); 2110 2111 node_and_key nodeAndKey; 2112 const bplustree_node* node; 2113 2114 CachedNode cached(this); 2115 while (stack.Pop(&nodeAndKey) 2116 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2117 #ifdef DEBUG 2118 NodeChecker checker(node, fNodeSize, "remove"); 2119 #endif 2120 if (node->IsLeaf()) { 2121 // first round, check for duplicate entries 2122 status_t status = _FindKey(node, key, keyLength, 2123 &nodeAndKey.keyIndex); 2124 if (status != B_OK) 2125 RETURN_ERROR(status); 2126 2127 // Is this a duplicate entry? 2128 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2129 node->Values()[nodeAndKey.keyIndex]))) { 2130 if (fAllowDuplicates) { 2131 return _RemoveDuplicate(transaction, node, cached, 2132 nodeAndKey.keyIndex, value); 2133 } 2134 2135 FATAL(("dupliate node found where no duplicates are " 2136 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2137 RETURN_ERROR(B_ERROR); 2138 } else { 2139 if (node->Values()[nodeAndKey.keyIndex] != value) 2140 return B_ENTRY_NOT_FOUND; 2141 2142 // If we will remove the last key, the iterator will be set 2143 // to the next node after the current - if there aren't any 2144 // more nodes, we need a way to prevent the TreeIterators to 2145 // touch the old node again, we use BPLUSTREE_FREE for this 2146 off_t next = node->RightLink() == BPLUSTREE_NULL 2147 ? BPLUSTREE_FREE : node->RightLink(); 2148 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2149 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2150 } 2151 } 2152 2153 bplustree_node* writableNode = cached.MakeWritable(transaction); 2154 if (writableNode == NULL) 2155 return B_IO_ERROR; 2156 2157 // if it's an empty root node, we have to convert it 2158 // to a leaf node by dropping the overflow link, or, 2159 // if it's already a leaf node, just empty it 2160 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2161 && (node->NumKeys() == 0 2162 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2163 writableNode->overflow_link 2164 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2165 writableNode->all_key_count = 0; 2166 writableNode->all_key_length = 0; 2167 2168 // if we've made a leaf node out of the root node, we need 2169 // to reset the maximum number of levels in the header 2170 if (fHeader.MaxNumberOfLevels() != 1) { 2171 CachedNode cached(this); 2172 bplustree_header* header 2173 = cached.SetToWritableHeader(transaction); 2174 if (header == NULL) 2175 return B_IO_ERROR; 2176 2177 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2178 } 2179 return B_OK; 2180 } 2181 2182 // if there is only one key left, we don't have to remove 2183 // it, we can just dump the node (index nodes still have 2184 // the overflow link, so we have to drop the last key) 2185 if (writableNode->NumKeys() > 1 2186 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2187 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2188 return B_OK; 2189 } 2190 2191 // when we are here, we can just free the node, but 2192 // we have to update the right/left link of the 2193 // siblings first 2194 CachedNode otherCached(this); 2195 bplustree_node* other = otherCached.SetToWritable(transaction, 2196 writableNode->LeftLink()); 2197 if (other != NULL) 2198 other->right_link = writableNode->right_link; 2199 2200 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2201 != NULL) { 2202 other->left_link = writableNode->left_link; 2203 } 2204 2205 cached.Free(transaction, nodeAndKey.nodeOffset); 2206 } 2207 RETURN_ERROR(B_ERROR); 2208 } 2209 2210 2211 /*! Replaces the value for the key in the tree. 2212 Returns B_OK if the key could be found and its value replaced, 2213 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2214 to indicate that something went terribly wrong. 2215 Note that this doesn't work with duplicates - it will just 2216 return B_BAD_TYPE if you call this function on a tree where 2217 duplicates are allowed. 2218 You need to have the inode write locked. 2219 */ 2220 status_t 2221 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2222 off_t value) 2223 { 2224 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2225 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2226 || key == NULL) 2227 RETURN_ERROR(B_BAD_VALUE); 2228 2229 if (fAllowDuplicates) 2230 RETURN_ERROR(B_BAD_TYPE); 2231 2232 ASSERT_WRITE_LOCKED_INODE(fStream); 2233 2234 off_t nodeOffset = fHeader.RootNode(); 2235 CachedNode cached(this); 2236 const bplustree_node* node; 2237 2238 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2239 uint16 keyIndex = 0; 2240 off_t nextOffset; 2241 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2242 &nextOffset); 2243 2244 if (node->OverflowLink() == BPLUSTREE_NULL) { 2245 if (status == B_OK) { 2246 bplustree_node* writableNode = cached.MakeWritable(transaction); 2247 if (writableNode != NULL) { 2248 writableNode->Values()[keyIndex] 2249 = HOST_ENDIAN_TO_BFS_INT64(value); 2250 } else 2251 status = B_IO_ERROR; 2252 } 2253 2254 return status; 2255 } else if (nextOffset == nodeOffset) 2256 RETURN_ERROR(B_ERROR); 2257 2258 nodeOffset = nextOffset; 2259 } 2260 RETURN_ERROR(B_ERROR); 2261 } 2262 #endif // !_BOOT_MODE 2263 2264 2265 /*! Searches the key in the tree, and stores the offset found in _value, 2266 if successful. 2267 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2268 while it descends the tree. 2269 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2270 It can also return other errors to indicate that something went wrong. 2271 Note that this doesn't work with duplicates - it will just return 2272 B_BAD_TYPE if you call this function on a tree where duplicates are 2273 allowed. 2274 You need to have the inode read or write locked. 2275 */ 2276 status_t 2277 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2278 { 2279 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2280 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2281 || key == NULL) 2282 RETURN_ERROR(B_BAD_VALUE); 2283 2284 if (fAllowDuplicates) 2285 RETURN_ERROR(B_BAD_TYPE); 2286 2287 #if !_BOOT_MODE 2288 ASSERT_READ_LOCKED_INODE(fStream); 2289 #endif 2290 2291 off_t nodeOffset = fHeader.RootNode(); 2292 CachedNode cached(this); 2293 const bplustree_node* node; 2294 2295 #ifdef DEBUG 2296 int32 levels = 0; 2297 #endif 2298 2299 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2300 uint16 keyIndex = 0; 2301 off_t nextOffset; 2302 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2303 &nextOffset); 2304 2305 #ifdef DEBUG 2306 levels++; 2307 #endif 2308 if (node->OverflowLink() == BPLUSTREE_NULL) { 2309 if (status == B_OK && _value != NULL) 2310 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2311 2312 #ifdef DEBUG 2313 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2314 DEBUGGER(("levels don't match")); 2315 #endif 2316 return status; 2317 } else if (nextOffset == nodeOffset) 2318 RETURN_ERROR(B_ERROR); 2319 2320 nodeOffset = nextOffset; 2321 } 2322 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2323 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2324 RETURN_ERROR(B_ERROR); 2325 } 2326 2327 2328 #if !_BOOT_MODE 2329 status_t 2330 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2331 const uint8* largestKey, uint16 largestKeyLength, 2332 const bplustree_node* parent) 2333 { 2334 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2335 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2336 "failed!\n", fStream->ID(), offset); 2337 check.FoundError(); 2338 return B_OK; 2339 } 2340 if (level >= fHeader.MaxNumberOfLevels()) { 2341 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2342 "!\n", fStream->ID(), offset); 2343 check.FoundError(); 2344 return B_OK; 2345 } 2346 2347 check.SetLevel(level); 2348 2349 if (check.Visited(offset)) { 2350 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2351 fStream->ID(), offset); 2352 check.FoundError(); 2353 return B_OK; 2354 } 2355 2356 check.SetVisited(offset); 2357 2358 uint32 count = parent->NumKeys(); 2359 off_t* values = parent->Values(); 2360 off_t lastOffset = check.PreviousOffset(level); 2361 CachedNode cached(this); 2362 2363 for (uint32 i = 0; i < count; i++) { 2364 uint16 keyLength; 2365 uint8* key = parent->KeyAt(i, &keyLength); 2366 if (largestKey != NULL) { 2367 int result = _CompareKeys(key, keyLength, largestKey, 2368 largestKeyLength); 2369 if (result > 0 || (result == 0 && i != count - 1)) { 2370 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2371 B_PRIu32 " larger than it should!\n", 2372 fStream->ID(), offset, i); 2373 check.FoundError(); 2374 } 2375 } 2376 2377 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2378 if (bplustree_node::IsDuplicate(childOffset)) { 2379 // Walk the duplicate nodes 2380 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2381 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2382 2383 while (duplicateOffset != BPLUSTREE_NULL) { 2384 const bplustree_node* node; 2385 status_t status = cached.SetTo(duplicateOffset, &node, false); 2386 if (status != B_OK) { 2387 if (status == B_IO_ERROR) 2388 return B_IO_ERROR; 2389 2390 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2391 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2392 duplicateOffset, strerror(status)); 2393 check.FoundError(); 2394 break; 2395 } 2396 2397 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2398 == BPLUSTREE_DUPLICATE_FRAGMENT; 2399 bool isKnownFragment = isFragmentNode 2400 && check.VisitedFragment(duplicateOffset); 2401 2402 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2403 dprintf("inode %" B_PRIdOFF ": %s node at %" 2404 B_PRIdOFF " already visited, referenced from %" 2405 B_PRIdOFF "!\n", fStream->ID(), 2406 isFragmentNode ? "fragment" : "duplicate", 2407 duplicateOffset, offset); 2408 check.FoundError(); 2409 break; 2410 } 2411 2412 // Fragment nodes may be visited more than once from different 2413 // places 2414 if (!check.Visited(duplicateOffset)) 2415 check.SetVisited(duplicateOffset); 2416 if (!isKnownFragment && isFragmentNode) 2417 check.SetVisitedFragment(duplicateOffset); 2418 2419 duplicate_array* array; 2420 int32 minSize; 2421 int32 maxSize; 2422 if (isFragmentNode) { 2423 array = node->FragmentAt( 2424 bplustree_node::FragmentIndex(childOffset)); 2425 minSize = 2; 2426 maxSize = NUM_FRAGMENT_VALUES; 2427 } else { 2428 array = node->DuplicateArray(); 2429 minSize = 1; 2430 maxSize = NUM_DUPLICATE_VALUES; 2431 } 2432 int32 arrayCount = array->Count(); 2433 2434 if (arrayCount < minSize || arrayCount > maxSize) { 2435 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2436 " has invalid array size %" B_PRId32 "!\n", 2437 fStream->ID(), duplicateOffset, arrayCount); 2438 check.FoundError(); 2439 } else { 2440 // Simple check if the values in the array may be valid 2441 for (int32 j = 0; j < arrayCount; j++) { 2442 if (!fStream->GetVolume()->IsValidInodeBlock( 2443 array->ValueAt(j))) { 2444 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2445 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2446 " at %" B_PRId32 "!\n", fStream->ID(), 2447 duplicateOffset, array->ValueAt(j), j); 2448 check.FoundError(); 2449 break; 2450 } 2451 } 2452 } 2453 2454 // A fragment node is not linked (and does not have valid links) 2455 if (isFragmentNode) 2456 break; 2457 2458 if (node->LeftLink() != lastDuplicateOffset) { 2459 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2460 " has wrong left link %" B_PRIdOFF ", expected %" 2461 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2462 node->LeftLink(), lastDuplicateOffset); 2463 check.FoundError(); 2464 } 2465 2466 lastDuplicateOffset = duplicateOffset; 2467 duplicateOffset = node->RightLink(); 2468 } 2469 } else if (!parent->IsLeaf()) { 2470 // Test a regular child node recursively 2471 off_t nextOffset = parent->OverflowLink(); 2472 if (i < count - 1) 2473 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2474 2475 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2476 // Test right link of the previous node 2477 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2478 if (previous == NULL) 2479 return B_IO_ERROR; 2480 2481 if (previous->RightLink() != childOffset) { 2482 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2483 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2484 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2485 childOffset); 2486 check.FoundError(); 2487 } 2488 } 2489 2490 status_t status = _ValidateChild(check, cached, level, childOffset, 2491 lastOffset, nextOffset, key, keyLength); 2492 if (status != B_OK) 2493 return status; 2494 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2495 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2496 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2497 fStream->ID(), offset, childOffset, i); 2498 check.FoundError(); 2499 } 2500 2501 lastOffset = childOffset; 2502 } 2503 2504 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2505 off_t childOffset = parent->OverflowLink(); 2506 status_t status = _ValidateChild(check, cached, level, childOffset, 2507 lastOffset, 0, NULL, 0); 2508 if (status != B_OK) 2509 return status; 2510 2511 lastOffset = childOffset; 2512 } 2513 2514 check.SetPreviousOffset(level, lastOffset); 2515 return B_OK; 2516 } 2517 2518 2519 status_t 2520 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2521 off_t offset, off_t lastOffset, off_t nextOffset, 2522 const uint8* key, uint16 keyLength) 2523 { 2524 const bplustree_node* node; 2525 status_t status = cached.SetTo(offset, &node, true); 2526 if (status != B_OK) { 2527 if (status == B_IO_ERROR) 2528 return B_IO_ERROR; 2529 2530 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2531 "read: %s\n", fStream->ID(), offset, strerror(status)); 2532 check.FoundError(); 2533 return B_OK; 2534 } 2535 2536 if (node->LeftLink() != lastOffset) { 2537 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2538 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2539 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2540 check.FoundError(); 2541 } 2542 2543 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2544 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2545 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2546 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2547 check.FoundError(); 2548 } 2549 2550 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2551 } 2552 #endif // !_BOOT_MODE 2553 2554 2555 // #pragma mark - 2556 2557 2558 TreeIterator::TreeIterator(BPlusTree* tree) 2559 : 2560 fTree(tree), 2561 fCurrentNodeOffset(BPLUSTREE_NULL) 2562 { 2563 #if !_BOOT_MODE 2564 tree->_AddIterator(this); 2565 #endif 2566 } 2567 2568 2569 TreeIterator::~TreeIterator() 2570 { 2571 #if !_BOOT_MODE 2572 if (fTree) 2573 fTree->_RemoveIterator(this); 2574 #endif 2575 } 2576 2577 2578 status_t 2579 TreeIterator::Goto(int8 to) 2580 { 2581 if (fTree == NULL || fTree->fStream == NULL) 2582 RETURN_ERROR(B_BAD_VALUE); 2583 2584 #if !_BOOT_MODE 2585 // lock access to stream 2586 InodeReadLocker locker(fTree->fStream); 2587 #endif 2588 2589 off_t nodeOffset = fTree->fHeader.RootNode(); 2590 CachedNode cached(fTree); 2591 const bplustree_node* node; 2592 2593 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2594 // is the node a leaf node? 2595 if (node->OverflowLink() == BPLUSTREE_NULL) { 2596 fCurrentNodeOffset = nodeOffset; 2597 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2598 fDuplicateNode = BPLUSTREE_NULL; 2599 2600 return B_OK; 2601 } 2602 2603 // get the next node offset depending on the direction (and if there 2604 // are any keys in that node at all) 2605 off_t nextOffset; 2606 if (to == BPLUSTREE_END || node->all_key_count == 0) 2607 nextOffset = node->OverflowLink(); 2608 else { 2609 if (node->AllKeyLength() > fTree->fNodeSize 2610 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2611 - 8 * node->NumKeys()) 2612 RETURN_ERROR(B_ERROR); 2613 2614 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2615 } 2616 if (nextOffset == nodeOffset) 2617 break; 2618 2619 nodeOffset = nextOffset; 2620 } 2621 FATAL(("%s fails\n", __FUNCTION__)); 2622 2623 RETURN_ERROR(B_ERROR); 2624 } 2625 2626 2627 /*! Iterates through the tree in the specified direction. 2628 When it iterates through duplicates, the "key" is only updated for the 2629 first entry - if you need to know when this happens, use the "duplicate" 2630 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2631 the other duplicates. 2632 That's not too nice, but saves the 256 bytes that would be needed to 2633 store the last key - if this will ever become an issue, it will be 2634 easy to change. 2635 The other advantage of this is, that the queries can skip all duplicates 2636 at once when they are not relevant to them. 2637 */ 2638 status_t 2639 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2640 uint16 maxLength, off_t* value, uint16* duplicate) 2641 { 2642 if (fTree == NULL) 2643 return B_INTERRUPTED; 2644 2645 bool forward = direction == BPLUSTREE_FORWARD; 2646 2647 if (fCurrentNodeOffset == BPLUSTREE_NULL 2648 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2649 RETURN_ERROR(B_ERROR); 2650 2651 // if the tree was emptied since the last call 2652 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2653 return B_ENTRY_NOT_FOUND; 2654 2655 #if !_BOOT_MODE 2656 // lock access to stream 2657 InodeReadLocker locker(fTree->fStream); 2658 #endif 2659 2660 CachedNode cached(fTree); 2661 const bplustree_node* node; 2662 2663 if (fDuplicateNode != BPLUSTREE_NULL) { 2664 // regardless of traverse direction the duplicates are always presented 2665 // in the same order; since they are all considered as equal, this 2666 // shouldn't cause any problems 2667 2668 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2669 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2670 false); 2671 } else 2672 node = NULL; 2673 2674 if (node != NULL) { 2675 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2676 // If the node is out of duplicates, we go directly to the next 2677 // one 2678 fDuplicateNode = node->RightLink(); 2679 if (fDuplicateNode != BPLUSTREE_NULL 2680 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2681 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2682 false); 2683 fDuplicate = 0; 2684 } 2685 } 2686 if (fDuplicate < fNumDuplicates) { 2687 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2688 fDuplicate++); 2689 if (duplicate) 2690 *duplicate = 2; 2691 return B_OK; 2692 } 2693 } 2694 fDuplicateNode = BPLUSTREE_NULL; 2695 } 2696 2697 off_t savedNodeOffset = fCurrentNodeOffset; 2698 int32 savedKey = fCurrentKey; 2699 2700 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2701 RETURN_ERROR(B_ERROR); 2702 2703 if (duplicate) 2704 *duplicate = 0; 2705 2706 fCurrentKey += direction; 2707 2708 // is the current key in the current node? 2709 while ((forward && fCurrentKey >= node->NumKeys()) 2710 || (!forward && fCurrentKey < 0)) { 2711 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2712 2713 // are there any more nodes? 2714 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2715 node = cached.SetTo(fCurrentNodeOffset); 2716 if (!node) 2717 RETURN_ERROR(B_ERROR); 2718 2719 // reset current key 2720 fCurrentKey = forward ? 0 : node->NumKeys() - 1; 2721 } else { 2722 // there are no nodes left, so turn back to the last key 2723 fCurrentNodeOffset = savedNodeOffset; 2724 fCurrentKey = savedKey; 2725 2726 return B_ENTRY_NOT_FOUND; 2727 } 2728 } 2729 2730 if (node->all_key_count == 0) 2731 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2732 2733 uint16 length = 0; 2734 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2735 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2736 > (uint8*)node + fTree->fNodeSize 2737 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2738 #if !_BOOT_MODE 2739 fTree->fStream->GetVolume()->Panic(); 2740 #endif 2741 RETURN_ERROR(B_BAD_DATA); 2742 } 2743 2744 // include the termination for string types 2745 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; 2746 if (length + (needsTermination ? 1 : 0) > maxLength) { 2747 if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) { 2748 // The buffer is too small, restore the last key and return 2749 // an error 2750 fCurrentNodeOffset = savedNodeOffset; 2751 fCurrentKey = savedKey; 2752 return B_BUFFER_OVERFLOW; 2753 } 2754 2755 // Always cut off strings at the maximum buffer size, and leave 2756 // room for a terminating null byte. 2757 // This allows to handle larger key sizes gracefully. 2758 length = maxLength - 1; 2759 } 2760 2761 memcpy(key, keyStart, length); 2762 2763 if (needsTermination) 2764 ((char*)key)[length] = '\0'; 2765 2766 *keyLength = length; 2767 2768 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2769 2770 // duplicate fragments? 2771 uint8 type = bplustree_node::LinkType(offset); 2772 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2773 || type == BPLUSTREE_DUPLICATE_NODE) { 2774 fDuplicateNode = offset; 2775 2776 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2777 false); 2778 if (node == NULL) 2779 RETURN_ERROR(B_ERROR); 2780 2781 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2782 2783 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2784 if (fNumDuplicates) { 2785 offset = node->DuplicateAt(offset, fIsFragment, 0); 2786 fDuplicate = 1; 2787 if (duplicate) 2788 *duplicate = 1; 2789 } else { 2790 // Shouldn't happen, but we're dealing here with potentially 2791 // corrupt disks... 2792 fDuplicateNode = BPLUSTREE_NULL; 2793 offset = 0; 2794 } 2795 } 2796 *value = offset; 2797 2798 return B_OK; 2799 } 2800 2801 2802 /*! This is more or less a copy of BPlusTree::Find() - but it just 2803 sets the current position in the iterator, regardless of if the 2804 key could be found or not. 2805 */ 2806 status_t 2807 TreeIterator::Find(const uint8* key, uint16 keyLength) 2808 { 2809 if (fTree == NULL) 2810 return B_INTERRUPTED; 2811 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2812 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2813 || key == NULL) 2814 RETURN_ERROR(B_BAD_VALUE); 2815 2816 #if !_BOOT_MODE 2817 // lock access to stream 2818 InodeReadLocker locker(fTree->fStream); 2819 #endif 2820 2821 off_t nodeOffset = fTree->fHeader.RootNode(); 2822 2823 CachedNode cached(fTree); 2824 const bplustree_node* node; 2825 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2826 uint16 keyIndex = 0; 2827 off_t nextOffset; 2828 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2829 &nextOffset); 2830 2831 if (node->OverflowLink() == BPLUSTREE_NULL) { 2832 fCurrentNodeOffset = nodeOffset; 2833 fCurrentKey = keyIndex - 1; 2834 fDuplicateNode = BPLUSTREE_NULL; 2835 2836 return status; 2837 } else if (nextOffset == nodeOffset) 2838 RETURN_ERROR(B_ERROR); 2839 2840 nodeOffset = nextOffset; 2841 } 2842 RETURN_ERROR(B_ERROR); 2843 } 2844 2845 2846 void 2847 TreeIterator::SkipDuplicates() 2848 { 2849 fDuplicateNode = BPLUSTREE_NULL; 2850 } 2851 2852 2853 void 2854 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2855 uint16 splitAt, int8 change) 2856 { 2857 if (offset != fCurrentNodeOffset) 2858 return; 2859 2860 if (nextOffset != BPLUSTREE_NULL) { 2861 fCurrentNodeOffset = nextOffset; 2862 if (splitAt <= fCurrentKey) { 2863 fCurrentKey -= splitAt; 2864 keyIndex -= splitAt; 2865 } 2866 } 2867 2868 // Adjust fCurrentKey to point to the same key as before. 2869 // Note, that if a key is inserted at the current position 2870 // it won't be included in this tree transition. 2871 if (keyIndex <= fCurrentKey) 2872 fCurrentKey += change; 2873 2874 // TODO: duplicate handling! 2875 } 2876 2877 2878 void 2879 TreeIterator::Stop() 2880 { 2881 fTree = NULL; 2882 } 2883 2884 2885 #ifdef DEBUG 2886 void 2887 TreeIterator::Dump() 2888 { 2889 __out("TreeIterator at %p:\n", this); 2890 __out("\tfTree = %p\n", fTree); 2891 __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset); 2892 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2893 __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n", 2894 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2895 fDuplicateNode); 2896 __out("\tfDuplicate = %u\n", fDuplicate); 2897 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2898 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2899 } 2900 #endif 2901 2902 2903 // #pragma mark - 2904 2905 2906 bool 2907 bplustree_header::IsValid() const 2908 { 2909 return Magic() == BPLUSTREE_MAGIC 2910 && (RootNode() % NodeSize()) == 0 2911 && IsValidLink(RootNode()) 2912 && IsValidLink(FreeNode()); 2913 } 2914 2915 2916 // #pragma mark - 2917 2918 2919 void 2920 bplustree_node::Initialize() 2921 { 2922 left_link = right_link = overflow_link 2923 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2924 all_key_count = 0; 2925 all_key_length = 0; 2926 } 2927 2928 2929 uint8* 2930 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2931 { 2932 if (index < 0 || index > NumKeys()) 2933 return NULL; 2934 2935 uint8* keyStart = Keys(); 2936 uint16* keyLengths = KeyLengths(); 2937 2938 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2939 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2940 if (index > 0) 2941 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2942 2943 return keyStart; 2944 } 2945 2946 2947 uint8 2948 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2949 { 2950 // the duplicate fragment handling is currently hard-coded to a node size 2951 // of 1024 bytes - with future versions of BFS, this may be a problem 2952 2953 if (isFragment) { 2954 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2955 2956 return ((off_t*)this)[fragment]; 2957 } 2958 return OverflowLink(); 2959 } 2960 2961 2962 off_t 2963 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2964 { 2965 uint32 start; 2966 if (isFragment) 2967 start = 8 * ((uint64)offset & 0x3ff); 2968 else 2969 start = 2; 2970 2971 return ((off_t*)this)[start + 1 + index]; 2972 } 2973 2974 2975 /*! Although the name suggests it, this function doesn't return the real 2976 used fragment count; at least, it can only count to two: it returns 2977 0, if there is no fragment used, 1 if there is only one fragment 2978 used, and 2 if there are at least 2 fragments used. 2979 */ 2980 uint32 2981 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2982 { 2983 uint32 used = 0; 2984 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2985 duplicate_array* array = FragmentAt(i); 2986 if (array->Count() > 0 && ++used > 1) 2987 return used; 2988 } 2989 return used; 2990 } 2991 2992 2993 status_t 2994 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2995 { 2996 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2997 DEBUGGER(("invalid node: key/length count")); 2998 2999 for (int32 i = 0; i < NumKeys(); i++) { 3000 uint16 length = 0; 3001 uint8* key = KeyAt(i, &length); 3002 if (key + length + sizeof(off_t) + sizeof(uint16) 3003 > (uint8*)this + nodeSize 3004 || length > BPLUSTREE_MAX_KEY_LENGTH) { 3005 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 3006 return B_BAD_DATA; 3007 } 3008 if (Values()[i] == -1) { 3009 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " 3010 "corrupted\n", this, (int)i, Values()[i]); 3011 return B_BAD_DATA; 3012 } 3013 } 3014 return B_OK; 3015 } 3016 3017 3018 // #pragma mark - 3019 3020 3021 #if !_BOOT_MODE 3022 BitmapArray::BitmapArray(size_t numBits) 3023 { 3024 fSize = (numBits + 7) / 8; 3025 fBitmap = (uint8*)calloc(fSize, 1); 3026 fCountSet = 0; 3027 } 3028 3029 3030 BitmapArray::~BitmapArray() 3031 { 3032 free(fBitmap); 3033 } 3034 3035 3036 status_t 3037 BitmapArray::InitCheck() const 3038 { 3039 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 3040 } 3041 3042 3043 bool 3044 BitmapArray::IsSet(size_t index) const 3045 { 3046 uint32 byteIndex = index / 8; 3047 if (byteIndex >= fSize) 3048 return false; 3049 3050 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 3051 } 3052 3053 3054 void 3055 BitmapArray::Set(size_t index, bool set) 3056 { 3057 uint32 byteIndex = index / 8; 3058 if (byteIndex >= fSize) 3059 return; 3060 3061 if (set) { 3062 fBitmap[byteIndex] |= 1UL << (index & 0x7); 3063 fCountSet++; 3064 } else { 3065 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 3066 fCountSet--; 3067 } 3068 } 3069 #endif // !_BOOT_MODE 3070 3071 3072 // #pragma mark - 3073 3074 3075 bool 3076 duplicate_array::_FindInternal(off_t value, int32& index) const 3077 { 3078 int32 min = 0, max = Count() - 1; 3079 off_t cmp; 3080 while (min <= max) { 3081 index = (min + max) / 2; 3082 3083 cmp = ValueAt(index) - value; 3084 if (cmp < 0) 3085 min = index + 1; 3086 else if (cmp > 0) 3087 max = index - 1; 3088 else 3089 return true; 3090 } 3091 return false; 3092 } 3093 3094 3095 void 3096 duplicate_array::Insert(off_t value) 3097 { 3098 // if there are more than 8 values in this array, use a 3099 // binary search, if not, just iterate linearly to find 3100 // the insertion point 3101 int32 size = Count(); 3102 int32 i = 0; 3103 if (size > 8 ) { 3104 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3105 i++; 3106 } else { 3107 for (i = 0; i < size; i++) { 3108 if (ValueAt(i) > value) 3109 break; 3110 } 3111 } 3112 3113 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3114 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3115 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3116 } 3117 3118 3119 bool 3120 duplicate_array::Remove(off_t value) 3121 { 3122 int32 index = Find(value); 3123 if (index == -1) 3124 return false; 3125 3126 int32 newSize = Count() - 1; 3127 memmove(&values[index], &values[index + 1], 3128 (newSize - index) * sizeof(off_t)); 3129 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3130 3131 return true; 3132 } 3133 3134 3135 #if _BOOT_MODE 3136 } // namespace BFS 3137 #endif 3138