1 /* 2 * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "BPlusTree.h" 14 15 #include <file_systems/QueryParserUtils.h> 16 17 #include "Debug.h" 18 #include "Utility.h" 19 20 #if !_BOOT_MODE 21 # include "Inode.h" 22 #else 23 # include "Stream.h" 24 25 // BFS::Stream from the bootloader has the same API as Inode. 26 # define Inode BFS::Stream 27 28 # define strerror(x) "error message unavailable" 29 30 namespace BFS { 31 #endif 32 33 34 /*! Simple array used for the duplicate handling in the B+Tree. This is an 35 on disk structure. 36 */ 37 struct duplicate_array { 38 off_t count; 39 off_t values[0]; 40 41 inline bool IsEmpty() const 42 { 43 return count == 0; 44 } 45 46 inline int32 Count() const 47 { 48 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 49 } 50 51 inline off_t ValueAt(uint32 index) const 52 { 53 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 54 } 55 56 inline void SetValueAt(uint32 index, off_t value) 57 { 58 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 59 } 60 61 inline int32 Find(off_t value) const 62 { 63 int32 i; 64 return _FindInternal(value, i) ? i : -1; 65 } 66 67 void Insert(off_t value); 68 bool Remove(off_t value); 69 70 private: 71 bool _FindInternal(off_t value, int32& index) const; 72 } _PACKED; 73 74 75 #ifdef DEBUG 76 class NodeChecker { 77 public: 78 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 79 : 80 fNode(node), 81 fSize(nodeSize), 82 fText(text) 83 { 84 Check("integrity check failed on construction."); 85 } 86 87 ~NodeChecker() 88 { 89 Check("integrity check failed on destruction."); 90 } 91 92 void 93 Check(const char* message) 94 { 95 if (fNode->CheckIntegrity(fSize) != B_OK) { 96 dprintf("%s: %s\n", fText, message); 97 DEBUGGER(("NodeChecker integrity check failed!")); 98 } 99 } 100 101 private: 102 const bplustree_node* fNode; 103 int32 fSize; 104 const char* fText; 105 }; 106 #endif // DEBUG 107 108 109 #if !_BOOT_MODE 110 class BitmapArray { 111 public: 112 BitmapArray(size_t numBits); 113 ~BitmapArray(); 114 115 status_t InitCheck() const; 116 117 bool IsSet(size_t index) const; 118 void Set(size_t index, bool set); 119 120 size_t CountSet() const { return fCountSet; } 121 122 private: 123 uint8* fBitmap; 124 size_t fSize; 125 size_t fCountSet; 126 }; 127 128 129 struct TreeCheck { 130 TreeCheck(BPlusTree* tree) 131 : 132 fLevelCount(0), 133 fFreeCount(0), 134 fNodeSize(tree->NodeSize()), 135 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 136 fFoundErrors(0), 137 fVisited(tree->Stream()->Size() / tree->NodeSize()), 138 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 139 { 140 fPreviousOffsets = (off_t*)malloc( 141 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 142 if (fPreviousOffsets != NULL) { 143 for (size_t i = 0; i < fMaxLevels; i++) 144 fPreviousOffsets[i] = BPLUSTREE_NULL; 145 } 146 } 147 148 ~TreeCheck() 149 { 150 free(fPreviousOffsets); 151 } 152 153 status_t InitCheck() const 154 { 155 if (fPreviousOffsets == NULL) 156 return B_NO_MEMORY; 157 158 status_t status = fVisited.InitCheck(); 159 if (status != B_OK) 160 return status; 161 162 return fVisitedFragment.InitCheck(); 163 } 164 165 bool Visited(off_t offset) const 166 { 167 return fVisited.IsSet(offset / fNodeSize); 168 } 169 170 void SetVisited(off_t offset) 171 { 172 fVisited.Set(offset / fNodeSize, true); 173 } 174 175 size_t VisitedCount() const 176 { 177 return fVisited.CountSet(); 178 } 179 180 bool VisitedFragment(off_t offset) const 181 { 182 return fVisitedFragment.IsSet(offset / fNodeSize); 183 } 184 185 void SetVisitedFragment(off_t offset) 186 { 187 fVisitedFragment.Set(offset / fNodeSize, true); 188 } 189 190 uint32 MaxLevels() const 191 { 192 return fLevelCount; 193 } 194 195 void SetLevel(uint32 level) 196 { 197 if (fLevelCount < level) 198 fLevelCount = level; 199 } 200 201 off_t PreviousOffset(uint32 level) 202 { 203 return fPreviousOffsets[level]; 204 } 205 206 void SetPreviousOffset(uint32 level, off_t offset) 207 { 208 fPreviousOffsets[level] = offset; 209 } 210 211 void FoundError() 212 { 213 fFoundErrors++; 214 } 215 216 bool ErrorsFound() 217 { 218 return fFoundErrors != 0; 219 } 220 221 private: 222 uint32 fLevelCount; 223 uint32 fFreeCount; 224 uint32 fNodeSize; 225 uint32 fMaxLevels; 226 uint32 fFoundErrors; 227 BitmapArray fVisited; 228 BitmapArray fVisitedFragment; 229 off_t* fPreviousOffsets; 230 }; 231 232 233 // #pragma mark - 234 235 236 // Node Caching for the BPlusTree class 237 // 238 // With write support, there is the need for a function that allocates new 239 // nodes by either returning empty nodes, or by growing the file's data stream 240 // 241 // !! The CachedNode class assumes that you have properly locked the stream 242 // !! before asking for nodes. 243 // 244 // Note: This code will fail if the block size is smaller than the node size! 245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 246 // is hard-coded to 1024 bytes, that's not an issue now. 247 248 void 249 CachedNode::UnsetUnchanged(Transaction& transaction) 250 { 251 if (fTree == NULL || fTree->fStream == NULL) 252 return; 253 254 if (fNode != NULL) { 255 void* cache = fTree->fStream->GetVolume()->BlockCache(); 256 257 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 258 block_cache_put(cache, fBlockNumber); 259 fNode = NULL; 260 } 261 } 262 #endif // !_BOOT_MODE 263 264 265 void 266 CachedNode::Unset() 267 { 268 if (fTree == NULL || fTree->fStream == NULL) 269 return; 270 271 if (fNode != NULL) { 272 #if !_BOOT_MODE 273 if (fWritable && fOffset == 0) { 274 // The B+tree header has been updated - we need to update the 275 // BPlusTrees copy of it, as well. 276 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 277 } 278 279 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 280 fBlockNumber); 281 #endif // !_BOOT_MODE 282 283 fNode = NULL; 284 } 285 } 286 287 288 const bplustree_node* 289 CachedNode::SetTo(off_t offset, bool check) 290 { 291 const bplustree_node* node; 292 if (SetTo(offset, &node, check) == B_OK) 293 return node; 294 295 return NULL; 296 } 297 298 299 status_t 300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 301 { 302 if (fTree == NULL || fTree->fStream == NULL) 303 RETURN_ERROR(B_BAD_VALUE); 304 305 Unset(); 306 307 // You can only ask for nodes at valid positions - you can't 308 // even access the b+tree header with this method (use SetToHeader() 309 // instead) 310 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 311 || offset <= 0 312 || (offset % fTree->fNodeSize) != 0) { 313 RETURN_ERROR(B_BAD_VALUE); 314 } 315 316 if (InternalSetTo(NULL, offset) != NULL && check) { 317 // sanity checks (links, all_key_count) 318 if (!fTree->fHeader.CheckNode(fNode)) { 319 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 320 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 321 fBlockNumber, fTree->fStream->ID())); 322 return B_BAD_DATA; 323 } 324 } 325 326 *_node = fNode; 327 return B_OK; 328 } 329 330 331 #if !_BOOT_MODE 332 bplustree_node* 333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 334 { 335 if (fTree == NULL || fTree->fStream == NULL) { 336 REPORT_ERROR(B_BAD_VALUE); 337 return NULL; 338 } 339 340 Unset(); 341 342 // You can only ask for nodes at valid positions - you can't 343 // even access the b+tree header with this method (use SetToHeader() 344 // instead) 345 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 346 || offset <= 0 347 || (offset % fTree->fNodeSize) != 0) 348 return NULL; 349 350 if (InternalSetTo(&transaction, offset) != NULL && check) { 351 // sanity checks (links, all_key_count) 352 if (!fTree->fHeader.CheckNode(fNode)) { 353 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 354 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 355 fBlockNumber, fTree->fStream->ID())); 356 return NULL; 357 } 358 } 359 return fNode; 360 } 361 362 363 bplustree_node* 364 CachedNode::MakeWritable(Transaction& transaction) 365 { 366 if (fNode == NULL) 367 return NULL; 368 369 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 370 fBlockNumber, transaction.ID()) == B_OK) { 371 return fNode; 372 } 373 374 return NULL; 375 } 376 #endif // !_BOOT_MODE 377 378 379 const bplustree_header* 380 CachedNode::SetToHeader() 381 { 382 if (fTree == NULL || fTree->fStream == NULL) { 383 REPORT_ERROR(B_BAD_VALUE); 384 return NULL; 385 } 386 387 Unset(); 388 389 InternalSetTo(NULL, 0LL); 390 return (bplustree_header*)fNode; 391 } 392 393 394 #if !_BOOT_MODE 395 bplustree_header* 396 CachedNode::SetToWritableHeader(Transaction& transaction) 397 { 398 if (fTree == NULL || fTree->fStream == NULL) { 399 REPORT_ERROR(B_BAD_VALUE); 400 return NULL; 401 } 402 403 Unset(); 404 405 InternalSetTo(&transaction, 0LL); 406 407 if (fNode != NULL && !fTree->fInTransaction) { 408 transaction.AddListener(fTree); 409 fTree->fInTransaction = true; 410 411 if (!transaction.GetVolume()->IsInitializing()) { 412 acquire_vnode(transaction.GetVolume()->FSVolume(), 413 fTree->fStream->ID()); 414 } 415 } 416 417 return (bplustree_header*)fNode; 418 } 419 #endif // !_BOOT_MODE 420 421 422 bplustree_node* 423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 424 { 425 fNode = NULL; 426 fOffset = offset; 427 428 off_t fileOffset; 429 block_run run; 430 if (offset < fTree->fStream->Size() 431 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 432 433 #if !_BOOT_MODE 434 Volume* volume = fTree->fStream->GetVolume(); 435 #else 436 Volume* volume = &fTree->fStream->GetVolume(); 437 #endif 438 439 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 440 fBlockNumber = volume->ToBlock(run) + blockOffset; 441 uint8* block = NULL; 442 443 #if !_BOOT_MODE 444 if (transaction != NULL) { 445 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 446 fBlockNumber, transaction->ID()); 447 fWritable = true; 448 } else { 449 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 450 fWritable = false; 451 } 452 #else // !_BOOT_MODE 453 if (fBlock == NULL) { 454 fBlock = (uint8*)malloc(volume->BlockSize()); 455 if (fBlock == NULL) 456 return NULL; 457 } 458 459 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), 460 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { 461 block = fBlock; 462 } 463 464 fWritable = false; 465 #endif // _BOOT_MODE 466 467 if (block != NULL) { 468 // The node is somewhere in that block... 469 // (confusing offset calculation) 470 fNode = (bplustree_node*)(block + offset 471 - (fileOffset + (blockOffset << volume->BlockShift()))); 472 } else 473 REPORT_ERROR(B_IO_ERROR); 474 } 475 return fNode; 476 } 477 478 479 #if !_BOOT_MODE 480 status_t 481 CachedNode::Free(Transaction& transaction, off_t offset) 482 { 483 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 484 RETURN_ERROR(B_BAD_VALUE); 485 486 // TODO: scan the free nodes list and remove all nodes at the end 487 // of the tree - perhaps that shouldn't be done everytime that 488 // function is called, perhaps it should be done when the directory 489 // inode is closed or based on some calculation or whatever... 490 491 CachedNode cached(fTree); 492 bplustree_header* header = cached.SetToWritableHeader(transaction); 493 if (header == NULL) 494 return B_IO_ERROR; 495 496 #if 0 497 // TODO: temporarily disabled because CheckNode() doesn't like this... 498 // Also, it's such an edge case that it's almost useless, anyway. 499 // if the node is the last one in the tree, we shrink 500 // the tree and file size by one node 501 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 502 if (offset == lastOffset) { 503 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 504 if (status != B_OK) 505 return status; 506 507 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 508 return B_OK; 509 } 510 #endif 511 512 // add the node to the free nodes list 513 fNode->left_link = header->free_node_pointer; 514 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 515 516 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 517 return B_OK; 518 } 519 520 521 status_t 522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 523 off_t* _offset) 524 { 525 if (fTree == NULL || fTree->fStream == NULL) 526 RETURN_ERROR(B_BAD_VALUE); 527 528 Unset(); 529 530 bplustree_header* header; 531 status_t status; 532 533 // if there are any free nodes, recycle them 534 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 535 CachedNode cached(fTree); 536 header = cached.SetToWritableHeader(transaction); 537 if (header == NULL) 538 return B_IO_ERROR; 539 540 *_offset = header->FreeNode(); 541 *_node = fNode; 542 543 // set new free node pointer 544 header->free_node_pointer = fNode->left_link; 545 fNode->Initialize(); 546 return B_OK; 547 } 548 549 // allocate space for a new node 550 Inode* stream = fTree->fStream; 551 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 552 return status; 553 554 CachedNode cached(fTree); 555 header = cached.SetToWritableHeader(transaction); 556 if (header == NULL) 557 return B_IO_ERROR; 558 559 // the maximum_size has to be changed before the call to SetTo() - or 560 // else it will fail because the requested node is out of bounds 561 off_t offset = header->MaximumSize(); 562 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 563 + fTree->fNodeSize); 564 565 cached.Unset(); 566 // SetToWritable() below needs the new values in the tree's header 567 568 if (SetToWritable(transaction, offset, false) == NULL) 569 RETURN_ERROR(B_ERROR); 570 571 fNode->Initialize(); 572 573 *_offset = offset; 574 *_node = fNode; 575 return B_OK; 576 } 577 578 579 // #pragma mark - 580 581 582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 583 : 584 fStream(NULL), 585 fInTransaction(false) 586 { 587 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 588 SetTo(transaction, stream); 589 } 590 #endif // !_BOOT_MODE 591 592 593 BPlusTree::BPlusTree(Inode* stream) 594 : 595 fStream(NULL), 596 fInTransaction(false) 597 { 598 #if !_BOOT_MODE 599 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 600 #endif 601 602 SetTo(stream); 603 } 604 605 606 BPlusTree::BPlusTree() 607 : 608 fStream(NULL), 609 fNodeSize(BPLUSTREE_NODE_SIZE), 610 fAllowDuplicates(true), 611 fInTransaction(false), 612 fStatus(B_NO_INIT) 613 { 614 #if !_BOOT_MODE 615 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 616 #endif 617 } 618 619 620 BPlusTree::~BPlusTree() 621 { 622 #if !_BOOT_MODE 623 // if there are any TreeIterators left, we need to stop them 624 // (can happen when the tree's inode gets deleted while 625 // traversing the tree - a TreeIterator doesn't lock the inode) 626 mutex_lock(&fIteratorLock); 627 628 SinglyLinkedList<TreeIterator>::ConstIterator iterator 629 = fIterators.GetIterator(); 630 while (iterator.HasNext()) 631 iterator.Next()->Stop(); 632 633 mutex_destroy(&fIteratorLock); 634 635 ASSERT(!fInTransaction); 636 #endif // !_BOOT_MODE 637 } 638 639 640 #if !_BOOT_MODE 641 /*! Create a new B+Tree on the specified stream */ 642 status_t 643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 644 { 645 // initializes in-memory B+Tree 646 647 fStream = stream; 648 649 CachedNode cached(this); 650 bplustree_header* header = cached.SetToWritableHeader(transaction); 651 if (header == NULL) { 652 // allocate space for new header + node! 653 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 654 if (fStatus != B_OK) 655 RETURN_ERROR(fStatus); 656 657 header = cached.SetToWritableHeader(transaction); 658 if (header == NULL) 659 RETURN_ERROR(fStatus = B_ERROR); 660 } 661 662 fAllowDuplicates = stream->IsIndex() 663 || (stream->Mode() & S_ALLOW_DUPS) != 0; 664 665 fNodeSize = nodeSize; 666 667 // initialize b+tree header 668 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 669 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 670 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 671 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 672 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 673 header->free_node_pointer 674 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 675 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 676 677 cached.Unset(); 678 679 // initialize b+tree root node 680 cached.SetToWritable(transaction, fHeader.RootNode(), false); 681 if (cached.Node() == NULL) 682 RETURN_ERROR(B_IO_ERROR); 683 684 cached.Node()->Initialize(); 685 686 return fStatus = B_OK; 687 } 688 #endif // !_BOOT_MODE 689 690 691 status_t 692 BPlusTree::SetTo(Inode* stream) 693 { 694 if (stream == NULL) 695 RETURN_ERROR(fStatus = B_BAD_VALUE); 696 697 fStream = stream; 698 699 // get on-disk B+Tree header 700 701 CachedNode cached(this); 702 const bplustree_header* header = cached.SetToHeader(); 703 if (header != NULL) 704 memcpy(&fHeader, header, sizeof(bplustree_header)); 705 else 706 RETURN_ERROR(fStatus = B_IO_ERROR); 707 708 // is header valid? 709 710 if (fHeader.MaximumSize() != stream->Size()) { 711 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 712 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 713 // we can't change the header since we don't have a transaction 714 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 715 } 716 if (!fHeader.IsValid()) { 717 #ifdef DEBUG 718 dump_bplustree_header(&fHeader); 719 dump_block((const char*)&fHeader, 128); 720 #endif 721 RETURN_ERROR(fStatus = B_BAD_DATA); 722 } 723 724 fNodeSize = fHeader.NodeSize(); 725 726 // validity check 727 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 728 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 729 S_DOUBLE_INDEX}; 730 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 731 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 732 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 733 734 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 735 || ((stream->Mode() & S_INDEX_DIR) != 0 736 && kToMode[fHeader.DataType()] != mode) 737 || !stream->IsContainer()) { 738 D( dump_bplustree_header(&fHeader); 739 dump_inode(&stream->Node()); 740 ); 741 RETURN_ERROR(fStatus = B_BAD_TYPE); 742 } 743 744 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 745 // in the original BFS code - we will honour it nevertheless 746 fAllowDuplicates = stream->IsIndex() 747 || (stream->Mode() & S_ALLOW_DUPS) != 0; 748 749 cached.SetTo(fHeader.RootNode()); 750 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 751 } 752 753 754 status_t 755 BPlusTree::InitCheck() 756 { 757 return fStatus; 758 } 759 760 761 #if !_BOOT_MODE 762 status_t 763 BPlusTree::Validate(bool repair, bool& _errorsFound) 764 { 765 TreeCheck check(this); 766 if (check.InitCheck() != B_OK) 767 return B_NO_MEMORY; 768 769 check.SetVisited(0); 770 771 // Walk the free nodes 772 773 CachedNode cached(this); 774 off_t freeOffset = fHeader.FreeNode(); 775 while (freeOffset > 0) { 776 const bplustree_node* node; 777 status_t status = cached.SetTo(freeOffset, &node, false); 778 if (status != B_OK) { 779 if (status == B_IO_ERROR) 780 return B_IO_ERROR; 781 782 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 783 "not be read: %s\n", fStream->ID(), freeOffset, 784 strerror(status)); 785 check.FoundError(); 786 break; 787 } 788 789 if (check.Visited(freeOffset)) { 790 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 791 " circular!\n", fStream->ID(), freeOffset); 792 // TODO: if 'repair' is true, we could collect all unvisited nodes 793 // at the end, and put the into the free list 794 check.FoundError(); 795 break; 796 } 797 798 check.SetVisited(freeOffset); 799 800 if (node->OverflowLink() != BPLUSTREE_FREE) { 801 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 802 " misses free mark!\n", fStream->ID(), freeOffset); 803 } 804 freeOffset = node->LeftLink(); 805 } 806 807 // Iterate over the complete tree recursively 808 809 const bplustree_node* root; 810 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 811 if (status != B_OK) 812 return status; 813 814 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 815 816 if (check.ErrorsFound()) 817 _errorsFound = true; 818 819 if (status != B_OK) 820 return status; 821 822 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 823 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 824 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 825 fHeader.MaxNumberOfLevels()); 826 } 827 828 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 829 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 830 " nodes.\n", fStream->ID(), check.VisitedCount(), 831 fHeader.MaximumSize() / fNodeSize); 832 } 833 834 return B_OK; 835 } 836 837 838 status_t 839 BPlusTree::MakeEmpty() 840 { 841 // Put all nodes into the free list in order 842 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 843 844 // Reset the header, and root node 845 CachedNode cached(this); 846 bplustree_header* header = cached.SetToWritableHeader(transaction); 847 if (header == NULL) 848 return B_IO_ERROR; 849 850 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 851 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 852 if (fStream->Size() > (off_t)NodeSize() * 2) 853 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 854 else { 855 header->free_node_pointer 856 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 857 } 858 859 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 860 if (node == NULL) 861 return B_IO_ERROR; 862 863 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 864 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 865 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 866 node->all_key_count = 0; 867 node->all_key_length = 0; 868 869 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 870 offset += NodeSize()) { 871 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 872 if (node == NULL) { 873 dprintf("--> could not open %" B_PRIdOFF "\n", offset); 874 return B_IO_ERROR; 875 } 876 if (offset < fStream->Size() - (off_t)NodeSize()) 877 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 878 else 879 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 880 881 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 882 883 // It's not important to write it out in a single transaction; just 884 // make sure it doesn't get too large 885 if (offset % (1024 * 1024) == 0) { 886 transaction.Done(); 887 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 888 } 889 } 890 891 return transaction.Done(); 892 } 893 894 895 int32 896 BPlusTree::TypeCodeToKeyType(type_code code) 897 { 898 switch (code) { 899 case B_STRING_TYPE: 900 return BPLUSTREE_STRING_TYPE; 901 case B_SSIZE_T_TYPE: 902 case B_INT32_TYPE: 903 return BPLUSTREE_INT32_TYPE; 904 case B_SIZE_T_TYPE: 905 case B_UINT32_TYPE: 906 return BPLUSTREE_UINT32_TYPE; 907 case B_OFF_T_TYPE: 908 case B_INT64_TYPE: 909 return BPLUSTREE_INT64_TYPE; 910 case B_UINT64_TYPE: 911 return BPLUSTREE_UINT64_TYPE; 912 case B_FLOAT_TYPE: 913 return BPLUSTREE_FLOAT_TYPE; 914 case B_DOUBLE_TYPE: 915 return BPLUSTREE_DOUBLE_TYPE; 916 } 917 return -1; 918 } 919 920 921 int32 922 BPlusTree::ModeToKeyType(mode_t mode) 923 { 924 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 925 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 926 case S_INT_INDEX: 927 return BPLUSTREE_INT32_TYPE; 928 case S_UINT_INDEX: 929 return BPLUSTREE_UINT32_TYPE; 930 case S_LONG_LONG_INDEX: 931 return BPLUSTREE_INT64_TYPE; 932 case S_ULONG_LONG_INDEX: 933 return BPLUSTREE_UINT64_TYPE; 934 case S_FLOAT_INDEX: 935 return BPLUSTREE_FLOAT_TYPE; 936 case S_DOUBLE_INDEX: 937 return BPLUSTREE_DOUBLE_TYPE; 938 case S_STR_INDEX: 939 default: 940 // default is for standard directories 941 return BPLUSTREE_STRING_TYPE; 942 } 943 } 944 #endif // !_BOOT_MODE 945 946 947 // #pragma mark - TransactionListener implementation 948 949 950 #if !_BOOT_MODE 951 void 952 BPlusTree::TransactionDone(bool success) 953 { 954 if (!success) { 955 // update header from disk 956 CachedNode cached(this); 957 const bplustree_header* header = cached.SetToHeader(); 958 if (header != NULL) 959 memcpy(&fHeader, header, sizeof(bplustree_header)); 960 } 961 } 962 963 964 void 965 BPlusTree::RemovedFromTransaction() 966 { 967 fInTransaction = false; 968 969 if (!fStream->GetVolume()->IsInitializing()) 970 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 971 } 972 973 974 // #pragma mark - 975 976 977 void 978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 979 uint16 splitAt, int8 change) 980 { 981 // Although every iterator which is affected by this update currently 982 // waits on a semaphore, other iterators could be added/removed at 983 // any time, so we need to protect this loop 984 MutexLocker _(fIteratorLock); 985 986 SinglyLinkedList<TreeIterator>::ConstIterator iterator 987 = fIterators.GetIterator(); 988 while (iterator.HasNext()) 989 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 990 } 991 992 993 void 994 BPlusTree::_AddIterator(TreeIterator* iterator) 995 { 996 MutexLocker _(fIteratorLock); 997 fIterators.Add(iterator); 998 } 999 1000 1001 void 1002 BPlusTree::_RemoveIterator(TreeIterator* iterator) 1003 { 1004 MutexLocker _(fIteratorLock); 1005 fIterators.Remove(iterator); 1006 } 1007 #endif // !_BOOT_MODE 1008 1009 1010 int32 1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 1012 int keyLength2) 1013 { 1014 type_code type = 0; 1015 switch (fHeader.DataType()) { 1016 case BPLUSTREE_STRING_TYPE: 1017 type = B_STRING_TYPE; 1018 break; 1019 case BPLUSTREE_INT32_TYPE: 1020 type = B_INT32_TYPE; 1021 break; 1022 case BPLUSTREE_UINT32_TYPE: 1023 type = B_UINT32_TYPE; 1024 break; 1025 case BPLUSTREE_INT64_TYPE: 1026 type = B_INT64_TYPE; 1027 break; 1028 case BPLUSTREE_UINT64_TYPE: 1029 type = B_UINT64_TYPE; 1030 break; 1031 case BPLUSTREE_FLOAT_TYPE: 1032 type = B_FLOAT_TYPE; 1033 break; 1034 case BPLUSTREE_DOUBLE_TYPE: 1035 type = B_DOUBLE_TYPE; 1036 break; 1037 } 1038 return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2); 1039 } 1040 1041 1042 status_t 1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 1044 uint16 keyLength, uint16* _index, off_t* _next) 1045 { 1046 #ifdef DEBUG 1047 NodeChecker checker(node, fNodeSize, "find"); 1048 #endif 1049 1050 if (node->all_key_count == 0) { 1051 if (_index) 1052 *_index = 0; 1053 if (_next) 1054 *_next = node->OverflowLink(); 1055 return B_ENTRY_NOT_FOUND; 1056 } 1057 1058 Unaligned<off_t>* values = node->Values(); 1059 int16 saveIndex = -1; 1060 1061 // binary search in the key array 1062 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1063 uint16 i = (first + last) >> 1; 1064 1065 uint16 searchLength = 0; 1066 uint8* searchKey = node->KeyAt(i, &searchLength); 1067 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1068 > (uint8*)node + fNodeSize 1069 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1070 #if !_BOOT_MODE 1071 fStream->GetVolume()->Panic(); 1072 #endif 1073 RETURN_ERROR(B_BAD_DATA); 1074 } 1075 1076 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1077 if (cmp < 0) { 1078 last = i - 1; 1079 saveIndex = i; 1080 } else if (cmp > 0) { 1081 saveIndex = first = i + 1; 1082 } else { 1083 if (_index) 1084 *_index = i; 1085 if (_next) 1086 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1087 return B_OK; 1088 } 1089 } 1090 1091 if (_index) 1092 *_index = saveIndex; 1093 if (_next) { 1094 if (saveIndex == node->NumKeys()) 1095 *_next = node->OverflowLink(); 1096 else 1097 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1098 } 1099 return B_ENTRY_NOT_FOUND; 1100 } 1101 1102 1103 #if !_BOOT_MODE 1104 /*! Prepares the stack to contain all nodes that were passed while 1105 following the key, from the root node to the leaf node that could 1106 or should contain that key. 1107 */ 1108 status_t 1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1110 uint16 keyLength) 1111 { 1112 // set the root node to begin with 1113 node_and_key nodeAndKey; 1114 nodeAndKey.nodeOffset = fHeader.RootNode(); 1115 1116 CachedNode cached(this); 1117 const bplustree_node* node; 1118 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1119 // if we are already on leaf level, we're done 1120 if (node->OverflowLink() == BPLUSTREE_NULL) { 1121 // node that the keyIndex is not properly set here (but it's not 1122 // needed in the calling functions anyway)! 1123 nodeAndKey.keyIndex = 0; 1124 stack.Push(nodeAndKey); 1125 return B_OK; 1126 } 1127 1128 off_t nextOffset; 1129 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1130 &nextOffset); 1131 1132 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1133 RETURN_ERROR(B_ERROR); 1134 1135 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1136 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1137 break; 1138 } 1139 1140 // put the node offset & the correct keyIndex on the stack 1141 stack.Push(nodeAndKey); 1142 1143 nodeAndKey.nodeOffset = nextOffset; 1144 } 1145 1146 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1147 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1148 return B_ERROR; 1149 } 1150 1151 1152 /*! This will find a free duplicate fragment in the given bplustree_node. 1153 The CachedNode will be set to the writable fragment on success. 1154 */ 1155 status_t 1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1157 const bplustree_node* node, CachedNode& cached, 1158 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1159 { 1160 Unaligned<off_t>* values = node->Values(); 1161 for (int32 i = 0; i < node->NumKeys(); i++) { 1162 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1163 1164 // does the value link to a duplicate fragment? 1165 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1166 continue; 1167 1168 const bplustree_node* fragment = cached.SetTo( 1169 bplustree_node::FragmentOffset(value), false); 1170 if (fragment == NULL) { 1171 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1172 B_PRIdOFF "\n", value, fStream->ID())); 1173 continue; 1174 } 1175 1176 // see if there is some space left for us 1177 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1178 for (uint32 j = 0; j < num; j++) { 1179 duplicate_array* array = fragment->FragmentAt(j); 1180 1181 if (array->IsEmpty()) { 1182 // found an unused fragment 1183 *_fragment = cached.MakeWritable(transaction); 1184 if (*_fragment == NULL) 1185 return B_IO_ERROR; 1186 1187 *_offset = bplustree_node::FragmentOffset(value); 1188 *_index = j; 1189 return B_OK; 1190 } 1191 } 1192 } 1193 return B_ENTRY_NOT_FOUND; 1194 } 1195 1196 1197 status_t 1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1199 const bplustree_node* node, uint16 index, off_t value) 1200 { 1201 CachedNode cachedDuplicate(this); 1202 Unaligned<off_t>* values = node->Values(); 1203 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1204 status_t status; 1205 off_t offset; 1206 1207 if (bplustree_node::IsDuplicate(oldValue)) { 1208 // If it's a duplicate fragment, try to insert it into that, or if it 1209 // doesn't fit anymore, create a new duplicate node 1210 1211 if (bplustree_node::LinkType(oldValue) 1212 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1213 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1214 transaction, bplustree_node::FragmentOffset(oldValue), false); 1215 if (duplicate == NULL) 1216 return B_IO_ERROR; 1217 1218 duplicate_array* array = duplicate->FragmentAt( 1219 bplustree_node::FragmentIndex(oldValue)); 1220 int32 arrayCount = array->Count(); 1221 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1222 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1223 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1224 (int)bplustree_node::FragmentIndex(oldValue), 1225 bplustree_node::FragmentOffset(oldValue), arrayCount, 1226 fStream->ID())); 1227 return B_BAD_DATA; 1228 } 1229 1230 if (arrayCount < NUM_FRAGMENT_VALUES) { 1231 array->Insert(value); 1232 } else { 1233 // Test if the fragment will be empty if we remove this key's 1234 // values 1235 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1236 // The node will be empty without our values, so let us 1237 // reuse it as a duplicate node 1238 offset = bplustree_node::FragmentOffset(oldValue); 1239 1240 memmove(duplicate->DuplicateArray(), array, 1241 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1242 duplicate->left_link = duplicate->right_link 1243 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1244 1245 array = duplicate->DuplicateArray(); 1246 array->Insert(value); 1247 } else { 1248 // Create a new duplicate node 1249 1250 cachedDuplicate.UnsetUnchanged(transaction); 1251 // The old duplicate has not been touched, so we can 1252 // reuse it 1253 1254 bplustree_node* newDuplicate; 1255 status = cachedDuplicate.Allocate(transaction, 1256 &newDuplicate, &offset); 1257 if (status != B_OK) 1258 RETURN_ERROR(status); 1259 1260 // Copy the array from the fragment node to the duplicate 1261 // node and free the old entry (by zero'ing all values) 1262 newDuplicate->overflow_link = array->count; 1263 memcpy(&newDuplicate->all_key_count, &array->values[0], 1264 array->Count() * sizeof(off_t)); 1265 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1266 1267 array = newDuplicate->DuplicateArray(); 1268 array->Insert(value); 1269 } 1270 1271 // Update the main pointer to link to a duplicate node 1272 if (cached.MakeWritable(transaction) == NULL) 1273 return B_IO_ERROR; 1274 1275 values[index] 1276 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1277 BPLUSTREE_DUPLICATE_NODE, offset)); 1278 } 1279 1280 return B_OK; 1281 } 1282 1283 // Put the value into a dedicated duplicate node 1284 1285 // search for free space in the duplicate nodes of that key 1286 duplicate_array* array; 1287 int32 arrayCount; 1288 const bplustree_node* duplicate; 1289 off_t duplicateOffset; 1290 do { 1291 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1292 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1293 if (duplicate == NULL) 1294 return B_IO_ERROR; 1295 1296 array = duplicate->DuplicateArray(); 1297 arrayCount =array->Count(); 1298 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1299 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1300 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1301 duplicateOffset, arrayCount, fStream->ID())); 1302 return B_BAD_DATA; 1303 } 1304 } while (arrayCount >= NUM_DUPLICATE_VALUES 1305 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1306 1307 bplustree_node* writableDuplicate 1308 = cachedDuplicate.MakeWritable(transaction); 1309 if (writableDuplicate == NULL) 1310 return B_IO_ERROR; 1311 1312 if (arrayCount < NUM_DUPLICATE_VALUES) { 1313 array = writableDuplicate->DuplicateArray(); 1314 array->Insert(value); 1315 } else { 1316 // no space left - add a new duplicate node 1317 1318 bplustree_node* newDuplicate; 1319 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1320 &offset); 1321 if (status != B_OK) 1322 RETURN_ERROR(status); 1323 1324 // link the two nodes together 1325 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1326 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1327 1328 array = newDuplicate->DuplicateArray(); 1329 array->count = 0; 1330 array->Insert(value); 1331 } 1332 return B_OK; 1333 } 1334 1335 // Search for a free duplicate fragment or create a new one 1336 // to insert the duplicate value into 1337 1338 uint32 fragmentIndex = 0; 1339 bplustree_node* fragment; 1340 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1341 &offset, &fragment, &fragmentIndex) != B_OK) { 1342 // allocate a new duplicate fragment node 1343 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1344 if (status != B_OK) 1345 RETURN_ERROR(status); 1346 1347 memset(fragment, 0, fNodeSize); 1348 } 1349 1350 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1351 array->Insert(oldValue); 1352 array->Insert(value); 1353 1354 if (cached.MakeWritable(transaction) == NULL) 1355 return B_IO_ERROR; 1356 1357 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1358 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1359 1360 return B_OK; 1361 } 1362 1363 1364 void 1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1366 uint16 keyLength, off_t value) 1367 { 1368 // should never happen, but who knows? 1369 if (index > node->NumKeys()) 1370 return; 1371 1372 Unaligned<off_t>* values = node->Values(); 1373 Unaligned<uint16>* keyLengths = node->KeyLengths(); 1374 uint8* keys = node->Keys(); 1375 1376 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1377 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1378 + keyLength); 1379 1380 Unaligned<off_t>* newValues = node->Values(); 1381 Unaligned<uint16>* newKeyLengths = node->KeyLengths(); 1382 1383 // move values and copy new value into them 1384 memmove(newValues + index + 1, values + index, 1385 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1386 memmove(newValues, values, sizeof(off_t) * index); 1387 1388 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1389 1390 // move and update key length index 1391 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1392 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1393 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1394 } 1395 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1396 1397 int32 keyStart; 1398 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1399 + (keyStart = index > 0 1400 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1401 1402 // move keys and copy new key into them 1403 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1404 int32 size = node->AllKeyLength() - length; 1405 if (size > 0) 1406 memmove(keys + length, keys + length - keyLength, size); 1407 1408 memcpy(keys + keyStart, key, keyLength); 1409 } 1410 1411 1412 /*! Splits the \a node into two halves - the other half will be put into 1413 \a other. It also takes care to create a new overflow link if the node 1414 to split is an index node. 1415 */ 1416 status_t 1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1418 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1419 uint16* _keyLength, off_t* _value) 1420 { 1421 if (*_keyIndex > node->NumKeys() + 1) 1422 return B_BAD_VALUE; 1423 1424 Unaligned<uint16>* inKeyLengths = node->KeyLengths(); 1425 Unaligned<off_t>* inKeyValues = node->Values(); 1426 uint8* inKeys = node->Keys(); 1427 uint8* outKeys = other->Keys(); 1428 int32 keyIndex = *_keyIndex; // can become less than zero! 1429 1430 if (keyIndex > node->NumKeys()) { 1431 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1432 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1433 return B_BAD_VALUE; 1434 } 1435 1436 // How many keys will fit in one (half) page? 1437 // The following loop will find the answer to this question and 1438 // change the key lengths indices for their new home 1439 1440 // "bytes" is the number of bytes written for the new key, 1441 // "bytesBefore" are the bytes before that key 1442 // "bytesAfter" are the bytes after the new key, if any 1443 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1444 1445 size_t size = fNodeSize >> 1; 1446 int32 out, in; 1447 size_t keyLengths = 0; 1448 for (in = out = 0; in < node->NumKeys() + 1;) { 1449 keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]); 1450 1451 if (in == keyIndex && !bytes) { 1452 bytes = *_keyLength; 1453 bytesBefore = in > 0 1454 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1455 } else { 1456 if (keyIndex < out) 1457 bytesAfter = keyLengths - bytesBefore; 1458 1459 in++; 1460 } 1461 out++; 1462 1463 if (key_align(sizeof(bplustree_node) + bytes + keyLengths) 1464 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1465 // we have found the number of keys in the new node! 1466 break; 1467 } 1468 } 1469 1470 // if the new key was not inserted, set the length of the keys 1471 // that can be copied directly 1472 1473 if (keyIndex >= out && in > 0) 1474 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1475 else if (keyIndex + 1 < out) 1476 bytesAfter = keyLengths - bytesBefore; 1477 1478 if (bytesBefore < 0 || bytesAfter < 0) 1479 return B_BAD_DATA; 1480 1481 other->left_link = node->left_link; 1482 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1483 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1484 + bytesAfter); 1485 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1486 1487 Unaligned<uint16>* outKeyLengths = other->KeyLengths(); 1488 Unaligned<off_t>* outKeyValues = other->Values(); 1489 int32 keys = out > keyIndex ? keyIndex : out; 1490 1491 if (bytesBefore) { 1492 // copy the keys 1493 memcpy(outKeys, inKeys, bytesBefore); 1494 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1495 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1496 } 1497 if (bytes) { 1498 // copy the newly inserted key 1499 memcpy(outKeys + bytesBefore, key, bytes); 1500 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1501 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1502 1503 if (bytesAfter) { 1504 // copy the keys after the new key 1505 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1506 bytesAfter); 1507 keys = out - keyIndex - 1; 1508 for (int32 i = 0;i < keys;i++) { 1509 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1510 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1511 + bytes); 1512 } 1513 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1514 keys * sizeof(off_t)); 1515 } 1516 } 1517 1518 // if the new key was already inserted, we shouldn't use it again 1519 if (in != out) 1520 keyIndex--; 1521 1522 int32 total = bytesBefore + bytesAfter; 1523 1524 // these variables are for the key that will be returned 1525 // to the parent node 1526 uint8* newKey = NULL; 1527 uint16 newLength = 0; 1528 bool newAllocated = false; 1529 1530 // If we have split an index node, we have to drop the first key 1531 // of the next node (which can also be the new key to insert). 1532 // The dropped key is also the one which has to be inserted in 1533 // the parent node, so we will set the "newKey" already here. 1534 if (node->OverflowLink() != BPLUSTREE_NULL) { 1535 if (in == keyIndex) { 1536 newKey = key; 1537 newLength = *_keyLength; 1538 1539 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1540 keyIndex--; 1541 } else { 1542 // If a key is dropped (is not the new key), we have to copy 1543 // it, because it would be lost if not. 1544 uint8* droppedKey = node->KeyAt(in, &newLength); 1545 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1546 > (uint8*)node + fNodeSize 1547 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1548 fStream->GetVolume()->Panic(); 1549 RETURN_ERROR(B_BAD_DATA); 1550 } 1551 newKey = (uint8*)malloc(newLength); 1552 if (newKey == NULL) 1553 return B_NO_MEMORY; 1554 1555 newAllocated = true; 1556 memcpy(newKey, droppedKey, newLength); 1557 1558 other->overflow_link = inKeyValues[in]; 1559 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1560 } 1561 } 1562 1563 // and now the same game for the other page and the rest of the keys 1564 // (but with memmove() instead of memcpy(), because they may overlap) 1565 1566 bytesBefore = bytesAfter = bytes = 0; 1567 out = 0; 1568 int32 skip = in; 1569 while (in < node->NumKeys() + 1) { 1570 if (in == keyIndex && !bytes) { 1571 // it's enough to set bytesBefore once here, because we do 1572 // not need to know the exact length of all keys in this 1573 // loop 1574 bytesBefore = in > skip 1575 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1576 bytes = *_keyLength; 1577 out++; 1578 } else { 1579 if (in < node->NumKeys()) { 1580 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1581 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1582 1583 if (bytes) { 1584 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1585 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1586 1587 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1588 - bytesBefore - bytes; 1589 } 1590 out++; 1591 } 1592 in++; 1593 } 1594 } 1595 1596 // adjust the byte counts (since we were a bit lazy in the loop) 1597 if (keyIndex < skip) 1598 bytesBefore = node->AllKeyLength() - total; 1599 1600 if (bytesBefore < 0 || bytesAfter < 0) { 1601 if (newAllocated) 1602 free(newKey); 1603 return B_BAD_DATA; 1604 } 1605 1606 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1607 // right link, and overflow link can stay the same 1608 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1609 + bytesAfter); 1610 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1611 1612 // array positions have changed 1613 outKeyLengths = node->KeyLengths(); 1614 outKeyValues = node->Values(); 1615 1616 // move the keys in the old node: the order is important here, 1617 // because we don't want to overwrite any contents 1618 1619 keys = keyIndex <= skip ? out : keyIndex - skip; 1620 keyIndex -= skip; 1621 in = out - keyIndex - 1; 1622 // Note: keyIndex and in will contain invalid values when the new key 1623 // went to the other node. But in this case bytes and bytesAfter are 1624 // 0 and subsequently we never use keyIndex and in. 1625 1626 if (bytesBefore) 1627 memmove(inKeys, inKeys + total, bytesBefore); 1628 if (bytesAfter) { 1629 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1630 bytesAfter); 1631 } 1632 1633 if (bytesBefore) 1634 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1635 if (bytesAfter) { 1636 // if byteAfter is > 0, keyIndex is larger than skip 1637 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1638 in * sizeof(uint16)); 1639 } 1640 1641 if (bytesBefore) 1642 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1643 if (bytesAfter) { 1644 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1645 in * sizeof(off_t)); 1646 } 1647 1648 if (bytes) { 1649 // finally, copy the newly inserted key (don't overwrite anything) 1650 memcpy(inKeys + bytesBefore, key, bytes); 1651 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1652 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1653 } 1654 1655 // Prepare the key that will be inserted in the parent node which 1656 // is either the dropped key or the last of the other node. 1657 // If it's the dropped key, "newKey" was already set earlier. 1658 1659 if (newKey == NULL) 1660 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1661 1662 memcpy(key, newKey, newLength); 1663 *_keyLength = newLength; 1664 *_value = otherOffset; 1665 1666 if (newAllocated) 1667 free(newKey); 1668 1669 return B_OK; 1670 } 1671 1672 1673 /*! This inserts a key into the tree. The changes made to the tree will 1674 all be part of the \a transaction. 1675 You need to have the inode write locked. 1676 */ 1677 status_t 1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1679 off_t value) 1680 { 1681 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1682 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1683 RETURN_ERROR(B_BAD_VALUE); 1684 #ifdef DEBUG 1685 if (value < 0) 1686 panic("tried to insert invalid value %" B_PRId64 "!\n", value); 1687 #endif 1688 1689 ASSERT_WRITE_LOCKED_INODE(fStream); 1690 1691 Stack<node_and_key> stack; 1692 if (_SeekDown(stack, key, keyLength) != B_OK) 1693 RETURN_ERROR(B_ERROR); 1694 1695 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH]; 1696 memcpy(keyBuffer, key, keyLength); 1697 1698 node_and_key nodeAndKey; 1699 const bplustree_node* node; 1700 1701 CachedNode cached(this); 1702 while (stack.Pop(&nodeAndKey) 1703 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1704 #ifdef DEBUG 1705 NodeChecker checker(node, fNodeSize, "insert"); 1706 #endif 1707 if (node->IsLeaf()) { 1708 // first round, check for duplicate entries 1709 status_t status = _FindKey(node, key, keyLength, 1710 &nodeAndKey.keyIndex); 1711 1712 // is this a duplicate entry? 1713 if (status == B_OK) { 1714 if (fAllowDuplicates) { 1715 status = _InsertDuplicate(transaction, cached, node, 1716 nodeAndKey.keyIndex, value); 1717 if (status != B_OK) 1718 RETURN_ERROR(status); 1719 return B_OK; 1720 } 1721 1722 return B_NAME_IN_USE; 1723 } 1724 } 1725 1726 bplustree_node* writableNode = cached.MakeWritable(transaction); 1727 if (writableNode == NULL) 1728 return B_IO_ERROR; 1729 1730 // is the node big enough to hold the pair? 1731 if (int32(key_align(sizeof(bplustree_node) 1732 + writableNode->AllKeyLength() + keyLength) 1733 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1734 + sizeof(off_t))) < fNodeSize) { 1735 _InsertKey(writableNode, nodeAndKey.keyIndex, 1736 keyBuffer, keyLength, value); 1737 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1738 nodeAndKey.keyIndex, 0, 1); 1739 1740 return B_OK; 1741 } else { 1742 CachedNode cachedNewRoot(this); 1743 CachedNode cachedOther(this); 1744 1745 // do we need to allocate a new root node? if so, then do 1746 // it now 1747 off_t newRoot = BPLUSTREE_NULL; 1748 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1749 bplustree_node* root; 1750 status_t status = cachedNewRoot.Allocate(transaction, &root, 1751 &newRoot); 1752 if (status != B_OK) { 1753 // The tree is most likely corrupted! 1754 // But it's still sane at leaf level - we could set 1755 // a flag in the header that forces the tree to be 1756 // rebuild next time... 1757 // But since we will have journaling, that's not a big 1758 // problem anyway. 1759 RETURN_ERROR(status); 1760 } 1761 } 1762 1763 // reserve space for the other node 1764 bplustree_node* other; 1765 off_t otherOffset; 1766 status_t status = cachedOther.Allocate(transaction, &other, 1767 &otherOffset); 1768 if (status != B_OK) { 1769 cachedNewRoot.Free(transaction, newRoot); 1770 RETURN_ERROR(status); 1771 } 1772 1773 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1774 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1775 &value) != B_OK) { 1776 // free root node & other node here 1777 cachedOther.Free(transaction, otherOffset); 1778 cachedNewRoot.Free(transaction, newRoot); 1779 1780 RETURN_ERROR(B_ERROR); 1781 } 1782 #ifdef DEBUG 1783 checker.Check("insert split"); 1784 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1785 #endif 1786 1787 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1788 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1789 1790 // update the right link of the node in the left of the new node 1791 if ((other = cachedOther.SetToWritable(transaction, 1792 other->LeftLink())) != NULL) { 1793 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1794 } 1795 1796 // create a new root if necessary 1797 if (newRoot != BPLUSTREE_NULL) { 1798 bplustree_node* root = cachedNewRoot.Node(); 1799 1800 _InsertKey(root, 0, keyBuffer, keyLength, 1801 writableNode->LeftLink()); 1802 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1803 nodeAndKey.nodeOffset); 1804 1805 CachedNode cached(this); 1806 bplustree_header* header 1807 = cached.SetToWritableHeader(transaction); 1808 if (header == NULL) 1809 return B_IO_ERROR; 1810 1811 // finally, update header to point to the new root 1812 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1813 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1814 header->MaxNumberOfLevels() + 1); 1815 1816 return B_OK; 1817 } 1818 } 1819 } 1820 RETURN_ERROR(B_ERROR); 1821 } 1822 1823 1824 /*! Removes the duplicate index/value pair from the tree. 1825 It's part of the private tree interface. 1826 */ 1827 status_t 1828 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1829 const bplustree_node* node, CachedNode& cached, uint16 index, 1830 off_t value) 1831 { 1832 Unaligned<off_t>* values = node->Values(); 1833 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1834 1835 CachedNode cachedDuplicate(this); 1836 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1837 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1838 duplicateOffset, false); 1839 if (duplicate == NULL) 1840 return B_IO_ERROR; 1841 1842 // if it's a duplicate fragment, remove the entry from there 1843 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1844 duplicate_array* array = duplicate->FragmentAt( 1845 bplustree_node::FragmentIndex(oldValue)); 1846 int32 arrayCount = array->Count(); 1847 1848 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1849 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1850 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1851 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1852 arrayCount, fStream->ID())); 1853 return B_BAD_DATA; 1854 } 1855 if (!array->Remove(value)) { 1856 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1857 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1858 duplicateOffset, fStream->ID())); 1859 return B_ENTRY_NOT_FOUND; 1860 } 1861 1862 // remove the array from the fragment node if it is empty 1863 if (--arrayCount == 1) { 1864 // set the link to the remaining value 1865 if (cached.MakeWritable(transaction) == NULL) 1866 return B_IO_ERROR; 1867 1868 values[index] = array->values[0]; 1869 1870 // Remove the whole fragment node, if this was the only array, 1871 // otherwise free just the array 1872 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1873 status_t status = cachedDuplicate.Free(transaction, 1874 duplicateOffset); 1875 if (status != B_OK) 1876 return status; 1877 } else 1878 array->count = 0; 1879 } 1880 return B_OK; 1881 } 1882 1883 // Remove value from a duplicate node! 1884 1885 duplicate_array* array = NULL; 1886 int32 arrayCount = 0; 1887 1888 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1889 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1890 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1891 return B_BAD_DATA; 1892 } 1893 1894 // Search the duplicate nodes until the entry could be found (and removed) 1895 while (duplicate != NULL) { 1896 array = duplicate->DuplicateArray(); 1897 arrayCount = array->Count(); 1898 1899 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1900 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1901 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1902 duplicateOffset, arrayCount, fStream->ID())); 1903 return B_BAD_DATA; 1904 } 1905 1906 if (array->Remove(value)) { 1907 arrayCount--; 1908 break; 1909 } 1910 1911 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1912 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1913 1914 cachedDuplicate.UnsetUnchanged(transaction); 1915 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1916 false); 1917 } 1918 if (duplicate == NULL) 1919 RETURN_ERROR(B_IO_ERROR); 1920 1921 // The entry got removed from the duplicate node, but we might want to free 1922 // it now in case it's empty 1923 1924 while (true) { 1925 off_t left = duplicate->LeftLink(); 1926 off_t right = duplicate->RightLink(); 1927 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1928 1929 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1930 // Free empty duplicate page, link their siblings together, and 1931 // update the duplicate link if needed (ie. when we either remove 1932 // the last duplicate node or have a new first one) 1933 1934 if (left == BPLUSTREE_NULL) { 1935 // the duplicate link points to us 1936 if (cached.MakeWritable(transaction) == NULL) 1937 return B_IO_ERROR; 1938 1939 if (arrayCount == 1) { 1940 // This is the last node, and there is only one value left; 1941 // replace the duplicate link with that value, it's no 1942 // duplicate anymore 1943 values[index] = array->values[0]; 1944 } else { 1945 // Move the duplicate link to the next node 1946 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1947 bplustree_node::MakeLink( 1948 BPLUSTREE_DUPLICATE_NODE, right)); 1949 } 1950 } 1951 1952 status_t status = cachedDuplicate.Free(transaction, 1953 duplicateOffset); 1954 if (status != B_OK) 1955 return status; 1956 1957 if (left != BPLUSTREE_NULL 1958 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1959 false)) != NULL) { 1960 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1961 1962 // If the next node is the last node, we need to free that node 1963 // and convert the duplicate entry back into a normal entry 1964 array = duplicate->DuplicateArray(); 1965 arrayCount = array->Count(); 1966 if (right == BPLUSTREE_NULL 1967 && duplicate->LeftLink() == BPLUSTREE_NULL 1968 && arrayCount <= NUM_FRAGMENT_VALUES) { 1969 duplicateOffset = left; 1970 continue; 1971 } 1972 } 1973 if (right != BPLUSTREE_NULL 1974 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1975 right, false)) != NULL) { 1976 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1977 1978 // Again, we may need to turn the duplicate entry back into a 1979 // normal entry 1980 array = duplicate->DuplicateArray(); 1981 arrayCount = array->Count(); 1982 if (left == BPLUSTREE_NULL 1983 && duplicate->RightLink() == BPLUSTREE_NULL 1984 && arrayCount <= NUM_FRAGMENT_VALUES) { 1985 duplicateOffset = right; 1986 continue; 1987 } 1988 } 1989 return B_OK; 1990 } 1991 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1992 // If the number of entries fits in a duplicate fragment, then 1993 // either find a free fragment node, or convert this node to a 1994 // fragment node. 1995 CachedNode cachedOther(this); 1996 1997 bplustree_node* fragment = NULL; 1998 uint32 fragmentIndex = 0; 1999 off_t offset; 2000 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 2001 &offset, &fragment, &fragmentIndex) == B_OK) { 2002 // move to other node 2003 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 2004 memcpy(target, array, 2005 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2006 2007 cachedDuplicate.Free(transaction, duplicateOffset); 2008 duplicateOffset = offset; 2009 } else { 2010 // convert node 2011 memmove(duplicate, array, 2012 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2013 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 2014 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2015 } 2016 2017 if (cached.MakeWritable(transaction) == NULL) 2018 return B_IO_ERROR; 2019 2020 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 2021 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 2022 } 2023 return B_OK; 2024 } 2025 } 2026 2027 2028 /*! Removes the key with the given index from the specified node. 2029 Since it has to get the key from the node anyway (to obtain it's 2030 pointer), it's not needed to pass the key & its length, although 2031 the calling method (BPlusTree::Remove()) have this data. 2032 */ 2033 void 2034 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 2035 { 2036 // should never happen, but who knows? 2037 if (index > node->NumKeys() && node->NumKeys() > 0) { 2038 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 2039 "\n", index, fStream->ID())); 2040 return; 2041 } 2042 2043 Unaligned<off_t>* values = node->Values(); 2044 2045 // if we would have to drop the overflow link, drop 2046 // the last key instead and update the overflow link 2047 // to the value of that one 2048 if (!node->IsLeaf() && index == node->NumKeys()) 2049 node->overflow_link = values[--index]; 2050 2051 uint16 length = 0; 2052 uint8* key = node->KeyAt(index, &length); 2053 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 2054 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2055 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 2056 fStream->ID())); 2057 fStream->GetVolume()->Panic(); 2058 return; 2059 } 2060 2061 Unaligned<uint16>* keyLengths = node->KeyLengths(); 2062 uint8* keys = node->Keys(); 2063 2064 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2065 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2066 node->AllKeyLength() - length); 2067 2068 Unaligned<off_t>* newValues = node->Values(); 2069 Unaligned<uint16>* newKeyLengths = node->KeyLengths(); 2070 2071 // move key data 2072 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2073 2074 // move and update key lengths 2075 if (index > 0 && newKeyLengths != keyLengths) 2076 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2077 for (uint16 i = index; i < node->NumKeys(); i++) { 2078 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2079 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2080 } 2081 2082 // move values 2083 if (index > 0) 2084 memmove(newValues, values, index * sizeof(off_t)); 2085 if (node->NumKeys() > index) { 2086 memmove(newValues + index, values + index + 1, 2087 (node->NumKeys() - index) * sizeof(off_t)); 2088 } 2089 } 2090 2091 2092 /*! Removes the specified key from the tree. The "value" parameter is only used 2093 for trees which allow duplicates, so you may safely ignore it. 2094 It's not an optional parameter, so at least you have to think about it. 2095 You need to have the inode write locked. 2096 */ 2097 status_t 2098 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2099 off_t value) 2100 { 2101 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2102 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2103 RETURN_ERROR(B_BAD_VALUE); 2104 2105 ASSERT_WRITE_LOCKED_INODE(fStream); 2106 2107 Stack<node_and_key> stack; 2108 if (_SeekDown(stack, key, keyLength) != B_OK) 2109 RETURN_ERROR(B_ERROR); 2110 2111 node_and_key nodeAndKey; 2112 const bplustree_node* node; 2113 2114 CachedNode cached(this); 2115 while (stack.Pop(&nodeAndKey) 2116 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2117 #ifdef DEBUG 2118 NodeChecker checker(node, fNodeSize, "remove"); 2119 #endif 2120 if (node->IsLeaf()) { 2121 // first round, check for duplicate entries 2122 status_t status = _FindKey(node, key, keyLength, 2123 &nodeAndKey.keyIndex); 2124 if (status != B_OK) 2125 RETURN_ERROR(status); 2126 2127 // Is this a duplicate entry? 2128 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2129 node->Values()[nodeAndKey.keyIndex]))) { 2130 if (fAllowDuplicates) { 2131 return _RemoveDuplicate(transaction, node, cached, 2132 nodeAndKey.keyIndex, value); 2133 } 2134 2135 FATAL(("dupliate node found where no duplicates are " 2136 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2137 RETURN_ERROR(B_ERROR); 2138 } else { 2139 if (node->Values()[nodeAndKey.keyIndex] != value) 2140 return B_ENTRY_NOT_FOUND; 2141 2142 // If we will remove the last key, the iterator will be set 2143 // to the next node after the current - if there aren't any 2144 // more nodes, we need a way to prevent the TreeIterators to 2145 // touch the old node again, we use BPLUSTREE_FREE for this 2146 off_t next = node->RightLink() == BPLUSTREE_NULL 2147 ? BPLUSTREE_FREE : node->RightLink(); 2148 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2149 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2150 } 2151 } 2152 2153 bplustree_node* writableNode = cached.MakeWritable(transaction); 2154 if (writableNode == NULL) 2155 return B_IO_ERROR; 2156 2157 // if it's an empty root node, we have to convert it 2158 // to a leaf node by dropping the overflow link, or, 2159 // if it's already a leaf node, just empty it 2160 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2161 && (node->NumKeys() == 0 2162 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2163 writableNode->overflow_link 2164 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2165 writableNode->all_key_count = 0; 2166 writableNode->all_key_length = 0; 2167 2168 // if we've made a leaf node out of the root node, we need 2169 // to reset the maximum number of levels in the header 2170 if (fHeader.MaxNumberOfLevels() != 1) { 2171 CachedNode cached(this); 2172 bplustree_header* header 2173 = cached.SetToWritableHeader(transaction); 2174 if (header == NULL) 2175 return B_IO_ERROR; 2176 2177 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2178 } 2179 return B_OK; 2180 } 2181 2182 // if there is only one key left, we don't have to remove 2183 // it, we can just dump the node (index nodes still have 2184 // the overflow link, so we have to drop the last key) 2185 if (writableNode->NumKeys() > 1 2186 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2187 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2188 return B_OK; 2189 } 2190 2191 // when we are here, we can just free the node, but 2192 // we have to update the right/left link of the 2193 // siblings first 2194 CachedNode otherCached(this); 2195 bplustree_node* other = otherCached.SetToWritable(transaction, 2196 writableNode->LeftLink()); 2197 if (other != NULL) 2198 other->right_link = writableNode->right_link; 2199 2200 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2201 != NULL) { 2202 other->left_link = writableNode->left_link; 2203 } 2204 2205 cached.Free(transaction, nodeAndKey.nodeOffset); 2206 } 2207 RETURN_ERROR(B_ERROR); 2208 } 2209 2210 2211 /*! Replaces the value for the key in the tree. 2212 Returns B_OK if the key could be found and its value replaced, 2213 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2214 to indicate that something went terribly wrong. 2215 Note that this doesn't work with duplicates - it will just 2216 return B_BAD_TYPE if you call this function on a tree where 2217 duplicates are allowed. 2218 You need to have the inode write locked. 2219 */ 2220 status_t 2221 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2222 off_t value) 2223 { 2224 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2225 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2226 || key == NULL) 2227 RETURN_ERROR(B_BAD_VALUE); 2228 2229 if (fAllowDuplicates) 2230 RETURN_ERROR(B_BAD_TYPE); 2231 2232 ASSERT_WRITE_LOCKED_INODE(fStream); 2233 2234 off_t nodeOffset = fHeader.RootNode(); 2235 CachedNode cached(this); 2236 const bplustree_node* node; 2237 2238 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2239 uint16 keyIndex = 0; 2240 off_t nextOffset; 2241 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2242 &nextOffset); 2243 2244 if (node->OverflowLink() == BPLUSTREE_NULL) { 2245 if (status == B_OK) { 2246 bplustree_node* writableNode = cached.MakeWritable(transaction); 2247 if (writableNode != NULL) { 2248 writableNode->Values()[keyIndex] 2249 = HOST_ENDIAN_TO_BFS_INT64(value); 2250 } else 2251 status = B_IO_ERROR; 2252 } 2253 2254 return status; 2255 } else if (nextOffset == nodeOffset) 2256 RETURN_ERROR(B_ERROR); 2257 2258 nodeOffset = nextOffset; 2259 } 2260 RETURN_ERROR(B_ERROR); 2261 } 2262 #endif // !_BOOT_MODE 2263 2264 2265 /*! Searches the key in the tree, and stores the offset found in _value, 2266 if successful. 2267 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2268 while it descends the tree. 2269 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2270 It can also return other errors to indicate that something went wrong. 2271 Note that this doesn't work with duplicates - it will just return 2272 B_BAD_TYPE if you call this function on a tree where duplicates are 2273 allowed. 2274 You need to have the inode read or write locked. 2275 */ 2276 status_t 2277 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2278 { 2279 if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH) 2280 RETURN_ERROR(B_BAD_VALUE); 2281 2282 if (keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2283 RETURN_ERROR(B_NAME_TOO_LONG); 2284 2285 if (fAllowDuplicates) 2286 RETURN_ERROR(B_BAD_TYPE); 2287 2288 #if !_BOOT_MODE 2289 ASSERT_READ_LOCKED_INODE(fStream); 2290 #endif 2291 2292 off_t nodeOffset = fHeader.RootNode(); 2293 CachedNode cached(this); 2294 const bplustree_node* node; 2295 2296 #ifdef DEBUG 2297 int32 levels = 0; 2298 #endif 2299 2300 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2301 uint16 keyIndex = 0; 2302 off_t nextOffset; 2303 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2304 &nextOffset); 2305 2306 #ifdef DEBUG 2307 levels++; 2308 #endif 2309 if (node->OverflowLink() == BPLUSTREE_NULL) { 2310 if (status == B_OK && _value != NULL) 2311 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2312 2313 #ifdef DEBUG 2314 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2315 DEBUGGER(("levels don't match")); 2316 #endif 2317 return status; 2318 } else if (nextOffset == nodeOffset) 2319 RETURN_ERROR(B_ERROR); 2320 2321 nodeOffset = nextOffset; 2322 } 2323 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2324 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2325 RETURN_ERROR(B_ERROR); 2326 } 2327 2328 2329 #if !_BOOT_MODE 2330 status_t 2331 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2332 const uint8* largestKey, uint16 largestKeyLength, 2333 const bplustree_node* parent) 2334 { 2335 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2336 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2337 "failed!\n", fStream->ID(), offset); 2338 check.FoundError(); 2339 return B_OK; 2340 } 2341 if (level >= fHeader.MaxNumberOfLevels()) { 2342 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2343 "!\n", fStream->ID(), offset); 2344 check.FoundError(); 2345 return B_OK; 2346 } 2347 2348 check.SetLevel(level); 2349 2350 if (check.Visited(offset)) { 2351 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2352 fStream->ID(), offset); 2353 check.FoundError(); 2354 return B_OK; 2355 } 2356 2357 check.SetVisited(offset); 2358 2359 uint32 count = parent->NumKeys(); 2360 Unaligned<off_t>* values = parent->Values(); 2361 off_t lastOffset = check.PreviousOffset(level); 2362 CachedNode cached(this); 2363 2364 for (uint32 i = 0; i < count; i++) { 2365 uint16 keyLength; 2366 uint8* key = parent->KeyAt(i, &keyLength); 2367 if (largestKey != NULL) { 2368 int result = _CompareKeys(key, keyLength, largestKey, 2369 largestKeyLength); 2370 if (result > 0 || (result == 0 && i != count - 1)) { 2371 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2372 B_PRIu32 " larger than it should!\n", 2373 fStream->ID(), offset, i); 2374 check.FoundError(); 2375 } 2376 } 2377 2378 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2379 if (bplustree_node::IsDuplicate(childOffset)) { 2380 // Walk the duplicate nodes 2381 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2382 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2383 2384 while (duplicateOffset != BPLUSTREE_NULL) { 2385 const bplustree_node* node; 2386 status_t status = cached.SetTo(duplicateOffset, &node, false); 2387 if (status != B_OK) { 2388 if (status == B_IO_ERROR) 2389 return B_IO_ERROR; 2390 2391 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2392 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2393 duplicateOffset, strerror(status)); 2394 check.FoundError(); 2395 break; 2396 } 2397 2398 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2399 == BPLUSTREE_DUPLICATE_FRAGMENT; 2400 bool isKnownFragment = isFragmentNode 2401 && check.VisitedFragment(duplicateOffset); 2402 2403 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2404 dprintf("inode %" B_PRIdOFF ": %s node at %" 2405 B_PRIdOFF " already visited, referenced from %" 2406 B_PRIdOFF "!\n", fStream->ID(), 2407 isFragmentNode ? "fragment" : "duplicate", 2408 duplicateOffset, offset); 2409 check.FoundError(); 2410 break; 2411 } 2412 2413 // Fragment nodes may be visited more than once from different 2414 // places 2415 if (!check.Visited(duplicateOffset)) 2416 check.SetVisited(duplicateOffset); 2417 if (!isKnownFragment && isFragmentNode) 2418 check.SetVisitedFragment(duplicateOffset); 2419 2420 duplicate_array* array; 2421 int32 minSize; 2422 int32 maxSize; 2423 if (isFragmentNode) { 2424 array = node->FragmentAt( 2425 bplustree_node::FragmentIndex(childOffset)); 2426 minSize = 2; 2427 maxSize = NUM_FRAGMENT_VALUES; 2428 } else { 2429 array = node->DuplicateArray(); 2430 minSize = 1; 2431 maxSize = NUM_DUPLICATE_VALUES; 2432 } 2433 int32 arrayCount = array->Count(); 2434 2435 if (arrayCount < minSize || arrayCount > maxSize) { 2436 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2437 " has invalid array size %" B_PRId32 "!\n", 2438 fStream->ID(), duplicateOffset, arrayCount); 2439 check.FoundError(); 2440 } else { 2441 // Simple check if the values in the array may be valid 2442 for (int32 j = 0; j < arrayCount; j++) { 2443 if (!fStream->GetVolume()->IsValidInodeBlock( 2444 array->ValueAt(j))) { 2445 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2446 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2447 " at %" B_PRId32 "!\n", fStream->ID(), 2448 duplicateOffset, array->ValueAt(j), j); 2449 check.FoundError(); 2450 break; 2451 } 2452 } 2453 } 2454 2455 // A fragment node is not linked (and does not have valid links) 2456 if (isFragmentNode) 2457 break; 2458 2459 if (node->LeftLink() != lastDuplicateOffset) { 2460 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2461 " has wrong left link %" B_PRIdOFF ", expected %" 2462 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2463 node->LeftLink(), lastDuplicateOffset); 2464 check.FoundError(); 2465 } 2466 2467 lastDuplicateOffset = duplicateOffset; 2468 duplicateOffset = node->RightLink(); 2469 } 2470 } else if (!parent->IsLeaf()) { 2471 // Test a regular child node recursively 2472 off_t nextOffset = parent->OverflowLink(); 2473 if (i < count - 1) 2474 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2475 2476 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2477 // Test right link of the previous node 2478 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2479 if (previous == NULL) 2480 return B_IO_ERROR; 2481 2482 if (previous->RightLink() != childOffset) { 2483 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2484 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2485 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2486 childOffset); 2487 check.FoundError(); 2488 } 2489 } 2490 2491 status_t status = _ValidateChild(check, cached, level, childOffset, 2492 lastOffset, nextOffset, key, keyLength); 2493 if (status != B_OK) 2494 return status; 2495 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2496 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2497 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2498 fStream->ID(), offset, childOffset, i); 2499 check.FoundError(); 2500 } 2501 2502 lastOffset = childOffset; 2503 } 2504 2505 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2506 off_t childOffset = parent->OverflowLink(); 2507 status_t status = _ValidateChild(check, cached, level, childOffset, 2508 lastOffset, 0, NULL, 0); 2509 if (status != B_OK) 2510 return status; 2511 2512 lastOffset = childOffset; 2513 } 2514 2515 check.SetPreviousOffset(level, lastOffset); 2516 return B_OK; 2517 } 2518 2519 2520 status_t 2521 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2522 off_t offset, off_t lastOffset, off_t nextOffset, 2523 const uint8* key, uint16 keyLength) 2524 { 2525 const bplustree_node* node; 2526 status_t status = cached.SetTo(offset, &node, true); 2527 if (status != B_OK) { 2528 if (status == B_IO_ERROR) 2529 return B_IO_ERROR; 2530 2531 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2532 "read: %s\n", fStream->ID(), offset, strerror(status)); 2533 check.FoundError(); 2534 return B_OK; 2535 } 2536 2537 if (node->LeftLink() != lastOffset) { 2538 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2539 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2540 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2541 check.FoundError(); 2542 } 2543 2544 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2545 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2546 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2547 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2548 check.FoundError(); 2549 } 2550 2551 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2552 } 2553 #endif // !_BOOT_MODE 2554 2555 2556 // #pragma mark - 2557 2558 2559 TreeIterator::TreeIterator(BPlusTree* tree) 2560 : 2561 fTree(tree), 2562 fCurrentNodeOffset(BPLUSTREE_NULL) 2563 { 2564 #if !_BOOT_MODE 2565 tree->_AddIterator(this); 2566 #endif 2567 } 2568 2569 2570 TreeIterator::~TreeIterator() 2571 { 2572 #if !_BOOT_MODE 2573 if (fTree) 2574 fTree->_RemoveIterator(this); 2575 #endif 2576 } 2577 2578 2579 status_t 2580 TreeIterator::Goto(int8 to) 2581 { 2582 if (fTree == NULL || fTree->fStream == NULL) 2583 RETURN_ERROR(B_BAD_VALUE); 2584 2585 #if !_BOOT_MODE 2586 // lock access to stream 2587 InodeReadLocker locker(fTree->fStream); 2588 #endif 2589 2590 off_t nodeOffset = fTree->fHeader.RootNode(); 2591 CachedNode cached(fTree); 2592 const bplustree_node* node; 2593 2594 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2595 // is the node a leaf node? 2596 if (node->OverflowLink() == BPLUSTREE_NULL) { 2597 fCurrentNodeOffset = nodeOffset; 2598 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2599 fDuplicateNode = BPLUSTREE_NULL; 2600 2601 return B_OK; 2602 } 2603 2604 // get the next node offset depending on the direction (and if there 2605 // are any keys in that node at all) 2606 off_t nextOffset; 2607 if (to == BPLUSTREE_END || node->all_key_count == 0) 2608 nextOffset = node->OverflowLink(); 2609 else { 2610 if (node->AllKeyLength() > fTree->fNodeSize 2611 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2612 - 8 * node->NumKeys()) 2613 RETURN_ERROR(B_ERROR); 2614 2615 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2616 } 2617 if (nextOffset == nodeOffset) 2618 break; 2619 2620 nodeOffset = nextOffset; 2621 } 2622 FATAL(("%s fails\n", __FUNCTION__)); 2623 2624 RETURN_ERROR(B_ERROR); 2625 } 2626 2627 2628 /*! Iterates through the tree in the specified direction. 2629 When it iterates through duplicates, the "key" is only updated for the 2630 first entry - if you need to know when this happens, use the "duplicate" 2631 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2632 the other duplicates. 2633 That's not too nice, but saves the 256 bytes that would be needed to 2634 store the last key - if this will ever become an issue, it will be 2635 easy to change. 2636 The other advantage of this is, that the queries can skip all duplicates 2637 at once when they are not relevant to them. 2638 */ 2639 status_t 2640 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2641 uint16 maxLength, off_t* value, uint16* duplicate) 2642 { 2643 if (fTree == NULL) 2644 return B_INTERRUPTED; 2645 2646 bool forward = direction == BPLUSTREE_FORWARD; 2647 2648 if (fCurrentNodeOffset == BPLUSTREE_NULL 2649 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2650 RETURN_ERROR(B_ERROR); 2651 2652 // if the tree was emptied since the last call 2653 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2654 return B_ENTRY_NOT_FOUND; 2655 2656 #if !_BOOT_MODE 2657 // lock access to stream 2658 InodeReadLocker locker(fTree->fStream); 2659 #endif 2660 2661 CachedNode cached(fTree); 2662 const bplustree_node* node; 2663 2664 if (fDuplicateNode != BPLUSTREE_NULL) { 2665 // regardless of traverse direction the duplicates are always presented 2666 // in the same order; since they are all considered as equal, this 2667 // shouldn't cause any problems 2668 2669 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2670 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2671 false); 2672 } else 2673 node = NULL; 2674 2675 if (node != NULL) { 2676 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2677 // If the node is out of duplicates, we go directly to the next 2678 // one 2679 fDuplicateNode = node->RightLink(); 2680 if (fDuplicateNode != BPLUSTREE_NULL 2681 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2682 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2683 false); 2684 fDuplicate = 0; 2685 } 2686 } 2687 if (fDuplicate < fNumDuplicates) { 2688 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2689 fDuplicate++); 2690 if (duplicate) 2691 *duplicate = 2; 2692 return B_OK; 2693 } 2694 } 2695 fDuplicateNode = BPLUSTREE_NULL; 2696 } 2697 2698 off_t savedNodeOffset = fCurrentNodeOffset; 2699 int32 savedKey = fCurrentKey; 2700 2701 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2702 RETURN_ERROR(B_ERROR); 2703 2704 if (duplicate) 2705 *duplicate = 0; 2706 2707 fCurrentKey += direction; 2708 2709 // is the current key in the current node? 2710 while ((forward && fCurrentKey >= node->NumKeys()) 2711 || (!forward && fCurrentKey < 0)) { 2712 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2713 2714 // are there any more nodes? 2715 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2716 node = cached.SetTo(fCurrentNodeOffset); 2717 if (!node) 2718 RETURN_ERROR(B_ERROR); 2719 2720 // reset current key 2721 fCurrentKey = forward ? 0 : node->NumKeys() - 1; 2722 } else { 2723 // there are no nodes left, so turn back to the last key 2724 fCurrentNodeOffset = savedNodeOffset; 2725 fCurrentKey = savedKey; 2726 2727 return B_ENTRY_NOT_FOUND; 2728 } 2729 } 2730 2731 if (node->all_key_count == 0) 2732 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2733 2734 uint16 length = 0; 2735 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2736 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2737 > (uint8*)node + fTree->fNodeSize 2738 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2739 #if !_BOOT_MODE 2740 fTree->fStream->GetVolume()->Panic(); 2741 #endif 2742 RETURN_ERROR(B_BAD_DATA); 2743 } 2744 2745 // include the termination for string types 2746 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; 2747 if (length + (needsTermination ? 1 : 0) > maxLength) { 2748 if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) { 2749 // The buffer is too small, restore the last key and return 2750 // an error 2751 fCurrentNodeOffset = savedNodeOffset; 2752 fCurrentKey = savedKey; 2753 return B_BUFFER_OVERFLOW; 2754 } 2755 2756 // Always cut off strings at the maximum buffer size, and leave 2757 // room for a terminating null byte. 2758 // This allows to handle larger key sizes gracefully. 2759 length = maxLength - 1; 2760 } 2761 2762 memcpy(key, keyStart, length); 2763 2764 if (needsTermination) 2765 ((char*)key)[length] = '\0'; 2766 2767 *keyLength = length; 2768 2769 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2770 2771 // duplicate fragments? 2772 uint8 type = bplustree_node::LinkType(offset); 2773 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2774 || type == BPLUSTREE_DUPLICATE_NODE) { 2775 fDuplicateNode = offset; 2776 2777 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2778 false); 2779 if (node == NULL) 2780 RETURN_ERROR(B_ERROR); 2781 2782 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2783 2784 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2785 if (fNumDuplicates) { 2786 offset = node->DuplicateAt(offset, fIsFragment, 0); 2787 fDuplicate = 1; 2788 if (duplicate) 2789 *duplicate = 1; 2790 } else { 2791 // Shouldn't happen, but we're dealing here with potentially 2792 // corrupt disks... 2793 fDuplicateNode = BPLUSTREE_NULL; 2794 offset = 0; 2795 } 2796 } 2797 *value = offset; 2798 2799 return B_OK; 2800 } 2801 2802 2803 /*! This is more or less a copy of BPlusTree::Find() - but it just 2804 sets the current position in the iterator, regardless of if the 2805 key could be found or not. 2806 */ 2807 status_t 2808 TreeIterator::Find(const uint8* key, uint16 keyLength) 2809 { 2810 if (fTree == NULL) 2811 return B_INTERRUPTED; 2812 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2813 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2814 || key == NULL) 2815 RETURN_ERROR(B_BAD_VALUE); 2816 2817 #if !_BOOT_MODE 2818 // lock access to stream 2819 InodeReadLocker locker(fTree->fStream); 2820 #endif 2821 2822 off_t nodeOffset = fTree->fHeader.RootNode(); 2823 2824 CachedNode cached(fTree); 2825 const bplustree_node* node; 2826 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2827 uint16 keyIndex = 0; 2828 off_t nextOffset = 0; 2829 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2830 &nextOffset); 2831 2832 if (node->OverflowLink() == BPLUSTREE_NULL) { 2833 fCurrentNodeOffset = nodeOffset; 2834 fCurrentKey = keyIndex - 1; 2835 fDuplicateNode = BPLUSTREE_NULL; 2836 2837 return status; 2838 } else if (nextOffset == nodeOffset) 2839 RETURN_ERROR(B_ERROR); 2840 2841 nodeOffset = nextOffset; 2842 } 2843 RETURN_ERROR(B_ERROR); 2844 } 2845 2846 2847 void 2848 TreeIterator::SkipDuplicates() 2849 { 2850 fDuplicateNode = BPLUSTREE_NULL; 2851 } 2852 2853 2854 void 2855 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2856 uint16 splitAt, int8 change) 2857 { 2858 if (offset != fCurrentNodeOffset) 2859 return; 2860 2861 if (nextOffset != BPLUSTREE_NULL) { 2862 fCurrentNodeOffset = nextOffset; 2863 if (splitAt <= fCurrentKey) { 2864 fCurrentKey -= splitAt; 2865 keyIndex -= splitAt; 2866 } 2867 } 2868 2869 // Adjust fCurrentKey to point to the same key as before. 2870 // Note, that if a key is inserted at the current position 2871 // it won't be included in this tree transition. 2872 if (keyIndex <= fCurrentKey) 2873 fCurrentKey += change; 2874 2875 // TODO: duplicate handling! 2876 } 2877 2878 2879 void 2880 TreeIterator::Stop() 2881 { 2882 fTree = NULL; 2883 } 2884 2885 2886 #ifdef DEBUG 2887 void 2888 TreeIterator::Dump() 2889 { 2890 __out("TreeIterator at %p:\n", this); 2891 __out("\tfTree = %p\n", fTree); 2892 __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset); 2893 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2894 __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n", 2895 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2896 fDuplicateNode); 2897 __out("\tfDuplicate = %u\n", fDuplicate); 2898 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2899 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2900 } 2901 #endif 2902 2903 2904 // #pragma mark - 2905 2906 2907 bool 2908 bplustree_header::IsValid() const 2909 { 2910 return Magic() == BPLUSTREE_MAGIC 2911 && (RootNode() % NodeSize()) == 0 2912 && IsValidLink(RootNode()) 2913 && IsValidLink(FreeNode()); 2914 } 2915 2916 2917 // #pragma mark - 2918 2919 2920 void 2921 bplustree_node::Initialize() 2922 { 2923 left_link = right_link = overflow_link 2924 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2925 all_key_count = 0; 2926 all_key_length = 0; 2927 } 2928 2929 2930 uint8* 2931 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2932 { 2933 if (index < 0 || index > NumKeys()) 2934 return NULL; 2935 2936 uint8* keyStart = Keys(); 2937 Unaligned<uint16>* keyLengths = KeyLengths(); 2938 2939 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2940 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2941 if (index > 0) 2942 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2943 2944 return keyStart; 2945 } 2946 2947 2948 uint8 2949 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2950 { 2951 // the duplicate fragment handling is currently hard-coded to a node size 2952 // of 1024 bytes - with future versions of BFS, this may be a problem 2953 2954 if (isFragment) { 2955 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2956 2957 return ((off_t*)this)[fragment]; 2958 } 2959 return OverflowLink(); 2960 } 2961 2962 2963 off_t 2964 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2965 { 2966 uint32 start; 2967 if (isFragment) 2968 start = 8 * ((uint64)offset & 0x3ff); 2969 else 2970 start = 2; 2971 2972 return ((off_t*)this)[start + 1 + index]; 2973 } 2974 2975 2976 /*! Although the name suggests it, this function doesn't return the real 2977 used fragment count; at least, it can only count to two: it returns 2978 0, if there is no fragment used, 1 if there is only one fragment 2979 used, and 2 if there are at least 2 fragments used. 2980 */ 2981 uint32 2982 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2983 { 2984 uint32 used = 0; 2985 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2986 duplicate_array* array = FragmentAt(i); 2987 if (array->Count() > 0 && ++used > 1) 2988 return used; 2989 } 2990 return used; 2991 } 2992 2993 2994 status_t 2995 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2996 { 2997 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2998 DEBUGGER(("invalid node: key/length count")); 2999 3000 for (int32 i = 0; i < NumKeys(); i++) { 3001 uint16 length = 0; 3002 uint8* key = KeyAt(i, &length); 3003 if (key + length + sizeof(off_t) + sizeof(uint16) 3004 > (uint8*)this + nodeSize 3005 || length > BPLUSTREE_MAX_KEY_LENGTH) { 3006 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 3007 return B_BAD_DATA; 3008 } 3009 if (Values()[i] == -1) { 3010 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " 3011 "corrupted\n", this, (int)i, Values()[i].value); 3012 return B_BAD_DATA; 3013 } 3014 } 3015 return B_OK; 3016 } 3017 3018 3019 // #pragma mark - 3020 3021 3022 #if !_BOOT_MODE 3023 BitmapArray::BitmapArray(size_t numBits) 3024 { 3025 fSize = (numBits + 7) / 8; 3026 fBitmap = (uint8*)calloc(fSize, 1); 3027 fCountSet = 0; 3028 } 3029 3030 3031 BitmapArray::~BitmapArray() 3032 { 3033 free(fBitmap); 3034 } 3035 3036 3037 status_t 3038 BitmapArray::InitCheck() const 3039 { 3040 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 3041 } 3042 3043 3044 bool 3045 BitmapArray::IsSet(size_t index) const 3046 { 3047 uint32 byteIndex = index / 8; 3048 if (byteIndex >= fSize) 3049 return false; 3050 3051 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 3052 } 3053 3054 3055 void 3056 BitmapArray::Set(size_t index, bool set) 3057 { 3058 uint32 byteIndex = index / 8; 3059 if (byteIndex >= fSize) 3060 return; 3061 3062 if (set) { 3063 fBitmap[byteIndex] |= 1UL << (index & 0x7); 3064 fCountSet++; 3065 } else { 3066 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 3067 fCountSet--; 3068 } 3069 } 3070 #endif // !_BOOT_MODE 3071 3072 3073 // #pragma mark - 3074 3075 3076 bool 3077 duplicate_array::_FindInternal(off_t value, int32& index) const 3078 { 3079 int32 min = 0, max = Count() - 1; 3080 off_t cmp; 3081 while (min <= max) { 3082 index = (min + max) / 2; 3083 3084 cmp = ValueAt(index) - value; 3085 if (cmp < 0) 3086 min = index + 1; 3087 else if (cmp > 0) 3088 max = index - 1; 3089 else 3090 return true; 3091 } 3092 return false; 3093 } 3094 3095 3096 void 3097 duplicate_array::Insert(off_t value) 3098 { 3099 // if there are more than 8 values in this array, use a 3100 // binary search, if not, just iterate linearly to find 3101 // the insertion point 3102 int32 size = Count(); 3103 int32 i = 0; 3104 if (size > 8 ) { 3105 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3106 i++; 3107 } else { 3108 for (i = 0; i < size; i++) { 3109 if (ValueAt(i) > value) 3110 break; 3111 } 3112 } 3113 3114 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3115 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3116 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3117 } 3118 3119 3120 bool 3121 duplicate_array::Remove(off_t value) 3122 { 3123 int32 index = Find(value); 3124 if (index == -1) 3125 return false; 3126 3127 int32 newSize = Count() - 1; 3128 memmove(&values[index], &values[index + 1], 3129 (newSize - index) * sizeof(off_t)); 3130 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3131 3132 return true; 3133 } 3134 3135 3136 #if _BOOT_MODE 3137 } // namespace BFS 3138 #endif 3139