1 /* 2 * Copyright 2001-2014, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "BPlusTree.h" 14 15 #include <file_systems/QueryParserUtils.h> 16 17 #include "Debug.h" 18 #include "Utility.h" 19 20 #if !_BOOT_MODE 21 # include "Inode.h" 22 #else 23 # include "Stream.h" 24 25 // BFS::Stream from the bootloader has the same API as Inode. 26 # define Inode BFS::Stream 27 28 # define strerror(x) "error message unavailable" 29 30 namespace BFS { 31 #endif 32 33 34 /*! Simple array used for the duplicate handling in the B+Tree. This is an 35 on disk structure. 36 */ 37 struct duplicate_array { 38 off_t count; 39 off_t values[0]; 40 41 inline bool IsEmpty() const 42 { 43 return count == 0; 44 } 45 46 inline int32 Count() const 47 { 48 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 49 } 50 51 inline off_t ValueAt(uint32 index) const 52 { 53 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 54 } 55 56 inline void SetValueAt(uint32 index, off_t value) 57 { 58 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 59 } 60 61 inline int32 Find(off_t value) const 62 { 63 int32 i; 64 return _FindInternal(value, i) ? i : -1; 65 } 66 67 void Insert(off_t value); 68 bool Remove(off_t value); 69 70 private: 71 bool _FindInternal(off_t value, int32& index) const; 72 } _PACKED; 73 74 75 #ifdef DEBUG 76 class NodeChecker { 77 public: 78 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 79 : 80 fNode(node), 81 fSize(nodeSize), 82 fText(text) 83 { 84 Check("integrity check failed on construction."); 85 } 86 87 ~NodeChecker() 88 { 89 Check("integrity check failed on destruction."); 90 } 91 92 void 93 Check(const char* message) 94 { 95 if (fNode->CheckIntegrity(fSize) != B_OK) { 96 dprintf("%s: %s\n", fText, message); 97 DEBUGGER(("NodeChecker integrity check failed!")); 98 } 99 } 100 101 private: 102 const bplustree_node* fNode; 103 int32 fSize; 104 const char* fText; 105 }; 106 #endif // DEBUG 107 108 109 #if !_BOOT_MODE 110 class BitmapArray { 111 public: 112 BitmapArray(size_t numBits); 113 ~BitmapArray(); 114 115 status_t InitCheck() const; 116 117 bool IsSet(size_t index) const; 118 void Set(size_t index, bool set); 119 120 size_t CountSet() const { return fCountSet; } 121 122 private: 123 uint8* fBitmap; 124 size_t fSize; 125 size_t fCountSet; 126 }; 127 128 129 struct TreeCheck { 130 TreeCheck(BPlusTree* tree) 131 : 132 fLevelCount(0), 133 fFreeCount(0), 134 fNodeSize(tree->NodeSize()), 135 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 136 fFoundErrors(0), 137 fVisited(tree->Stream()->Size() / tree->NodeSize()), 138 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 139 { 140 fPreviousOffsets = (off_t*)malloc( 141 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 142 if (fPreviousOffsets != NULL) { 143 for (size_t i = 0; i < fMaxLevels; i++) 144 fPreviousOffsets[i] = BPLUSTREE_NULL; 145 } 146 } 147 148 ~TreeCheck() 149 { 150 free(fPreviousOffsets); 151 } 152 153 status_t InitCheck() const 154 { 155 if (fPreviousOffsets == NULL) 156 return B_NO_MEMORY; 157 158 status_t status = fVisited.InitCheck(); 159 if (status != B_OK) 160 return status; 161 162 return fVisitedFragment.InitCheck(); 163 } 164 165 bool Visited(off_t offset) const 166 { 167 return fVisited.IsSet(offset / fNodeSize); 168 } 169 170 void SetVisited(off_t offset) 171 { 172 fVisited.Set(offset / fNodeSize, true); 173 } 174 175 size_t VisitedCount() const 176 { 177 return fVisited.CountSet(); 178 } 179 180 bool VisitedFragment(off_t offset) const 181 { 182 return fVisitedFragment.IsSet(offset / fNodeSize); 183 } 184 185 void SetVisitedFragment(off_t offset) 186 { 187 fVisitedFragment.Set(offset / fNodeSize, true); 188 } 189 190 uint32 MaxLevels() const 191 { 192 return fLevelCount; 193 } 194 195 void SetLevel(uint32 level) 196 { 197 if (fLevelCount < level) 198 fLevelCount = level; 199 } 200 201 off_t PreviousOffset(uint32 level) 202 { 203 return fPreviousOffsets[level]; 204 } 205 206 void SetPreviousOffset(uint32 level, off_t offset) 207 { 208 fPreviousOffsets[level] = offset; 209 } 210 211 void FoundError() 212 { 213 fFoundErrors++; 214 } 215 216 bool ErrorsFound() 217 { 218 return fFoundErrors != 0; 219 } 220 221 private: 222 uint32 fLevelCount; 223 uint32 fFreeCount; 224 uint32 fNodeSize; 225 uint32 fMaxLevels; 226 uint32 fFoundErrors; 227 BitmapArray fVisited; 228 BitmapArray fVisitedFragment; 229 off_t* fPreviousOffsets; 230 }; 231 232 233 // #pragma mark - 234 235 236 // Node Caching for the BPlusTree class 237 // 238 // With write support, there is the need for a function that allocates new 239 // nodes by either returning empty nodes, or by growing the file's data stream 240 // 241 // !! The CachedNode class assumes that you have properly locked the stream 242 // !! before asking for nodes. 243 // 244 // Note: This code will fail if the block size is smaller than the node size! 245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 246 // is hard-coded to 1024 bytes, that's not an issue now. 247 248 void 249 CachedNode::UnsetUnchanged(Transaction& transaction) 250 { 251 if (fTree == NULL || fTree->fStream == NULL) 252 return; 253 254 if (fNode != NULL) { 255 void* cache = fTree->fStream->GetVolume()->BlockCache(); 256 257 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 258 block_cache_put(cache, fBlockNumber); 259 fNode = NULL; 260 } 261 } 262 #endif // !_BOOT_MODE 263 264 265 void 266 CachedNode::Unset() 267 { 268 if (fTree == NULL || fTree->fStream == NULL) 269 return; 270 271 if (fNode != NULL) { 272 #if !_BOOT_MODE 273 if (fWritable && fOffset == 0) { 274 // The B+tree header has been updated - we need to update the 275 // BPlusTrees copy of it, as well. 276 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 277 } 278 279 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 280 fBlockNumber); 281 #endif // !_BOOT_MODE 282 283 fNode = NULL; 284 } 285 } 286 287 288 const bplustree_node* 289 CachedNode::SetTo(off_t offset, bool check) 290 { 291 const bplustree_node* node; 292 if (SetTo(offset, &node, check) == B_OK) 293 return node; 294 295 return NULL; 296 } 297 298 299 status_t 300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 301 { 302 if (fTree == NULL || fTree->fStream == NULL) 303 RETURN_ERROR(B_BAD_VALUE); 304 305 Unset(); 306 307 // You can only ask for nodes at valid positions - you can't 308 // even access the b+tree header with this method (use SetToHeader() 309 // instead) 310 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 311 || offset <= 0 312 || (offset % fTree->fNodeSize) != 0) { 313 RETURN_ERROR(B_BAD_VALUE); 314 } 315 316 if (InternalSetTo(NULL, offset) != NULL && check) { 317 // sanity checks (links, all_key_count) 318 if (!fTree->fHeader.CheckNode(fNode)) { 319 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 320 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 321 fBlockNumber, fTree->fStream->ID())); 322 return B_BAD_DATA; 323 } 324 } 325 326 *_node = fNode; 327 return B_OK; 328 } 329 330 331 #if !_BOOT_MODE 332 bplustree_node* 333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 334 { 335 if (fTree == NULL || fTree->fStream == NULL) { 336 REPORT_ERROR(B_BAD_VALUE); 337 return NULL; 338 } 339 340 Unset(); 341 342 // You can only ask for nodes at valid positions - you can't 343 // even access the b+tree header with this method (use SetToHeader() 344 // instead) 345 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 346 || offset <= 0 347 || (offset % fTree->fNodeSize) != 0) 348 return NULL; 349 350 if (InternalSetTo(&transaction, offset) != NULL && check) { 351 // sanity checks (links, all_key_count) 352 if (!fTree->fHeader.CheckNode(fNode)) { 353 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 354 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 355 fBlockNumber, fTree->fStream->ID())); 356 return NULL; 357 } 358 } 359 return fNode; 360 } 361 362 363 bplustree_node* 364 CachedNode::MakeWritable(Transaction& transaction) 365 { 366 if (fNode == NULL) 367 return NULL; 368 369 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 370 fBlockNumber, transaction.ID()) == B_OK) { 371 return fNode; 372 } 373 374 return NULL; 375 } 376 #endif // !_BOOT_MODE 377 378 379 const bplustree_header* 380 CachedNode::SetToHeader() 381 { 382 if (fTree == NULL || fTree->fStream == NULL) { 383 REPORT_ERROR(B_BAD_VALUE); 384 return NULL; 385 } 386 387 Unset(); 388 389 InternalSetTo(NULL, 0LL); 390 return (bplustree_header*)fNode; 391 } 392 393 394 #if !_BOOT_MODE 395 bplustree_header* 396 CachedNode::SetToWritableHeader(Transaction& transaction) 397 { 398 if (fTree == NULL || fTree->fStream == NULL) { 399 REPORT_ERROR(B_BAD_VALUE); 400 return NULL; 401 } 402 403 Unset(); 404 405 InternalSetTo(&transaction, 0LL); 406 407 if (fNode != NULL && !fTree->fInTransaction) { 408 transaction.AddListener(fTree); 409 fTree->fInTransaction = true; 410 411 if (!transaction.GetVolume()->IsInitializing()) { 412 acquire_vnode(transaction.GetVolume()->FSVolume(), 413 fTree->fStream->ID()); 414 } 415 } 416 417 return (bplustree_header*)fNode; 418 } 419 #endif // !_BOOT_MODE 420 421 422 bplustree_node* 423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 424 { 425 fNode = NULL; 426 fOffset = offset; 427 428 off_t fileOffset; 429 block_run run; 430 if (offset < fTree->fStream->Size() 431 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 432 433 #if !_BOOT_MODE 434 Volume* volume = fTree->fStream->GetVolume(); 435 #else 436 Volume* volume = &fTree->fStream->GetVolume(); 437 #endif 438 439 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 440 fBlockNumber = volume->ToBlock(run) + blockOffset; 441 uint8* block = NULL; 442 443 #if !_BOOT_MODE 444 if (transaction != NULL) { 445 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 446 fBlockNumber, transaction->ID()); 447 fWritable = true; 448 } else { 449 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 450 fWritable = false; 451 } 452 #else // !_BOOT_MODE 453 if (fBlock == NULL) { 454 fBlock = (uint8*)malloc(volume->BlockSize()); 455 if (fBlock == NULL) 456 return NULL; 457 } 458 459 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), 460 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { 461 block = fBlock; 462 } 463 464 fWritable = false; 465 #endif // _BOOT_MODE 466 467 if (block != NULL) { 468 // The node is somewhere in that block... 469 // (confusing offset calculation) 470 fNode = (bplustree_node*)(block + offset 471 - (fileOffset + (blockOffset << volume->BlockShift()))); 472 } else 473 REPORT_ERROR(B_IO_ERROR); 474 } 475 return fNode; 476 } 477 478 479 #if !_BOOT_MODE 480 status_t 481 CachedNode::Free(Transaction& transaction, off_t offset) 482 { 483 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 484 RETURN_ERROR(B_BAD_VALUE); 485 486 // TODO: scan the free nodes list and remove all nodes at the end 487 // of the tree - perhaps that shouldn't be done everytime that 488 // function is called, perhaps it should be done when the directory 489 // inode is closed or based on some calculation or whatever... 490 491 CachedNode cached(fTree); 492 bplustree_header* header = cached.SetToWritableHeader(transaction); 493 if (header == NULL) 494 return B_IO_ERROR; 495 496 #if 0 497 // TODO: temporarily disabled because CheckNode() doesn't like this... 498 // Also, it's such an edge case that it's almost useless, anyway. 499 // if the node is the last one in the tree, we shrink 500 // the tree and file size by one node 501 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 502 if (offset == lastOffset) { 503 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 504 if (status != B_OK) 505 return status; 506 507 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 508 return B_OK; 509 } 510 #endif 511 512 // add the node to the free nodes list 513 fNode->left_link = header->free_node_pointer; 514 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 515 516 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 517 return B_OK; 518 } 519 520 521 status_t 522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 523 off_t* _offset) 524 { 525 if (fTree == NULL || fTree->fStream == NULL) 526 RETURN_ERROR(B_BAD_VALUE); 527 528 Unset(); 529 530 bplustree_header* header; 531 status_t status; 532 533 // if there are any free nodes, recycle them 534 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 535 CachedNode cached(fTree); 536 header = cached.SetToWritableHeader(transaction); 537 if (header == NULL) 538 return B_IO_ERROR; 539 540 *_offset = header->FreeNode(); 541 *_node = fNode; 542 543 // set new free node pointer 544 header->free_node_pointer = fNode->left_link; 545 fNode->Initialize(); 546 return B_OK; 547 } 548 549 // allocate space for a new node 550 Inode* stream = fTree->fStream; 551 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 552 return status; 553 554 CachedNode cached(fTree); 555 header = cached.SetToWritableHeader(transaction); 556 if (header == NULL) 557 return B_IO_ERROR; 558 559 // the maximum_size has to be changed before the call to SetTo() - or 560 // else it will fail because the requested node is out of bounds 561 off_t offset = header->MaximumSize(); 562 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 563 + fTree->fNodeSize); 564 565 cached.Unset(); 566 // SetToWritable() below needs the new values in the tree's header 567 568 if (SetToWritable(transaction, offset, false) == NULL) 569 RETURN_ERROR(B_ERROR); 570 571 fNode->Initialize(); 572 573 *_offset = offset; 574 *_node = fNode; 575 return B_OK; 576 } 577 578 579 // #pragma mark - 580 581 582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 583 : 584 fStream(NULL), 585 fInTransaction(false) 586 { 587 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 588 SetTo(transaction, stream); 589 } 590 #endif // !_BOOT_MODE 591 592 593 BPlusTree::BPlusTree(Inode* stream) 594 : 595 fStream(NULL), 596 fInTransaction(false) 597 { 598 #if !_BOOT_MODE 599 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 600 #endif 601 602 SetTo(stream); 603 } 604 605 606 BPlusTree::BPlusTree() 607 : 608 fStream(NULL), 609 fNodeSize(BPLUSTREE_NODE_SIZE), 610 fAllowDuplicates(true), 611 fInTransaction(false), 612 fStatus(B_NO_INIT) 613 { 614 #if !_BOOT_MODE 615 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 616 #endif 617 } 618 619 620 BPlusTree::~BPlusTree() 621 { 622 #if !_BOOT_MODE 623 // if there are any TreeIterators left, we need to stop them 624 // (can happen when the tree's inode gets deleted while 625 // traversing the tree - a TreeIterator doesn't lock the inode) 626 mutex_lock(&fIteratorLock); 627 628 SinglyLinkedList<TreeIterator>::Iterator iterator 629 = fIterators.GetIterator(); 630 while (iterator.HasNext()) 631 iterator.Next()->Stop(); 632 633 mutex_destroy(&fIteratorLock); 634 635 ASSERT(!fInTransaction); 636 #endif // !_BOOT_MODE 637 } 638 639 640 #if !_BOOT_MODE 641 /*! Create a new B+Tree on the specified stream */ 642 status_t 643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 644 { 645 // initializes in-memory B+Tree 646 647 fStream = stream; 648 649 CachedNode cached(this); 650 bplustree_header* header = cached.SetToWritableHeader(transaction); 651 if (header == NULL) { 652 // allocate space for new header + node! 653 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 654 if (fStatus != B_OK) 655 RETURN_ERROR(fStatus); 656 657 header = cached.SetToWritableHeader(transaction); 658 if (header == NULL) 659 RETURN_ERROR(fStatus = B_ERROR); 660 } 661 662 fAllowDuplicates = stream->IsIndex() 663 || (stream->Mode() & S_ALLOW_DUPS) != 0; 664 665 fNodeSize = nodeSize; 666 667 // initialize b+tree header 668 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 669 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 670 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 671 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 672 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 673 header->free_node_pointer 674 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 675 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 676 677 cached.Unset(); 678 679 // initialize b+tree root node 680 cached.SetToWritable(transaction, fHeader.RootNode(), false); 681 if (cached.Node() == NULL) 682 RETURN_ERROR(B_IO_ERROR); 683 684 cached.Node()->Initialize(); 685 686 return fStatus = B_OK; 687 } 688 #endif // !_BOOT_MODE 689 690 691 status_t 692 BPlusTree::SetTo(Inode* stream) 693 { 694 if (stream == NULL) 695 RETURN_ERROR(fStatus = B_BAD_VALUE); 696 697 fStream = stream; 698 699 // get on-disk B+Tree header 700 701 CachedNode cached(this); 702 const bplustree_header* header = cached.SetToHeader(); 703 if (header != NULL) 704 memcpy(&fHeader, header, sizeof(bplustree_header)); 705 else 706 RETURN_ERROR(fStatus = B_IO_ERROR); 707 708 // is header valid? 709 710 if (fHeader.MaximumSize() != stream->Size()) { 711 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 712 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 713 // we can't change the header since we don't have a transaction 714 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 715 } 716 if (!fHeader.IsValid()) { 717 #ifdef DEBUG 718 dump_bplustree_header(&fHeader); 719 dump_block((const char*)&fHeader, 128); 720 #endif 721 RETURN_ERROR(fStatus = B_BAD_DATA); 722 } 723 724 fNodeSize = fHeader.NodeSize(); 725 726 // validity check 727 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 728 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 729 S_DOUBLE_INDEX}; 730 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 731 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 732 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 733 734 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 735 || ((stream->Mode() & S_INDEX_DIR) != 0 736 && kToMode[fHeader.DataType()] != mode) 737 || !stream->IsContainer()) { 738 D( dump_bplustree_header(&fHeader); 739 dump_inode(&stream->Node()); 740 ); 741 RETURN_ERROR(fStatus = B_BAD_TYPE); 742 } 743 744 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 745 // in the original BFS code - we will honour it nevertheless 746 fAllowDuplicates = stream->IsIndex() 747 || (stream->Mode() & S_ALLOW_DUPS) != 0; 748 749 cached.SetTo(fHeader.RootNode()); 750 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 751 } 752 753 754 status_t 755 BPlusTree::InitCheck() 756 { 757 return fStatus; 758 } 759 760 761 #if !_BOOT_MODE 762 status_t 763 BPlusTree::Validate(bool repair, bool& _errorsFound) 764 { 765 TreeCheck check(this); 766 if (check.InitCheck() != B_OK) 767 return B_NO_MEMORY; 768 769 check.SetVisited(0); 770 771 // Walk the free nodes 772 773 CachedNode cached(this); 774 off_t freeOffset = fHeader.FreeNode(); 775 while (freeOffset > 0) { 776 const bplustree_node* node; 777 status_t status = cached.SetTo(freeOffset, &node, false); 778 if (status != B_OK) { 779 if (status == B_IO_ERROR) 780 return B_IO_ERROR; 781 782 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 783 "not be read: %s\n", fStream->ID(), freeOffset, 784 strerror(status)); 785 check.FoundError(); 786 break; 787 } 788 789 if (check.Visited(freeOffset)) { 790 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 791 " circular!\n", fStream->ID(), freeOffset); 792 // TODO: if 'repair' is true, we could collect all unvisited nodes 793 // at the end, and put the into the free list 794 check.FoundError(); 795 break; 796 } 797 798 check.SetVisited(freeOffset); 799 800 if (node->OverflowLink() != BPLUSTREE_FREE) { 801 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 802 " misses free mark!\n", fStream->ID(), freeOffset); 803 } 804 freeOffset = node->LeftLink(); 805 } 806 807 // Iterate over the complete tree recursively 808 809 const bplustree_node* root; 810 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 811 if (status != B_OK) 812 return status; 813 814 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 815 816 if (check.ErrorsFound()) 817 _errorsFound = true; 818 819 if (status != B_OK) 820 return status; 821 822 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 823 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 824 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 825 fHeader.MaxNumberOfLevels()); 826 } 827 828 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 829 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 830 " nodes.\n", fStream->ID(), check.VisitedCount(), 831 fHeader.MaximumSize() / fNodeSize); 832 } 833 834 return B_OK; 835 } 836 837 838 status_t 839 BPlusTree::MakeEmpty() 840 { 841 // Put all nodes into the free list in order 842 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 843 844 // Reset the header, and root node 845 CachedNode cached(this); 846 bplustree_header* header = cached.SetToWritableHeader(transaction); 847 if (header == NULL) 848 return B_IO_ERROR; 849 850 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 851 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 852 if (fStream->Size() > (off_t)NodeSize() * 2) 853 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 854 else { 855 header->free_node_pointer 856 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 857 } 858 859 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 860 if (node == NULL) 861 return B_IO_ERROR; 862 863 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 864 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 865 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 866 node->all_key_count = 0; 867 node->all_key_length = 0; 868 869 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 870 offset += NodeSize()) { 871 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 872 if (node == NULL) { 873 dprintf("--> could not open %" B_PRIdOFF "\n", offset); 874 return B_IO_ERROR; 875 } 876 if (offset < fStream->Size() - (off_t)NodeSize()) 877 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 878 else 879 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 880 881 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 882 883 // It's not important to write it out in a single transaction; just 884 // make sure it doesn't get too large 885 if (offset % (1024 * 1024) == 0) { 886 transaction.Done(); 887 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 888 } 889 } 890 891 return transaction.Done(); 892 } 893 894 895 int32 896 BPlusTree::TypeCodeToKeyType(type_code code) 897 { 898 switch (code) { 899 case B_STRING_TYPE: 900 return BPLUSTREE_STRING_TYPE; 901 case B_SSIZE_T_TYPE: 902 case B_INT32_TYPE: 903 return BPLUSTREE_INT32_TYPE; 904 case B_SIZE_T_TYPE: 905 case B_UINT32_TYPE: 906 return BPLUSTREE_UINT32_TYPE; 907 case B_OFF_T_TYPE: 908 case B_INT64_TYPE: 909 return BPLUSTREE_INT64_TYPE; 910 case B_UINT64_TYPE: 911 return BPLUSTREE_UINT64_TYPE; 912 case B_FLOAT_TYPE: 913 return BPLUSTREE_FLOAT_TYPE; 914 case B_DOUBLE_TYPE: 915 return BPLUSTREE_DOUBLE_TYPE; 916 } 917 return -1; 918 } 919 920 921 int32 922 BPlusTree::ModeToKeyType(mode_t mode) 923 { 924 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 925 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 926 case S_INT_INDEX: 927 return BPLUSTREE_INT32_TYPE; 928 case S_UINT_INDEX: 929 return BPLUSTREE_UINT32_TYPE; 930 case S_LONG_LONG_INDEX: 931 return BPLUSTREE_INT64_TYPE; 932 case S_ULONG_LONG_INDEX: 933 return BPLUSTREE_UINT64_TYPE; 934 case S_FLOAT_INDEX: 935 return BPLUSTREE_FLOAT_TYPE; 936 case S_DOUBLE_INDEX: 937 return BPLUSTREE_DOUBLE_TYPE; 938 case S_STR_INDEX: 939 default: 940 // default is for standard directories 941 return BPLUSTREE_STRING_TYPE; 942 } 943 } 944 #endif // !_BOOT_MODE 945 946 947 // #pragma mark - TransactionListener implementation 948 949 950 #if !_BOOT_MODE 951 void 952 BPlusTree::TransactionDone(bool success) 953 { 954 if (!success) { 955 // update header from disk 956 CachedNode cached(this); 957 const bplustree_header* header = cached.SetToHeader(); 958 if (header != NULL) 959 memcpy(&fHeader, header, sizeof(bplustree_header)); 960 } 961 } 962 963 964 void 965 BPlusTree::RemovedFromTransaction() 966 { 967 fInTransaction = false; 968 969 if (!fStream->GetVolume()->IsInitializing()) 970 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 971 } 972 973 974 // #pragma mark - 975 976 977 void 978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 979 uint16 splitAt, int8 change) 980 { 981 // Although every iterator which is affected by this update currently 982 // waits on a semaphore, other iterators could be added/removed at 983 // any time, so we need to protect this loop 984 MutexLocker _(fIteratorLock); 985 986 SinglyLinkedList<TreeIterator>::Iterator iterator 987 = fIterators.GetIterator(); 988 while (iterator.HasNext()) 989 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 990 } 991 992 993 void 994 BPlusTree::_AddIterator(TreeIterator* iterator) 995 { 996 MutexLocker _(fIteratorLock); 997 fIterators.Add(iterator); 998 } 999 1000 1001 void 1002 BPlusTree::_RemoveIterator(TreeIterator* iterator) 1003 { 1004 MutexLocker _(fIteratorLock); 1005 fIterators.Remove(iterator); 1006 } 1007 #endif // !_BOOT_MODE 1008 1009 1010 int32 1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 1012 int keyLength2) 1013 { 1014 type_code type = 0; 1015 switch (fHeader.DataType()) { 1016 case BPLUSTREE_STRING_TYPE: 1017 type = B_STRING_TYPE; 1018 break; 1019 case BPLUSTREE_INT32_TYPE: 1020 type = B_INT32_TYPE; 1021 break; 1022 case BPLUSTREE_UINT32_TYPE: 1023 type = B_UINT32_TYPE; 1024 break; 1025 case BPLUSTREE_INT64_TYPE: 1026 type = B_INT64_TYPE; 1027 break; 1028 case BPLUSTREE_UINT64_TYPE: 1029 type = B_UINT64_TYPE; 1030 break; 1031 case BPLUSTREE_FLOAT_TYPE: 1032 type = B_FLOAT_TYPE; 1033 break; 1034 case BPLUSTREE_DOUBLE_TYPE: 1035 type = B_DOUBLE_TYPE; 1036 break; 1037 } 1038 return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2); 1039 } 1040 1041 1042 status_t 1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 1044 uint16 keyLength, uint16* _index, off_t* _next) 1045 { 1046 #ifdef DEBUG 1047 NodeChecker checker(node, fNodeSize, "find"); 1048 #endif 1049 1050 if (node->all_key_count == 0) { 1051 if (_index) 1052 *_index = 0; 1053 if (_next) 1054 *_next = node->OverflowLink(); 1055 return B_ENTRY_NOT_FOUND; 1056 } 1057 1058 off_t* values = node->Values(); 1059 int16 saveIndex = -1; 1060 1061 // binary search in the key array 1062 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1063 uint16 i = (first + last) >> 1; 1064 1065 uint16 searchLength; 1066 uint8* searchKey = node->KeyAt(i, &searchLength); 1067 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1068 > (uint8*)node + fNodeSize 1069 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1070 #if !_BOOT_MODE 1071 fStream->GetVolume()->Panic(); 1072 #endif 1073 RETURN_ERROR(B_BAD_DATA); 1074 } 1075 1076 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1077 if (cmp < 0) { 1078 last = i - 1; 1079 saveIndex = i; 1080 } else if (cmp > 0) { 1081 saveIndex = first = i + 1; 1082 } else { 1083 if (_index) 1084 *_index = i; 1085 if (_next) 1086 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1087 return B_OK; 1088 } 1089 } 1090 1091 if (_index) 1092 *_index = saveIndex; 1093 if (_next) { 1094 if (saveIndex == node->NumKeys()) 1095 *_next = node->OverflowLink(); 1096 else 1097 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1098 } 1099 return B_ENTRY_NOT_FOUND; 1100 } 1101 1102 1103 #if !_BOOT_MODE 1104 /*! Prepares the stack to contain all nodes that were passed while 1105 following the key, from the root node to the leaf node that could 1106 or should contain that key. 1107 */ 1108 status_t 1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1110 uint16 keyLength) 1111 { 1112 // set the root node to begin with 1113 node_and_key nodeAndKey; 1114 nodeAndKey.nodeOffset = fHeader.RootNode(); 1115 1116 CachedNode cached(this); 1117 const bplustree_node* node; 1118 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1119 // if we are already on leaf level, we're done 1120 if (node->OverflowLink() == BPLUSTREE_NULL) { 1121 // node that the keyIndex is not properly set here (but it's not 1122 // needed in the calling functions anyway)! 1123 nodeAndKey.keyIndex = 0; 1124 stack.Push(nodeAndKey); 1125 return B_OK; 1126 } 1127 1128 off_t nextOffset; 1129 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1130 &nextOffset); 1131 1132 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1133 RETURN_ERROR(B_ERROR); 1134 1135 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1136 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1137 break; 1138 } 1139 1140 // put the node offset & the correct keyIndex on the stack 1141 stack.Push(nodeAndKey); 1142 1143 nodeAndKey.nodeOffset = nextOffset; 1144 } 1145 1146 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1147 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1148 return B_ERROR; 1149 } 1150 1151 1152 /*! This will find a free duplicate fragment in the given bplustree_node. 1153 The CachedNode will be set to the writable fragment on success. 1154 */ 1155 status_t 1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1157 const bplustree_node* node, CachedNode& cached, 1158 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1159 { 1160 off_t* values = node->Values(); 1161 for (int32 i = 0; i < node->NumKeys(); i++) { 1162 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1163 1164 // does the value link to a duplicate fragment? 1165 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1166 continue; 1167 1168 const bplustree_node* fragment = cached.SetTo( 1169 bplustree_node::FragmentOffset(value), false); 1170 if (fragment == NULL) { 1171 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1172 B_PRIdOFF "\n", value, fStream->ID())); 1173 continue; 1174 } 1175 1176 // see if there is some space left for us 1177 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1178 for (uint32 j = 0; j < num; j++) { 1179 duplicate_array* array = fragment->FragmentAt(j); 1180 1181 if (array->IsEmpty()) { 1182 // found an unused fragment 1183 *_fragment = cached.MakeWritable(transaction); 1184 if (*_fragment == NULL) 1185 return B_IO_ERROR; 1186 1187 *_offset = bplustree_node::FragmentOffset(value); 1188 *_index = j; 1189 return B_OK; 1190 } 1191 } 1192 } 1193 return B_ENTRY_NOT_FOUND; 1194 } 1195 1196 1197 status_t 1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1199 const bplustree_node* node, uint16 index, off_t value) 1200 { 1201 CachedNode cachedDuplicate(this); 1202 off_t* values = node->Values(); 1203 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1204 status_t status; 1205 off_t offset; 1206 1207 if (bplustree_node::IsDuplicate(oldValue)) { 1208 // If it's a duplicate fragment, try to insert it into that, or if it 1209 // doesn't fit anymore, create a new duplicate node 1210 1211 if (bplustree_node::LinkType(oldValue) 1212 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1213 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1214 transaction, bplustree_node::FragmentOffset(oldValue), false); 1215 if (duplicate == NULL) 1216 return B_IO_ERROR; 1217 1218 duplicate_array* array = duplicate->FragmentAt( 1219 bplustree_node::FragmentIndex(oldValue)); 1220 int32 arrayCount = array->Count(); 1221 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1222 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1223 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1224 (int)bplustree_node::FragmentIndex(oldValue), 1225 bplustree_node::FragmentOffset(oldValue), arrayCount, 1226 fStream->ID())); 1227 return B_BAD_DATA; 1228 } 1229 1230 if (arrayCount < NUM_FRAGMENT_VALUES) { 1231 array->Insert(value); 1232 } else { 1233 // Test if the fragment will be empty if we remove this key's 1234 // values 1235 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1236 // The node will be empty without our values, so let us 1237 // reuse it as a duplicate node 1238 offset = bplustree_node::FragmentOffset(oldValue); 1239 1240 memmove(duplicate->DuplicateArray(), array, 1241 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1242 duplicate->left_link = duplicate->right_link 1243 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1244 1245 array = duplicate->DuplicateArray(); 1246 array->Insert(value); 1247 } else { 1248 // Create a new duplicate node 1249 1250 cachedDuplicate.UnsetUnchanged(transaction); 1251 // The old duplicate has not been touched, so we can 1252 // reuse it 1253 1254 bplustree_node* newDuplicate; 1255 status = cachedDuplicate.Allocate(transaction, 1256 &newDuplicate, &offset); 1257 if (status != B_OK) 1258 RETURN_ERROR(status); 1259 1260 // Copy the array from the fragment node to the duplicate 1261 // node and free the old entry (by zero'ing all values) 1262 newDuplicate->overflow_link = array->count; 1263 memcpy(&newDuplicate->all_key_count, &array->values[0], 1264 array->Count() * sizeof(off_t)); 1265 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1266 1267 array = newDuplicate->DuplicateArray(); 1268 array->Insert(value); 1269 } 1270 1271 // Update the main pointer to link to a duplicate node 1272 if (cached.MakeWritable(transaction) == NULL) 1273 return B_IO_ERROR; 1274 1275 values[index] 1276 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1277 BPLUSTREE_DUPLICATE_NODE, offset)); 1278 } 1279 1280 return B_OK; 1281 } 1282 1283 // Put the value into a dedicated duplicate node 1284 1285 // search for free space in the duplicate nodes of that key 1286 duplicate_array* array; 1287 int32 arrayCount; 1288 const bplustree_node* duplicate; 1289 off_t duplicateOffset; 1290 do { 1291 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1292 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1293 if (duplicate == NULL) 1294 return B_IO_ERROR; 1295 1296 array = duplicate->DuplicateArray(); 1297 arrayCount =array->Count(); 1298 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1299 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1300 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1301 duplicateOffset, arrayCount, fStream->ID())); 1302 return B_BAD_DATA; 1303 } 1304 } while (arrayCount >= NUM_DUPLICATE_VALUES 1305 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1306 1307 bplustree_node* writableDuplicate 1308 = cachedDuplicate.MakeWritable(transaction); 1309 if (writableDuplicate == NULL) 1310 return B_IO_ERROR; 1311 1312 if (arrayCount < NUM_DUPLICATE_VALUES) { 1313 array = writableDuplicate->DuplicateArray(); 1314 array->Insert(value); 1315 } else { 1316 // no space left - add a new duplicate node 1317 1318 bplustree_node* newDuplicate; 1319 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1320 &offset); 1321 if (status != B_OK) 1322 RETURN_ERROR(status); 1323 1324 // link the two nodes together 1325 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1326 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1327 1328 array = newDuplicate->DuplicateArray(); 1329 array->count = 0; 1330 array->Insert(value); 1331 } 1332 return B_OK; 1333 } 1334 1335 // Search for a free duplicate fragment or create a new one 1336 // to insert the duplicate value into 1337 1338 uint32 fragmentIndex = 0; 1339 bplustree_node* fragment; 1340 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1341 &offset, &fragment, &fragmentIndex) != B_OK) { 1342 // allocate a new duplicate fragment node 1343 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1344 if (status != B_OK) 1345 RETURN_ERROR(status); 1346 1347 memset(fragment, 0, fNodeSize); 1348 } 1349 1350 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1351 array->Insert(oldValue); 1352 array->Insert(value); 1353 1354 if (cached.MakeWritable(transaction) == NULL) 1355 return B_IO_ERROR; 1356 1357 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1358 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1359 1360 return B_OK; 1361 } 1362 1363 1364 void 1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1366 uint16 keyLength, off_t value) 1367 { 1368 // should never happen, but who knows? 1369 if (index > node->NumKeys()) 1370 return; 1371 1372 off_t* values = node->Values(); 1373 uint16* keyLengths = node->KeyLengths(); 1374 uint8* keys = node->Keys(); 1375 1376 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1377 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1378 + keyLength); 1379 1380 off_t* newValues = node->Values(); 1381 uint16* newKeyLengths = node->KeyLengths(); 1382 1383 // move values and copy new value into them 1384 memmove(newValues + index + 1, values + index, 1385 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1386 memmove(newValues, values, sizeof(off_t) * index); 1387 1388 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1389 1390 // move and update key length index 1391 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1392 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1393 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1394 } 1395 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1396 1397 int32 keyStart; 1398 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1399 + (keyStart = index > 0 1400 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1401 1402 // move keys and copy new key into them 1403 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1404 int32 size = node->AllKeyLength() - length; 1405 if (size > 0) 1406 memmove(keys + length, keys + length - keyLength, size); 1407 1408 memcpy(keys + keyStart, key, keyLength); 1409 } 1410 1411 1412 /*! Splits the \a node into two halves - the other half will be put into 1413 \a other. It also takes care to create a new overflow link if the node 1414 to split is an index node. 1415 */ 1416 status_t 1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1418 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1419 uint16* _keyLength, off_t* _value) 1420 { 1421 if (*_keyIndex > node->NumKeys() + 1) 1422 return B_BAD_VALUE; 1423 1424 uint16* inKeyLengths = node->KeyLengths(); 1425 off_t* inKeyValues = node->Values(); 1426 uint8* inKeys = node->Keys(); 1427 uint8* outKeys = other->Keys(); 1428 int32 keyIndex = *_keyIndex; // can become less than zero! 1429 1430 if (keyIndex > node->NumKeys()) { 1431 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1432 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1433 return B_BAD_VALUE; 1434 } 1435 1436 // how many keys will fit in one (half) page? 1437 // that loop will find the answer to this question and 1438 // change the key lengths indices for their new home 1439 1440 // "bytes" is the number of bytes written for the new key, 1441 // "bytesBefore" are the bytes before that key 1442 // "bytesAfter" are the bytes after the new key, if any 1443 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1444 1445 size_t size = fNodeSize >> 1; 1446 int32 out, in; 1447 for (in = out = 0; in < node->NumKeys() + 1;) { 1448 if (!bytes) { 1449 bytesBefore = in > 0 1450 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1451 } 1452 1453 if (in == keyIndex && !bytes) { 1454 bytes = *_keyLength; 1455 } else { 1456 if (keyIndex < out) { 1457 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1458 - bytesBefore; 1459 } 1460 1461 in++; 1462 } 1463 out++; 1464 1465 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1466 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1467 // we have found the number of keys in the new node! 1468 break; 1469 } 1470 } 1471 1472 // if the new key was not inserted, set the length of the keys 1473 // that can be copied directly 1474 if (keyIndex >= out && in > 0) 1475 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1476 1477 if (bytesBefore < 0 || bytesAfter < 0) 1478 return B_BAD_DATA; 1479 1480 other->left_link = node->left_link; 1481 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1482 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1483 + bytesAfter); 1484 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1485 1486 uint16* outKeyLengths = other->KeyLengths(); 1487 off_t* outKeyValues = other->Values(); 1488 int32 keys = out > keyIndex ? keyIndex : out; 1489 1490 if (bytesBefore) { 1491 // copy the keys 1492 memcpy(outKeys, inKeys, bytesBefore); 1493 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1494 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1495 } 1496 if (bytes) { 1497 // copy the newly inserted key 1498 memcpy(outKeys + bytesBefore, key, bytes); 1499 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1500 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1501 1502 if (bytesAfter) { 1503 // copy the keys after the new key 1504 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1505 bytesAfter); 1506 keys = out - keyIndex - 1; 1507 for (int32 i = 0;i < keys;i++) { 1508 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1509 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1510 + bytes); 1511 } 1512 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1513 keys * sizeof(off_t)); 1514 } 1515 } 1516 1517 // if the new key was already inserted, we shouldn't use it again 1518 if (in != out) 1519 keyIndex--; 1520 1521 int32 total = bytesBefore + bytesAfter; 1522 1523 // these variables are for the key that will be returned 1524 // to the parent node 1525 uint8* newKey = NULL; 1526 uint16 newLength; 1527 bool newAllocated = false; 1528 1529 // If we have split an index node, we have to drop the first key 1530 // of the next node (which can also be the new key to insert). 1531 // The dropped key is also the one which has to be inserted in 1532 // the parent node, so we will set the "newKey" already here. 1533 if (node->OverflowLink() != BPLUSTREE_NULL) { 1534 if (in == keyIndex) { 1535 newKey = key; 1536 newLength = *_keyLength; 1537 1538 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1539 keyIndex--; 1540 } else { 1541 // If a key is dropped (is not the new key), we have to copy 1542 // it, because it would be lost if not. 1543 uint8* droppedKey = node->KeyAt(in, &newLength); 1544 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1545 > (uint8*)node + fNodeSize 1546 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1547 fStream->GetVolume()->Panic(); 1548 RETURN_ERROR(B_BAD_DATA); 1549 } 1550 newKey = (uint8*)malloc(newLength); 1551 if (newKey == NULL) 1552 return B_NO_MEMORY; 1553 1554 newAllocated = true; 1555 memcpy(newKey, droppedKey, newLength); 1556 1557 other->overflow_link = inKeyValues[in]; 1558 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1559 } 1560 } 1561 1562 // and now the same game for the other page and the rest of the keys 1563 // (but with memmove() instead of memcpy(), because they may overlap) 1564 1565 bytesBefore = bytesAfter = bytes = 0; 1566 out = 0; 1567 int32 skip = in; 1568 while (in < node->NumKeys() + 1) { 1569 if (in == keyIndex && !bytes) { 1570 // it's enough to set bytesBefore once here, because we do 1571 // not need to know the exact length of all keys in this 1572 // loop 1573 bytesBefore = in > skip 1574 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1575 bytes = *_keyLength; 1576 out++; 1577 } else { 1578 if (in < node->NumKeys()) { 1579 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1580 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1581 1582 if (bytes) { 1583 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1584 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1585 1586 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1587 - bytesBefore - bytes; 1588 } 1589 out++; 1590 } 1591 in++; 1592 } 1593 } 1594 1595 // adjust the byte counts (since we were a bit lazy in the loop) 1596 if (keyIndex < skip) 1597 bytesBefore = node->AllKeyLength() - total; 1598 1599 if (bytesBefore < 0 || bytesAfter < 0) { 1600 if (newAllocated) 1601 free(newKey); 1602 return B_BAD_DATA; 1603 } 1604 1605 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1606 // right link, and overflow link can stay the same 1607 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1608 + bytesAfter); 1609 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1610 1611 // array positions have changed 1612 outKeyLengths = node->KeyLengths(); 1613 outKeyValues = node->Values(); 1614 1615 // move the keys in the old node: the order is important here, 1616 // because we don't want to overwrite any contents 1617 1618 keys = keyIndex <= skip ? out : keyIndex - skip; 1619 keyIndex -= skip; 1620 in = out - keyIndex - 1; 1621 // Note: keyIndex and in will contain invalid values when the new key 1622 // went to the other node. But in this case bytes and bytesAfter are 1623 // 0 and subsequently we never use keyIndex and in. 1624 1625 if (bytesBefore) 1626 memmove(inKeys, inKeys + total, bytesBefore); 1627 if (bytesAfter) { 1628 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1629 bytesAfter); 1630 } 1631 1632 if (bytesBefore) 1633 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1634 if (bytesAfter) { 1635 // if byteAfter is > 0, keyIndex is larger than skip 1636 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1637 in * sizeof(uint16)); 1638 } 1639 1640 if (bytesBefore) 1641 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1642 if (bytesAfter) { 1643 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1644 in * sizeof(off_t)); 1645 } 1646 1647 if (bytes) { 1648 // finally, copy the newly inserted key (don't overwrite anything) 1649 memcpy(inKeys + bytesBefore, key, bytes); 1650 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1651 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1652 } 1653 1654 // Prepare the key that will be inserted in the parent node which 1655 // is either the dropped key or the last of the other node. 1656 // If it's the dropped key, "newKey" was already set earlier. 1657 1658 if (newKey == NULL) 1659 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1660 1661 memcpy(key, newKey, newLength); 1662 *_keyLength = newLength; 1663 *_value = otherOffset; 1664 1665 if (newAllocated) 1666 free(newKey); 1667 1668 return B_OK; 1669 } 1670 1671 1672 /*! This inserts a key into the tree. The changes made to the tree will 1673 all be part of the \a transaction. 1674 You need to have the inode write locked. 1675 */ 1676 status_t 1677 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1678 off_t value) 1679 { 1680 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1681 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1682 RETURN_ERROR(B_BAD_VALUE); 1683 #ifdef DEBUG 1684 if (value < 0) 1685 panic("tried to insert invalid value %" B_PRId64 "!\n", value); 1686 #endif 1687 1688 ASSERT_WRITE_LOCKED_INODE(fStream); 1689 1690 Stack<node_and_key> stack; 1691 if (_SeekDown(stack, key, keyLength) != B_OK) 1692 RETURN_ERROR(B_ERROR); 1693 1694 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1695 1696 memcpy(keyBuffer, key, keyLength); 1697 keyBuffer[keyLength] = 0; 1698 1699 node_and_key nodeAndKey; 1700 const bplustree_node* node; 1701 1702 CachedNode cached(this); 1703 while (stack.Pop(&nodeAndKey) 1704 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1705 #ifdef DEBUG 1706 NodeChecker checker(node, fNodeSize, "insert"); 1707 #endif 1708 if (node->IsLeaf()) { 1709 // first round, check for duplicate entries 1710 status_t status = _FindKey(node, key, keyLength, 1711 &nodeAndKey.keyIndex); 1712 1713 // is this a duplicate entry? 1714 if (status == B_OK) { 1715 if (fAllowDuplicates) { 1716 status = _InsertDuplicate(transaction, cached, node, 1717 nodeAndKey.keyIndex, value); 1718 if (status != B_OK) 1719 RETURN_ERROR(status); 1720 return B_OK; 1721 } 1722 1723 return B_NAME_IN_USE; 1724 } 1725 } 1726 1727 bplustree_node* writableNode = cached.MakeWritable(transaction); 1728 if (writableNode == NULL) 1729 return B_IO_ERROR; 1730 1731 // is the node big enough to hold the pair? 1732 if (int32(key_align(sizeof(bplustree_node) 1733 + writableNode->AllKeyLength() + keyLength) 1734 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1735 + sizeof(off_t))) < fNodeSize) { 1736 _InsertKey(writableNode, nodeAndKey.keyIndex, 1737 keyBuffer, keyLength, value); 1738 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1739 nodeAndKey.keyIndex, 0, 1); 1740 1741 return B_OK; 1742 } else { 1743 CachedNode cachedNewRoot(this); 1744 CachedNode cachedOther(this); 1745 1746 // do we need to allocate a new root node? if so, then do 1747 // it now 1748 off_t newRoot = BPLUSTREE_NULL; 1749 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1750 bplustree_node* root; 1751 status_t status = cachedNewRoot.Allocate(transaction, &root, 1752 &newRoot); 1753 if (status != B_OK) { 1754 // The tree is most likely corrupted! 1755 // But it's still sane at leaf level - we could set 1756 // a flag in the header that forces the tree to be 1757 // rebuild next time... 1758 // But since we will have journaling, that's not a big 1759 // problem anyway. 1760 RETURN_ERROR(status); 1761 } 1762 } 1763 1764 // reserve space for the other node 1765 bplustree_node* other; 1766 off_t otherOffset; 1767 status_t status = cachedOther.Allocate(transaction, &other, 1768 &otherOffset); 1769 if (status != B_OK) { 1770 cachedNewRoot.Free(transaction, newRoot); 1771 RETURN_ERROR(status); 1772 } 1773 1774 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1775 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1776 &value) != B_OK) { 1777 // free root node & other node here 1778 cachedOther.Free(transaction, otherOffset); 1779 cachedNewRoot.Free(transaction, newRoot); 1780 1781 RETURN_ERROR(B_ERROR); 1782 } 1783 #ifdef DEBUG 1784 checker.Check("insert split"); 1785 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1786 #endif 1787 1788 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1789 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1790 1791 // update the right link of the node in the left of the new node 1792 if ((other = cachedOther.SetToWritable(transaction, 1793 other->LeftLink())) != NULL) { 1794 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1795 } 1796 1797 // create a new root if necessary 1798 if (newRoot != BPLUSTREE_NULL) { 1799 bplustree_node* root = cachedNewRoot.Node(); 1800 1801 _InsertKey(root, 0, keyBuffer, keyLength, 1802 writableNode->LeftLink()); 1803 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1804 nodeAndKey.nodeOffset); 1805 1806 CachedNode cached(this); 1807 bplustree_header* header 1808 = cached.SetToWritableHeader(transaction); 1809 if (header == NULL) 1810 return B_IO_ERROR; 1811 1812 // finally, update header to point to the new root 1813 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1814 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1815 header->MaxNumberOfLevels() + 1); 1816 1817 return B_OK; 1818 } 1819 } 1820 } 1821 RETURN_ERROR(B_ERROR); 1822 } 1823 1824 1825 /*! Removes the duplicate index/value pair from the tree. 1826 It's part of the private tree interface. 1827 */ 1828 status_t 1829 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1830 const bplustree_node* node, CachedNode& cached, uint16 index, 1831 off_t value) 1832 { 1833 off_t* values = node->Values(); 1834 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1835 1836 CachedNode cachedDuplicate(this); 1837 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1838 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1839 duplicateOffset, false); 1840 if (duplicate == NULL) 1841 return B_IO_ERROR; 1842 1843 // if it's a duplicate fragment, remove the entry from there 1844 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1845 duplicate_array* array = duplicate->FragmentAt( 1846 bplustree_node::FragmentIndex(oldValue)); 1847 int32 arrayCount = array->Count(); 1848 1849 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1850 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1851 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1852 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1853 arrayCount, fStream->ID())); 1854 return B_BAD_DATA; 1855 } 1856 if (!array->Remove(value)) { 1857 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1858 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1859 duplicateOffset, fStream->ID())); 1860 return B_ENTRY_NOT_FOUND; 1861 } 1862 1863 // remove the array from the fragment node if it is empty 1864 if (--arrayCount == 1) { 1865 // set the link to the remaining value 1866 if (cached.MakeWritable(transaction) == NULL) 1867 return B_IO_ERROR; 1868 1869 values[index] = array->values[0]; 1870 1871 // Remove the whole fragment node, if this was the only array, 1872 // otherwise free just the array 1873 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1874 status_t status = cachedDuplicate.Free(transaction, 1875 duplicateOffset); 1876 if (status != B_OK) 1877 return status; 1878 } else 1879 array->count = 0; 1880 } 1881 return B_OK; 1882 } 1883 1884 // Remove value from a duplicate node! 1885 1886 duplicate_array* array = NULL; 1887 int32 arrayCount = 0; 1888 1889 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1890 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1891 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1892 return B_BAD_DATA; 1893 } 1894 1895 // Search the duplicate nodes until the entry could be found (and removed) 1896 while (duplicate != NULL) { 1897 array = duplicate->DuplicateArray(); 1898 arrayCount = array->Count(); 1899 1900 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1901 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1902 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1903 duplicateOffset, arrayCount, fStream->ID())); 1904 return B_BAD_DATA; 1905 } 1906 1907 if (array->Remove(value)) { 1908 arrayCount--; 1909 break; 1910 } 1911 1912 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1913 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1914 1915 cachedDuplicate.UnsetUnchanged(transaction); 1916 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1917 false); 1918 } 1919 if (duplicate == NULL) 1920 RETURN_ERROR(B_IO_ERROR); 1921 1922 // The entry got removed from the duplicate node, but we might want to free 1923 // it now in case it's empty 1924 1925 while (true) { 1926 off_t left = duplicate->LeftLink(); 1927 off_t right = duplicate->RightLink(); 1928 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1929 1930 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1931 // Free empty duplicate page, link their siblings together, and 1932 // update the duplicate link if needed (ie. when we either remove 1933 // the last duplicate node or have a new first one) 1934 1935 if (left == BPLUSTREE_NULL) { 1936 // the duplicate link points to us 1937 if (cached.MakeWritable(transaction) == NULL) 1938 return B_IO_ERROR; 1939 1940 if (arrayCount == 1) { 1941 // This is the last node, and there is only one value left; 1942 // replace the duplicate link with that value, it's no 1943 // duplicate anymore 1944 values[index] = array->values[0]; 1945 } else { 1946 // Move the duplicate link to the next node 1947 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1948 bplustree_node::MakeLink( 1949 BPLUSTREE_DUPLICATE_NODE, right)); 1950 } 1951 } 1952 1953 status_t status = cachedDuplicate.Free(transaction, 1954 duplicateOffset); 1955 if (status != B_OK) 1956 return status; 1957 1958 if (left != BPLUSTREE_NULL 1959 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1960 false)) != NULL) { 1961 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1962 1963 // If the next node is the last node, we need to free that node 1964 // and convert the duplicate entry back into a normal entry 1965 array = duplicate->DuplicateArray(); 1966 arrayCount = array->Count(); 1967 if (right == BPLUSTREE_NULL 1968 && duplicate->LeftLink() == BPLUSTREE_NULL 1969 && arrayCount <= NUM_FRAGMENT_VALUES) { 1970 duplicateOffset = left; 1971 continue; 1972 } 1973 } 1974 if (right != BPLUSTREE_NULL 1975 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1976 right, false)) != NULL) { 1977 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1978 1979 // Again, we may need to turn the duplicate entry back into a 1980 // normal entry 1981 array = duplicate->DuplicateArray(); 1982 arrayCount = array->Count(); 1983 if (left == BPLUSTREE_NULL 1984 && duplicate->RightLink() == BPLUSTREE_NULL 1985 && arrayCount <= NUM_FRAGMENT_VALUES) { 1986 duplicateOffset = right; 1987 continue; 1988 } 1989 } 1990 return B_OK; 1991 } 1992 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1993 // If the number of entries fits in a duplicate fragment, then 1994 // either find a free fragment node, or convert this node to a 1995 // fragment node. 1996 CachedNode cachedOther(this); 1997 1998 bplustree_node* fragment = NULL; 1999 uint32 fragmentIndex = 0; 2000 off_t offset; 2001 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 2002 &offset, &fragment, &fragmentIndex) == B_OK) { 2003 // move to other node 2004 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 2005 memcpy(target, array, 2006 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2007 2008 cachedDuplicate.Free(transaction, duplicateOffset); 2009 duplicateOffset = offset; 2010 } else { 2011 // convert node 2012 memmove(duplicate, array, 2013 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2014 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 2015 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2016 } 2017 2018 if (cached.MakeWritable(transaction) == NULL) 2019 return B_IO_ERROR; 2020 2021 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 2022 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 2023 } 2024 return B_OK; 2025 } 2026 } 2027 2028 2029 /*! Removes the key with the given index from the specified node. 2030 Since it has to get the key from the node anyway (to obtain it's 2031 pointer), it's not needed to pass the key & its length, although 2032 the calling method (BPlusTree::Remove()) have this data. 2033 */ 2034 void 2035 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 2036 { 2037 // should never happen, but who knows? 2038 if (index > node->NumKeys() && node->NumKeys() > 0) { 2039 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 2040 "\n", index, fStream->ID())); 2041 return; 2042 } 2043 2044 off_t* values = node->Values(); 2045 2046 // if we would have to drop the overflow link, drop 2047 // the last key instead and update the overflow link 2048 // to the value of that one 2049 if (!node->IsLeaf() && index == node->NumKeys()) 2050 node->overflow_link = values[--index]; 2051 2052 uint16 length; 2053 uint8* key = node->KeyAt(index, &length); 2054 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 2055 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2056 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 2057 fStream->ID())); 2058 fStream->GetVolume()->Panic(); 2059 return; 2060 } 2061 2062 uint16* keyLengths = node->KeyLengths(); 2063 uint8* keys = node->Keys(); 2064 2065 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2066 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2067 node->AllKeyLength() - length); 2068 2069 off_t* newValues = node->Values(); 2070 uint16* newKeyLengths = node->KeyLengths(); 2071 2072 // move key data 2073 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2074 2075 // move and update key lengths 2076 if (index > 0 && newKeyLengths != keyLengths) 2077 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2078 for (uint16 i = index; i < node->NumKeys(); i++) { 2079 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2080 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2081 } 2082 2083 // move values 2084 if (index > 0) 2085 memmove(newValues, values, index * sizeof(off_t)); 2086 if (node->NumKeys() > index) { 2087 memmove(newValues + index, values + index + 1, 2088 (node->NumKeys() - index) * sizeof(off_t)); 2089 } 2090 } 2091 2092 2093 /*! Removes the specified key from the tree. The "value" parameter is only used 2094 for trees which allow duplicates, so you may safely ignore it. 2095 It's not an optional parameter, so at least you have to think about it. 2096 You need to have the inode write locked. 2097 */ 2098 status_t 2099 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2100 off_t value) 2101 { 2102 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2103 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2104 RETURN_ERROR(B_BAD_VALUE); 2105 2106 ASSERT_WRITE_LOCKED_INODE(fStream); 2107 2108 Stack<node_and_key> stack; 2109 if (_SeekDown(stack, key, keyLength) != B_OK) 2110 RETURN_ERROR(B_ERROR); 2111 2112 node_and_key nodeAndKey; 2113 const bplustree_node* node; 2114 2115 CachedNode cached(this); 2116 while (stack.Pop(&nodeAndKey) 2117 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2118 #ifdef DEBUG 2119 NodeChecker checker(node, fNodeSize, "remove"); 2120 #endif 2121 if (node->IsLeaf()) { 2122 // first round, check for duplicate entries 2123 status_t status = _FindKey(node, key, keyLength, 2124 &nodeAndKey.keyIndex); 2125 if (status != B_OK) 2126 RETURN_ERROR(status); 2127 2128 // Is this a duplicate entry? 2129 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2130 node->Values()[nodeAndKey.keyIndex]))) { 2131 if (fAllowDuplicates) { 2132 return _RemoveDuplicate(transaction, node, cached, 2133 nodeAndKey.keyIndex, value); 2134 } 2135 2136 FATAL(("dupliate node found where no duplicates are " 2137 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2138 RETURN_ERROR(B_ERROR); 2139 } else { 2140 if (node->Values()[nodeAndKey.keyIndex] != value) 2141 return B_ENTRY_NOT_FOUND; 2142 2143 // If we will remove the last key, the iterator will be set 2144 // to the next node after the current - if there aren't any 2145 // more nodes, we need a way to prevent the TreeIterators to 2146 // touch the old node again, we use BPLUSTREE_FREE for this 2147 off_t next = node->RightLink() == BPLUSTREE_NULL 2148 ? BPLUSTREE_FREE : node->RightLink(); 2149 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2150 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2151 } 2152 } 2153 2154 bplustree_node* writableNode = cached.MakeWritable(transaction); 2155 if (writableNode == NULL) 2156 return B_IO_ERROR; 2157 2158 // if it's an empty root node, we have to convert it 2159 // to a leaf node by dropping the overflow link, or, 2160 // if it's already a leaf node, just empty it 2161 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2162 && (node->NumKeys() == 0 2163 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2164 writableNode->overflow_link 2165 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2166 writableNode->all_key_count = 0; 2167 writableNode->all_key_length = 0; 2168 2169 // if we've made a leaf node out of the root node, we need 2170 // to reset the maximum number of levels in the header 2171 if (fHeader.MaxNumberOfLevels() != 1) { 2172 CachedNode cached(this); 2173 bplustree_header* header 2174 = cached.SetToWritableHeader(transaction); 2175 if (header == NULL) 2176 return B_IO_ERROR; 2177 2178 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2179 } 2180 return B_OK; 2181 } 2182 2183 // if there is only one key left, we don't have to remove 2184 // it, we can just dump the node (index nodes still have 2185 // the overflow link, so we have to drop the last key) 2186 if (writableNode->NumKeys() > 1 2187 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2188 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2189 return B_OK; 2190 } 2191 2192 // when we are here, we can just free the node, but 2193 // we have to update the right/left link of the 2194 // siblings first 2195 CachedNode otherCached(this); 2196 bplustree_node* other = otherCached.SetToWritable(transaction, 2197 writableNode->LeftLink()); 2198 if (other != NULL) 2199 other->right_link = writableNode->right_link; 2200 2201 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2202 != NULL) { 2203 other->left_link = writableNode->left_link; 2204 } 2205 2206 cached.Free(transaction, nodeAndKey.nodeOffset); 2207 } 2208 RETURN_ERROR(B_ERROR); 2209 } 2210 2211 2212 /*! Replaces the value for the key in the tree. 2213 Returns B_OK if the key could be found and its value replaced, 2214 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2215 to indicate that something went terribly wrong. 2216 Note that this doesn't work with duplicates - it will just 2217 return B_BAD_TYPE if you call this function on a tree where 2218 duplicates are allowed. 2219 You need to have the inode write locked. 2220 */ 2221 status_t 2222 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2223 off_t value) 2224 { 2225 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2226 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2227 || key == NULL) 2228 RETURN_ERROR(B_BAD_VALUE); 2229 2230 if (fAllowDuplicates) 2231 RETURN_ERROR(B_BAD_TYPE); 2232 2233 ASSERT_WRITE_LOCKED_INODE(fStream); 2234 2235 off_t nodeOffset = fHeader.RootNode(); 2236 CachedNode cached(this); 2237 const bplustree_node* node; 2238 2239 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2240 uint16 keyIndex = 0; 2241 off_t nextOffset; 2242 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2243 &nextOffset); 2244 2245 if (node->OverflowLink() == BPLUSTREE_NULL) { 2246 if (status == B_OK) { 2247 bplustree_node* writableNode = cached.MakeWritable(transaction); 2248 if (writableNode != NULL) { 2249 writableNode->Values()[keyIndex] 2250 = HOST_ENDIAN_TO_BFS_INT64(value); 2251 } else 2252 status = B_IO_ERROR; 2253 } 2254 2255 return status; 2256 } else if (nextOffset == nodeOffset) 2257 RETURN_ERROR(B_ERROR); 2258 2259 nodeOffset = nextOffset; 2260 } 2261 RETURN_ERROR(B_ERROR); 2262 } 2263 #endif // !_BOOT_MODE 2264 2265 2266 /*! Searches the key in the tree, and stores the offset found in _value, 2267 if successful. 2268 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2269 while it descends the tree. 2270 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2271 It can also return other errors to indicate that something went wrong. 2272 Note that this doesn't work with duplicates - it will just return 2273 B_BAD_TYPE if you call this function on a tree where duplicates are 2274 allowed. 2275 You need to have the inode read or write locked. 2276 */ 2277 status_t 2278 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2279 { 2280 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2281 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2282 || key == NULL) 2283 RETURN_ERROR(B_BAD_VALUE); 2284 2285 if (fAllowDuplicates) 2286 RETURN_ERROR(B_BAD_TYPE); 2287 2288 #if !_BOOT_MODE 2289 ASSERT_READ_LOCKED_INODE(fStream); 2290 #endif 2291 2292 off_t nodeOffset = fHeader.RootNode(); 2293 CachedNode cached(this); 2294 const bplustree_node* node; 2295 2296 #ifdef DEBUG 2297 int32 levels = 0; 2298 #endif 2299 2300 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2301 uint16 keyIndex = 0; 2302 off_t nextOffset; 2303 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2304 &nextOffset); 2305 2306 #ifdef DEBUG 2307 levels++; 2308 #endif 2309 if (node->OverflowLink() == BPLUSTREE_NULL) { 2310 if (status == B_OK && _value != NULL) 2311 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2312 2313 #ifdef DEBUG 2314 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2315 DEBUGGER(("levels don't match")); 2316 #endif 2317 return status; 2318 } else if (nextOffset == nodeOffset) 2319 RETURN_ERROR(B_ERROR); 2320 2321 nodeOffset = nextOffset; 2322 } 2323 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2324 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2325 RETURN_ERROR(B_ERROR); 2326 } 2327 2328 2329 #if !_BOOT_MODE 2330 status_t 2331 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2332 const uint8* largestKey, uint16 largestKeyLength, 2333 const bplustree_node* parent) 2334 { 2335 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2336 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2337 "failed!\n", fStream->ID(), offset); 2338 check.FoundError(); 2339 return B_OK; 2340 } 2341 if (level >= fHeader.MaxNumberOfLevels()) { 2342 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2343 "!\n", fStream->ID(), offset); 2344 check.FoundError(); 2345 return B_OK; 2346 } 2347 2348 check.SetLevel(level); 2349 2350 if (check.Visited(offset)) { 2351 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2352 fStream->ID(), offset); 2353 check.FoundError(); 2354 return B_OK; 2355 } 2356 2357 check.SetVisited(offset); 2358 2359 uint32 count = parent->NumKeys(); 2360 off_t* values = parent->Values(); 2361 off_t lastOffset = check.PreviousOffset(level); 2362 CachedNode cached(this); 2363 2364 for (uint32 i = 0; i < count; i++) { 2365 uint16 keyLength; 2366 uint8* key = parent->KeyAt(i, &keyLength); 2367 if (largestKey != NULL) { 2368 int result = _CompareKeys(key, keyLength, largestKey, 2369 largestKeyLength); 2370 if (result >= 0) { 2371 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2372 B_PRIu32 " larger than it should!\n", 2373 fStream->ID(), offset, i); 2374 check.FoundError(); 2375 } 2376 } 2377 2378 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2379 if (bplustree_node::IsDuplicate(childOffset)) { 2380 // Walk the duplicate nodes 2381 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2382 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2383 2384 while (duplicateOffset != BPLUSTREE_NULL) { 2385 const bplustree_node* node; 2386 status_t status = cached.SetTo(duplicateOffset, &node, false); 2387 if (status != B_OK) { 2388 if (status == B_IO_ERROR) 2389 return B_IO_ERROR; 2390 2391 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2392 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2393 duplicateOffset, strerror(status)); 2394 check.FoundError(); 2395 break; 2396 } 2397 2398 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2399 == BPLUSTREE_DUPLICATE_FRAGMENT; 2400 bool isKnownFragment = isFragmentNode 2401 && check.VisitedFragment(duplicateOffset); 2402 2403 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2404 dprintf("inode %" B_PRIdOFF ": %s node at %" 2405 B_PRIdOFF " already visited, referenced from %" 2406 B_PRIdOFF "!\n", fStream->ID(), 2407 isFragmentNode ? "fragment" : "duplicate", 2408 duplicateOffset, offset); 2409 check.FoundError(); 2410 break; 2411 } 2412 2413 // Fragment nodes may be visited more than once from different 2414 // places 2415 if (!check.Visited(duplicateOffset)) 2416 check.SetVisited(duplicateOffset); 2417 if (!isKnownFragment && isFragmentNode) 2418 check.SetVisitedFragment(duplicateOffset); 2419 2420 duplicate_array* array; 2421 int32 minSize; 2422 int32 maxSize; 2423 if (isFragmentNode) { 2424 array = node->FragmentAt( 2425 bplustree_node::FragmentIndex(childOffset)); 2426 minSize = 2; 2427 maxSize = NUM_FRAGMENT_VALUES; 2428 } else { 2429 array = node->DuplicateArray(); 2430 minSize = 1; 2431 maxSize = NUM_DUPLICATE_VALUES; 2432 } 2433 int32 arrayCount = array->Count(); 2434 2435 if (arrayCount < minSize || arrayCount > maxSize) { 2436 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2437 " has invalid array size %" B_PRId32 "!\n", 2438 fStream->ID(), duplicateOffset, arrayCount); 2439 check.FoundError(); 2440 } else { 2441 // Simple check if the values in the array may be valid 2442 for (int32 j = 0; j < arrayCount; j++) { 2443 if (!fStream->GetVolume()->IsValidInodeBlock( 2444 array->ValueAt(j))) { 2445 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2446 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2447 " at %" B_PRId32 "!\n", fStream->ID(), 2448 duplicateOffset, array->ValueAt(j), j); 2449 check.FoundError(); 2450 break; 2451 } 2452 } 2453 } 2454 2455 // A fragment node is not linked (and does not have valid links) 2456 if (isFragmentNode) 2457 break; 2458 2459 if (node->LeftLink() != lastDuplicateOffset) { 2460 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2461 " has wrong left link %" B_PRIdOFF ", expected %" 2462 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2463 node->LeftLink(), lastDuplicateOffset); 2464 check.FoundError(); 2465 } 2466 2467 lastDuplicateOffset = duplicateOffset; 2468 duplicateOffset = node->RightLink(); 2469 } 2470 } else if (!parent->IsLeaf()) { 2471 // Test a regular child node recursively 2472 off_t nextOffset = parent->OverflowLink(); 2473 if (i < count - 1) 2474 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2475 2476 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2477 // Test right link of the previous node 2478 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2479 if (previous == NULL) 2480 return B_IO_ERROR; 2481 2482 if (previous->RightLink() != childOffset) { 2483 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2484 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2485 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2486 childOffset); 2487 check.FoundError(); 2488 } 2489 } 2490 2491 status_t status = _ValidateChild(check, cached, level, childOffset, 2492 lastOffset, nextOffset, key, keyLength); 2493 if (status != B_OK) 2494 return status; 2495 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2496 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2497 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2498 fStream->ID(), offset, childOffset, i); 2499 check.FoundError(); 2500 } 2501 2502 lastOffset = childOffset; 2503 } 2504 2505 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2506 off_t childOffset = parent->OverflowLink(); 2507 status_t status = _ValidateChild(check, cached, level, childOffset, 2508 lastOffset, 0, NULL, 0); 2509 if (status != B_OK) 2510 return status; 2511 2512 lastOffset = childOffset; 2513 } 2514 2515 check.SetPreviousOffset(level, lastOffset); 2516 return B_OK; 2517 } 2518 2519 2520 status_t 2521 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2522 off_t offset, off_t lastOffset, off_t nextOffset, 2523 const uint8* key, uint16 keyLength) 2524 { 2525 const bplustree_node* node; 2526 status_t status = cached.SetTo(offset, &node, true); 2527 if (status != B_OK) { 2528 if (status == B_IO_ERROR) 2529 return B_IO_ERROR; 2530 2531 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2532 "read: %s\n", fStream->ID(), offset, strerror(status)); 2533 check.FoundError(); 2534 return B_OK; 2535 } 2536 2537 if (node->LeftLink() != lastOffset) { 2538 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2539 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2540 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2541 check.FoundError(); 2542 } 2543 2544 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2545 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2546 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2547 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2548 check.FoundError(); 2549 } 2550 2551 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2552 } 2553 #endif // !_BOOT_MODE 2554 2555 2556 // #pragma mark - 2557 2558 2559 TreeIterator::TreeIterator(BPlusTree* tree) 2560 : 2561 fTree(tree), 2562 fCurrentNodeOffset(BPLUSTREE_NULL) 2563 { 2564 #if !_BOOT_MODE 2565 tree->_AddIterator(this); 2566 #endif 2567 } 2568 2569 2570 TreeIterator::~TreeIterator() 2571 { 2572 #if !_BOOT_MODE 2573 if (fTree) 2574 fTree->_RemoveIterator(this); 2575 #endif 2576 } 2577 2578 2579 status_t 2580 TreeIterator::Goto(int8 to) 2581 { 2582 if (fTree == NULL || fTree->fStream == NULL) 2583 RETURN_ERROR(B_BAD_VALUE); 2584 2585 #if !_BOOT_MODE 2586 // lock access to stream 2587 InodeReadLocker locker(fTree->fStream); 2588 #endif 2589 2590 off_t nodeOffset = fTree->fHeader.RootNode(); 2591 CachedNode cached(fTree); 2592 const bplustree_node* node; 2593 2594 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2595 // is the node a leaf node? 2596 if (node->OverflowLink() == BPLUSTREE_NULL) { 2597 fCurrentNodeOffset = nodeOffset; 2598 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2599 fDuplicateNode = BPLUSTREE_NULL; 2600 2601 return B_OK; 2602 } 2603 2604 // get the next node offset depending on the direction (and if there 2605 // are any keys in that node at all) 2606 off_t nextOffset; 2607 if (to == BPLUSTREE_END || node->all_key_count == 0) 2608 nextOffset = node->OverflowLink(); 2609 else { 2610 if (node->AllKeyLength() > fTree->fNodeSize 2611 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2612 - 8 * node->NumKeys()) 2613 RETURN_ERROR(B_ERROR); 2614 2615 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2616 } 2617 if (nextOffset == nodeOffset) 2618 break; 2619 2620 nodeOffset = nextOffset; 2621 } 2622 FATAL(("%s fails\n", __FUNCTION__)); 2623 2624 RETURN_ERROR(B_ERROR); 2625 } 2626 2627 2628 /*! Iterates through the tree in the specified direction. 2629 When it iterates through duplicates, the "key" is only updated for the 2630 first entry - if you need to know when this happens, use the "duplicate" 2631 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2632 the other duplicates. 2633 That's not too nice, but saves the 256 bytes that would be needed to 2634 store the last key - if this will ever become an issue, it will be 2635 easy to change. 2636 The other advantage of this is, that the queries can skip all duplicates 2637 at once when they are not relevant to them. 2638 */ 2639 status_t 2640 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2641 uint16 maxLength, off_t* value, uint16* duplicate) 2642 { 2643 if (fTree == NULL) 2644 return B_INTERRUPTED; 2645 2646 bool forward = direction == BPLUSTREE_FORWARD; 2647 2648 if (fCurrentNodeOffset == BPLUSTREE_NULL 2649 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2650 RETURN_ERROR(B_ERROR); 2651 2652 // if the tree was emptied since the last call 2653 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2654 return B_ENTRY_NOT_FOUND; 2655 2656 #if !_BOOT_MODE 2657 // lock access to stream 2658 InodeReadLocker locker(fTree->fStream); 2659 #endif 2660 2661 CachedNode cached(fTree); 2662 const bplustree_node* node; 2663 2664 if (fDuplicateNode != BPLUSTREE_NULL) { 2665 // regardless of traverse direction the duplicates are always presented 2666 // in the same order; since they are all considered as equal, this 2667 // shouldn't cause any problems 2668 2669 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2670 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2671 false); 2672 } else 2673 node = NULL; 2674 2675 if (node != NULL) { 2676 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2677 // If the node is out of duplicates, we go directly to the next 2678 // one 2679 fDuplicateNode = node->RightLink(); 2680 if (fDuplicateNode != BPLUSTREE_NULL 2681 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2682 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2683 false); 2684 fDuplicate = 0; 2685 } 2686 } 2687 if (fDuplicate < fNumDuplicates) { 2688 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2689 fDuplicate++); 2690 if (duplicate) 2691 *duplicate = 2; 2692 return B_OK; 2693 } 2694 } 2695 fDuplicateNode = BPLUSTREE_NULL; 2696 } 2697 2698 off_t savedNodeOffset = fCurrentNodeOffset; 2699 int32 savedKey = fCurrentKey; 2700 2701 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2702 RETURN_ERROR(B_ERROR); 2703 2704 if (duplicate) 2705 *duplicate = 0; 2706 2707 fCurrentKey += direction; 2708 2709 // is the current key in the current node? 2710 while ((forward && fCurrentKey >= node->NumKeys()) 2711 || (!forward && fCurrentKey < 0)) { 2712 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2713 2714 // are there any more nodes? 2715 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2716 node = cached.SetTo(fCurrentNodeOffset); 2717 if (!node) 2718 RETURN_ERROR(B_ERROR); 2719 2720 // reset current key 2721 fCurrentKey = forward ? 0 : node->NumKeys() - 1; 2722 } else { 2723 // there are no nodes left, so turn back to the last key 2724 fCurrentNodeOffset = savedNodeOffset; 2725 fCurrentKey = savedKey; 2726 2727 return B_ENTRY_NOT_FOUND; 2728 } 2729 } 2730 2731 if (node->all_key_count == 0) 2732 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2733 2734 uint16 length; 2735 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2736 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2737 > (uint8*)node + fTree->fNodeSize 2738 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2739 #if !_BOOT_MODE 2740 fTree->fStream->GetVolume()->Panic(); 2741 #endif 2742 RETURN_ERROR(B_BAD_DATA); 2743 } 2744 2745 // include the termination for string types 2746 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; 2747 if (length + (needsTermination ? 1 : 0) > maxLength) { 2748 // the buffer is too small, restore the last key and return an error 2749 fCurrentNodeOffset = savedNodeOffset; 2750 fCurrentKey = savedKey; 2751 return B_BUFFER_OVERFLOW; 2752 } 2753 2754 memcpy(key, keyStart, length); 2755 2756 if (needsTermination) 2757 ((char*)key)[length] = '\0'; 2758 2759 *keyLength = length; 2760 2761 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2762 2763 // duplicate fragments? 2764 uint8 type = bplustree_node::LinkType(offset); 2765 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2766 || type == BPLUSTREE_DUPLICATE_NODE) { 2767 fDuplicateNode = offset; 2768 2769 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2770 false); 2771 if (node == NULL) 2772 RETURN_ERROR(B_ERROR); 2773 2774 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2775 2776 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2777 if (fNumDuplicates) { 2778 offset = node->DuplicateAt(offset, fIsFragment, 0); 2779 fDuplicate = 1; 2780 if (duplicate) 2781 *duplicate = 1; 2782 } else { 2783 // Shouldn't happen, but we're dealing here with potentially 2784 // corrupt disks... 2785 fDuplicateNode = BPLUSTREE_NULL; 2786 offset = 0; 2787 } 2788 } 2789 *value = offset; 2790 2791 return B_OK; 2792 } 2793 2794 2795 /*! This is more or less a copy of BPlusTree::Find() - but it just 2796 sets the current position in the iterator, regardless of if the 2797 key could be found or not. 2798 */ 2799 status_t 2800 TreeIterator::Find(const uint8* key, uint16 keyLength) 2801 { 2802 if (fTree == NULL) 2803 return B_INTERRUPTED; 2804 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2805 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2806 || key == NULL) 2807 RETURN_ERROR(B_BAD_VALUE); 2808 2809 #if !_BOOT_MODE 2810 // lock access to stream 2811 InodeReadLocker locker(fTree->fStream); 2812 #endif 2813 2814 off_t nodeOffset = fTree->fHeader.RootNode(); 2815 2816 CachedNode cached(fTree); 2817 const bplustree_node* node; 2818 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2819 uint16 keyIndex = 0; 2820 off_t nextOffset; 2821 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2822 &nextOffset); 2823 2824 if (node->OverflowLink() == BPLUSTREE_NULL) { 2825 fCurrentNodeOffset = nodeOffset; 2826 fCurrentKey = keyIndex - 1; 2827 fDuplicateNode = BPLUSTREE_NULL; 2828 2829 return status; 2830 } else if (nextOffset == nodeOffset) 2831 RETURN_ERROR(B_ERROR); 2832 2833 nodeOffset = nextOffset; 2834 } 2835 RETURN_ERROR(B_ERROR); 2836 } 2837 2838 2839 void 2840 TreeIterator::SkipDuplicates() 2841 { 2842 fDuplicateNode = BPLUSTREE_NULL; 2843 } 2844 2845 2846 void 2847 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2848 uint16 splitAt, int8 change) 2849 { 2850 if (offset != fCurrentNodeOffset) 2851 return; 2852 2853 if (nextOffset != BPLUSTREE_NULL) { 2854 fCurrentNodeOffset = nextOffset; 2855 if (splitAt <= fCurrentKey) { 2856 fCurrentKey -= splitAt; 2857 keyIndex -= splitAt; 2858 } 2859 } 2860 2861 // Adjust fCurrentKey to point to the same key as before. 2862 // Note, that if a key is inserted at the current position 2863 // it won't be included in this tree transition. 2864 if (keyIndex <= fCurrentKey) 2865 fCurrentKey += change; 2866 2867 // TODO: duplicate handling! 2868 } 2869 2870 2871 void 2872 TreeIterator::Stop() 2873 { 2874 fTree = NULL; 2875 } 2876 2877 2878 #ifdef DEBUG 2879 void 2880 TreeIterator::Dump() 2881 { 2882 __out("TreeIterator at %p:\n", this); 2883 __out("\tfTree = %p\n", fTree); 2884 __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset); 2885 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2886 __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n", 2887 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2888 fDuplicateNode); 2889 __out("\tfDuplicate = %u\n", fDuplicate); 2890 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2891 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2892 } 2893 #endif 2894 2895 2896 // #pragma mark - 2897 2898 2899 bool 2900 bplustree_header::IsValid() const 2901 { 2902 return Magic() == BPLUSTREE_MAGIC 2903 && (RootNode() % NodeSize()) == 0 2904 && IsValidLink(RootNode()) 2905 && IsValidLink(FreeNode()); 2906 } 2907 2908 2909 // #pragma mark - 2910 2911 2912 void 2913 bplustree_node::Initialize() 2914 { 2915 left_link = right_link = overflow_link 2916 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2917 all_key_count = 0; 2918 all_key_length = 0; 2919 } 2920 2921 2922 uint8* 2923 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2924 { 2925 if (index < 0 || index > NumKeys()) 2926 return NULL; 2927 2928 uint8* keyStart = Keys(); 2929 uint16* keyLengths = KeyLengths(); 2930 2931 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2932 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2933 if (index > 0) 2934 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2935 2936 return keyStart; 2937 } 2938 2939 2940 uint8 2941 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2942 { 2943 // the duplicate fragment handling is currently hard-coded to a node size 2944 // of 1024 bytes - with future versions of BFS, this may be a problem 2945 2946 if (isFragment) { 2947 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2948 2949 return ((off_t*)this)[fragment]; 2950 } 2951 return OverflowLink(); 2952 } 2953 2954 2955 off_t 2956 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2957 { 2958 uint32 start; 2959 if (isFragment) 2960 start = 8 * ((uint64)offset & 0x3ff); 2961 else 2962 start = 2; 2963 2964 return ((off_t*)this)[start + 1 + index]; 2965 } 2966 2967 2968 /*! Although the name suggests it, this function doesn't return the real 2969 used fragment count; at least, it can only count to two: it returns 2970 0, if there is no fragment used, 1 if there is only one fragment 2971 used, and 2 if there are at least 2 fragments used. 2972 */ 2973 uint32 2974 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2975 { 2976 uint32 used = 0; 2977 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2978 duplicate_array* array = FragmentAt(i); 2979 if (array->Count() > 0 && ++used > 1) 2980 return used; 2981 } 2982 return used; 2983 } 2984 2985 2986 status_t 2987 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2988 { 2989 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2990 DEBUGGER(("invalid node: key/length count")); 2991 2992 for (int32 i = 0; i < NumKeys(); i++) { 2993 uint16 length; 2994 uint8* key = KeyAt(i, &length); 2995 if (key + length + sizeof(off_t) + sizeof(uint16) 2996 > (uint8*)this + nodeSize 2997 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2998 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 2999 return B_BAD_DATA; 3000 } 3001 if (Values()[i] == -1) { 3002 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " 3003 "corrupted\n", this, (int)i, Values()[i]); 3004 return B_BAD_DATA; 3005 } 3006 } 3007 return B_OK; 3008 } 3009 3010 3011 // #pragma mark - 3012 3013 3014 #if !_BOOT_MODE 3015 BitmapArray::BitmapArray(size_t numBits) 3016 { 3017 fSize = (numBits + 7) / 8; 3018 fBitmap = (uint8*)calloc(fSize, 1); 3019 fCountSet = 0; 3020 } 3021 3022 3023 BitmapArray::~BitmapArray() 3024 { 3025 free(fBitmap); 3026 } 3027 3028 3029 status_t 3030 BitmapArray::InitCheck() const 3031 { 3032 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 3033 } 3034 3035 3036 bool 3037 BitmapArray::IsSet(size_t index) const 3038 { 3039 uint32 byteIndex = index / 8; 3040 if (byteIndex >= fSize) 3041 return false; 3042 3043 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 3044 } 3045 3046 3047 void 3048 BitmapArray::Set(size_t index, bool set) 3049 { 3050 uint32 byteIndex = index / 8; 3051 if (byteIndex >= fSize) 3052 return; 3053 3054 if (set) { 3055 fBitmap[byteIndex] |= 1UL << (index & 0x7); 3056 fCountSet++; 3057 } else { 3058 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 3059 fCountSet--; 3060 } 3061 } 3062 #endif // !_BOOT_MODE 3063 3064 3065 // #pragma mark - 3066 3067 3068 bool 3069 duplicate_array::_FindInternal(off_t value, int32& index) const 3070 { 3071 int32 min = 0, max = Count() - 1; 3072 off_t cmp; 3073 while (min <= max) { 3074 index = (min + max) / 2; 3075 3076 cmp = ValueAt(index) - value; 3077 if (cmp < 0) 3078 min = index + 1; 3079 else if (cmp > 0) 3080 max = index - 1; 3081 else 3082 return true; 3083 } 3084 return false; 3085 } 3086 3087 3088 void 3089 duplicate_array::Insert(off_t value) 3090 { 3091 // if there are more than 8 values in this array, use a 3092 // binary search, if not, just iterate linearly to find 3093 // the insertion point 3094 int32 size = Count(); 3095 int32 i = 0; 3096 if (size > 8 ) { 3097 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3098 i++; 3099 } else { 3100 for (i = 0; i < size; i++) { 3101 if (ValueAt(i) > value) 3102 break; 3103 } 3104 } 3105 3106 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3107 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3108 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3109 } 3110 3111 3112 bool 3113 duplicate_array::Remove(off_t value) 3114 { 3115 int32 index = Find(value); 3116 if (index == -1) 3117 return false; 3118 3119 int32 newSize = Count() - 1; 3120 memmove(&values[index], &values[index + 1], 3121 (newSize - index) * sizeof(off_t)); 3122 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3123 3124 return true; 3125 } 3126 3127 3128 #if _BOOT_MODE 3129 } // namespace BFS 3130 #endif 3131