1 /* 2 * Copyright 2001-2015, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "BPlusTree.h" 14 15 #include <file_systems/QueryParserUtils.h> 16 17 #include "Debug.h" 18 #include "Utility.h" 19 20 #if !_BOOT_MODE 21 # include "Inode.h" 22 #else 23 # include "Stream.h" 24 25 // BFS::Stream from the bootloader has the same API as Inode. 26 # define Inode BFS::Stream 27 28 # define strerror(x) "error message unavailable" 29 30 namespace BFS { 31 #endif 32 33 34 /*! Simple array used for the duplicate handling in the B+Tree. This is an 35 on disk structure. 36 */ 37 struct duplicate_array { 38 off_t count; 39 off_t values[0]; 40 41 inline bool IsEmpty() const 42 { 43 return count == 0; 44 } 45 46 inline int32 Count() const 47 { 48 return (int32)BFS_ENDIAN_TO_HOST_INT64(count); 49 } 50 51 inline off_t ValueAt(uint32 index) const 52 { 53 return BFS_ENDIAN_TO_HOST_INT64(values[index]); 54 } 55 56 inline void SetValueAt(uint32 index, off_t value) 57 { 58 values[index] = HOST_ENDIAN_TO_BFS_INT64(value); 59 } 60 61 inline int32 Find(off_t value) const 62 { 63 int32 i; 64 return _FindInternal(value, i) ? i : -1; 65 } 66 67 void Insert(off_t value); 68 bool Remove(off_t value); 69 70 private: 71 bool _FindInternal(off_t value, int32& index) const; 72 } _PACKED; 73 74 75 #ifdef DEBUG 76 class NodeChecker { 77 public: 78 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 79 : 80 fNode(node), 81 fSize(nodeSize), 82 fText(text) 83 { 84 Check("integrity check failed on construction."); 85 } 86 87 ~NodeChecker() 88 { 89 Check("integrity check failed on destruction."); 90 } 91 92 void 93 Check(const char* message) 94 { 95 if (fNode->CheckIntegrity(fSize) != B_OK) { 96 dprintf("%s: %s\n", fText, message); 97 DEBUGGER(("NodeChecker integrity check failed!")); 98 } 99 } 100 101 private: 102 const bplustree_node* fNode; 103 int32 fSize; 104 const char* fText; 105 }; 106 #endif // DEBUG 107 108 109 #if !_BOOT_MODE 110 class BitmapArray { 111 public: 112 BitmapArray(size_t numBits); 113 ~BitmapArray(); 114 115 status_t InitCheck() const; 116 117 bool IsSet(size_t index) const; 118 void Set(size_t index, bool set); 119 120 size_t CountSet() const { return fCountSet; } 121 122 private: 123 uint8* fBitmap; 124 size_t fSize; 125 size_t fCountSet; 126 }; 127 128 129 struct TreeCheck { 130 TreeCheck(BPlusTree* tree) 131 : 132 fLevelCount(0), 133 fFreeCount(0), 134 fNodeSize(tree->NodeSize()), 135 fMaxLevels(tree->fHeader.MaxNumberOfLevels()), 136 fFoundErrors(0), 137 fVisited(tree->Stream()->Size() / tree->NodeSize()), 138 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize()) 139 { 140 fPreviousOffsets = (off_t*)malloc( 141 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels()); 142 if (fPreviousOffsets != NULL) { 143 for (size_t i = 0; i < fMaxLevels; i++) 144 fPreviousOffsets[i] = BPLUSTREE_NULL; 145 } 146 } 147 148 ~TreeCheck() 149 { 150 free(fPreviousOffsets); 151 } 152 153 status_t InitCheck() const 154 { 155 if (fPreviousOffsets == NULL) 156 return B_NO_MEMORY; 157 158 status_t status = fVisited.InitCheck(); 159 if (status != B_OK) 160 return status; 161 162 return fVisitedFragment.InitCheck(); 163 } 164 165 bool Visited(off_t offset) const 166 { 167 return fVisited.IsSet(offset / fNodeSize); 168 } 169 170 void SetVisited(off_t offset) 171 { 172 fVisited.Set(offset / fNodeSize, true); 173 } 174 175 size_t VisitedCount() const 176 { 177 return fVisited.CountSet(); 178 } 179 180 bool VisitedFragment(off_t offset) const 181 { 182 return fVisitedFragment.IsSet(offset / fNodeSize); 183 } 184 185 void SetVisitedFragment(off_t offset) 186 { 187 fVisitedFragment.Set(offset / fNodeSize, true); 188 } 189 190 uint32 MaxLevels() const 191 { 192 return fLevelCount; 193 } 194 195 void SetLevel(uint32 level) 196 { 197 if (fLevelCount < level) 198 fLevelCount = level; 199 } 200 201 off_t PreviousOffset(uint32 level) 202 { 203 return fPreviousOffsets[level]; 204 } 205 206 void SetPreviousOffset(uint32 level, off_t offset) 207 { 208 fPreviousOffsets[level] = offset; 209 } 210 211 void FoundError() 212 { 213 fFoundErrors++; 214 } 215 216 bool ErrorsFound() 217 { 218 return fFoundErrors != 0; 219 } 220 221 private: 222 uint32 fLevelCount; 223 uint32 fFreeCount; 224 uint32 fNodeSize; 225 uint32 fMaxLevels; 226 uint32 fFoundErrors; 227 BitmapArray fVisited; 228 BitmapArray fVisitedFragment; 229 off_t* fPreviousOffsets; 230 }; 231 232 233 // #pragma mark - 234 235 236 // Node Caching for the BPlusTree class 237 // 238 // With write support, there is the need for a function that allocates new 239 // nodes by either returning empty nodes, or by growing the file's data stream 240 // 241 // !! The CachedNode class assumes that you have properly locked the stream 242 // !! before asking for nodes. 243 // 244 // Note: This code will fail if the block size is smaller than the node size! 245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 246 // is hard-coded to 1024 bytes, that's not an issue now. 247 248 void 249 CachedNode::UnsetUnchanged(Transaction& transaction) 250 { 251 if (fTree == NULL || fTree->fStream == NULL) 252 return; 253 254 if (fNode != NULL) { 255 void* cache = fTree->fStream->GetVolume()->BlockCache(); 256 257 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 258 block_cache_put(cache, fBlockNumber); 259 fNode = NULL; 260 } 261 } 262 #endif // !_BOOT_MODE 263 264 265 void 266 CachedNode::Unset() 267 { 268 if (fTree == NULL || fTree->fStream == NULL) 269 return; 270 271 if (fNode != NULL) { 272 #if !_BOOT_MODE 273 if (fWritable && fOffset == 0) { 274 // The B+tree header has been updated - we need to update the 275 // BPlusTrees copy of it, as well. 276 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 277 } 278 279 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 280 fBlockNumber); 281 #endif // !_BOOT_MODE 282 283 fNode = NULL; 284 } 285 } 286 287 288 const bplustree_node* 289 CachedNode::SetTo(off_t offset, bool check) 290 { 291 const bplustree_node* node; 292 if (SetTo(offset, &node, check) == B_OK) 293 return node; 294 295 return NULL; 296 } 297 298 299 status_t 300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check) 301 { 302 if (fTree == NULL || fTree->fStream == NULL) 303 RETURN_ERROR(B_BAD_VALUE); 304 305 Unset(); 306 307 // You can only ask for nodes at valid positions - you can't 308 // even access the b+tree header with this method (use SetToHeader() 309 // instead) 310 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 311 || offset <= 0 312 || (offset % fTree->fNodeSize) != 0) { 313 RETURN_ERROR(B_BAD_VALUE); 314 } 315 316 if (InternalSetTo(NULL, offset) != NULL && check) { 317 // sanity checks (links, all_key_count) 318 if (!fTree->fHeader.CheckNode(fNode)) { 319 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 320 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 321 fBlockNumber, fTree->fStream->ID())); 322 return B_BAD_DATA; 323 } 324 } 325 326 *_node = fNode; 327 return B_OK; 328 } 329 330 331 #if !_BOOT_MODE 332 bplustree_node* 333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 334 { 335 if (fTree == NULL || fTree->fStream == NULL) { 336 REPORT_ERROR(B_BAD_VALUE); 337 return NULL; 338 } 339 340 Unset(); 341 342 // You can only ask for nodes at valid positions - you can't 343 // even access the b+tree header with this method (use SetToHeader() 344 // instead) 345 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 346 || offset <= 0 347 || (offset % fTree->fNodeSize) != 0) 348 return NULL; 349 350 if (InternalSetTo(&transaction, offset) != NULL && check) { 351 // sanity checks (links, all_key_count) 352 if (!fTree->fHeader.CheckNode(fNode)) { 353 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 354 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 355 fBlockNumber, fTree->fStream->ID())); 356 return NULL; 357 } 358 } 359 return fNode; 360 } 361 362 363 bplustree_node* 364 CachedNode::MakeWritable(Transaction& transaction) 365 { 366 if (fNode == NULL) 367 return NULL; 368 369 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 370 fBlockNumber, transaction.ID()) == B_OK) { 371 return fNode; 372 } 373 374 return NULL; 375 } 376 #endif // !_BOOT_MODE 377 378 379 const bplustree_header* 380 CachedNode::SetToHeader() 381 { 382 if (fTree == NULL || fTree->fStream == NULL) { 383 REPORT_ERROR(B_BAD_VALUE); 384 return NULL; 385 } 386 387 Unset(); 388 389 InternalSetTo(NULL, 0LL); 390 return (bplustree_header*)fNode; 391 } 392 393 394 #if !_BOOT_MODE 395 bplustree_header* 396 CachedNode::SetToWritableHeader(Transaction& transaction) 397 { 398 if (fTree == NULL || fTree->fStream == NULL) { 399 REPORT_ERROR(B_BAD_VALUE); 400 return NULL; 401 } 402 403 Unset(); 404 405 InternalSetTo(&transaction, 0LL); 406 407 if (fNode != NULL && !fTree->fInTransaction) { 408 transaction.AddListener(fTree); 409 fTree->fInTransaction = true; 410 411 if (!transaction.GetVolume()->IsInitializing()) { 412 acquire_vnode(transaction.GetVolume()->FSVolume(), 413 fTree->fStream->ID()); 414 } 415 } 416 417 return (bplustree_header*)fNode; 418 } 419 #endif // !_BOOT_MODE 420 421 422 bplustree_node* 423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 424 { 425 fNode = NULL; 426 fOffset = offset; 427 428 off_t fileOffset; 429 block_run run; 430 if (offset < fTree->fStream->Size() 431 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 432 433 #if !_BOOT_MODE 434 Volume* volume = fTree->fStream->GetVolume(); 435 #else 436 Volume* volume = &fTree->fStream->GetVolume(); 437 #endif 438 439 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 440 fBlockNumber = volume->ToBlock(run) + blockOffset; 441 uint8* block = NULL; 442 443 #if !_BOOT_MODE 444 if (transaction != NULL) { 445 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 446 fBlockNumber, transaction->ID()); 447 fWritable = true; 448 } else { 449 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 450 fWritable = false; 451 } 452 #else // !_BOOT_MODE 453 if (fBlock == NULL) { 454 fBlock = (uint8*)malloc(volume->BlockSize()); 455 if (fBlock == NULL) 456 return NULL; 457 } 458 459 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(), 460 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) { 461 block = fBlock; 462 } 463 464 fWritable = false; 465 #endif // _BOOT_MODE 466 467 if (block != NULL) { 468 // The node is somewhere in that block... 469 // (confusing offset calculation) 470 fNode = (bplustree_node*)(block + offset 471 - (fileOffset + (blockOffset << volume->BlockShift()))); 472 } else 473 REPORT_ERROR(B_IO_ERROR); 474 } 475 return fNode; 476 } 477 478 479 #if !_BOOT_MODE 480 status_t 481 CachedNode::Free(Transaction& transaction, off_t offset) 482 { 483 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 484 RETURN_ERROR(B_BAD_VALUE); 485 486 // TODO: scan the free nodes list and remove all nodes at the end 487 // of the tree - perhaps that shouldn't be done everytime that 488 // function is called, perhaps it should be done when the directory 489 // inode is closed or based on some calculation or whatever... 490 491 CachedNode cached(fTree); 492 bplustree_header* header = cached.SetToWritableHeader(transaction); 493 if (header == NULL) 494 return B_IO_ERROR; 495 496 #if 0 497 // TODO: temporarily disabled because CheckNode() doesn't like this... 498 // Also, it's such an edge case that it's almost useless, anyway. 499 // if the node is the last one in the tree, we shrink 500 // the tree and file size by one node 501 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 502 if (offset == lastOffset) { 503 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 504 if (status != B_OK) 505 return status; 506 507 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 508 return B_OK; 509 } 510 #endif 511 512 // add the node to the free nodes list 513 fNode->left_link = header->free_node_pointer; 514 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 515 516 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 517 return B_OK; 518 } 519 520 521 status_t 522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 523 off_t* _offset) 524 { 525 if (fTree == NULL || fTree->fStream == NULL) 526 RETURN_ERROR(B_BAD_VALUE); 527 528 Unset(); 529 530 bplustree_header* header; 531 status_t status; 532 533 // if there are any free nodes, recycle them 534 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 535 CachedNode cached(fTree); 536 header = cached.SetToWritableHeader(transaction); 537 if (header == NULL) 538 return B_IO_ERROR; 539 540 *_offset = header->FreeNode(); 541 *_node = fNode; 542 543 // set new free node pointer 544 header->free_node_pointer = fNode->left_link; 545 fNode->Initialize(); 546 return B_OK; 547 } 548 549 // allocate space for a new node 550 Inode* stream = fTree->fStream; 551 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK) 552 return status; 553 554 CachedNode cached(fTree); 555 header = cached.SetToWritableHeader(transaction); 556 if (header == NULL) 557 return B_IO_ERROR; 558 559 // the maximum_size has to be changed before the call to SetTo() - or 560 // else it will fail because the requested node is out of bounds 561 off_t offset = header->MaximumSize(); 562 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 563 + fTree->fNodeSize); 564 565 cached.Unset(); 566 // SetToWritable() below needs the new values in the tree's header 567 568 if (SetToWritable(transaction, offset, false) == NULL) 569 RETURN_ERROR(B_ERROR); 570 571 fNode->Initialize(); 572 573 *_offset = offset; 574 *_node = fNode; 575 return B_OK; 576 } 577 578 579 // #pragma mark - 580 581 582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 583 : 584 fStream(NULL), 585 fInTransaction(false) 586 { 587 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 588 SetTo(transaction, stream); 589 } 590 #endif // !_BOOT_MODE 591 592 593 BPlusTree::BPlusTree(Inode* stream) 594 : 595 fStream(NULL), 596 fInTransaction(false) 597 { 598 #if !_BOOT_MODE 599 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 600 #endif 601 602 SetTo(stream); 603 } 604 605 606 BPlusTree::BPlusTree() 607 : 608 fStream(NULL), 609 fNodeSize(BPLUSTREE_NODE_SIZE), 610 fAllowDuplicates(true), 611 fInTransaction(false), 612 fStatus(B_NO_INIT) 613 { 614 #if !_BOOT_MODE 615 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 616 #endif 617 } 618 619 620 BPlusTree::~BPlusTree() 621 { 622 #if !_BOOT_MODE 623 // if there are any TreeIterators left, we need to stop them 624 // (can happen when the tree's inode gets deleted while 625 // traversing the tree - a TreeIterator doesn't lock the inode) 626 mutex_lock(&fIteratorLock); 627 628 SinglyLinkedList<TreeIterator>::Iterator iterator 629 = fIterators.GetIterator(); 630 while (iterator.HasNext()) 631 iterator.Next()->Stop(); 632 633 mutex_destroy(&fIteratorLock); 634 635 ASSERT(!fInTransaction); 636 #endif // !_BOOT_MODE 637 } 638 639 640 #if !_BOOT_MODE 641 /*! Create a new B+Tree on the specified stream */ 642 status_t 643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 644 { 645 // initializes in-memory B+Tree 646 647 fStream = stream; 648 649 CachedNode cached(this); 650 bplustree_header* header = cached.SetToWritableHeader(transaction); 651 if (header == NULL) { 652 // allocate space for new header + node! 653 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 654 if (fStatus != B_OK) 655 RETURN_ERROR(fStatus); 656 657 header = cached.SetToWritableHeader(transaction); 658 if (header == NULL) 659 RETURN_ERROR(fStatus = B_ERROR); 660 } 661 662 fAllowDuplicates = stream->IsIndex() 663 || (stream->Mode() & S_ALLOW_DUPS) != 0; 664 665 fNodeSize = nodeSize; 666 667 // initialize b+tree header 668 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 669 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 670 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 671 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 672 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 673 header->free_node_pointer 674 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 675 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 676 677 cached.Unset(); 678 679 // initialize b+tree root node 680 cached.SetToWritable(transaction, fHeader.RootNode(), false); 681 if (cached.Node() == NULL) 682 RETURN_ERROR(B_IO_ERROR); 683 684 cached.Node()->Initialize(); 685 686 return fStatus = B_OK; 687 } 688 #endif // !_BOOT_MODE 689 690 691 status_t 692 BPlusTree::SetTo(Inode* stream) 693 { 694 if (stream == NULL) 695 RETURN_ERROR(fStatus = B_BAD_VALUE); 696 697 fStream = stream; 698 699 // get on-disk B+Tree header 700 701 CachedNode cached(this); 702 const bplustree_header* header = cached.SetToHeader(); 703 if (header != NULL) 704 memcpy(&fHeader, header, sizeof(bplustree_header)); 705 else 706 RETURN_ERROR(fStatus = B_IO_ERROR); 707 708 // is header valid? 709 710 if (fHeader.MaximumSize() != stream->Size()) { 711 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 712 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 713 // we can't change the header since we don't have a transaction 714 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 715 } 716 if (!fHeader.IsValid()) { 717 #ifdef DEBUG 718 dump_bplustree_header(&fHeader); 719 dump_block((const char*)&fHeader, 128); 720 #endif 721 RETURN_ERROR(fStatus = B_BAD_DATA); 722 } 723 724 fNodeSize = fHeader.NodeSize(); 725 726 // validity check 727 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 728 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 729 S_DOUBLE_INDEX}; 730 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 731 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 732 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 733 734 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 735 || ((stream->Mode() & S_INDEX_DIR) != 0 736 && kToMode[fHeader.DataType()] != mode) 737 || !stream->IsContainer()) { 738 D( dump_bplustree_header(&fHeader); 739 dump_inode(&stream->Node()); 740 ); 741 RETURN_ERROR(fStatus = B_BAD_TYPE); 742 } 743 744 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 745 // in the original BFS code - we will honour it nevertheless 746 fAllowDuplicates = stream->IsIndex() 747 || (stream->Mode() & S_ALLOW_DUPS) != 0; 748 749 cached.SetTo(fHeader.RootNode()); 750 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 751 } 752 753 754 status_t 755 BPlusTree::InitCheck() 756 { 757 return fStatus; 758 } 759 760 761 #if !_BOOT_MODE 762 status_t 763 BPlusTree::Validate(bool repair, bool& _errorsFound) 764 { 765 TreeCheck check(this); 766 if (check.InitCheck() != B_OK) 767 return B_NO_MEMORY; 768 769 check.SetVisited(0); 770 771 // Walk the free nodes 772 773 CachedNode cached(this); 774 off_t freeOffset = fHeader.FreeNode(); 775 while (freeOffset > 0) { 776 const bplustree_node* node; 777 status_t status = cached.SetTo(freeOffset, &node, false); 778 if (status != B_OK) { 779 if (status == B_IO_ERROR) 780 return B_IO_ERROR; 781 782 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could " 783 "not be read: %s\n", fStream->ID(), freeOffset, 784 strerror(status)); 785 check.FoundError(); 786 break; 787 } 788 789 if (check.Visited(freeOffset)) { 790 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 791 " circular!\n", fStream->ID(), freeOffset); 792 // TODO: if 'repair' is true, we could collect all unvisited nodes 793 // at the end, and put the into the free list 794 check.FoundError(); 795 break; 796 } 797 798 check.SetVisited(freeOffset); 799 800 if (node->OverflowLink() != BPLUSTREE_FREE) { 801 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF 802 " misses free mark!\n", fStream->ID(), freeOffset); 803 } 804 freeOffset = node->LeftLink(); 805 } 806 807 // Iterate over the complete tree recursively 808 809 const bplustree_node* root; 810 status_t status = cached.SetTo(fHeader.RootNode(), &root, true); 811 if (status != B_OK) 812 return status; 813 814 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root); 815 816 if (check.ErrorsFound()) 817 _errorsFound = true; 818 819 if (status != B_OK) 820 return status; 821 822 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) { 823 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, " 824 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(), 825 fHeader.MaxNumberOfLevels()); 826 } 827 828 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) { 829 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF 830 " nodes.\n", fStream->ID(), check.VisitedCount(), 831 fHeader.MaximumSize() / fNodeSize); 832 } 833 834 return B_OK; 835 } 836 837 838 status_t 839 BPlusTree::MakeEmpty() 840 { 841 // Put all nodes into the free list in order 842 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber()); 843 844 // Reset the header, and root node 845 CachedNode cached(this); 846 bplustree_header* header = cached.SetToWritableHeader(transaction); 847 if (header == NULL) 848 return B_IO_ERROR; 849 850 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 851 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize()); 852 if (fStream->Size() > (off_t)NodeSize() * 2) 853 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize()); 854 else { 855 header->free_node_pointer 856 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 857 } 858 859 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false); 860 if (node == NULL) 861 return B_IO_ERROR; 862 863 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 864 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 865 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 866 node->all_key_count = 0; 867 node->all_key_length = 0; 868 869 for (off_t offset = 2 * NodeSize(); offset < fStream->Size(); 870 offset += NodeSize()) { 871 bplustree_node* node = cached.SetToWritable(transaction, offset, false); 872 if (node == NULL) { 873 dprintf("--> could not open %" B_PRIdOFF "\n", offset); 874 return B_IO_ERROR; 875 } 876 if (offset < fStream->Size() - (off_t)NodeSize()) 877 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize()); 878 else 879 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 880 881 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 882 883 // It's not important to write it out in a single transaction; just 884 // make sure it doesn't get too large 885 if (offset % (1024 * 1024) == 0) { 886 transaction.Done(); 887 transaction.Start(fStream->GetVolume(), fStream->BlockNumber()); 888 } 889 } 890 891 return transaction.Done(); 892 } 893 894 895 int32 896 BPlusTree::TypeCodeToKeyType(type_code code) 897 { 898 switch (code) { 899 case B_STRING_TYPE: 900 return BPLUSTREE_STRING_TYPE; 901 case B_SSIZE_T_TYPE: 902 case B_INT32_TYPE: 903 return BPLUSTREE_INT32_TYPE; 904 case B_SIZE_T_TYPE: 905 case B_UINT32_TYPE: 906 return BPLUSTREE_UINT32_TYPE; 907 case B_OFF_T_TYPE: 908 case B_INT64_TYPE: 909 return BPLUSTREE_INT64_TYPE; 910 case B_UINT64_TYPE: 911 return BPLUSTREE_UINT64_TYPE; 912 case B_FLOAT_TYPE: 913 return BPLUSTREE_FLOAT_TYPE; 914 case B_DOUBLE_TYPE: 915 return BPLUSTREE_DOUBLE_TYPE; 916 } 917 return -1; 918 } 919 920 921 int32 922 BPlusTree::ModeToKeyType(mode_t mode) 923 { 924 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 925 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 926 case S_INT_INDEX: 927 return BPLUSTREE_INT32_TYPE; 928 case S_UINT_INDEX: 929 return BPLUSTREE_UINT32_TYPE; 930 case S_LONG_LONG_INDEX: 931 return BPLUSTREE_INT64_TYPE; 932 case S_ULONG_LONG_INDEX: 933 return BPLUSTREE_UINT64_TYPE; 934 case S_FLOAT_INDEX: 935 return BPLUSTREE_FLOAT_TYPE; 936 case S_DOUBLE_INDEX: 937 return BPLUSTREE_DOUBLE_TYPE; 938 case S_STR_INDEX: 939 default: 940 // default is for standard directories 941 return BPLUSTREE_STRING_TYPE; 942 } 943 } 944 #endif // !_BOOT_MODE 945 946 947 // #pragma mark - TransactionListener implementation 948 949 950 #if !_BOOT_MODE 951 void 952 BPlusTree::TransactionDone(bool success) 953 { 954 if (!success) { 955 // update header from disk 956 CachedNode cached(this); 957 const bplustree_header* header = cached.SetToHeader(); 958 if (header != NULL) 959 memcpy(&fHeader, header, sizeof(bplustree_header)); 960 } 961 } 962 963 964 void 965 BPlusTree::RemovedFromTransaction() 966 { 967 fInTransaction = false; 968 969 if (!fStream->GetVolume()->IsInitializing()) 970 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 971 } 972 973 974 // #pragma mark - 975 976 977 void 978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 979 uint16 splitAt, int8 change) 980 { 981 // Although every iterator which is affected by this update currently 982 // waits on a semaphore, other iterators could be added/removed at 983 // any time, so we need to protect this loop 984 MutexLocker _(fIteratorLock); 985 986 SinglyLinkedList<TreeIterator>::Iterator iterator 987 = fIterators.GetIterator(); 988 while (iterator.HasNext()) 989 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 990 } 991 992 993 void 994 BPlusTree::_AddIterator(TreeIterator* iterator) 995 { 996 MutexLocker _(fIteratorLock); 997 fIterators.Add(iterator); 998 } 999 1000 1001 void 1002 BPlusTree::_RemoveIterator(TreeIterator* iterator) 1003 { 1004 MutexLocker _(fIteratorLock); 1005 fIterators.Remove(iterator); 1006 } 1007 #endif // !_BOOT_MODE 1008 1009 1010 int32 1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 1012 int keyLength2) 1013 { 1014 type_code type = 0; 1015 switch (fHeader.DataType()) { 1016 case BPLUSTREE_STRING_TYPE: 1017 type = B_STRING_TYPE; 1018 break; 1019 case BPLUSTREE_INT32_TYPE: 1020 type = B_INT32_TYPE; 1021 break; 1022 case BPLUSTREE_UINT32_TYPE: 1023 type = B_UINT32_TYPE; 1024 break; 1025 case BPLUSTREE_INT64_TYPE: 1026 type = B_INT64_TYPE; 1027 break; 1028 case BPLUSTREE_UINT64_TYPE: 1029 type = B_UINT64_TYPE; 1030 break; 1031 case BPLUSTREE_FLOAT_TYPE: 1032 type = B_FLOAT_TYPE; 1033 break; 1034 case BPLUSTREE_DOUBLE_TYPE: 1035 type = B_DOUBLE_TYPE; 1036 break; 1037 } 1038 return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2); 1039 } 1040 1041 1042 status_t 1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 1044 uint16 keyLength, uint16* _index, off_t* _next) 1045 { 1046 #ifdef DEBUG 1047 NodeChecker checker(node, fNodeSize, "find"); 1048 #endif 1049 1050 if (node->all_key_count == 0) { 1051 if (_index) 1052 *_index = 0; 1053 if (_next) 1054 *_next = node->OverflowLink(); 1055 return B_ENTRY_NOT_FOUND; 1056 } 1057 1058 off_t* values = node->Values(); 1059 int16 saveIndex = -1; 1060 1061 // binary search in the key array 1062 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 1063 uint16 i = (first + last) >> 1; 1064 1065 uint16 searchLength = 0; 1066 uint8* searchKey = node->KeyAt(i, &searchLength); 1067 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 1068 > (uint8*)node + fNodeSize 1069 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 1070 #if !_BOOT_MODE 1071 fStream->GetVolume()->Panic(); 1072 #endif 1073 RETURN_ERROR(B_BAD_DATA); 1074 } 1075 1076 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 1077 if (cmp < 0) { 1078 last = i - 1; 1079 saveIndex = i; 1080 } else if (cmp > 0) { 1081 saveIndex = first = i + 1; 1082 } else { 1083 if (_index) 1084 *_index = i; 1085 if (_next) 1086 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1087 return B_OK; 1088 } 1089 } 1090 1091 if (_index) 1092 *_index = saveIndex; 1093 if (_next) { 1094 if (saveIndex == node->NumKeys()) 1095 *_next = node->OverflowLink(); 1096 else 1097 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 1098 } 1099 return B_ENTRY_NOT_FOUND; 1100 } 1101 1102 1103 #if !_BOOT_MODE 1104 /*! Prepares the stack to contain all nodes that were passed while 1105 following the key, from the root node to the leaf node that could 1106 or should contain that key. 1107 */ 1108 status_t 1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 1110 uint16 keyLength) 1111 { 1112 // set the root node to begin with 1113 node_and_key nodeAndKey; 1114 nodeAndKey.nodeOffset = fHeader.RootNode(); 1115 1116 CachedNode cached(this); 1117 const bplustree_node* node; 1118 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1119 // if we are already on leaf level, we're done 1120 if (node->OverflowLink() == BPLUSTREE_NULL) { 1121 // node that the keyIndex is not properly set here (but it's not 1122 // needed in the calling functions anyway)! 1123 nodeAndKey.keyIndex = 0; 1124 stack.Push(nodeAndKey); 1125 return B_OK; 1126 } 1127 1128 off_t nextOffset; 1129 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 1130 &nextOffset); 1131 1132 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 1133 RETURN_ERROR(B_ERROR); 1134 1135 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 1136 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 1137 break; 1138 } 1139 1140 // put the node offset & the correct keyIndex on the stack 1141 stack.Push(nodeAndKey); 1142 1143 nodeAndKey.nodeOffset = nextOffset; 1144 } 1145 1146 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %" 1147 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID())); 1148 return B_ERROR; 1149 } 1150 1151 1152 /*! This will find a free duplicate fragment in the given bplustree_node. 1153 The CachedNode will be set to the writable fragment on success. 1154 */ 1155 status_t 1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 1157 const bplustree_node* node, CachedNode& cached, 1158 off_t* _offset, bplustree_node** _fragment, uint32* _index) 1159 { 1160 off_t* values = node->Values(); 1161 for (int32 i = 0; i < node->NumKeys(); i++) { 1162 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 1163 1164 // does the value link to a duplicate fragment? 1165 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 1166 continue; 1167 1168 const bplustree_node* fragment = cached.SetTo( 1169 bplustree_node::FragmentOffset(value), false); 1170 if (fragment == NULL) { 1171 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %" 1172 B_PRIdOFF "\n", value, fStream->ID())); 1173 continue; 1174 } 1175 1176 // see if there is some space left for us 1177 uint32 num = bplustree_node::MaxFragments(fNodeSize); 1178 for (uint32 j = 0; j < num; j++) { 1179 duplicate_array* array = fragment->FragmentAt(j); 1180 1181 if (array->IsEmpty()) { 1182 // found an unused fragment 1183 *_fragment = cached.MakeWritable(transaction); 1184 if (*_fragment == NULL) 1185 return B_IO_ERROR; 1186 1187 *_offset = bplustree_node::FragmentOffset(value); 1188 *_index = j; 1189 return B_OK; 1190 } 1191 } 1192 } 1193 return B_ENTRY_NOT_FOUND; 1194 } 1195 1196 1197 status_t 1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 1199 const bplustree_node* node, uint16 index, off_t value) 1200 { 1201 CachedNode cachedDuplicate(this); 1202 off_t* values = node->Values(); 1203 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1204 status_t status; 1205 off_t offset; 1206 1207 if (bplustree_node::IsDuplicate(oldValue)) { 1208 // If it's a duplicate fragment, try to insert it into that, or if it 1209 // doesn't fit anymore, create a new duplicate node 1210 1211 if (bplustree_node::LinkType(oldValue) 1212 == BPLUSTREE_DUPLICATE_FRAGMENT) { 1213 bplustree_node* duplicate = cachedDuplicate.SetToWritable( 1214 transaction, bplustree_node::FragmentOffset(oldValue), false); 1215 if (duplicate == NULL) 1216 return B_IO_ERROR; 1217 1218 duplicate_array* array = duplicate->FragmentAt( 1219 bplustree_node::FragmentIndex(oldValue)); 1220 int32 arrayCount = array->Count(); 1221 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) { 1222 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment " 1223 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1224 (int)bplustree_node::FragmentIndex(oldValue), 1225 bplustree_node::FragmentOffset(oldValue), arrayCount, 1226 fStream->ID())); 1227 return B_BAD_DATA; 1228 } 1229 1230 if (arrayCount < NUM_FRAGMENT_VALUES) { 1231 array->Insert(value); 1232 } else { 1233 // Test if the fragment will be empty if we remove this key's 1234 // values 1235 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 1236 // The node will be empty without our values, so let us 1237 // reuse it as a duplicate node 1238 offset = bplustree_node::FragmentOffset(oldValue); 1239 1240 memmove(duplicate->DuplicateArray(), array, 1241 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1242 duplicate->left_link = duplicate->right_link 1243 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1244 1245 array = duplicate->DuplicateArray(); 1246 array->Insert(value); 1247 } else { 1248 // Create a new duplicate node 1249 1250 cachedDuplicate.UnsetUnchanged(transaction); 1251 // The old duplicate has not been touched, so we can 1252 // reuse it 1253 1254 bplustree_node* newDuplicate; 1255 status = cachedDuplicate.Allocate(transaction, 1256 &newDuplicate, &offset); 1257 if (status != B_OK) 1258 RETURN_ERROR(status); 1259 1260 // Copy the array from the fragment node to the duplicate 1261 // node and free the old entry (by zero'ing all values) 1262 newDuplicate->overflow_link = array->count; 1263 memcpy(&newDuplicate->all_key_count, &array->values[0], 1264 array->Count() * sizeof(off_t)); 1265 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1266 1267 array = newDuplicate->DuplicateArray(); 1268 array->Insert(value); 1269 } 1270 1271 // Update the main pointer to link to a duplicate node 1272 if (cached.MakeWritable(transaction) == NULL) 1273 return B_IO_ERROR; 1274 1275 values[index] 1276 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1277 BPLUSTREE_DUPLICATE_NODE, offset)); 1278 } 1279 1280 return B_OK; 1281 } 1282 1283 // Put the value into a dedicated duplicate node 1284 1285 // search for free space in the duplicate nodes of that key 1286 duplicate_array* array; 1287 int32 arrayCount; 1288 const bplustree_node* duplicate; 1289 off_t duplicateOffset; 1290 do { 1291 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1292 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1293 if (duplicate == NULL) 1294 return B_IO_ERROR; 1295 1296 array = duplicate->DuplicateArray(); 1297 arrayCount =array->Count(); 1298 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1299 FATAL(("_InsertDuplicate: Invalid array size in duplicate %" 1300 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1301 duplicateOffset, arrayCount, fStream->ID())); 1302 return B_BAD_DATA; 1303 } 1304 } while (arrayCount >= NUM_DUPLICATE_VALUES 1305 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 1306 1307 bplustree_node* writableDuplicate 1308 = cachedDuplicate.MakeWritable(transaction); 1309 if (writableDuplicate == NULL) 1310 return B_IO_ERROR; 1311 1312 if (arrayCount < NUM_DUPLICATE_VALUES) { 1313 array = writableDuplicate->DuplicateArray(); 1314 array->Insert(value); 1315 } else { 1316 // no space left - add a new duplicate node 1317 1318 bplustree_node* newDuplicate; 1319 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 1320 &offset); 1321 if (status != B_OK) 1322 RETURN_ERROR(status); 1323 1324 // link the two nodes together 1325 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 1326 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 1327 1328 array = newDuplicate->DuplicateArray(); 1329 array->count = 0; 1330 array->Insert(value); 1331 } 1332 return B_OK; 1333 } 1334 1335 // Search for a free duplicate fragment or create a new one 1336 // to insert the duplicate value into 1337 1338 uint32 fragmentIndex = 0; 1339 bplustree_node* fragment; 1340 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 1341 &offset, &fragment, &fragmentIndex) != B_OK) { 1342 // allocate a new duplicate fragment node 1343 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 1344 if (status != B_OK) 1345 RETURN_ERROR(status); 1346 1347 memset(fragment, 0, fNodeSize); 1348 } 1349 1350 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 1351 array->Insert(oldValue); 1352 array->Insert(value); 1353 1354 if (cached.MakeWritable(transaction) == NULL) 1355 return B_IO_ERROR; 1356 1357 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1358 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 1359 1360 return B_OK; 1361 } 1362 1363 1364 void 1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 1366 uint16 keyLength, off_t value) 1367 { 1368 // should never happen, but who knows? 1369 if (index > node->NumKeys()) 1370 return; 1371 1372 off_t* values = node->Values(); 1373 uint16* keyLengths = node->KeyLengths(); 1374 uint8* keys = node->Keys(); 1375 1376 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1377 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1378 + keyLength); 1379 1380 off_t* newValues = node->Values(); 1381 uint16* newKeyLengths = node->KeyLengths(); 1382 1383 // move values and copy new value into them 1384 memmove(newValues + index + 1, values + index, 1385 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1386 memmove(newValues, values, sizeof(off_t) * index); 1387 1388 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1389 1390 // move and update key length index 1391 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1392 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1393 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1394 } 1395 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1396 1397 int32 keyStart; 1398 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1399 + (keyStart = index > 0 1400 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1401 1402 // move keys and copy new key into them 1403 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1404 int32 size = node->AllKeyLength() - length; 1405 if (size > 0) 1406 memmove(keys + length, keys + length - keyLength, size); 1407 1408 memcpy(keys + keyStart, key, keyLength); 1409 } 1410 1411 1412 /*! Splits the \a node into two halves - the other half will be put into 1413 \a other. It also takes care to create a new overflow link if the node 1414 to split is an index node. 1415 */ 1416 status_t 1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1418 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1419 uint16* _keyLength, off_t* _value) 1420 { 1421 if (*_keyIndex > node->NumKeys() + 1) 1422 return B_BAD_VALUE; 1423 1424 uint16* inKeyLengths = node->KeyLengths(); 1425 off_t* inKeyValues = node->Values(); 1426 uint8* inKeys = node->Keys(); 1427 uint8* outKeys = other->Keys(); 1428 int32 keyIndex = *_keyIndex; // can become less than zero! 1429 1430 if (keyIndex > node->NumKeys()) { 1431 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF 1432 "\n", (int)keyIndex, node->NumKeys(), fStream->ID())); 1433 return B_BAD_VALUE; 1434 } 1435 1436 // How many keys will fit in one (half) page? 1437 // The following loop will find the answer to this question and 1438 // change the key lengths indices for their new home 1439 1440 // "bytes" is the number of bytes written for the new key, 1441 // "bytesBefore" are the bytes before that key 1442 // "bytesAfter" are the bytes after the new key, if any 1443 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1444 1445 size_t size = fNodeSize >> 1; 1446 int32 out, in; 1447 size_t keyLengths = 0; 1448 for (in = out = 0; in < node->NumKeys() + 1;) { 1449 keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]); 1450 1451 if (in == keyIndex && !bytes) { 1452 bytes = *_keyLength; 1453 bytesBefore = in > 0 1454 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1455 } else { 1456 if (keyIndex < out) 1457 bytesAfter = keyLengths - bytesBefore; 1458 1459 in++; 1460 } 1461 out++; 1462 1463 if (key_align(sizeof(bplustree_node) + bytes + keyLengths) 1464 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1465 // we have found the number of keys in the new node! 1466 break; 1467 } 1468 } 1469 1470 // if the new key was not inserted, set the length of the keys 1471 // that can be copied directly 1472 1473 if (keyIndex >= out && in > 0) 1474 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1475 else if (keyIndex + 1 < out) 1476 bytesAfter = keyLengths - bytesBefore; 1477 1478 if (bytesBefore < 0 || bytesAfter < 0) 1479 return B_BAD_DATA; 1480 1481 other->left_link = node->left_link; 1482 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1483 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1484 + bytesAfter); 1485 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1486 1487 uint16* outKeyLengths = other->KeyLengths(); 1488 off_t* outKeyValues = other->Values(); 1489 int32 keys = out > keyIndex ? keyIndex : out; 1490 1491 if (bytesBefore) { 1492 // copy the keys 1493 memcpy(outKeys, inKeys, bytesBefore); 1494 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1495 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1496 } 1497 if (bytes) { 1498 // copy the newly inserted key 1499 memcpy(outKeys + bytesBefore, key, bytes); 1500 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1501 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1502 1503 if (bytesAfter) { 1504 // copy the keys after the new key 1505 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1506 bytesAfter); 1507 keys = out - keyIndex - 1; 1508 for (int32 i = 0;i < keys;i++) { 1509 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1510 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1511 + bytes); 1512 } 1513 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1514 keys * sizeof(off_t)); 1515 } 1516 } 1517 1518 // if the new key was already inserted, we shouldn't use it again 1519 if (in != out) 1520 keyIndex--; 1521 1522 int32 total = bytesBefore + bytesAfter; 1523 1524 // these variables are for the key that will be returned 1525 // to the parent node 1526 uint8* newKey = NULL; 1527 uint16 newLength; 1528 bool newAllocated = false; 1529 1530 // If we have split an index node, we have to drop the first key 1531 // of the next node (which can also be the new key to insert). 1532 // The dropped key is also the one which has to be inserted in 1533 // the parent node, so we will set the "newKey" already here. 1534 if (node->OverflowLink() != BPLUSTREE_NULL) { 1535 if (in == keyIndex) { 1536 newKey = key; 1537 newLength = *_keyLength; 1538 1539 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1540 keyIndex--; 1541 } else { 1542 // If a key is dropped (is not the new key), we have to copy 1543 // it, because it would be lost if not. 1544 uint8* droppedKey = node->KeyAt(in, &newLength); 1545 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1546 > (uint8*)node + fNodeSize 1547 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1548 fStream->GetVolume()->Panic(); 1549 RETURN_ERROR(B_BAD_DATA); 1550 } 1551 newKey = (uint8*)malloc(newLength); 1552 if (newKey == NULL) 1553 return B_NO_MEMORY; 1554 1555 newAllocated = true; 1556 memcpy(newKey, droppedKey, newLength); 1557 1558 other->overflow_link = inKeyValues[in]; 1559 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1560 } 1561 } 1562 1563 // and now the same game for the other page and the rest of the keys 1564 // (but with memmove() instead of memcpy(), because they may overlap) 1565 1566 bytesBefore = bytesAfter = bytes = 0; 1567 out = 0; 1568 int32 skip = in; 1569 while (in < node->NumKeys() + 1) { 1570 if (in == keyIndex && !bytes) { 1571 // it's enough to set bytesBefore once here, because we do 1572 // not need to know the exact length of all keys in this 1573 // loop 1574 bytesBefore = in > skip 1575 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1576 bytes = *_keyLength; 1577 out++; 1578 } else { 1579 if (in < node->NumKeys()) { 1580 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1581 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1582 1583 if (bytes) { 1584 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1585 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1586 1587 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1588 - bytesBefore - bytes; 1589 } 1590 out++; 1591 } 1592 in++; 1593 } 1594 } 1595 1596 // adjust the byte counts (since we were a bit lazy in the loop) 1597 if (keyIndex < skip) 1598 bytesBefore = node->AllKeyLength() - total; 1599 1600 if (bytesBefore < 0 || bytesAfter < 0) { 1601 if (newAllocated) 1602 free(newKey); 1603 return B_BAD_DATA; 1604 } 1605 1606 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1607 // right link, and overflow link can stay the same 1608 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1609 + bytesAfter); 1610 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1611 1612 // array positions have changed 1613 outKeyLengths = node->KeyLengths(); 1614 outKeyValues = node->Values(); 1615 1616 // move the keys in the old node: the order is important here, 1617 // because we don't want to overwrite any contents 1618 1619 keys = keyIndex <= skip ? out : keyIndex - skip; 1620 keyIndex -= skip; 1621 in = out - keyIndex - 1; 1622 // Note: keyIndex and in will contain invalid values when the new key 1623 // went to the other node. But in this case bytes and bytesAfter are 1624 // 0 and subsequently we never use keyIndex and in. 1625 1626 if (bytesBefore) 1627 memmove(inKeys, inKeys + total, bytesBefore); 1628 if (bytesAfter) { 1629 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1630 bytesAfter); 1631 } 1632 1633 if (bytesBefore) 1634 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1635 if (bytesAfter) { 1636 // if byteAfter is > 0, keyIndex is larger than skip 1637 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1638 in * sizeof(uint16)); 1639 } 1640 1641 if (bytesBefore) 1642 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1643 if (bytesAfter) { 1644 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1645 in * sizeof(off_t)); 1646 } 1647 1648 if (bytes) { 1649 // finally, copy the newly inserted key (don't overwrite anything) 1650 memcpy(inKeys + bytesBefore, key, bytes); 1651 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1652 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1653 } 1654 1655 // Prepare the key that will be inserted in the parent node which 1656 // is either the dropped key or the last of the other node. 1657 // If it's the dropped key, "newKey" was already set earlier. 1658 1659 if (newKey == NULL) 1660 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1661 1662 memcpy(key, newKey, newLength); 1663 *_keyLength = newLength; 1664 *_value = otherOffset; 1665 1666 if (newAllocated) 1667 free(newKey); 1668 1669 return B_OK; 1670 } 1671 1672 1673 /*! This inserts a key into the tree. The changes made to the tree will 1674 all be part of the \a transaction. 1675 You need to have the inode write locked. 1676 */ 1677 status_t 1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1679 off_t value) 1680 { 1681 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1682 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1683 RETURN_ERROR(B_BAD_VALUE); 1684 #ifdef DEBUG 1685 if (value < 0) 1686 panic("tried to insert invalid value %" B_PRId64 "!\n", value); 1687 #endif 1688 1689 ASSERT_WRITE_LOCKED_INODE(fStream); 1690 1691 Stack<node_and_key> stack; 1692 if (_SeekDown(stack, key, keyLength) != B_OK) 1693 RETURN_ERROR(B_ERROR); 1694 1695 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1696 1697 memcpy(keyBuffer, key, keyLength); 1698 keyBuffer[keyLength] = 0; 1699 1700 node_and_key nodeAndKey; 1701 const bplustree_node* node; 1702 1703 CachedNode cached(this); 1704 while (stack.Pop(&nodeAndKey) 1705 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1706 #ifdef DEBUG 1707 NodeChecker checker(node, fNodeSize, "insert"); 1708 #endif 1709 if (node->IsLeaf()) { 1710 // first round, check for duplicate entries 1711 status_t status = _FindKey(node, key, keyLength, 1712 &nodeAndKey.keyIndex); 1713 1714 // is this a duplicate entry? 1715 if (status == B_OK) { 1716 if (fAllowDuplicates) { 1717 status = _InsertDuplicate(transaction, cached, node, 1718 nodeAndKey.keyIndex, value); 1719 if (status != B_OK) 1720 RETURN_ERROR(status); 1721 return B_OK; 1722 } 1723 1724 return B_NAME_IN_USE; 1725 } 1726 } 1727 1728 bplustree_node* writableNode = cached.MakeWritable(transaction); 1729 if (writableNode == NULL) 1730 return B_IO_ERROR; 1731 1732 // is the node big enough to hold the pair? 1733 if (int32(key_align(sizeof(bplustree_node) 1734 + writableNode->AllKeyLength() + keyLength) 1735 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1736 + sizeof(off_t))) < fNodeSize) { 1737 _InsertKey(writableNode, nodeAndKey.keyIndex, 1738 keyBuffer, keyLength, value); 1739 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1740 nodeAndKey.keyIndex, 0, 1); 1741 1742 return B_OK; 1743 } else { 1744 CachedNode cachedNewRoot(this); 1745 CachedNode cachedOther(this); 1746 1747 // do we need to allocate a new root node? if so, then do 1748 // it now 1749 off_t newRoot = BPLUSTREE_NULL; 1750 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1751 bplustree_node* root; 1752 status_t status = cachedNewRoot.Allocate(transaction, &root, 1753 &newRoot); 1754 if (status != B_OK) { 1755 // The tree is most likely corrupted! 1756 // But it's still sane at leaf level - we could set 1757 // a flag in the header that forces the tree to be 1758 // rebuild next time... 1759 // But since we will have journaling, that's not a big 1760 // problem anyway. 1761 RETURN_ERROR(status); 1762 } 1763 } 1764 1765 // reserve space for the other node 1766 bplustree_node* other; 1767 off_t otherOffset; 1768 status_t status = cachedOther.Allocate(transaction, &other, 1769 &otherOffset); 1770 if (status != B_OK) { 1771 cachedNewRoot.Free(transaction, newRoot); 1772 RETURN_ERROR(status); 1773 } 1774 1775 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1776 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1777 &value) != B_OK) { 1778 // free root node & other node here 1779 cachedOther.Free(transaction, otherOffset); 1780 cachedNewRoot.Free(transaction, newRoot); 1781 1782 RETURN_ERROR(B_ERROR); 1783 } 1784 #ifdef DEBUG 1785 checker.Check("insert split"); 1786 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1787 #endif 1788 1789 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1790 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1791 1792 // update the right link of the node in the left of the new node 1793 if ((other = cachedOther.SetToWritable(transaction, 1794 other->LeftLink())) != NULL) { 1795 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1796 } 1797 1798 // create a new root if necessary 1799 if (newRoot != BPLUSTREE_NULL) { 1800 bplustree_node* root = cachedNewRoot.Node(); 1801 1802 _InsertKey(root, 0, keyBuffer, keyLength, 1803 writableNode->LeftLink()); 1804 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1805 nodeAndKey.nodeOffset); 1806 1807 CachedNode cached(this); 1808 bplustree_header* header 1809 = cached.SetToWritableHeader(transaction); 1810 if (header == NULL) 1811 return B_IO_ERROR; 1812 1813 // finally, update header to point to the new root 1814 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1815 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1816 header->MaxNumberOfLevels() + 1); 1817 1818 return B_OK; 1819 } 1820 } 1821 } 1822 RETURN_ERROR(B_ERROR); 1823 } 1824 1825 1826 /*! Removes the duplicate index/value pair from the tree. 1827 It's part of the private tree interface. 1828 */ 1829 status_t 1830 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1831 const bplustree_node* node, CachedNode& cached, uint16 index, 1832 off_t value) 1833 { 1834 off_t* values = node->Values(); 1835 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1836 1837 CachedNode cachedDuplicate(this); 1838 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1839 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1840 duplicateOffset, false); 1841 if (duplicate == NULL) 1842 return B_IO_ERROR; 1843 1844 // if it's a duplicate fragment, remove the entry from there 1845 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1846 duplicate_array* array = duplicate->FragmentAt( 1847 bplustree_node::FragmentIndex(oldValue)); 1848 int32 arrayCount = array->Count(); 1849 1850 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) { 1851 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %" 1852 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1853 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1854 arrayCount, fStream->ID())); 1855 return B_BAD_DATA; 1856 } 1857 if (!array->Remove(value)) { 1858 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1859 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value, 1860 duplicateOffset, fStream->ID())); 1861 return B_ENTRY_NOT_FOUND; 1862 } 1863 1864 // remove the array from the fragment node if it is empty 1865 if (--arrayCount == 1) { 1866 // set the link to the remaining value 1867 if (cached.MakeWritable(transaction) == NULL) 1868 return B_IO_ERROR; 1869 1870 values[index] = array->values[0]; 1871 1872 // Remove the whole fragment node, if this was the only array, 1873 // otherwise free just the array 1874 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1875 status_t status = cachedDuplicate.Free(transaction, 1876 duplicateOffset); 1877 if (status != B_OK) 1878 return status; 1879 } else 1880 array->count = 0; 1881 } 1882 return B_OK; 1883 } 1884 1885 // Remove value from a duplicate node! 1886 1887 duplicate_array* array = NULL; 1888 int32 arrayCount = 0; 1889 1890 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1891 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1892 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID())); 1893 return B_BAD_DATA; 1894 } 1895 1896 // Search the duplicate nodes until the entry could be found (and removed) 1897 while (duplicate != NULL) { 1898 array = duplicate->DuplicateArray(); 1899 arrayCount = array->Count(); 1900 1901 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) { 1902 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %" 1903 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n", 1904 duplicateOffset, arrayCount, fStream->ID())); 1905 return B_BAD_DATA; 1906 } 1907 1908 if (array->Remove(value)) { 1909 arrayCount--; 1910 break; 1911 } 1912 1913 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1914 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1915 1916 cachedDuplicate.UnsetUnchanged(transaction); 1917 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1918 false); 1919 } 1920 if (duplicate == NULL) 1921 RETURN_ERROR(B_IO_ERROR); 1922 1923 // The entry got removed from the duplicate node, but we might want to free 1924 // it now in case it's empty 1925 1926 while (true) { 1927 off_t left = duplicate->LeftLink(); 1928 off_t right = duplicate->RightLink(); 1929 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1930 1931 if ((isLast && arrayCount == 1) || arrayCount == 0) { 1932 // Free empty duplicate page, link their siblings together, and 1933 // update the duplicate link if needed (ie. when we either remove 1934 // the last duplicate node or have a new first one) 1935 1936 if (left == BPLUSTREE_NULL) { 1937 // the duplicate link points to us 1938 if (cached.MakeWritable(transaction) == NULL) 1939 return B_IO_ERROR; 1940 1941 if (arrayCount == 1) { 1942 // This is the last node, and there is only one value left; 1943 // replace the duplicate link with that value, it's no 1944 // duplicate anymore 1945 values[index] = array->values[0]; 1946 } else { 1947 // Move the duplicate link to the next node 1948 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1949 bplustree_node::MakeLink( 1950 BPLUSTREE_DUPLICATE_NODE, right)); 1951 } 1952 } 1953 1954 status_t status = cachedDuplicate.Free(transaction, 1955 duplicateOffset); 1956 if (status != B_OK) 1957 return status; 1958 1959 if (left != BPLUSTREE_NULL 1960 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1961 false)) != NULL) { 1962 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1963 1964 // If the next node is the last node, we need to free that node 1965 // and convert the duplicate entry back into a normal entry 1966 array = duplicate->DuplicateArray(); 1967 arrayCount = array->Count(); 1968 if (right == BPLUSTREE_NULL 1969 && duplicate->LeftLink() == BPLUSTREE_NULL 1970 && arrayCount <= NUM_FRAGMENT_VALUES) { 1971 duplicateOffset = left; 1972 continue; 1973 } 1974 } 1975 if (right != BPLUSTREE_NULL 1976 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1977 right, false)) != NULL) { 1978 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1979 1980 // Again, we may need to turn the duplicate entry back into a 1981 // normal entry 1982 array = duplicate->DuplicateArray(); 1983 arrayCount = array->Count(); 1984 if (left == BPLUSTREE_NULL 1985 && duplicate->RightLink() == BPLUSTREE_NULL 1986 && arrayCount <= NUM_FRAGMENT_VALUES) { 1987 duplicateOffset = right; 1988 continue; 1989 } 1990 } 1991 return B_OK; 1992 } 1993 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) { 1994 // If the number of entries fits in a duplicate fragment, then 1995 // either find a free fragment node, or convert this node to a 1996 // fragment node. 1997 CachedNode cachedOther(this); 1998 1999 bplustree_node* fragment = NULL; 2000 uint32 fragmentIndex = 0; 2001 off_t offset; 2002 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 2003 &offset, &fragment, &fragmentIndex) == B_OK) { 2004 // move to other node 2005 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 2006 memcpy(target, array, 2007 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2008 2009 cachedDuplicate.Free(transaction, duplicateOffset); 2010 duplicateOffset = offset; 2011 } else { 2012 // convert node 2013 memmove(duplicate, array, 2014 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2015 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 2016 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 2017 } 2018 2019 if (cached.MakeWritable(transaction) == NULL) 2020 return B_IO_ERROR; 2021 2022 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 2023 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 2024 } 2025 return B_OK; 2026 } 2027 } 2028 2029 2030 /*! Removes the key with the given index from the specified node. 2031 Since it has to get the key from the node anyway (to obtain it's 2032 pointer), it's not needed to pass the key & its length, although 2033 the calling method (BPlusTree::Remove()) have this data. 2034 */ 2035 void 2036 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 2037 { 2038 // should never happen, but who knows? 2039 if (index > node->NumKeys() && node->NumKeys() > 0) { 2040 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF 2041 "\n", index, fStream->ID())); 2042 return; 2043 } 2044 2045 off_t* values = node->Values(); 2046 2047 // if we would have to drop the overflow link, drop 2048 // the last key instead and update the overflow link 2049 // to the value of that one 2050 if (!node->IsLeaf() && index == node->NumKeys()) 2051 node->overflow_link = values[--index]; 2052 2053 uint16 length; 2054 uint8* key = node->KeyAt(index, &length); 2055 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 2056 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2057 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length, 2058 fStream->ID())); 2059 fStream->GetVolume()->Panic(); 2060 return; 2061 } 2062 2063 uint16* keyLengths = node->KeyLengths(); 2064 uint8* keys = node->Keys(); 2065 2066 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 2067 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 2068 node->AllKeyLength() - length); 2069 2070 off_t* newValues = node->Values(); 2071 uint16* newKeyLengths = node->KeyLengths(); 2072 2073 // move key data 2074 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 2075 2076 // move and update key lengths 2077 if (index > 0 && newKeyLengths != keyLengths) 2078 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 2079 for (uint16 i = index; i < node->NumKeys(); i++) { 2080 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 2081 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 2082 } 2083 2084 // move values 2085 if (index > 0) 2086 memmove(newValues, values, index * sizeof(off_t)); 2087 if (node->NumKeys() > index) { 2088 memmove(newValues + index, values + index + 1, 2089 (node->NumKeys() - index) * sizeof(off_t)); 2090 } 2091 } 2092 2093 2094 /*! Removes the specified key from the tree. The "value" parameter is only used 2095 for trees which allow duplicates, so you may safely ignore it. 2096 It's not an optional parameter, so at least you have to think about it. 2097 You need to have the inode write locked. 2098 */ 2099 status_t 2100 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 2101 off_t value) 2102 { 2103 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2104 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 2105 RETURN_ERROR(B_BAD_VALUE); 2106 2107 ASSERT_WRITE_LOCKED_INODE(fStream); 2108 2109 Stack<node_and_key> stack; 2110 if (_SeekDown(stack, key, keyLength) != B_OK) 2111 RETURN_ERROR(B_ERROR); 2112 2113 node_and_key nodeAndKey; 2114 const bplustree_node* node; 2115 2116 CachedNode cached(this); 2117 while (stack.Pop(&nodeAndKey) 2118 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 2119 #ifdef DEBUG 2120 NodeChecker checker(node, fNodeSize, "remove"); 2121 #endif 2122 if (node->IsLeaf()) { 2123 // first round, check for duplicate entries 2124 status_t status = _FindKey(node, key, keyLength, 2125 &nodeAndKey.keyIndex); 2126 if (status != B_OK) 2127 RETURN_ERROR(status); 2128 2129 // Is this a duplicate entry? 2130 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 2131 node->Values()[nodeAndKey.keyIndex]))) { 2132 if (fAllowDuplicates) { 2133 return _RemoveDuplicate(transaction, node, cached, 2134 nodeAndKey.keyIndex, value); 2135 } 2136 2137 FATAL(("dupliate node found where no duplicates are " 2138 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID())); 2139 RETURN_ERROR(B_ERROR); 2140 } else { 2141 if (node->Values()[nodeAndKey.keyIndex] != value) 2142 return B_ENTRY_NOT_FOUND; 2143 2144 // If we will remove the last key, the iterator will be set 2145 // to the next node after the current - if there aren't any 2146 // more nodes, we need a way to prevent the TreeIterators to 2147 // touch the old node again, we use BPLUSTREE_FREE for this 2148 off_t next = node->RightLink() == BPLUSTREE_NULL 2149 ? BPLUSTREE_FREE : node->RightLink(); 2150 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 2151 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 2152 } 2153 } 2154 2155 bplustree_node* writableNode = cached.MakeWritable(transaction); 2156 if (writableNode == NULL) 2157 return B_IO_ERROR; 2158 2159 // if it's an empty root node, we have to convert it 2160 // to a leaf node by dropping the overflow link, or, 2161 // if it's already a leaf node, just empty it 2162 if (nodeAndKey.nodeOffset == fHeader.RootNode() 2163 && (node->NumKeys() == 0 2164 || (node->NumKeys() == 1 && node->IsLeaf()))) { 2165 writableNode->overflow_link 2166 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2167 writableNode->all_key_count = 0; 2168 writableNode->all_key_length = 0; 2169 2170 // if we've made a leaf node out of the root node, we need 2171 // to reset the maximum number of levels in the header 2172 if (fHeader.MaxNumberOfLevels() != 1) { 2173 CachedNode cached(this); 2174 bplustree_header* header 2175 = cached.SetToWritableHeader(transaction); 2176 if (header == NULL) 2177 return B_IO_ERROR; 2178 2179 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 2180 } 2181 return B_OK; 2182 } 2183 2184 // if there is only one key left, we don't have to remove 2185 // it, we can just dump the node (index nodes still have 2186 // the overflow link, so we have to drop the last key) 2187 if (writableNode->NumKeys() > 1 2188 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 2189 _RemoveKey(writableNode, nodeAndKey.keyIndex); 2190 return B_OK; 2191 } 2192 2193 // when we are here, we can just free the node, but 2194 // we have to update the right/left link of the 2195 // siblings first 2196 CachedNode otherCached(this); 2197 bplustree_node* other = otherCached.SetToWritable(transaction, 2198 writableNode->LeftLink()); 2199 if (other != NULL) 2200 other->right_link = writableNode->right_link; 2201 2202 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 2203 != NULL) { 2204 other->left_link = writableNode->left_link; 2205 } 2206 2207 cached.Free(transaction, nodeAndKey.nodeOffset); 2208 } 2209 RETURN_ERROR(B_ERROR); 2210 } 2211 2212 2213 /*! Replaces the value for the key in the tree. 2214 Returns B_OK if the key could be found and its value replaced, 2215 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 2216 to indicate that something went terribly wrong. 2217 Note that this doesn't work with duplicates - it will just 2218 return B_BAD_TYPE if you call this function on a tree where 2219 duplicates are allowed. 2220 You need to have the inode write locked. 2221 */ 2222 status_t 2223 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 2224 off_t value) 2225 { 2226 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2227 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2228 || key == NULL) 2229 RETURN_ERROR(B_BAD_VALUE); 2230 2231 if (fAllowDuplicates) 2232 RETURN_ERROR(B_BAD_TYPE); 2233 2234 ASSERT_WRITE_LOCKED_INODE(fStream); 2235 2236 off_t nodeOffset = fHeader.RootNode(); 2237 CachedNode cached(this); 2238 const bplustree_node* node; 2239 2240 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2241 uint16 keyIndex = 0; 2242 off_t nextOffset; 2243 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2244 &nextOffset); 2245 2246 if (node->OverflowLink() == BPLUSTREE_NULL) { 2247 if (status == B_OK) { 2248 bplustree_node* writableNode = cached.MakeWritable(transaction); 2249 if (writableNode != NULL) { 2250 writableNode->Values()[keyIndex] 2251 = HOST_ENDIAN_TO_BFS_INT64(value); 2252 } else 2253 status = B_IO_ERROR; 2254 } 2255 2256 return status; 2257 } else if (nextOffset == nodeOffset) 2258 RETURN_ERROR(B_ERROR); 2259 2260 nodeOffset = nextOffset; 2261 } 2262 RETURN_ERROR(B_ERROR); 2263 } 2264 #endif // !_BOOT_MODE 2265 2266 2267 /*! Searches the key in the tree, and stores the offset found in _value, 2268 if successful. 2269 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 2270 while it descends the tree. 2271 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 2272 It can also return other errors to indicate that something went wrong. 2273 Note that this doesn't work with duplicates - it will just return 2274 B_BAD_TYPE if you call this function on a tree where duplicates are 2275 allowed. 2276 You need to have the inode read or write locked. 2277 */ 2278 status_t 2279 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 2280 { 2281 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2282 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2283 || key == NULL) 2284 RETURN_ERROR(B_BAD_VALUE); 2285 2286 if (fAllowDuplicates) 2287 RETURN_ERROR(B_BAD_TYPE); 2288 2289 #if !_BOOT_MODE 2290 ASSERT_READ_LOCKED_INODE(fStream); 2291 #endif 2292 2293 off_t nodeOffset = fHeader.RootNode(); 2294 CachedNode cached(this); 2295 const bplustree_node* node; 2296 2297 #ifdef DEBUG 2298 int32 levels = 0; 2299 #endif 2300 2301 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2302 uint16 keyIndex = 0; 2303 off_t nextOffset; 2304 status_t status = _FindKey(node, key, keyLength, &keyIndex, 2305 &nextOffset); 2306 2307 #ifdef DEBUG 2308 levels++; 2309 #endif 2310 if (node->OverflowLink() == BPLUSTREE_NULL) { 2311 if (status == B_OK && _value != NULL) 2312 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 2313 2314 #ifdef DEBUG 2315 if (levels != (int32)fHeader.MaxNumberOfLevels()) 2316 DEBUGGER(("levels don't match")); 2317 #endif 2318 return status; 2319 } else if (nextOffset == nodeOffset) 2320 RETURN_ERROR(B_ERROR); 2321 2322 nodeOffset = nextOffset; 2323 } 2324 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %" 2325 B_PRIdOFF "\n", nodeOffset, fStream->ID())); 2326 RETURN_ERROR(B_ERROR); 2327 } 2328 2329 2330 #if !_BOOT_MODE 2331 status_t 2332 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset, 2333 const uint8* largestKey, uint16 largestKeyLength, 2334 const bplustree_node* parent) 2335 { 2336 if (parent->CheckIntegrity(fNodeSize) != B_OK) { 2337 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check " 2338 "failed!\n", fStream->ID(), offset); 2339 check.FoundError(); 2340 return B_OK; 2341 } 2342 if (level >= fHeader.MaxNumberOfLevels()) { 2343 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF 2344 "!\n", fStream->ID(), offset); 2345 check.FoundError(); 2346 return B_OK; 2347 } 2348 2349 check.SetLevel(level); 2350 2351 if (check.Visited(offset)) { 2352 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n", 2353 fStream->ID(), offset); 2354 check.FoundError(); 2355 return B_OK; 2356 } 2357 2358 check.SetVisited(offset); 2359 2360 uint32 count = parent->NumKeys(); 2361 off_t* values = parent->Values(); 2362 off_t lastOffset = check.PreviousOffset(level); 2363 CachedNode cached(this); 2364 2365 for (uint32 i = 0; i < count; i++) { 2366 uint16 keyLength; 2367 uint8* key = parent->KeyAt(i, &keyLength); 2368 if (largestKey != NULL) { 2369 int result = _CompareKeys(key, keyLength, largestKey, 2370 largestKeyLength); 2371 if (result > 0 || (result == 0 && i != count - 1)) { 2372 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %" 2373 B_PRIu32 " larger than it should!\n", 2374 fStream->ID(), offset, i); 2375 check.FoundError(); 2376 } 2377 } 2378 2379 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]); 2380 if (bplustree_node::IsDuplicate(childOffset)) { 2381 // Walk the duplicate nodes 2382 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset); 2383 off_t lastDuplicateOffset = BPLUSTREE_NULL; 2384 2385 while (duplicateOffset != BPLUSTREE_NULL) { 2386 const bplustree_node* node; 2387 status_t status = cached.SetTo(duplicateOffset, &node, false); 2388 if (status != B_OK) { 2389 if (status == B_IO_ERROR) 2390 return B_IO_ERROR; 2391 2392 dprintf("inode %" B_PRIdOFF ": duplicate node at %" 2393 B_PRIdOFF " could not be read: %s\n", fStream->ID(), 2394 duplicateOffset, strerror(status)); 2395 check.FoundError(); 2396 break; 2397 } 2398 2399 bool isFragmentNode = bplustree_node::LinkType(childOffset) 2400 == BPLUSTREE_DUPLICATE_FRAGMENT; 2401 bool isKnownFragment = isFragmentNode 2402 && check.VisitedFragment(duplicateOffset); 2403 2404 if (!isKnownFragment && check.Visited(duplicateOffset)) { 2405 dprintf("inode %" B_PRIdOFF ": %s node at %" 2406 B_PRIdOFF " already visited, referenced from %" 2407 B_PRIdOFF "!\n", fStream->ID(), 2408 isFragmentNode ? "fragment" : "duplicate", 2409 duplicateOffset, offset); 2410 check.FoundError(); 2411 break; 2412 } 2413 2414 // Fragment nodes may be visited more than once from different 2415 // places 2416 if (!check.Visited(duplicateOffset)) 2417 check.SetVisited(duplicateOffset); 2418 if (!isKnownFragment && isFragmentNode) 2419 check.SetVisitedFragment(duplicateOffset); 2420 2421 duplicate_array* array; 2422 int32 minSize; 2423 int32 maxSize; 2424 if (isFragmentNode) { 2425 array = node->FragmentAt( 2426 bplustree_node::FragmentIndex(childOffset)); 2427 minSize = 2; 2428 maxSize = NUM_FRAGMENT_VALUES; 2429 } else { 2430 array = node->DuplicateArray(); 2431 minSize = 1; 2432 maxSize = NUM_DUPLICATE_VALUES; 2433 } 2434 int32 arrayCount = array->Count(); 2435 2436 if (arrayCount < minSize || arrayCount > maxSize) { 2437 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2438 " has invalid array size %" B_PRId32 "!\n", 2439 fStream->ID(), duplicateOffset, arrayCount); 2440 check.FoundError(); 2441 } else { 2442 // Simple check if the values in the array may be valid 2443 for (int32 j = 0; j < arrayCount; j++) { 2444 if (!fStream->GetVolume()->IsValidInodeBlock( 2445 array->ValueAt(j))) { 2446 dprintf("inode %" B_PRIdOFF ": duplicate at %" 2447 B_PRIdOFF " contains invalid block %" B_PRIdOFF 2448 " at %" B_PRId32 "!\n", fStream->ID(), 2449 duplicateOffset, array->ValueAt(j), j); 2450 check.FoundError(); 2451 break; 2452 } 2453 } 2454 } 2455 2456 // A fragment node is not linked (and does not have valid links) 2457 if (isFragmentNode) 2458 break; 2459 2460 if (node->LeftLink() != lastDuplicateOffset) { 2461 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF 2462 " has wrong left link %" B_PRIdOFF ", expected %" 2463 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset, 2464 node->LeftLink(), lastDuplicateOffset); 2465 check.FoundError(); 2466 } 2467 2468 lastDuplicateOffset = duplicateOffset; 2469 duplicateOffset = node->RightLink(); 2470 } 2471 } else if (!parent->IsLeaf()) { 2472 // Test a regular child node recursively 2473 off_t nextOffset = parent->OverflowLink(); 2474 if (i < count - 1) 2475 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]); 2476 2477 if (i == 0 && lastOffset != BPLUSTREE_NULL) { 2478 // Test right link of the previous node 2479 const bplustree_node* previous = cached.SetTo(lastOffset, true); 2480 if (previous == NULL) 2481 return B_IO_ERROR; 2482 2483 if (previous->RightLink() != childOffset) { 2484 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2485 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2486 "!\n", fStream->ID(), lastOffset, previous->RightLink(), 2487 childOffset); 2488 check.FoundError(); 2489 } 2490 } 2491 2492 status_t status = _ValidateChild(check, cached, level, childOffset, 2493 lastOffset, nextOffset, key, keyLength); 2494 if (status != B_OK) 2495 return status; 2496 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) { 2497 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains " 2498 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n", 2499 fStream->ID(), offset, childOffset, i); 2500 check.FoundError(); 2501 } 2502 2503 lastOffset = childOffset; 2504 } 2505 2506 if (parent->OverflowLink() != BPLUSTREE_NULL) { 2507 off_t childOffset = parent->OverflowLink(); 2508 status_t status = _ValidateChild(check, cached, level, childOffset, 2509 lastOffset, 0, NULL, 0); 2510 if (status != B_OK) 2511 return status; 2512 2513 lastOffset = childOffset; 2514 } 2515 2516 check.SetPreviousOffset(level, lastOffset); 2517 return B_OK; 2518 } 2519 2520 2521 status_t 2522 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level, 2523 off_t offset, off_t lastOffset, off_t nextOffset, 2524 const uint8* key, uint16 keyLength) 2525 { 2526 const bplustree_node* node; 2527 status_t status = cached.SetTo(offset, &node, true); 2528 if (status != B_OK) { 2529 if (status == B_IO_ERROR) 2530 return B_IO_ERROR; 2531 2532 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be " 2533 "read: %s\n", fStream->ID(), offset, strerror(status)); 2534 check.FoundError(); 2535 return B_OK; 2536 } 2537 2538 if (node->LeftLink() != lastOffset) { 2539 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2540 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF 2541 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset); 2542 check.FoundError(); 2543 } 2544 2545 if (nextOffset != 0 && node->RightLink() != nextOffset) { 2546 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has " 2547 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF 2548 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset); 2549 check.FoundError(); 2550 } 2551 2552 return _ValidateChildren(check, level + 1, offset, key, keyLength, node); 2553 } 2554 #endif // !_BOOT_MODE 2555 2556 2557 // #pragma mark - 2558 2559 2560 TreeIterator::TreeIterator(BPlusTree* tree) 2561 : 2562 fTree(tree), 2563 fCurrentNodeOffset(BPLUSTREE_NULL) 2564 { 2565 #if !_BOOT_MODE 2566 tree->_AddIterator(this); 2567 #endif 2568 } 2569 2570 2571 TreeIterator::~TreeIterator() 2572 { 2573 #if !_BOOT_MODE 2574 if (fTree) 2575 fTree->_RemoveIterator(this); 2576 #endif 2577 } 2578 2579 2580 status_t 2581 TreeIterator::Goto(int8 to) 2582 { 2583 if (fTree == NULL || fTree->fStream == NULL) 2584 RETURN_ERROR(B_BAD_VALUE); 2585 2586 #if !_BOOT_MODE 2587 // lock access to stream 2588 InodeReadLocker locker(fTree->fStream); 2589 #endif 2590 2591 off_t nodeOffset = fTree->fHeader.RootNode(); 2592 CachedNode cached(fTree); 2593 const bplustree_node* node; 2594 2595 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2596 // is the node a leaf node? 2597 if (node->OverflowLink() == BPLUSTREE_NULL) { 2598 fCurrentNodeOffset = nodeOffset; 2599 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 2600 fDuplicateNode = BPLUSTREE_NULL; 2601 2602 return B_OK; 2603 } 2604 2605 // get the next node offset depending on the direction (and if there 2606 // are any keys in that node at all) 2607 off_t nextOffset; 2608 if (to == BPLUSTREE_END || node->all_key_count == 0) 2609 nextOffset = node->OverflowLink(); 2610 else { 2611 if (node->AllKeyLength() > fTree->fNodeSize 2612 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 2613 - 8 * node->NumKeys()) 2614 RETURN_ERROR(B_ERROR); 2615 2616 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 2617 } 2618 if (nextOffset == nodeOffset) 2619 break; 2620 2621 nodeOffset = nextOffset; 2622 } 2623 FATAL(("%s fails\n", __FUNCTION__)); 2624 2625 RETURN_ERROR(B_ERROR); 2626 } 2627 2628 2629 /*! Iterates through the tree in the specified direction. 2630 When it iterates through duplicates, the "key" is only updated for the 2631 first entry - if you need to know when this happens, use the "duplicate" 2632 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2633 the other duplicates. 2634 That's not too nice, but saves the 256 bytes that would be needed to 2635 store the last key - if this will ever become an issue, it will be 2636 easy to change. 2637 The other advantage of this is, that the queries can skip all duplicates 2638 at once when they are not relevant to them. 2639 */ 2640 status_t 2641 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2642 uint16 maxLength, off_t* value, uint16* duplicate) 2643 { 2644 if (fTree == NULL) 2645 return B_INTERRUPTED; 2646 2647 bool forward = direction == BPLUSTREE_FORWARD; 2648 2649 if (fCurrentNodeOffset == BPLUSTREE_NULL 2650 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK) 2651 RETURN_ERROR(B_ERROR); 2652 2653 // if the tree was emptied since the last call 2654 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2655 return B_ENTRY_NOT_FOUND; 2656 2657 #if !_BOOT_MODE 2658 // lock access to stream 2659 InodeReadLocker locker(fTree->fStream); 2660 #endif 2661 2662 CachedNode cached(fTree); 2663 const bplustree_node* node; 2664 2665 if (fDuplicateNode != BPLUSTREE_NULL) { 2666 // regardless of traverse direction the duplicates are always presented 2667 // in the same order; since they are all considered as equal, this 2668 // shouldn't cause any problems 2669 2670 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2671 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2672 false); 2673 } else 2674 node = NULL; 2675 2676 if (node != NULL) { 2677 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2678 // If the node is out of duplicates, we go directly to the next 2679 // one 2680 fDuplicateNode = node->RightLink(); 2681 if (fDuplicateNode != BPLUSTREE_NULL 2682 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2683 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2684 false); 2685 fDuplicate = 0; 2686 } 2687 } 2688 if (fDuplicate < fNumDuplicates) { 2689 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2690 fDuplicate++); 2691 if (duplicate) 2692 *duplicate = 2; 2693 return B_OK; 2694 } 2695 } 2696 fDuplicateNode = BPLUSTREE_NULL; 2697 } 2698 2699 off_t savedNodeOffset = fCurrentNodeOffset; 2700 int32 savedKey = fCurrentKey; 2701 2702 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2703 RETURN_ERROR(B_ERROR); 2704 2705 if (duplicate) 2706 *duplicate = 0; 2707 2708 fCurrentKey += direction; 2709 2710 // is the current key in the current node? 2711 while ((forward && fCurrentKey >= node->NumKeys()) 2712 || (!forward && fCurrentKey < 0)) { 2713 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2714 2715 // are there any more nodes? 2716 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2717 node = cached.SetTo(fCurrentNodeOffset); 2718 if (!node) 2719 RETURN_ERROR(B_ERROR); 2720 2721 // reset current key 2722 fCurrentKey = forward ? 0 : node->NumKeys() - 1; 2723 } else { 2724 // there are no nodes left, so turn back to the last key 2725 fCurrentNodeOffset = savedNodeOffset; 2726 fCurrentKey = savedKey; 2727 2728 return B_ENTRY_NOT_FOUND; 2729 } 2730 } 2731 2732 if (node->all_key_count == 0) 2733 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2734 2735 uint16 length = 0; 2736 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2737 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2738 > (uint8*)node + fTree->fNodeSize 2739 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2740 #if !_BOOT_MODE 2741 fTree->fStream->GetVolume()->Panic(); 2742 #endif 2743 RETURN_ERROR(B_BAD_DATA); 2744 } 2745 2746 // include the termination for string types 2747 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE; 2748 if (length + (needsTermination ? 1 : 0) > maxLength) { 2749 // the buffer is too small, restore the last key and return an error 2750 fCurrentNodeOffset = savedNodeOffset; 2751 fCurrentKey = savedKey; 2752 return B_BUFFER_OVERFLOW; 2753 } 2754 2755 memcpy(key, keyStart, length); 2756 2757 if (needsTermination) 2758 ((char*)key)[length] = '\0'; 2759 2760 *keyLength = length; 2761 2762 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2763 2764 // duplicate fragments? 2765 uint8 type = bplustree_node::LinkType(offset); 2766 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2767 || type == BPLUSTREE_DUPLICATE_NODE) { 2768 fDuplicateNode = offset; 2769 2770 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2771 false); 2772 if (node == NULL) 2773 RETURN_ERROR(B_ERROR); 2774 2775 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2776 2777 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2778 if (fNumDuplicates) { 2779 offset = node->DuplicateAt(offset, fIsFragment, 0); 2780 fDuplicate = 1; 2781 if (duplicate) 2782 *duplicate = 1; 2783 } else { 2784 // Shouldn't happen, but we're dealing here with potentially 2785 // corrupt disks... 2786 fDuplicateNode = BPLUSTREE_NULL; 2787 offset = 0; 2788 } 2789 } 2790 *value = offset; 2791 2792 return B_OK; 2793 } 2794 2795 2796 /*! This is more or less a copy of BPlusTree::Find() - but it just 2797 sets the current position in the iterator, regardless of if the 2798 key could be found or not. 2799 */ 2800 status_t 2801 TreeIterator::Find(const uint8* key, uint16 keyLength) 2802 { 2803 if (fTree == NULL) 2804 return B_INTERRUPTED; 2805 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2806 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2807 || key == NULL) 2808 RETURN_ERROR(B_BAD_VALUE); 2809 2810 #if !_BOOT_MODE 2811 // lock access to stream 2812 InodeReadLocker locker(fTree->fStream); 2813 #endif 2814 2815 off_t nodeOffset = fTree->fHeader.RootNode(); 2816 2817 CachedNode cached(fTree); 2818 const bplustree_node* node; 2819 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2820 uint16 keyIndex = 0; 2821 off_t nextOffset; 2822 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2823 &nextOffset); 2824 2825 if (node->OverflowLink() == BPLUSTREE_NULL) { 2826 fCurrentNodeOffset = nodeOffset; 2827 fCurrentKey = keyIndex - 1; 2828 fDuplicateNode = BPLUSTREE_NULL; 2829 2830 return status; 2831 } else if (nextOffset == nodeOffset) 2832 RETURN_ERROR(B_ERROR); 2833 2834 nodeOffset = nextOffset; 2835 } 2836 RETURN_ERROR(B_ERROR); 2837 } 2838 2839 2840 void 2841 TreeIterator::SkipDuplicates() 2842 { 2843 fDuplicateNode = BPLUSTREE_NULL; 2844 } 2845 2846 2847 void 2848 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2849 uint16 splitAt, int8 change) 2850 { 2851 if (offset != fCurrentNodeOffset) 2852 return; 2853 2854 if (nextOffset != BPLUSTREE_NULL) { 2855 fCurrentNodeOffset = nextOffset; 2856 if (splitAt <= fCurrentKey) { 2857 fCurrentKey -= splitAt; 2858 keyIndex -= splitAt; 2859 } 2860 } 2861 2862 // Adjust fCurrentKey to point to the same key as before. 2863 // Note, that if a key is inserted at the current position 2864 // it won't be included in this tree transition. 2865 if (keyIndex <= fCurrentKey) 2866 fCurrentKey += change; 2867 2868 // TODO: duplicate handling! 2869 } 2870 2871 2872 void 2873 TreeIterator::Stop() 2874 { 2875 fTree = NULL; 2876 } 2877 2878 2879 #ifdef DEBUG 2880 void 2881 TreeIterator::Dump() 2882 { 2883 __out("TreeIterator at %p:\n", this); 2884 __out("\tfTree = %p\n", fTree); 2885 __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset); 2886 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2887 __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n", 2888 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2889 fDuplicateNode); 2890 __out("\tfDuplicate = %u\n", fDuplicate); 2891 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2892 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2893 } 2894 #endif 2895 2896 2897 // #pragma mark - 2898 2899 2900 bool 2901 bplustree_header::IsValid() const 2902 { 2903 return Magic() == BPLUSTREE_MAGIC 2904 && (RootNode() % NodeSize()) == 0 2905 && IsValidLink(RootNode()) 2906 && IsValidLink(FreeNode()); 2907 } 2908 2909 2910 // #pragma mark - 2911 2912 2913 void 2914 bplustree_node::Initialize() 2915 { 2916 left_link = right_link = overflow_link 2917 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2918 all_key_count = 0; 2919 all_key_length = 0; 2920 } 2921 2922 2923 uint8* 2924 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2925 { 2926 if (index < 0 || index > NumKeys()) 2927 return NULL; 2928 2929 uint8* keyStart = Keys(); 2930 uint16* keyLengths = KeyLengths(); 2931 2932 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2933 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2934 if (index > 0) 2935 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2936 2937 return keyStart; 2938 } 2939 2940 2941 uint8 2942 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2943 { 2944 // the duplicate fragment handling is currently hard-coded to a node size 2945 // of 1024 bytes - with future versions of BFS, this may be a problem 2946 2947 if (isFragment) { 2948 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2949 2950 return ((off_t*)this)[fragment]; 2951 } 2952 return OverflowLink(); 2953 } 2954 2955 2956 off_t 2957 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2958 { 2959 uint32 start; 2960 if (isFragment) 2961 start = 8 * ((uint64)offset & 0x3ff); 2962 else 2963 start = 2; 2964 2965 return ((off_t*)this)[start + 1 + index]; 2966 } 2967 2968 2969 /*! Although the name suggests it, this function doesn't return the real 2970 used fragment count; at least, it can only count to two: it returns 2971 0, if there is no fragment used, 1 if there is only one fragment 2972 used, and 2 if there are at least 2 fragments used. 2973 */ 2974 uint32 2975 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2976 { 2977 uint32 used = 0; 2978 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2979 duplicate_array* array = FragmentAt(i); 2980 if (array->Count() > 0 && ++used > 1) 2981 return used; 2982 } 2983 return used; 2984 } 2985 2986 2987 status_t 2988 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2989 { 2990 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2991 DEBUGGER(("invalid node: key/length count")); 2992 2993 for (int32 i = 0; i < NumKeys(); i++) { 2994 uint16 length = 0; 2995 uint8* key = KeyAt(i, &length); 2996 if (key + length + sizeof(off_t) + sizeof(uint16) 2997 > (uint8*)this + nodeSize 2998 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2999 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i); 3000 return B_BAD_DATA; 3001 } 3002 if (Values()[i] == -1) { 3003 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values " 3004 "corrupted\n", this, (int)i, Values()[i]); 3005 return B_BAD_DATA; 3006 } 3007 } 3008 return B_OK; 3009 } 3010 3011 3012 // #pragma mark - 3013 3014 3015 #if !_BOOT_MODE 3016 BitmapArray::BitmapArray(size_t numBits) 3017 { 3018 fSize = (numBits + 7) / 8; 3019 fBitmap = (uint8*)calloc(fSize, 1); 3020 fCountSet = 0; 3021 } 3022 3023 3024 BitmapArray::~BitmapArray() 3025 { 3026 free(fBitmap); 3027 } 3028 3029 3030 status_t 3031 BitmapArray::InitCheck() const 3032 { 3033 return fBitmap != NULL ? B_OK : B_NO_MEMORY; 3034 } 3035 3036 3037 bool 3038 BitmapArray::IsSet(size_t index) const 3039 { 3040 uint32 byteIndex = index / 8; 3041 if (byteIndex >= fSize) 3042 return false; 3043 3044 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0; 3045 } 3046 3047 3048 void 3049 BitmapArray::Set(size_t index, bool set) 3050 { 3051 uint32 byteIndex = index / 8; 3052 if (byteIndex >= fSize) 3053 return; 3054 3055 if (set) { 3056 fBitmap[byteIndex] |= 1UL << (index & 0x7); 3057 fCountSet++; 3058 } else { 3059 fBitmap[byteIndex] &= ~(1UL << (index & 0x7)); 3060 fCountSet--; 3061 } 3062 } 3063 #endif // !_BOOT_MODE 3064 3065 3066 // #pragma mark - 3067 3068 3069 bool 3070 duplicate_array::_FindInternal(off_t value, int32& index) const 3071 { 3072 int32 min = 0, max = Count() - 1; 3073 off_t cmp; 3074 while (min <= max) { 3075 index = (min + max) / 2; 3076 3077 cmp = ValueAt(index) - value; 3078 if (cmp < 0) 3079 min = index + 1; 3080 else if (cmp > 0) 3081 max = index - 1; 3082 else 3083 return true; 3084 } 3085 return false; 3086 } 3087 3088 3089 void 3090 duplicate_array::Insert(off_t value) 3091 { 3092 // if there are more than 8 values in this array, use a 3093 // binary search, if not, just iterate linearly to find 3094 // the insertion point 3095 int32 size = Count(); 3096 int32 i = 0; 3097 if (size > 8 ) { 3098 if (!_FindInternal(value, i) && ValueAt(i) <= value) 3099 i++; 3100 } else { 3101 for (i = 0; i < size; i++) { 3102 if (ValueAt(i) > value) 3103 break; 3104 } 3105 } 3106 3107 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t)); 3108 values[i] = HOST_ENDIAN_TO_BFS_INT64(value); 3109 count = HOST_ENDIAN_TO_BFS_INT64(size + 1); 3110 } 3111 3112 3113 bool 3114 duplicate_array::Remove(off_t value) 3115 { 3116 int32 index = Find(value); 3117 if (index == -1) 3118 return false; 3119 3120 int32 newSize = Count() - 1; 3121 memmove(&values[index], &values[index + 1], 3122 (newSize - index) * sizeof(off_t)); 3123 count = HOST_ENDIAN_TO_BFS_INT64(newSize); 3124 3125 return true; 3126 } 3127 3128 3129 #if _BOOT_MODE 3130 } // namespace BFS 3131 #endif 3132