1 /* 2 * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 //! B+Tree implementation 10 11 12 #include "Debug.h" 13 #include "BPlusTree.h" 14 #include "Inode.h" 15 #include "Utility.h" 16 17 18 #ifdef DEBUG 19 class NodeChecker { 20 public: 21 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 22 : 23 fNode(node), 24 fSize(nodeSize), 25 fText(text) 26 { 27 Check("integrity check failed on construction."); 28 } 29 30 ~NodeChecker() 31 { 32 Check("integrity check failed on destruction."); 33 } 34 35 void 36 Check(const char* message) 37 { 38 if (fNode->CheckIntegrity(fSize) < B_OK) 39 dprintf("%s: %s\n", fText, message); 40 } 41 42 private: 43 const bplustree_node* fNode; 44 int32 fSize; 45 const char* fText; 46 }; 47 #endif 48 49 50 // Node Caching for the BPlusTree class 51 // 52 // With write support, there is the need for a function that allocates new 53 // nodes by either returning empty nodes, or by growing the file's data stream 54 // 55 // !! The CachedNode class assumes that you have properly locked the stream 56 // !! before asking for nodes. 57 // 58 // Note: This code will fail if the block size is smaller than the node size! 59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 60 // is hard-coded to 1024 bytes, that's not an issue now. 61 62 void 63 CachedNode::UnsetUnchanged(Transaction& transaction) 64 { 65 if (fTree == NULL || fTree->fStream == NULL) 66 return; 67 68 if (fNode != NULL) { 69 void* cache = fTree->fStream->GetVolume()->BlockCache(); 70 71 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 72 block_cache_put(cache, fBlockNumber); 73 fNode = NULL; 74 } 75 } 76 77 78 void 79 CachedNode::Unset() 80 { 81 if (fTree == NULL || fTree->fStream == NULL) 82 return; 83 84 if (fNode != NULL) { 85 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 86 fBlockNumber); 87 fNode = NULL; 88 } 89 } 90 91 92 const bplustree_node* 93 CachedNode::SetTo(off_t offset, bool check) 94 { 95 if (fTree == NULL || fTree->fStream == NULL) { 96 REPORT_ERROR(B_BAD_VALUE); 97 return NULL; 98 } 99 100 Unset(); 101 102 // You can only ask for nodes at valid positions - you can't 103 // even access the b+tree header with this method (use SetToHeader() 104 // instead) 105 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 106 || offset <= 0 107 || (offset % fTree->fNodeSize) != 0) 108 return NULL; 109 110 if (InternalSetTo(NULL, offset) != NULL && check) { 111 // sanity checks (links, all_key_count) 112 if (!fTree->fHeader->CheckNode(fNode)) { 113 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 114 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 115 return NULL; 116 } 117 } 118 return fNode; 119 } 120 121 122 bplustree_node* 123 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 124 { 125 if (fTree == NULL || fTree->fStream == NULL) { 126 REPORT_ERROR(B_BAD_VALUE); 127 return NULL; 128 } 129 130 Unset(); 131 132 // You can only ask for nodes at valid positions - you can't 133 // even access the b+tree header with this method (use SetToHeader() 134 // instead) 135 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 136 || offset <= 0 137 || (offset % fTree->fNodeSize) != 0) 138 return NULL; 139 140 if (InternalSetTo(&transaction, offset) != NULL && check) { 141 // sanity checks (links, all_key_count) 142 if (!fTree->fHeader->CheckNode(fNode)) { 143 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 144 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 145 return NULL; 146 } 147 } 148 return fNode; 149 } 150 151 152 bplustree_node* 153 CachedNode::MakeWritable(Transaction& transaction) 154 { 155 if (fNode == NULL) 156 return NULL; 157 158 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 159 fBlockNumber, transaction.ID()) == B_OK) 160 return fNode; 161 162 return NULL; 163 } 164 165 166 bplustree_header* 167 CachedNode::MakeWritableHeader(Transaction& transaction) 168 { 169 return (bplustree_header*)MakeWritable(transaction); 170 } 171 172 173 const bplustree_header* 174 CachedNode::SetToHeader() 175 { 176 if (fTree == NULL || fTree->fStream == NULL) { 177 REPORT_ERROR(B_BAD_VALUE); 178 return NULL; 179 } 180 181 Unset(); 182 183 InternalSetTo(NULL, 0LL); 184 return (bplustree_header*)fNode; 185 } 186 187 188 bplustree_header* 189 CachedNode::SetToWritableHeader(Transaction& transaction) 190 { 191 if (fTree == NULL || fTree->fStream == NULL) { 192 REPORT_ERROR(B_BAD_VALUE); 193 return NULL; 194 } 195 196 Unset(); 197 198 InternalSetTo(&transaction, 0LL); 199 return (bplustree_header*)fNode; 200 } 201 202 203 bplustree_node* 204 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 205 { 206 fNode = NULL; 207 fOffset = offset; 208 209 off_t fileOffset; 210 block_run run; 211 if (offset < fTree->fStream->Size() 212 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 213 Volume* volume = fTree->fStream->GetVolume(); 214 215 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 216 fBlockNumber = volume->ToBlock(run) + blockOffset; 217 uint8* block; 218 219 if (transaction != NULL) { 220 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 221 fBlockNumber, transaction->ID()); 222 fWritable = true; 223 } else { 224 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 225 fWritable = false; 226 } 227 228 if (block) { 229 // The node is somewhere in that block... 230 // (confusing offset calculation) 231 fNode = (bplustree_node*)(block + offset 232 - (fileOffset + (blockOffset << volume->BlockShift()))); 233 } else 234 REPORT_ERROR(B_IO_ERROR); 235 } 236 return fNode; 237 } 238 239 240 status_t 241 CachedNode::Free(Transaction& transaction, off_t offset) 242 { 243 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 244 RETURN_ERROR(B_BAD_VALUE); 245 246 // TODO: scan the free nodes list and remove all nodes at the end 247 // of the tree - perhaps that shouldn't be done everytime that 248 // function is called, perhaps it should be done when the directory 249 // inode is closed or based on some calculation or whatever... 250 251 bplustree_header* header 252 = fTree->fCachedHeader.MakeWritableHeader(transaction); 253 if (header == NULL) 254 return B_IO_ERROR; 255 256 #if 0 257 // TODO: temporarily disabled because CheckNode() doesn't like this... 258 // Also, it's such an edge case that it's almost useless, anyway. 259 // if the node is the last one in the tree, we shrink 260 // the tree and file size by one node 261 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 262 if (offset == lastOffset) { 263 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 264 if (status < B_OK) 265 return status; 266 267 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 268 return B_OK; 269 } 270 #endif 271 272 // add the node to the free nodes list 273 fNode->left_link = header->free_node_pointer; 274 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 275 276 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 277 return B_OK; 278 } 279 280 281 status_t 282 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 283 off_t* _offset) 284 { 285 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 286 RETURN_ERROR(B_BAD_VALUE); 287 288 Unset(); 289 290 bplustree_header* header; 291 status_t status; 292 293 // if there are any free nodes, recycle them 294 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 295 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 296 if (header == NULL) 297 return B_IO_ERROR; 298 299 *_offset = header->FreeNode(); 300 *_node = fNode; 301 302 // set new free node pointer 303 header->free_node_pointer = fNode->left_link; 304 fNode->Initialize(); 305 return B_OK; 306 } 307 308 // allocate space for a new node 309 Inode* stream = fTree->fStream; 310 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 311 return status; 312 313 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 314 if (header == NULL) 315 return B_IO_ERROR; 316 317 // the maximum_size has to be changed before the call to SetTo() - or 318 // else it will fail because the requested node is out of bounds 319 off_t offset = header->MaximumSize(); 320 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 321 + fTree->fNodeSize); 322 323 if (SetToWritable(transaction, offset, false) != NULL) { 324 fNode->Initialize(); 325 326 *_offset = offset; 327 *_node = fNode; 328 return B_OK; 329 } 330 331 // revert header size to old value 332 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 333 - fTree->fNodeSize); 334 RETURN_ERROR(B_ERROR); 335 } 336 337 338 // #pragma mark - 339 340 341 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 342 : 343 fStream(NULL), 344 fHeader(NULL), 345 fCachedHeader(this) 346 { 347 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 348 SetTo(transaction, stream); 349 } 350 351 352 BPlusTree::BPlusTree(Inode* stream) 353 : 354 fStream(NULL), 355 fHeader(NULL), 356 fCachedHeader(this) 357 { 358 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 359 SetTo(stream); 360 } 361 362 363 BPlusTree::BPlusTree() 364 : 365 fStream(NULL), 366 fHeader(NULL), 367 fCachedHeader(this), 368 fNodeSize(BPLUSTREE_NODE_SIZE), 369 fAllowDuplicates(true), 370 fStatus(B_NO_INIT) 371 { 372 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 373 } 374 375 376 BPlusTree::~BPlusTree() 377 { 378 // if there are any TreeIterators left, we need to stop them 379 // (can happen when the tree's inode gets deleted while 380 // traversing the tree - a TreeIterator doesn't lock the inode) 381 mutex_lock(&fIteratorLock); 382 383 SinglyLinkedList<TreeIterator>::Iterator iterator 384 = fIterators.GetIterator(); 385 while (iterator.HasNext()) 386 iterator.Next()->Stop(); 387 388 mutex_destroy(&fIteratorLock); 389 } 390 391 392 /*! Create a new B+Tree on the specified stream */ 393 status_t 394 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 395 { 396 // initializes in-memory B+Tree 397 398 fStream = stream; 399 400 bplustree_header* header = fCachedHeader.SetToWritableHeader(transaction); 401 if (header == NULL) { 402 // allocate space for new header + node! 403 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 404 if (fStatus < B_OK) 405 RETURN_ERROR(fStatus); 406 407 header = fCachedHeader.SetToWritableHeader(transaction); 408 if (header == NULL) 409 RETURN_ERROR(fStatus = B_ERROR); 410 } 411 412 fHeader = header; 413 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 414 && stream->BlockRun() != stream->Parent()) 415 || (stream->Mode() & S_ALLOW_DUPS) != 0; 416 417 fNodeSize = nodeSize; 418 419 // initialize b+tree header 420 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 421 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 422 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 423 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 424 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 425 header->free_node_pointer 426 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 427 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 428 429 // initialize b+tree root node 430 CachedNode cached(this); 431 cached.SetToWritable(transaction, header->RootNode(), false); 432 if (cached.Node() == NULL) 433 RETURN_ERROR(B_IO_ERROR); 434 435 cached.Node()->Initialize(); 436 437 return fStatus = B_OK; 438 } 439 440 441 status_t 442 BPlusTree::SetTo(Inode* stream) 443 { 444 if (stream == NULL) 445 RETURN_ERROR(fStatus = B_BAD_VALUE); 446 447 // get on-disk B+Tree header 448 449 fCachedHeader.Unset(); 450 fStream = stream; 451 452 fHeader = fCachedHeader.SetToHeader(); 453 if (fHeader == NULL) 454 RETURN_ERROR(fStatus = B_NO_INIT); 455 456 // is header valid? 457 458 if (fHeader->MaximumSize() != stream->Size()) { 459 dprintf("B+tree header size %Ld doesn't fit file size %Ld!\n", 460 fHeader->MaximumSize(), stream->Size()); 461 // we can't change the header since we don't have a transaction 462 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 463 } 464 if (fHeader->Magic() != BPLUSTREE_MAGIC 465 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 466 || !fHeader->IsValidLink(fHeader->RootNode()) 467 || !fHeader->IsValidLink(fHeader->FreeNode())) { 468 #ifdef DEBUG 469 dump_bplustree_header(fHeader); 470 dump_block((const char*)fHeader, 128); 471 #endif 472 RETURN_ERROR(fStatus = B_BAD_DATA); 473 } 474 475 fNodeSize = fHeader->NodeSize(); 476 477 { 478 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 479 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 480 S_DOUBLE_INDEX}; 481 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 482 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 483 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 484 485 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 486 || ((stream->Mode() & S_INDEX_DIR) != 0 487 && toMode[fHeader->DataType()] != mode) 488 || !stream->IsContainer()) { 489 D( dump_bplustree_header(fHeader); 490 dump_inode(&stream->Node()); 491 ); 492 RETURN_ERROR(fStatus = B_BAD_TYPE); 493 } 494 495 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 496 // in the original BFS code - we will honour it nevertheless 497 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 498 && stream->BlockRun() != stream->Parent()) 499 || (stream->Mode() & S_ALLOW_DUPS) != 0; 500 } 501 502 CachedNode cached(this, fHeader->RootNode()); 503 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 504 } 505 506 507 status_t 508 BPlusTree::InitCheck() 509 { 510 return fStatus; 511 } 512 513 514 int32 515 BPlusTree::TypeCodeToKeyType(type_code code) 516 { 517 switch (code) { 518 case B_STRING_TYPE: 519 return BPLUSTREE_STRING_TYPE; 520 case B_SSIZE_T_TYPE: 521 case B_INT32_TYPE: 522 return BPLUSTREE_INT32_TYPE; 523 case B_SIZE_T_TYPE: 524 case B_UINT32_TYPE: 525 return BPLUSTREE_UINT32_TYPE; 526 case B_OFF_T_TYPE: 527 case B_INT64_TYPE: 528 return BPLUSTREE_INT64_TYPE; 529 case B_UINT64_TYPE: 530 return BPLUSTREE_UINT64_TYPE; 531 case B_FLOAT_TYPE: 532 return BPLUSTREE_FLOAT_TYPE; 533 case B_DOUBLE_TYPE: 534 return BPLUSTREE_DOUBLE_TYPE; 535 } 536 return -1; 537 } 538 539 540 int32 541 BPlusTree::ModeToKeyType(mode_t mode) 542 { 543 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 544 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 545 case S_INT_INDEX: 546 return BPLUSTREE_INT32_TYPE; 547 case S_UINT_INDEX: 548 return BPLUSTREE_UINT32_TYPE; 549 case S_LONG_LONG_INDEX: 550 return BPLUSTREE_INT64_TYPE; 551 case S_ULONG_LONG_INDEX: 552 return BPLUSTREE_UINT64_TYPE; 553 case S_FLOAT_INDEX: 554 return BPLUSTREE_FLOAT_TYPE; 555 case S_DOUBLE_INDEX: 556 return BPLUSTREE_DOUBLE_TYPE; 557 case S_STR_INDEX: 558 default: 559 // default is for standard directories 560 return BPLUSTREE_STRING_TYPE; 561 } 562 } 563 564 565 // #pragma mark - 566 567 568 void 569 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 570 uint16 splitAt, int8 change) 571 { 572 // Although every iterator which is affected by this update currently 573 // waits on a semaphore, other iterators could be added/removed at 574 // any time, so we need to protect this loop 575 MutexLocker _(fIteratorLock); 576 577 SinglyLinkedList<TreeIterator>::Iterator iterator 578 = fIterators.GetIterator(); 579 while (iterator.HasNext()) 580 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 581 } 582 583 584 void 585 BPlusTree::_AddIterator(TreeIterator* iterator) 586 { 587 MutexLocker _(fIteratorLock); 588 fIterators.Add(iterator); 589 } 590 591 592 void 593 BPlusTree::_RemoveIterator(TreeIterator* iterator) 594 { 595 MutexLocker _(fIteratorLock); 596 fIterators.Remove(iterator); 597 } 598 599 600 int32 601 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 602 int keyLength2) 603 { 604 type_code type = 0; 605 switch (fHeader->data_type) { 606 case BPLUSTREE_STRING_TYPE: 607 type = B_STRING_TYPE; 608 break; 609 case BPLUSTREE_INT32_TYPE: 610 type = B_INT32_TYPE; 611 break; 612 case BPLUSTREE_UINT32_TYPE: 613 type = B_UINT32_TYPE; 614 break; 615 case BPLUSTREE_INT64_TYPE: 616 type = B_INT64_TYPE; 617 break; 618 case BPLUSTREE_UINT64_TYPE: 619 type = B_UINT64_TYPE; 620 break; 621 case BPLUSTREE_FLOAT_TYPE: 622 type = B_FLOAT_TYPE; 623 break; 624 case BPLUSTREE_DOUBLE_TYPE: 625 type = B_DOUBLE_TYPE; 626 break; 627 } 628 return compareKeys(type, key1, keyLength1, key2, keyLength2); 629 } 630 631 632 status_t 633 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 634 uint16 keyLength, uint16* _index, off_t* _next) 635 { 636 #ifdef DEBUG 637 NodeChecker checker(node, fNodeSize, "find"); 638 #endif 639 640 if (node->all_key_count == 0) { 641 if (_index) 642 *_index = 0; 643 if (_next) 644 *_next = node->OverflowLink(); 645 return B_ENTRY_NOT_FOUND; 646 } 647 648 off_t* values = node->Values(); 649 int16 saveIndex = -1; 650 651 // binary search in the key array 652 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 653 uint16 i = (first + last) >> 1; 654 655 uint16 searchLength; 656 uint8* searchKey = node->KeyAt(i, &searchLength); 657 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 658 > (uint8*)node + fNodeSize 659 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 660 fStream->GetVolume()->Panic(); 661 RETURN_ERROR(B_BAD_DATA); 662 } 663 664 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 665 if (cmp < 0) { 666 last = i - 1; 667 saveIndex = i; 668 } else if (cmp > 0) { 669 saveIndex = first = i + 1; 670 } else { 671 if (_index) 672 *_index = i; 673 if (_next) 674 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 675 return B_OK; 676 } 677 } 678 679 if (_index) 680 *_index = saveIndex; 681 if (_next) { 682 if (saveIndex == node->NumKeys()) 683 *_next = node->OverflowLink(); 684 else 685 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 686 } 687 return B_ENTRY_NOT_FOUND; 688 } 689 690 691 /*! Prepares the stack to contain all nodes that were passed while 692 following the key, from the root node to the leaf node that could 693 or should contain that key. 694 */ 695 status_t 696 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 697 uint16 keyLength) 698 { 699 // set the root node to begin with 700 node_and_key nodeAndKey; 701 nodeAndKey.nodeOffset = fHeader->RootNode(); 702 703 CachedNode cached(this); 704 const bplustree_node* node; 705 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 706 // if we are already on leaf level, we're done 707 if (node->OverflowLink() == BPLUSTREE_NULL) { 708 // node that the keyIndex is not properly set here (but it's not 709 // needed in the calling functions anyway)! 710 nodeAndKey.keyIndex = 0; 711 stack.Push(nodeAndKey); 712 return B_OK; 713 } 714 715 off_t nextOffset; 716 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 717 &nextOffset); 718 719 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 720 RETURN_ERROR(B_ERROR); 721 722 if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) { 723 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 724 break; 725 } 726 727 // put the node offset & the correct keyIndex on the stack 728 stack.Push(nodeAndKey); 729 730 nodeAndKey.nodeOffset = nextOffset; 731 } 732 733 FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n", 734 nodeAndKey.nodeOffset)); 735 return B_ERROR; 736 } 737 738 739 /*! This will find a free duplicate fragment in the given bplustree_node. 740 The CachedNode will be set to the writable fragment on success. 741 */ 742 status_t 743 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 744 const bplustree_node* node, CachedNode& cached, 745 off_t* _offset, bplustree_node** _fragment, uint32* _index) 746 { 747 off_t* values = node->Values(); 748 for (int32 i = 0; i < node->NumKeys(); i++) { 749 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 750 751 // does the value link to a duplicate fragment? 752 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 753 continue; 754 755 const bplustree_node* fragment = cached.SetTo( 756 bplustree_node::FragmentOffset(value), false); 757 if (fragment == NULL) { 758 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 759 continue; 760 } 761 762 // see if there is some space left for us 763 uint32 num = bplustree_node::MaxFragments(fNodeSize); 764 for (uint32 j = 0; j < num; j++) { 765 duplicate_array* array = fragment->FragmentAt(j); 766 767 if (array->count == 0) { 768 // found an unused fragment 769 *_fragment = cached.MakeWritable(transaction); 770 if (*_fragment == NULL) 771 return B_IO_ERROR; 772 773 *_offset = bplustree_node::FragmentOffset(value); 774 *_index = j; 775 return B_OK; 776 } 777 } 778 } 779 return B_ENTRY_NOT_FOUND; 780 } 781 782 783 status_t 784 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 785 const bplustree_node* node, uint16 index, off_t value) 786 { 787 CachedNode cachedDuplicate(this); 788 off_t* values = node->Values(); 789 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 790 status_t status; 791 off_t offset; 792 793 if (bplustree_node::IsDuplicate(oldValue)) { 794 // If it's a duplicate fragment, try to insert it into that, or if it 795 // doesn't fit anymore, create a new duplicate node 796 797 if (bplustree_node::LinkType(oldValue) 798 == BPLUSTREE_DUPLICATE_FRAGMENT) { 799 bplustree_node* duplicate 800 = cachedDuplicate.SetToWritable(transaction, 801 bplustree_node::FragmentOffset(oldValue), false); 802 if (duplicate == NULL) 803 return B_IO_ERROR; 804 805 duplicate_array* array = duplicate->FragmentAt( 806 bplustree_node::FragmentIndex(oldValue)); 807 if (array->count > NUM_FRAGMENT_VALUES 808 || array->count < 1) { 809 FATAL(("insertDuplicate: Invalid array[%d] size in fragment " 810 "%Ld == %Ld!\n", 811 (int)bplustree_node::FragmentIndex(oldValue), 812 bplustree_node::FragmentOffset(oldValue), array->count)); 813 return B_BAD_DATA; 814 } 815 816 if (array->count < NUM_FRAGMENT_VALUES) { 817 array->Insert(value); 818 } else { 819 // Test if the fragment will be empty if we remove this key's 820 // values 821 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 822 // The node will be empty without our values, so let us 823 // reuse it as a duplicate node 824 offset = bplustree_node::FragmentOffset(oldValue); 825 826 memmove(duplicate->DuplicateArray(), array, 827 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 828 duplicate->left_link = duplicate->right_link 829 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 830 831 array = duplicate->DuplicateArray(); 832 array->Insert(value); 833 } else { 834 // Create a new duplicate node 835 836 cachedDuplicate.UnsetUnchanged(transaction); 837 // The old duplicate has not been touched, so we can 838 // reuse it 839 840 bplustree_node* newDuplicate; 841 status = cachedDuplicate.Allocate(transaction, 842 &newDuplicate, &offset); 843 if (status < B_OK) 844 RETURN_ERROR(status); 845 846 // Copy the array from the fragment node to the duplicate 847 // node and free the old entry (by zero'ing all values) 848 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 849 array->count); 850 memcpy(&newDuplicate->all_key_count, &array->values[0], 851 array->count * sizeof(off_t)); 852 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 853 854 array = newDuplicate->DuplicateArray(); 855 array->Insert(value); 856 } 857 858 // Update the main pointer to link to a duplicate node 859 if (cached.MakeWritable(transaction) == NULL) 860 return B_IO_ERROR; 861 862 values[index] 863 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 864 BPLUSTREE_DUPLICATE_NODE, offset)); 865 } 866 867 return B_OK; 868 } 869 870 // Put the value into a dedicated duplicate node 871 872 // search for free space in the duplicate nodes of that key 873 duplicate_array* array; 874 const bplustree_node* duplicate; 875 off_t duplicateOffset; 876 do { 877 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 878 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 879 if (duplicate == NULL) 880 return B_IO_ERROR; 881 882 array = duplicate->DuplicateArray(); 883 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 884 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld " 885 "== %Ld!\n", duplicateOffset, array->count)); 886 return B_BAD_DATA; 887 } 888 } while (array->count >= NUM_DUPLICATE_VALUES 889 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 890 891 bplustree_node* writableDuplicate 892 = cachedDuplicate.MakeWritable(transaction); 893 if (writableDuplicate == NULL) 894 return B_IO_ERROR; 895 896 if (array->count < NUM_DUPLICATE_VALUES) { 897 array = writableDuplicate->DuplicateArray(); 898 array->Insert(value); 899 } else { 900 // no space left - add a new duplicate node 901 902 bplustree_node* newDuplicate; 903 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 904 &offset); 905 if (status < B_OK) 906 RETURN_ERROR(status); 907 908 // link the two nodes together 909 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 910 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 911 912 array = newDuplicate->DuplicateArray(); 913 array->count = 0; 914 array->Insert(value); 915 } 916 return B_OK; 917 } 918 919 // Search for a free duplicate fragment or create a new one 920 // to insert the duplicate value into 921 922 uint32 fragmentIndex = 0; 923 bplustree_node* fragment; 924 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 925 &offset, &fragment, &fragmentIndex) != B_OK) { 926 // allocate a new duplicate fragment node 927 if ((status = cachedDuplicate.Allocate(transaction, &fragment, 928 &offset)) < B_OK) 929 RETURN_ERROR(status); 930 931 memset(fragment, 0, fNodeSize); 932 } 933 934 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 935 array->Insert(oldValue); 936 array->Insert(value); 937 938 if (cached.MakeWritable(transaction) == NULL) 939 return B_IO_ERROR; 940 941 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 942 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 943 944 return B_OK; 945 } 946 947 948 void 949 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 950 uint16 keyLength, off_t value) 951 { 952 // should never happen, but who knows? 953 if (index > node->NumKeys()) 954 return; 955 956 off_t* values = node->Values(); 957 uint16* keyLengths = node->KeyLengths(); 958 uint8* keys = node->Keys(); 959 960 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 961 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 962 + keyLength); 963 964 off_t* newValues = node->Values(); 965 uint16* newKeyLengths = node->KeyLengths(); 966 967 // move values and copy new value into them 968 memmove(newValues + index + 1, values + index, 969 sizeof(off_t) * (node->NumKeys() - 1 - index)); 970 memmove(newValues, values, sizeof(off_t) * index); 971 972 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 973 974 // move and update key length index 975 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 976 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 977 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 978 } 979 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 980 981 int32 keyStart; 982 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 983 + (keyStart = index > 0 984 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 985 986 // move keys and copy new key into them 987 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 988 int32 size = node->AllKeyLength() - length; 989 if (size > 0) 990 memmove(keys + length, keys + length - keyLength, size); 991 992 memcpy(keys + keyStart, key, keyLength); 993 } 994 995 996 /*! Splits the \a node into two halves - the other half will be put into 997 \a other. It also takes care to create a new overflow link if the node 998 to split is an index node. 999 */ 1000 status_t 1001 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1002 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1003 uint16* _keyLength, off_t* _value) 1004 { 1005 if (*_keyIndex > node->NumKeys() + 1) 1006 return B_BAD_VALUE; 1007 1008 uint16* inKeyLengths = node->KeyLengths(); 1009 off_t* inKeyValues = node->Values(); 1010 uint8* inKeys = node->Keys(); 1011 uint8* outKeys = other->Keys(); 1012 int32 keyIndex = *_keyIndex; // can become less than zero! 1013 1014 if (keyIndex > node->NumKeys()) { 1015 FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex, 1016 node->NumKeys())); 1017 return B_BAD_VALUE; 1018 } 1019 1020 // how many keys will fit in one (half) page? 1021 // that loop will find the answer to this question and 1022 // change the key lengths indices for their new home 1023 1024 // "bytes" is the number of bytes written for the new key, 1025 // "bytesBefore" are the bytes before that key 1026 // "bytesAfter" are the bytes after the new key, if any 1027 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1028 1029 size_t size = fNodeSize >> 1; 1030 int32 out, in; 1031 for (in = out = 0; in < node->NumKeys() + 1;) { 1032 if (!bytes) { 1033 bytesBefore = in > 0 1034 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1035 } 1036 1037 if (in == keyIndex && !bytes) { 1038 bytes = *_keyLength; 1039 } else { 1040 if (keyIndex < out) { 1041 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1042 - bytesBefore; 1043 } 1044 1045 in++; 1046 } 1047 out++; 1048 1049 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1050 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1051 // we have found the number of keys in the new node! 1052 break; 1053 } 1054 } 1055 1056 // if the new key was not inserted, set the length of the keys 1057 // that can be copied directly 1058 if (keyIndex >= out && in > 0) 1059 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1060 1061 if (bytesBefore < 0 || bytesAfter < 0) 1062 return B_BAD_DATA; 1063 1064 other->left_link = node->left_link; 1065 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1066 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1067 + bytesAfter); 1068 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1069 1070 uint16* outKeyLengths = other->KeyLengths(); 1071 off_t* outKeyValues = other->Values(); 1072 int32 keys = out > keyIndex ? keyIndex : out; 1073 1074 if (bytesBefore) { 1075 // copy the keys 1076 memcpy(outKeys, inKeys, bytesBefore); 1077 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1078 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1079 } 1080 if (bytes) { 1081 // copy the newly inserted key 1082 memcpy(outKeys + bytesBefore, key, bytes); 1083 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1084 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1085 1086 if (bytesAfter) { 1087 // copy the keys after the new key 1088 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1089 bytesAfter); 1090 keys = out - keyIndex - 1; 1091 for (int32 i = 0;i < keys;i++) { 1092 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1093 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1094 + bytes); 1095 } 1096 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1097 keys * sizeof(off_t)); 1098 } 1099 } 1100 1101 // if the new key was already inserted, we shouldn't use it again 1102 if (in != out) 1103 keyIndex--; 1104 1105 int32 total = bytesBefore + bytesAfter; 1106 1107 // these variables are for the key that will be returned 1108 // to the parent node 1109 uint8* newKey = NULL; 1110 uint16 newLength; 1111 bool newAllocated = false; 1112 1113 // If we have split an index node, we have to drop the first key 1114 // of the next node (which can also be the new key to insert). 1115 // The dropped key is also the one which has to be inserted in 1116 // the parent node, so we will set the "newKey" already here. 1117 if (node->OverflowLink() != BPLUSTREE_NULL) { 1118 if (in == keyIndex) { 1119 newKey = key; 1120 newLength = *_keyLength; 1121 1122 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1123 keyIndex--; 1124 } else { 1125 // If a key is dropped (is not the new key), we have to copy 1126 // it, because it would be lost if not. 1127 uint8* droppedKey = node->KeyAt(in, &newLength); 1128 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1129 > (uint8*)node + fNodeSize 1130 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1131 fStream->GetVolume()->Panic(); 1132 RETURN_ERROR(B_BAD_DATA); 1133 } 1134 newKey = (uint8*)malloc(newLength); 1135 if (newKey == NULL) 1136 return B_NO_MEMORY; 1137 1138 newAllocated = true; 1139 memcpy(newKey, droppedKey, newLength); 1140 1141 other->overflow_link = inKeyValues[in]; 1142 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1143 } 1144 } 1145 1146 // and now the same game for the other page and the rest of the keys 1147 // (but with memmove() instead of memcpy(), because they may overlap) 1148 1149 bytesBefore = bytesAfter = bytes = 0; 1150 out = 0; 1151 int32 skip = in; 1152 while (in < node->NumKeys() + 1) { 1153 if (in == keyIndex && !bytes) { 1154 // it's enough to set bytesBefore once here, because we do 1155 // not need to know the exact length of all keys in this 1156 // loop 1157 bytesBefore = in > skip 1158 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1159 bytes = *_keyLength; 1160 out++; 1161 } else { 1162 if (in < node->NumKeys()) { 1163 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1164 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1165 1166 if (bytes) { 1167 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1168 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1169 1170 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1171 - bytesBefore - bytes; 1172 } 1173 out++; 1174 } 1175 in++; 1176 } 1177 } 1178 1179 // adjust the byte counts (since we were a bit lazy in the loop) 1180 if (keyIndex < skip) 1181 bytesBefore = node->AllKeyLength() - total; 1182 1183 if (bytesBefore < 0 || bytesAfter < 0) { 1184 if (newAllocated) 1185 free(newKey); 1186 return B_BAD_DATA; 1187 } 1188 1189 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1190 // right link, and overflow link can stay the same 1191 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1192 + bytesAfter); 1193 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1194 1195 // array positions have changed 1196 outKeyLengths = node->KeyLengths(); 1197 outKeyValues = node->Values(); 1198 1199 // move the keys in the old node: the order is important here, 1200 // because we don't want to overwrite any contents 1201 1202 keys = keyIndex <= skip ? out : keyIndex - skip; 1203 keyIndex -= skip; 1204 in = out - keyIndex - 1; 1205 // Note: keyIndex and in will contain invalid values when the new key 1206 // went to the other node. But in this case bytes and bytesAfter are 1207 // 0 and subsequently we never use keyIndex and in. 1208 1209 if (bytesBefore) 1210 memmove(inKeys, inKeys + total, bytesBefore); 1211 if (bytesAfter) { 1212 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1213 bytesAfter); 1214 } 1215 1216 if (bytesBefore) 1217 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1218 if (bytesAfter) { 1219 // if byteAfter is > 0, keyIndex is larger than skip 1220 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1221 in * sizeof(uint16)); 1222 } 1223 1224 if (bytesBefore) 1225 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1226 if (bytesAfter) { 1227 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1228 in * sizeof(off_t)); 1229 } 1230 1231 if (bytes) { 1232 // finally, copy the newly inserted key (don't overwrite anything) 1233 memcpy(inKeys + bytesBefore, key, bytes); 1234 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1235 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1236 } 1237 1238 // Prepare the key that will be inserted in the parent node which 1239 // is either the dropped key or the last of the other node. 1240 // If it's the dropped key, "newKey" was already set earlier. 1241 1242 if (newKey == NULL) 1243 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1244 1245 memcpy(key, newKey, newLength); 1246 *_keyLength = newLength; 1247 *_value = otherOffset; 1248 1249 if (newAllocated) 1250 free(newKey); 1251 1252 return B_OK; 1253 } 1254 1255 1256 /*! This inserts a key into the tree. The changes made to the tree will 1257 all be part of the \a transaction. 1258 You need to have the inode write locked. 1259 */ 1260 status_t 1261 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1262 off_t value) 1263 { 1264 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1265 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1266 RETURN_ERROR(B_BAD_VALUE); 1267 #ifdef DEBUG 1268 if (value < 0) 1269 panic("tried to insert invalid value %Ld!\n", value); 1270 #endif 1271 1272 ASSERT_WRITE_LOCKED_INODE(fStream); 1273 1274 Stack<node_and_key> stack; 1275 if (_SeekDown(stack, key, keyLength) != B_OK) 1276 RETURN_ERROR(B_ERROR); 1277 1278 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1279 1280 memcpy(keyBuffer, key, keyLength); 1281 keyBuffer[keyLength] = 0; 1282 1283 node_and_key nodeAndKey; 1284 const bplustree_node* node; 1285 1286 CachedNode cached(this); 1287 while (stack.Pop(&nodeAndKey) 1288 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1289 #ifdef DEBUG 1290 NodeChecker checker(node, fNodeSize, "insert"); 1291 #endif 1292 if (node->IsLeaf()) { 1293 // first round, check for duplicate entries 1294 status_t status = _FindKey(node, key, keyLength, 1295 &nodeAndKey.keyIndex); 1296 1297 // is this a duplicate entry? 1298 if (status == B_OK) { 1299 if (fAllowDuplicates) { 1300 status = _InsertDuplicate(transaction, cached, node, 1301 nodeAndKey.keyIndex, value); 1302 if (status != B_OK) 1303 RETURN_ERROR(status); 1304 return B_OK; 1305 } 1306 1307 RETURN_ERROR(B_NAME_IN_USE); 1308 } 1309 } 1310 1311 bplustree_node* writableNode = cached.MakeWritable(transaction); 1312 if (writableNode == NULL) 1313 return B_IO_ERROR; 1314 1315 // is the node big enough to hold the pair? 1316 if (int32(key_align(sizeof(bplustree_node) 1317 + writableNode->AllKeyLength() + keyLength) 1318 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1319 + sizeof(off_t))) < fNodeSize) { 1320 _InsertKey(writableNode, nodeAndKey.keyIndex, 1321 keyBuffer, keyLength, value); 1322 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1323 nodeAndKey.keyIndex, 0, 1); 1324 1325 return B_OK; 1326 } else { 1327 CachedNode cachedNewRoot(this); 1328 CachedNode cachedOther(this); 1329 1330 // do we need to allocate a new root node? if so, then do 1331 // it now 1332 off_t newRoot = BPLUSTREE_NULL; 1333 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1334 bplustree_node* root; 1335 status_t status = cachedNewRoot.Allocate(transaction, &root, 1336 &newRoot); 1337 if (status < B_OK) { 1338 // The tree is most likely corrupted! 1339 // But it's still sane at leaf level - we could set 1340 // a flag in the header that forces the tree to be 1341 // rebuild next time... 1342 // But since we will have journaling, that's not a big 1343 // problem anyway. 1344 RETURN_ERROR(status); 1345 } 1346 } 1347 1348 // reserve space for the other node 1349 bplustree_node* other; 1350 off_t otherOffset; 1351 status_t status = cachedOther.Allocate(transaction, &other, 1352 &otherOffset); 1353 if (status < B_OK) { 1354 cachedNewRoot.Free(transaction, newRoot); 1355 RETURN_ERROR(status); 1356 } 1357 1358 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1359 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1360 &value) < B_OK) { 1361 // free root node & other node here 1362 cachedOther.Free(transaction, otherOffset); 1363 cachedNewRoot.Free(transaction, newRoot); 1364 1365 RETURN_ERROR(B_ERROR); 1366 } 1367 #ifdef DEBUG 1368 checker.Check("insert split"); 1369 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1370 #endif 1371 1372 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1373 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1374 1375 // update the right link of the node in the left of the new node 1376 if ((other = cachedOther.SetToWritable(transaction, 1377 other->LeftLink())) != NULL) { 1378 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1379 } 1380 1381 // create a new root if necessary 1382 if (newRoot != BPLUSTREE_NULL) { 1383 bplustree_node* root = cachedNewRoot.Node(); 1384 1385 _InsertKey(root, 0, keyBuffer, keyLength, 1386 writableNode->LeftLink()); 1387 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1388 nodeAndKey.nodeOffset); 1389 1390 bplustree_header* header 1391 = fCachedHeader.MakeWritableHeader(transaction); 1392 if (header == NULL) 1393 return B_IO_ERROR; 1394 1395 // finally, update header to point to the new root 1396 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1397 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1398 header->MaxNumberOfLevels() + 1); 1399 1400 return B_OK; 1401 } 1402 } 1403 } 1404 RETURN_ERROR(B_ERROR); 1405 } 1406 1407 1408 /*! Removes the duplicate index/value pair from the tree. 1409 It's part of the private tree interface. 1410 */ 1411 status_t 1412 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1413 const bplustree_node* node, CachedNode& cached, uint16 index, 1414 off_t value) 1415 { 1416 off_t* values = node->Values(); 1417 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1418 1419 CachedNode cachedDuplicate(this); 1420 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1421 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1422 duplicateOffset, false); 1423 if (duplicate == NULL) 1424 return B_IO_ERROR; 1425 1426 // if it's a duplicate fragment, remove the entry from there 1427 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1428 duplicate_array* array = duplicate->FragmentAt( 1429 bplustree_node::FragmentIndex(oldValue)); 1430 1431 if (array->count > NUM_FRAGMENT_VALUES 1432 || array->count < 1) { 1433 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld " 1434 "== %Ld!\n", (int)bplustree_node::FragmentIndex(oldValue), 1435 duplicateOffset, array->count)); 1436 return B_BAD_DATA; 1437 } 1438 if (!array->Remove(value)) { 1439 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1440 value, duplicateOffset)); 1441 return B_ENTRY_NOT_FOUND; 1442 } 1443 1444 // remove the array from the fragment node if it is empty 1445 if (array->count == 1) { 1446 // set the link to the remaining value 1447 if (cached.MakeWritable(transaction) == NULL) 1448 return B_IO_ERROR; 1449 1450 values[index] = array->values[0]; 1451 1452 // Remove the whole fragment node, if this was the only array, 1453 // otherwise free just the array 1454 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1455 status_t status = cachedDuplicate.Free(transaction, 1456 duplicateOffset); 1457 if (status < B_OK) 1458 return status; 1459 } else 1460 array->count = 0; 1461 } 1462 return B_OK; 1463 } 1464 1465 // Remove value from a duplicate node! 1466 1467 duplicate_array* array = NULL; 1468 1469 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1470 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1471 duplicate->LeftLink())); 1472 return B_BAD_DATA; 1473 } 1474 1475 // Search the duplicate nodes until the entry could be found (and removed) 1476 while (duplicate != NULL) { 1477 array = duplicate->DuplicateArray(); 1478 if (array->count > NUM_DUPLICATE_VALUES 1479 || array->count < 0) { 1480 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == " 1481 "%Ld!\n", duplicateOffset, array->count)); 1482 return B_BAD_DATA; 1483 } 1484 1485 if (array->Remove(value)) 1486 break; 1487 1488 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1489 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1490 1491 cachedDuplicate.UnsetUnchanged(transaction); 1492 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1493 false); 1494 } 1495 if (duplicate == NULL) 1496 RETURN_ERROR(B_IO_ERROR); 1497 1498 // The entry got removed from the duplicate node, but we might want to free 1499 // it now in case it's empty 1500 1501 while (true) { 1502 off_t left = duplicate->LeftLink(); 1503 off_t right = duplicate->RightLink(); 1504 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1505 1506 if ((isLast && array->count == 1) || array->count == 0) { 1507 // Free empty duplicate page, link their siblings together, and 1508 // update the duplicate link if needed (ie. when we either remove 1509 // the last duplicate node or have a new first one) 1510 1511 if (left == BPLUSTREE_NULL) { 1512 // the duplicate link points to us 1513 if (cached.MakeWritable(transaction) == NULL) 1514 return B_IO_ERROR; 1515 1516 if (array->count == 1) { 1517 // This is the last node, and there is only one value left; 1518 // replace the duplicate link with that value, it's no 1519 // duplicate anymore 1520 values[index] = array->values[0]; 1521 } else { 1522 // Move the duplicate link to the next node 1523 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1524 bplustree_node::MakeLink( 1525 BPLUSTREE_DUPLICATE_NODE, right)); 1526 } 1527 } 1528 1529 status_t status = cachedDuplicate.Free(transaction, 1530 duplicateOffset); 1531 if (status < B_OK) 1532 return status; 1533 1534 if (left != BPLUSTREE_NULL 1535 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1536 false)) != NULL) { 1537 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1538 1539 // If the next node is the last node, we need to free that node 1540 // and convert the duplicate entry back into a normal entry 1541 array = duplicate->DuplicateArray(); 1542 if (right == BPLUSTREE_NULL 1543 && duplicate->LeftLink() == BPLUSTREE_NULL 1544 && array->count <= NUM_FRAGMENT_VALUES) { 1545 duplicateOffset = left; 1546 continue; 1547 } 1548 } 1549 if (right != BPLUSTREE_NULL 1550 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1551 right, false)) != NULL) { 1552 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1553 1554 // Again, we may need to turn the duplicate entry back into a 1555 // normal entry 1556 array = duplicate->DuplicateArray(); 1557 if (left == BPLUSTREE_NULL 1558 && duplicate->RightLink() == BPLUSTREE_NULL 1559 && array->count <= NUM_FRAGMENT_VALUES) { 1560 duplicateOffset = right; 1561 continue; 1562 } 1563 } 1564 return B_OK; 1565 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1566 // If the number of entries fits in a duplicate fragment, then 1567 // either find a free fragment node, or convert this node to a 1568 // fragment node. 1569 CachedNode cachedOther(this); 1570 1571 bplustree_node* fragment = NULL; 1572 uint32 fragmentIndex = 0; 1573 off_t offset; 1574 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1575 &offset, &fragment, &fragmentIndex) == B_OK) { 1576 // move to other node 1577 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 1578 memcpy(target, array, 1579 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1580 1581 cachedDuplicate.Free(transaction, duplicateOffset); 1582 duplicateOffset = offset; 1583 } else { 1584 // convert node 1585 memmove(duplicate, array, 1586 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1587 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1588 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1589 } 1590 1591 if (cached.MakeWritable(transaction) == NULL) 1592 return B_IO_ERROR; 1593 1594 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1595 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1596 } 1597 return B_OK; 1598 } 1599 } 1600 1601 1602 /*! Removes the key with the given index from the specified node. 1603 Since it has to get the key from the node anyway (to obtain it's 1604 pointer), it's not needed to pass the key & its length, although 1605 the calling method (BPlusTree::Remove()) have this data. 1606 */ 1607 void 1608 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 1609 { 1610 // should never happen, but who knows? 1611 if (index > node->NumKeys() && node->NumKeys() > 0) { 1612 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1613 return; 1614 } 1615 1616 off_t* values = node->Values(); 1617 1618 // if we would have to drop the overflow link, drop 1619 // the last key instead and update the overflow link 1620 // to the value of that one 1621 if (!node->IsLeaf() && index == node->NumKeys()) 1622 node->overflow_link = values[--index]; 1623 1624 uint16 length; 1625 uint8* key = node->KeyAt(index, &length); 1626 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 1627 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1628 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1629 (int)fStream->BlockRun().allocation_group, 1630 fStream->BlockRun().start)); 1631 fStream->GetVolume()->Panic(); 1632 return; 1633 } 1634 1635 uint16* keyLengths = node->KeyLengths(); 1636 uint8* keys = node->Keys(); 1637 1638 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1639 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 1640 node->AllKeyLength() - length); 1641 1642 off_t* newValues = node->Values(); 1643 uint16* newKeyLengths = node->KeyLengths(); 1644 1645 // move key data 1646 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1647 1648 // move and update key lengths 1649 if (index > 0 && newKeyLengths != keyLengths) 1650 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1651 for (uint16 i = index; i < node->NumKeys(); i++) { 1652 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1653 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1654 } 1655 1656 // move values 1657 if (index > 0) 1658 memmove(newValues, values, index * sizeof(off_t)); 1659 if (node->NumKeys() > index) { 1660 memmove(newValues + index, values + index + 1, 1661 (node->NumKeys() - index) * sizeof(off_t)); 1662 } 1663 } 1664 1665 1666 /*! Removes the specified key from the tree. The "value" parameter is only used 1667 for trees which allow duplicates, so you may safely ignore it. 1668 It's not an optional parameter, so at least you have to think about it. 1669 You need to have the inode write locked. 1670 */ 1671 status_t 1672 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 1673 off_t value) 1674 { 1675 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1676 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1677 RETURN_ERROR(B_BAD_VALUE); 1678 1679 ASSERT_WRITE_LOCKED_INODE(fStream); 1680 1681 Stack<node_and_key> stack; 1682 if (_SeekDown(stack, key, keyLength) != B_OK) 1683 RETURN_ERROR(B_ERROR); 1684 1685 node_and_key nodeAndKey; 1686 const bplustree_node* node; 1687 1688 CachedNode cached(this); 1689 while (stack.Pop(&nodeAndKey) 1690 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1691 #ifdef DEBUG 1692 NodeChecker checker(node, fNodeSize, "remove"); 1693 #endif 1694 if (node->IsLeaf()) { 1695 // first round, check for duplicate entries 1696 status_t status = _FindKey(node, key, keyLength, 1697 &nodeAndKey.keyIndex); 1698 if (status < B_OK) 1699 RETURN_ERROR(status); 1700 1701 // If we will remove the last key, the iterator will be set 1702 // to the next node after the current - if there aren't any 1703 // more nodes, we need a way to prevent the TreeIterators to 1704 // touch the old node again, we use BPLUSTREE_FREE for this 1705 off_t next = node->RightLink() == BPLUSTREE_NULL 1706 ? BPLUSTREE_FREE : node->RightLink(); 1707 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1708 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1709 1710 // is this a duplicate entry? 1711 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1712 node->Values()[nodeAndKey.keyIndex]))) { 1713 if (fAllowDuplicates) { 1714 return _RemoveDuplicate(transaction, node, cached, 1715 nodeAndKey.keyIndex, value); 1716 } else { 1717 FATAL(("dupliate node found where no duplicates are " 1718 "allowed!\n")); 1719 RETURN_ERROR(B_ERROR); 1720 } 1721 } 1722 } 1723 1724 bplustree_node* writableNode = cached.MakeWritable(transaction); 1725 if (writableNode == NULL) 1726 return B_IO_ERROR; 1727 1728 // if it's an empty root node, we have to convert it 1729 // to a leaf node by dropping the overflow link, or, 1730 // if it's already a leaf node, just empty it 1731 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1732 && (node->NumKeys() == 0 1733 || (node->NumKeys() == 1 && node->IsLeaf()))) { 1734 writableNode->overflow_link 1735 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1736 writableNode->all_key_count = 0; 1737 writableNode->all_key_length = 0; 1738 1739 // if we've made a leaf node out of the root node, we need 1740 // to reset the maximum number of levels in the header 1741 if (fHeader->MaxNumberOfLevels() != 1) { 1742 bplustree_header* header 1743 = fCachedHeader.MakeWritableHeader(transaction); 1744 if (header == NULL) 1745 return B_IO_ERROR; 1746 1747 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1748 } 1749 return B_OK; 1750 } 1751 1752 // if there is only one key left, we don't have to remove 1753 // it, we can just dump the node (index nodes still have 1754 // the overflow link, so we have to drop the last key) 1755 if (writableNode->NumKeys() > 1 1756 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 1757 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1758 return B_OK; 1759 } 1760 1761 // when we are here, we can just free the node, but 1762 // we have to update the right/left link of the 1763 // siblings first 1764 CachedNode otherCached(this); 1765 bplustree_node* other = otherCached.SetToWritable(transaction, 1766 writableNode->LeftLink()); 1767 if (other != NULL) 1768 other->right_link = writableNode->right_link; 1769 1770 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 1771 != NULL) { 1772 other->left_link = writableNode->left_link; 1773 } 1774 1775 cached.Free(transaction, nodeAndKey.nodeOffset); 1776 } 1777 RETURN_ERROR(B_ERROR); 1778 } 1779 1780 1781 /*! Replaces the value for the key in the tree. 1782 Returns B_OK if the key could be found and its value replaced, 1783 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1784 to indicate that something went terribly wrong. 1785 Note that this doesn't work with duplicates - it will just 1786 return B_BAD_TYPE if you call this function on a tree where 1787 duplicates are allowed. 1788 You need to have the inode write locked. 1789 */ 1790 status_t 1791 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 1792 off_t value) 1793 { 1794 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1795 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1796 || key == NULL) 1797 RETURN_ERROR(B_BAD_VALUE); 1798 1799 if (fAllowDuplicates) 1800 RETURN_ERROR(B_BAD_TYPE); 1801 1802 ASSERT_WRITE_LOCKED_INODE(fStream); 1803 1804 off_t nodeOffset = fHeader->RootNode(); 1805 CachedNode cached(this); 1806 const bplustree_node* node; 1807 1808 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1809 uint16 keyIndex = 0; 1810 off_t nextOffset; 1811 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1812 &nextOffset); 1813 1814 if (node->OverflowLink() == BPLUSTREE_NULL) { 1815 if (status == B_OK) { 1816 bplustree_node* writableNode = cached.MakeWritable(transaction); 1817 if (writableNode != NULL) { 1818 writableNode->Values()[keyIndex] 1819 = HOST_ENDIAN_TO_BFS_INT64(value); 1820 } else 1821 status = B_IO_ERROR; 1822 } 1823 1824 return status; 1825 } else if (nextOffset == nodeOffset) 1826 RETURN_ERROR(B_ERROR); 1827 1828 nodeOffset = nextOffset; 1829 } 1830 RETURN_ERROR(B_ERROR); 1831 } 1832 1833 1834 /*! Searches the key in the tree, and stores the offset found in _value, 1835 if successful. 1836 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 1837 while it descends the tree. 1838 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 1839 It can also return other errors to indicate that something went wrong. 1840 Note that this doesn't work with duplicates - it will just return 1841 B_BAD_TYPE if you call this function on a tree where duplicates are 1842 allowed. 1843 You need to have the inode read or write locked. 1844 */ 1845 status_t 1846 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 1847 { 1848 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1849 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1850 || key == NULL) 1851 RETURN_ERROR(B_BAD_VALUE); 1852 1853 if (fAllowDuplicates) 1854 RETURN_ERROR(B_BAD_TYPE); 1855 1856 ASSERT_READ_LOCKED_INODE(fStream); 1857 1858 off_t nodeOffset = fHeader->RootNode(); 1859 CachedNode cached(this); 1860 const bplustree_node* node; 1861 1862 #ifdef DEBUG 1863 int32 levels = 0; 1864 #endif 1865 1866 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1867 uint16 keyIndex = 0; 1868 off_t nextOffset; 1869 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1870 &nextOffset); 1871 1872 #ifdef DEBUG 1873 levels++; 1874 #endif 1875 if (node->OverflowLink() == BPLUSTREE_NULL) { 1876 if (status == B_OK && _value != NULL) 1877 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1878 1879 #ifdef DEBUG 1880 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1881 DEBUGGER(("levels don't match")); 1882 #endif 1883 return status; 1884 } else if (nextOffset == nodeOffset) 1885 RETURN_ERROR(B_ERROR); 1886 1887 nodeOffset = nextOffset; 1888 } 1889 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1890 RETURN_ERROR(B_ERROR); 1891 } 1892 1893 1894 // #pragma mark - 1895 1896 1897 TreeIterator::TreeIterator(BPlusTree* tree) 1898 : 1899 fTree(tree), 1900 fCurrentNodeOffset(BPLUSTREE_NULL) 1901 { 1902 tree->_AddIterator(this); 1903 } 1904 1905 1906 TreeIterator::~TreeIterator() 1907 { 1908 if (fTree) 1909 fTree->_RemoveIterator(this); 1910 } 1911 1912 1913 status_t 1914 TreeIterator::Goto(int8 to) 1915 { 1916 if (fTree == NULL || fTree->fHeader == NULL) 1917 RETURN_ERROR(B_BAD_VALUE); 1918 1919 // lock access to stream 1920 InodeReadLocker locker(fTree->fStream); 1921 1922 off_t nodeOffset = fTree->fHeader->RootNode(); 1923 CachedNode cached(fTree); 1924 const bplustree_node* node; 1925 1926 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1927 // is the node a leaf node? 1928 if (node->OverflowLink() == BPLUSTREE_NULL) { 1929 fCurrentNodeOffset = nodeOffset; 1930 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1931 fDuplicateNode = BPLUSTREE_NULL; 1932 1933 return B_OK; 1934 } 1935 1936 // get the next node offset depending on the direction (and if there 1937 // are any keys in that node at all) 1938 off_t nextOffset; 1939 if (to == BPLUSTREE_END || node->all_key_count == 0) 1940 nextOffset = node->OverflowLink(); 1941 else { 1942 if (node->AllKeyLength() > fTree->fNodeSize 1943 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1944 - 8 * node->NumKeys()) 1945 RETURN_ERROR(B_ERROR); 1946 1947 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1948 } 1949 if (nextOffset == nodeOffset) 1950 break; 1951 1952 nodeOffset = nextOffset; 1953 } 1954 FATAL(("%s fails\n", __FUNCTION__)); 1955 1956 RETURN_ERROR(B_ERROR); 1957 } 1958 1959 1960 /*! Iterates through the tree in the specified direction. 1961 When it iterates through duplicates, the "key" is only updated for the 1962 first entry - if you need to know when this happens, use the "duplicate" 1963 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1964 the other duplicates. 1965 That's not too nice, but saves the 256 bytes that would be needed to 1966 store the last key - if this will ever become an issue, it will be 1967 easy to change. 1968 The other advantage of this is, that the queries can skip all duplicates 1969 at once when they are not relevant to them. 1970 */ 1971 status_t 1972 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 1973 uint16 maxLength, off_t* value, uint16* duplicate) 1974 { 1975 if (fTree == NULL) 1976 return B_INTERRUPTED; 1977 1978 bool forward = direction == BPLUSTREE_FORWARD; 1979 1980 if (fCurrentNodeOffset == BPLUSTREE_NULL 1981 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1982 RETURN_ERROR(B_ERROR); 1983 1984 // if the tree was emptied since the last call 1985 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1986 return B_ENTRY_NOT_FOUND; 1987 1988 // lock access to stream 1989 InodeReadLocker locker(fTree->fStream); 1990 1991 CachedNode cached(fTree); 1992 const bplustree_node* node; 1993 1994 if (fDuplicateNode != BPLUSTREE_NULL) { 1995 // regardless of traverse direction the duplicates are always presented 1996 // in the same order; since they are all considered as equal, this 1997 // shouldn't cause any problems 1998 1999 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2000 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2001 false); 2002 } else 2003 node = NULL; 2004 2005 if (node != NULL) { 2006 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2007 // If the node is out of duplicates, we go directly to the next 2008 // one 2009 fDuplicateNode = node->RightLink(); 2010 if (fDuplicateNode != BPLUSTREE_NULL 2011 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2012 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2013 false); 2014 fDuplicate = 0; 2015 } 2016 } 2017 if (fDuplicate < fNumDuplicates) { 2018 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2019 fDuplicate++); 2020 if (duplicate) 2021 *duplicate = 2; 2022 return B_OK; 2023 } 2024 } 2025 fDuplicateNode = BPLUSTREE_NULL; 2026 } 2027 2028 off_t savedNodeOffset = fCurrentNodeOffset; 2029 int32 savedKey = fCurrentKey; 2030 2031 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2032 RETURN_ERROR(B_ERROR); 2033 2034 if (duplicate) 2035 *duplicate = 0; 2036 2037 fCurrentKey += direction; 2038 2039 // is the current key in the current node? 2040 while ((forward && fCurrentKey >= node->NumKeys()) 2041 || (!forward && fCurrentKey < 0)) { 2042 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2043 2044 // are there any more nodes? 2045 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2046 node = cached.SetTo(fCurrentNodeOffset); 2047 if (!node) 2048 RETURN_ERROR(B_ERROR); 2049 2050 // reset current key 2051 fCurrentKey = forward ? 0 : node->NumKeys(); 2052 } else { 2053 // there are no nodes left, so turn back to the last key 2054 fCurrentNodeOffset = savedNodeOffset; 2055 fCurrentKey = savedKey; 2056 2057 return B_ENTRY_NOT_FOUND; 2058 } 2059 } 2060 2061 if (node->all_key_count == 0) 2062 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2063 2064 uint16 length; 2065 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2066 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2067 > (uint8*)node + fTree->fNodeSize 2068 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2069 fTree->fStream->GetVolume()->Panic(); 2070 RETURN_ERROR(B_BAD_DATA); 2071 } 2072 2073 length = min_c(length, maxLength); 2074 memcpy(key, keyStart, length); 2075 2076 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 2077 // terminate string type 2078 if (length == maxLength) 2079 length--; 2080 ((char*)key)[length] = '\0'; 2081 } 2082 *keyLength = length; 2083 2084 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2085 2086 // duplicate fragments? 2087 uint8 type = bplustree_node::LinkType(offset); 2088 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2089 || type == BPLUSTREE_DUPLICATE_NODE) { 2090 fDuplicateNode = offset; 2091 2092 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2093 false); 2094 if (node == NULL) 2095 RETURN_ERROR(B_ERROR); 2096 2097 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2098 2099 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2100 if (fNumDuplicates) { 2101 offset = node->DuplicateAt(offset, fIsFragment, 0); 2102 fDuplicate = 1; 2103 if (duplicate) 2104 *duplicate = 1; 2105 } else { 2106 // Shouldn't happen, but we're dealing here with potentially 2107 // corrupt disks... 2108 fDuplicateNode = BPLUSTREE_NULL; 2109 offset = 0; 2110 } 2111 } 2112 *value = offset; 2113 2114 return B_OK; 2115 } 2116 2117 2118 /*! This is more or less a copy of BPlusTree::Find() - but it just 2119 sets the current position in the iterator, regardless of if the 2120 key could be found or not. 2121 */ 2122 status_t 2123 TreeIterator::Find(const uint8* key, uint16 keyLength) 2124 { 2125 if (fTree == NULL) 2126 return B_INTERRUPTED; 2127 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2128 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2129 || key == NULL) 2130 RETURN_ERROR(B_BAD_VALUE); 2131 2132 // lock access to stream 2133 InodeReadLocker locker(fTree->fStream); 2134 2135 off_t nodeOffset = fTree->fHeader->RootNode(); 2136 2137 CachedNode cached(fTree); 2138 const bplustree_node* node; 2139 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2140 uint16 keyIndex = 0; 2141 off_t nextOffset; 2142 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2143 &nextOffset); 2144 2145 if (node->OverflowLink() == BPLUSTREE_NULL) { 2146 fCurrentNodeOffset = nodeOffset; 2147 fCurrentKey = keyIndex - 1; 2148 fDuplicateNode = BPLUSTREE_NULL; 2149 2150 return status; 2151 } else if (nextOffset == nodeOffset) 2152 RETURN_ERROR(B_ERROR); 2153 2154 nodeOffset = nextOffset; 2155 } 2156 RETURN_ERROR(B_ERROR); 2157 } 2158 2159 2160 void 2161 TreeIterator::SkipDuplicates() 2162 { 2163 fDuplicateNode = BPLUSTREE_NULL; 2164 } 2165 2166 2167 void 2168 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2169 uint16 splitAt, int8 change) 2170 { 2171 if (offset != fCurrentNodeOffset) 2172 return; 2173 2174 if (nextOffset != BPLUSTREE_NULL) { 2175 fCurrentNodeOffset = nextOffset; 2176 if (splitAt <= fCurrentKey) { 2177 fCurrentKey -= splitAt; 2178 keyIndex -= splitAt; 2179 } 2180 } 2181 2182 // Adjust fCurrentKey to point to the same key as before. 2183 // Note, that if a key is inserted at the current position 2184 // it won't be included in this tree transition. 2185 if (keyIndex <= fCurrentKey) 2186 fCurrentKey += change; 2187 2188 // TODO: duplicate handling! 2189 } 2190 2191 2192 void 2193 TreeIterator::Stop() 2194 { 2195 fTree = NULL; 2196 } 2197 2198 2199 #ifdef DEBUG 2200 void 2201 TreeIterator::Dump() 2202 { 2203 __out("TreeIterator at %p:\n", this); 2204 __out("\tfTree = %p\n", fTree); 2205 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2206 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2207 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", 2208 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2209 fDuplicateNode); 2210 __out("\tfDuplicate = %u\n", fDuplicate); 2211 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2212 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2213 } 2214 #endif 2215 2216 2217 // #pragma mark - 2218 2219 2220 void 2221 bplustree_node::Initialize() 2222 { 2223 left_link = right_link = overflow_link 2224 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2225 all_key_count = 0; 2226 all_key_length = 0; 2227 } 2228 2229 2230 uint8* 2231 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2232 { 2233 if (index < 0 || index > NumKeys()) 2234 return NULL; 2235 2236 uint8* keyStart = Keys(); 2237 uint16* keyLengths = KeyLengths(); 2238 2239 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2240 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2241 if (index > 0) 2242 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2243 2244 return keyStart; 2245 } 2246 2247 2248 uint8 2249 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2250 { 2251 // the duplicate fragment handling is currently hard-coded to a node size 2252 // of 1024 bytes - with future versions of BFS, this may be a problem 2253 2254 if (isFragment) { 2255 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2256 2257 return ((off_t*)this)[fragment]; 2258 } 2259 return OverflowLink(); 2260 } 2261 2262 2263 off_t 2264 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2265 { 2266 uint32 start; 2267 if (isFragment) 2268 start = 8 * ((uint64)offset & 0x3ff); 2269 else 2270 start = 2; 2271 2272 return ((off_t*)this)[start + 1 + index]; 2273 } 2274 2275 2276 /*! Although the name suggests it, this function doesn't return the real 2277 used fragment count; at least, it can only count to two: it returns 2278 0, if there is no fragment used, 1 if there is only one fragment 2279 used, and 2 if there are at least 2 fragments used. 2280 */ 2281 uint32 2282 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2283 { 2284 uint32 used = 0; 2285 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2286 duplicate_array* array = FragmentAt(i); 2287 if (array->count > 0 && ++used > 1) 2288 return used; 2289 } 2290 return used; 2291 } 2292 2293 2294 #ifdef DEBUG 2295 status_t 2296 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2297 { 2298 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2299 DEBUGGER(("invalid node: key/length count")); 2300 2301 for (int32 i = 0; i < NumKeys(); i++) { 2302 uint16 length; 2303 uint8* key = KeyAt(i, &length); 2304 if (key + length + sizeof(off_t) + sizeof(uint16) 2305 > (uint8*)this + nodeSize 2306 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2307 dprintf("node %p, key %ld\n", this, i); 2308 DEBUGGER(("invalid node: keys corrupted")); 2309 return B_BAD_DATA; 2310 } 2311 if (Values()[i] == -1) { 2312 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2313 DEBUGGER(("invalid node: values corrupted")); 2314 return B_BAD_DATA; 2315 } 2316 } 2317 return B_OK; 2318 } 2319 #endif 2320 2321 2322 // #pragma mark - 2323 2324 2325 int32 2326 compareKeys(type_code type, const void* key1, int keyLength1, 2327 const void* key2, int keyLength2) 2328 { 2329 // if one of the keys is NULL, bail out gracefully 2330 if (key1 == NULL || key2 == NULL) { 2331 // even if it's most probably a bug in the calling code, we try to 2332 // give a meaningful result 2333 if (key1 == NULL && key2 != NULL) 2334 return 1; 2335 if (key2 == NULL && key1 != NULL) 2336 return -1; 2337 2338 return 0; 2339 } 2340 2341 switch (type) { 2342 case B_STRING_TYPE: 2343 { 2344 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 2345 if (result == 0) { 2346 // ignore trailing null bytes 2347 if ((keyLength1 == keyLength2 + 1 2348 && ((uint8*)key1)[keyLength2] == '\0') 2349 || (keyLength2 == keyLength1 + 1 2350 && ((uint8*)key2)[keyLength1] == '\0')) 2351 return 0; 2352 2353 result = keyLength1 - keyLength2; 2354 } 2355 2356 return result; 2357 } 2358 2359 case B_SSIZE_T_TYPE: 2360 case B_INT32_TYPE: 2361 return *(int32*)key1 - *(int32*)key2; 2362 2363 case B_SIZE_T_TYPE: 2364 case B_UINT32_TYPE: 2365 if (*(uint32*)key1 == *(uint32*)key2) 2366 return 0; 2367 if (*(uint32*)key1 > *(uint32*)key2) 2368 return 1; 2369 2370 return -1; 2371 2372 case B_OFF_T_TYPE: 2373 case B_INT64_TYPE: 2374 if (*(int64*)key1 == *(int64*)key2) 2375 return 0; 2376 if (*(int64*)key1 > *(int64*)key2) 2377 return 1; 2378 2379 return -1; 2380 2381 case B_UINT64_TYPE: 2382 if (*(uint64*)key1 == *(uint64*)key2) 2383 return 0; 2384 if (*(uint64*)key1 > *(uint64*)key2) 2385 return 1; 2386 2387 return -1; 2388 2389 case B_FLOAT_TYPE: 2390 { 2391 float result = *(float*)key1 - *(float*)key2; 2392 if (result == 0.0f) 2393 return 0; 2394 2395 return (result < 0.0f) ? -1 : 1; 2396 } 2397 2398 case B_DOUBLE_TYPE: 2399 { 2400 double result = *(double*)key1 - *(double*)key2; 2401 if (result == 0.0) 2402 return 0; 2403 2404 return (result < 0.0) ? -1 : 1; 2405 } 2406 } 2407 2408 // if the type is unknown, the entries don't match... 2409 return -1; 2410 } 2411 2412