1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 //! B+Tree implementation 10 11 12 #include "Debug.h" 13 #include "BPlusTree.h" 14 #include "Inode.h" 15 #include "Utility.h" 16 17 18 #ifdef DEBUG 19 class NodeChecker { 20 public: 21 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 22 : 23 fNode(node), 24 fSize(nodeSize), 25 fText(text) 26 { 27 Check("integrity check failed on construction."); 28 } 29 30 ~NodeChecker() 31 { 32 Check("integrity check failed on destruction."); 33 } 34 35 void 36 Check(const char* message) 37 { 38 if (fNode->CheckIntegrity(fSize) < B_OK) 39 dprintf("%s: %s\n", fText, message); 40 } 41 42 private: 43 const bplustree_node* fNode; 44 int32 fSize; 45 const char* fText; 46 }; 47 #endif 48 49 50 // Node Caching for the BPlusTree class 51 // 52 // With write support, there is the need for a function that allocates new 53 // nodes by either returning empty nodes, or by growing the file's data stream 54 // 55 // !! The CachedNode class assumes that you have properly locked the stream 56 // !! before asking for nodes. 57 // 58 // Note: This code will fail if the block size is smaller than the node size! 59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 60 // is hard-coded to 1024 bytes, that's not an issue now. 61 62 void 63 CachedNode::UnsetUnchanged(Transaction& transaction) 64 { 65 if (fTree == NULL || fTree->fStream == NULL) 66 return; 67 68 if (fNode != NULL) { 69 void* cache = fTree->fStream->GetVolume()->BlockCache(); 70 71 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 72 block_cache_put(cache, fBlockNumber); 73 fNode = NULL; 74 } 75 } 76 77 78 void 79 CachedNode::Unset() 80 { 81 if (fTree == NULL || fTree->fStream == NULL) 82 return; 83 84 if (fNode != NULL) { 85 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 86 fBlockNumber); 87 fNode = NULL; 88 } 89 } 90 91 92 const bplustree_node* 93 CachedNode::SetTo(off_t offset, bool check) 94 { 95 if (fTree == NULL || fTree->fStream == NULL) { 96 REPORT_ERROR(B_BAD_VALUE); 97 return NULL; 98 } 99 100 Unset(); 101 102 // You can only ask for nodes at valid positions - you can't 103 // even access the b+tree header with this method (use SetToHeader() 104 // instead) 105 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 106 || offset <= 0 107 || (offset % fTree->fNodeSize) != 0) 108 return NULL; 109 110 if (InternalSetTo(NULL, offset) != NULL && check) { 111 // sanity checks (links, all_key_count) 112 if (!fTree->fHeader->CheckNode(fNode)) { 113 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 114 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 115 return NULL; 116 } 117 } 118 return fNode; 119 } 120 121 122 bplustree_node* 123 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 124 { 125 if (fTree == NULL || fTree->fStream == NULL) { 126 REPORT_ERROR(B_BAD_VALUE); 127 return NULL; 128 } 129 130 Unset(); 131 132 // You can only ask for nodes at valid positions - you can't 133 // even access the b+tree header with this method (use SetToHeader() 134 // instead) 135 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 136 || offset <= 0 137 || (offset % fTree->fNodeSize) != 0) 138 return NULL; 139 140 if (InternalSetTo(&transaction, offset) != NULL && check) { 141 // sanity checks (links, all_key_count) 142 if (!fTree->fHeader->CheckNode(fNode)) { 143 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 144 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 145 return NULL; 146 } 147 } 148 return fNode; 149 } 150 151 152 bplustree_node* 153 CachedNode::MakeWritable(Transaction& transaction) 154 { 155 if (fNode == NULL) 156 return NULL; 157 158 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 159 fBlockNumber, transaction.ID()) == B_OK) 160 return fNode; 161 162 return NULL; 163 } 164 165 166 bplustree_header* 167 CachedNode::MakeWritableHeader(Transaction& transaction) 168 { 169 return (bplustree_header*)MakeWritable(transaction); 170 } 171 172 173 const bplustree_header* 174 CachedNode::SetToHeader() 175 { 176 if (fTree == NULL || fTree->fStream == NULL) { 177 REPORT_ERROR(B_BAD_VALUE); 178 return NULL; 179 } 180 181 Unset(); 182 183 InternalSetTo(NULL, 0LL); 184 return (bplustree_header*)fNode; 185 } 186 187 188 bplustree_header* 189 CachedNode::SetToWritableHeader(Transaction& transaction) 190 { 191 if (fTree == NULL || fTree->fStream == NULL) { 192 REPORT_ERROR(B_BAD_VALUE); 193 return NULL; 194 } 195 196 Unset(); 197 198 InternalSetTo(&transaction, 0LL); 199 return (bplustree_header*)fNode; 200 } 201 202 203 bplustree_node* 204 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 205 { 206 fNode = NULL; 207 fOffset = offset; 208 209 off_t fileOffset; 210 block_run run; 211 if (offset < fTree->fStream->Size() 212 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 213 Volume* volume = fTree->fStream->GetVolume(); 214 215 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 216 fBlockNumber = volume->ToBlock(run) + blockOffset; 217 uint8* block; 218 219 if (transaction != NULL) { 220 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 221 fBlockNumber, transaction->ID()); 222 fWritable = true; 223 } else { 224 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 225 fWritable = false; 226 } 227 228 if (block) { 229 // The node is somewhere in that block... 230 // (confusing offset calculation) 231 fNode = (bplustree_node*)(block + offset 232 - (fileOffset + (blockOffset << volume->BlockShift()))); 233 } else 234 REPORT_ERROR(B_IO_ERROR); 235 } 236 return fNode; 237 } 238 239 240 status_t 241 CachedNode::Free(Transaction& transaction, off_t offset) 242 { 243 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 244 RETURN_ERROR(B_BAD_VALUE); 245 246 // TODO: scan the free nodes list and remove all nodes at the end 247 // of the tree - perhaps that shouldn't be done everytime that 248 // function is called, perhaps it should be done when the directory 249 // inode is closed or based on some calculation or whatever... 250 251 bplustree_header* header 252 = fTree->fCachedHeader.MakeWritableHeader(transaction); 253 if (header == NULL) 254 return B_IO_ERROR; 255 256 #if 0 257 // TODO: temporarily disabled because CheckNode() doesn't like this... 258 // Also, it's such an edge case that it's almost useless, anyway. 259 // if the node is the last one in the tree, we shrink 260 // the tree and file size by one node 261 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 262 if (offset == lastOffset) { 263 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 264 if (status < B_OK) 265 return status; 266 267 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 268 return B_OK; 269 } 270 #endif 271 272 // add the node to the free nodes list 273 fNode->left_link = header->free_node_pointer; 274 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 275 276 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 277 return B_OK; 278 } 279 280 281 status_t 282 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 283 off_t* _offset) 284 { 285 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 286 RETURN_ERROR(B_BAD_VALUE); 287 288 Unset(); 289 290 bplustree_header* header; 291 status_t status; 292 293 // if there are any free nodes, recycle them 294 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 295 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 296 if (header == NULL) 297 return B_IO_ERROR; 298 299 *_offset = header->FreeNode(); 300 *_node = fNode; 301 302 // set new free node pointer 303 header->free_node_pointer = fNode->left_link; 304 fNode->Initialize(); 305 return B_OK; 306 } 307 308 // allocate space for a new node 309 Inode* stream = fTree->fStream; 310 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 311 return status; 312 313 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 314 if (header == NULL) 315 return B_IO_ERROR; 316 317 // the maximum_size has to be changed before the call to SetTo() - or 318 // else it will fail because the requested node is out of bounds 319 off_t offset = header->MaximumSize(); 320 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 321 + fTree->fNodeSize); 322 323 if (SetToWritable(transaction, offset, false) != NULL) { 324 fNode->Initialize(); 325 326 *_offset = offset; 327 *_node = fNode; 328 return B_OK; 329 } 330 331 // revert header size to old value 332 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 333 - fTree->fNodeSize); 334 RETURN_ERROR(B_ERROR); 335 } 336 337 338 // #pragma mark - 339 340 341 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 342 : 343 fStream(NULL), 344 fHeader(NULL), 345 fCachedHeader(this) 346 { 347 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 348 SetTo(transaction, stream); 349 } 350 351 352 BPlusTree::BPlusTree(Inode* stream) 353 : 354 fStream(NULL), 355 fHeader(NULL), 356 fCachedHeader(this) 357 { 358 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 359 SetTo(stream); 360 } 361 362 363 BPlusTree::BPlusTree() 364 : 365 fStream(NULL), 366 fHeader(NULL), 367 fCachedHeader(this), 368 fNodeSize(BPLUSTREE_NODE_SIZE), 369 fAllowDuplicates(true), 370 fStatus(B_NO_INIT) 371 { 372 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 373 } 374 375 376 BPlusTree::~BPlusTree() 377 { 378 // if there are any TreeIterators left, we need to stop them 379 // (can happen when the tree's inode gets deleted while 380 // traversing the tree - a TreeIterator doesn't lock the inode) 381 mutex_lock(&fIteratorLock); 382 383 SinglyLinkedList<TreeIterator>::Iterator iterator 384 = fIterators.GetIterator(); 385 while (iterator.HasNext()) 386 iterator.Next()->Stop(); 387 388 mutex_destroy(&fIteratorLock); 389 } 390 391 392 /*! Create a new B+Tree on the specified stream */ 393 status_t 394 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 395 { 396 // initializes in-memory B+Tree 397 398 fStream = stream; 399 400 bplustree_header* header = fCachedHeader.SetToWritableHeader(transaction); 401 if (header == NULL) { 402 // allocate space for new header + node! 403 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 404 if (fStatus < B_OK) 405 RETURN_ERROR(fStatus); 406 407 header = fCachedHeader.SetToWritableHeader(transaction); 408 if (header == NULL) 409 RETURN_ERROR(fStatus = B_ERROR); 410 } 411 412 fHeader = header; 413 fAllowDuplicates = stream->IsIndex() 414 || (stream->Mode() & S_ALLOW_DUPS) != 0; 415 416 fNodeSize = nodeSize; 417 418 // initialize b+tree header 419 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 420 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 421 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 422 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 423 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 424 header->free_node_pointer 425 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 426 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 427 428 // initialize b+tree root node 429 CachedNode cached(this); 430 cached.SetToWritable(transaction, header->RootNode(), false); 431 if (cached.Node() == NULL) 432 RETURN_ERROR(B_IO_ERROR); 433 434 cached.Node()->Initialize(); 435 436 return fStatus = B_OK; 437 } 438 439 440 status_t 441 BPlusTree::SetTo(Inode* stream) 442 { 443 if (stream == NULL) 444 RETURN_ERROR(fStatus = B_BAD_VALUE); 445 446 // get on-disk B+Tree header 447 448 fCachedHeader.Unset(); 449 fStream = stream; 450 451 fHeader = fCachedHeader.SetToHeader(); 452 if (fHeader == NULL) 453 RETURN_ERROR(fStatus = B_NO_INIT); 454 455 // is header valid? 456 457 if (fHeader->MaximumSize() != stream->Size()) { 458 dprintf("B+tree header size %Ld doesn't fit file size %Ld!\n", 459 fHeader->MaximumSize(), stream->Size()); 460 // we can't change the header since we don't have a transaction 461 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 462 } 463 if (fHeader->Magic() != BPLUSTREE_MAGIC 464 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 465 || !fHeader->IsValidLink(fHeader->RootNode()) 466 || !fHeader->IsValidLink(fHeader->FreeNode())) { 467 #ifdef DEBUG 468 dump_bplustree_header(fHeader); 469 dump_block((const char*)fHeader, 128); 470 #endif 471 RETURN_ERROR(fStatus = B_BAD_DATA); 472 } 473 474 fNodeSize = fHeader->NodeSize(); 475 476 // validity check 477 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 478 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 479 S_DOUBLE_INDEX}; 480 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 481 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 482 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 483 484 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 485 || ((stream->Mode() & S_INDEX_DIR) != 0 486 && kToMode[fHeader->DataType()] != mode) 487 || !stream->IsContainer()) { 488 D( dump_bplustree_header(fHeader); 489 dump_inode(&stream->Node()); 490 ); 491 RETURN_ERROR(fStatus = B_BAD_TYPE); 492 } 493 494 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 495 // in the original BFS code - we will honour it nevertheless 496 fAllowDuplicates = stream->IsIndex() 497 || (stream->Mode() & S_ALLOW_DUPS) != 0; 498 499 CachedNode cached(this, fHeader->RootNode()); 500 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 501 } 502 503 504 status_t 505 BPlusTree::InitCheck() 506 { 507 return fStatus; 508 } 509 510 511 int32 512 BPlusTree::TypeCodeToKeyType(type_code code) 513 { 514 switch (code) { 515 case B_STRING_TYPE: 516 return BPLUSTREE_STRING_TYPE; 517 case B_SSIZE_T_TYPE: 518 case B_INT32_TYPE: 519 return BPLUSTREE_INT32_TYPE; 520 case B_SIZE_T_TYPE: 521 case B_UINT32_TYPE: 522 return BPLUSTREE_UINT32_TYPE; 523 case B_OFF_T_TYPE: 524 case B_INT64_TYPE: 525 return BPLUSTREE_INT64_TYPE; 526 case B_UINT64_TYPE: 527 return BPLUSTREE_UINT64_TYPE; 528 case B_FLOAT_TYPE: 529 return BPLUSTREE_FLOAT_TYPE; 530 case B_DOUBLE_TYPE: 531 return BPLUSTREE_DOUBLE_TYPE; 532 } 533 return -1; 534 } 535 536 537 int32 538 BPlusTree::ModeToKeyType(mode_t mode) 539 { 540 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 541 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 542 case S_INT_INDEX: 543 return BPLUSTREE_INT32_TYPE; 544 case S_UINT_INDEX: 545 return BPLUSTREE_UINT32_TYPE; 546 case S_LONG_LONG_INDEX: 547 return BPLUSTREE_INT64_TYPE; 548 case S_ULONG_LONG_INDEX: 549 return BPLUSTREE_UINT64_TYPE; 550 case S_FLOAT_INDEX: 551 return BPLUSTREE_FLOAT_TYPE; 552 case S_DOUBLE_INDEX: 553 return BPLUSTREE_DOUBLE_TYPE; 554 case S_STR_INDEX: 555 default: 556 // default is for standard directories 557 return BPLUSTREE_STRING_TYPE; 558 } 559 } 560 561 562 // #pragma mark - 563 564 565 void 566 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 567 uint16 splitAt, int8 change) 568 { 569 // Although every iterator which is affected by this update currently 570 // waits on a semaphore, other iterators could be added/removed at 571 // any time, so we need to protect this loop 572 MutexLocker _(fIteratorLock); 573 574 SinglyLinkedList<TreeIterator>::Iterator iterator 575 = fIterators.GetIterator(); 576 while (iterator.HasNext()) 577 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 578 } 579 580 581 void 582 BPlusTree::_AddIterator(TreeIterator* iterator) 583 { 584 MutexLocker _(fIteratorLock); 585 fIterators.Add(iterator); 586 } 587 588 589 void 590 BPlusTree::_RemoveIterator(TreeIterator* iterator) 591 { 592 MutexLocker _(fIteratorLock); 593 fIterators.Remove(iterator); 594 } 595 596 597 int32 598 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 599 int keyLength2) 600 { 601 type_code type = 0; 602 switch (fHeader->data_type) { 603 case BPLUSTREE_STRING_TYPE: 604 type = B_STRING_TYPE; 605 break; 606 case BPLUSTREE_INT32_TYPE: 607 type = B_INT32_TYPE; 608 break; 609 case BPLUSTREE_UINT32_TYPE: 610 type = B_UINT32_TYPE; 611 break; 612 case BPLUSTREE_INT64_TYPE: 613 type = B_INT64_TYPE; 614 break; 615 case BPLUSTREE_UINT64_TYPE: 616 type = B_UINT64_TYPE; 617 break; 618 case BPLUSTREE_FLOAT_TYPE: 619 type = B_FLOAT_TYPE; 620 break; 621 case BPLUSTREE_DOUBLE_TYPE: 622 type = B_DOUBLE_TYPE; 623 break; 624 } 625 return compareKeys(type, key1, keyLength1, key2, keyLength2); 626 } 627 628 629 status_t 630 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 631 uint16 keyLength, uint16* _index, off_t* _next) 632 { 633 #ifdef DEBUG 634 NodeChecker checker(node, fNodeSize, "find"); 635 #endif 636 637 if (node->all_key_count == 0) { 638 if (_index) 639 *_index = 0; 640 if (_next) 641 *_next = node->OverflowLink(); 642 return B_ENTRY_NOT_FOUND; 643 } 644 645 off_t* values = node->Values(); 646 int16 saveIndex = -1; 647 648 // binary search in the key array 649 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 650 uint16 i = (first + last) >> 1; 651 652 uint16 searchLength; 653 uint8* searchKey = node->KeyAt(i, &searchLength); 654 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 655 > (uint8*)node + fNodeSize 656 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 657 fStream->GetVolume()->Panic(); 658 RETURN_ERROR(B_BAD_DATA); 659 } 660 661 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 662 if (cmp < 0) { 663 last = i - 1; 664 saveIndex = i; 665 } else if (cmp > 0) { 666 saveIndex = first = i + 1; 667 } else { 668 if (_index) 669 *_index = i; 670 if (_next) 671 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 672 return B_OK; 673 } 674 } 675 676 if (_index) 677 *_index = saveIndex; 678 if (_next) { 679 if (saveIndex == node->NumKeys()) 680 *_next = node->OverflowLink(); 681 else 682 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 683 } 684 return B_ENTRY_NOT_FOUND; 685 } 686 687 688 /*! Prepares the stack to contain all nodes that were passed while 689 following the key, from the root node to the leaf node that could 690 or should contain that key. 691 */ 692 status_t 693 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 694 uint16 keyLength) 695 { 696 // set the root node to begin with 697 node_and_key nodeAndKey; 698 nodeAndKey.nodeOffset = fHeader->RootNode(); 699 700 CachedNode cached(this); 701 const bplustree_node* node; 702 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 703 // if we are already on leaf level, we're done 704 if (node->OverflowLink() == BPLUSTREE_NULL) { 705 // node that the keyIndex is not properly set here (but it's not 706 // needed in the calling functions anyway)! 707 nodeAndKey.keyIndex = 0; 708 stack.Push(nodeAndKey); 709 return B_OK; 710 } 711 712 off_t nextOffset; 713 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 714 &nextOffset); 715 716 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 717 RETURN_ERROR(B_ERROR); 718 719 if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) { 720 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 721 break; 722 } 723 724 // put the node offset & the correct keyIndex on the stack 725 stack.Push(nodeAndKey); 726 727 nodeAndKey.nodeOffset = nextOffset; 728 } 729 730 FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n", 731 nodeAndKey.nodeOffset)); 732 return B_ERROR; 733 } 734 735 736 /*! This will find a free duplicate fragment in the given bplustree_node. 737 The CachedNode will be set to the writable fragment on success. 738 */ 739 status_t 740 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 741 const bplustree_node* node, CachedNode& cached, 742 off_t* _offset, bplustree_node** _fragment, uint32* _index) 743 { 744 off_t* values = node->Values(); 745 for (int32 i = 0; i < node->NumKeys(); i++) { 746 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 747 748 // does the value link to a duplicate fragment? 749 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 750 continue; 751 752 const bplustree_node* fragment = cached.SetTo( 753 bplustree_node::FragmentOffset(value), false); 754 if (fragment == NULL) { 755 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 756 continue; 757 } 758 759 // see if there is some space left for us 760 uint32 num = bplustree_node::MaxFragments(fNodeSize); 761 for (uint32 j = 0; j < num; j++) { 762 duplicate_array* array = fragment->FragmentAt(j); 763 764 if (array->count == 0) { 765 // found an unused fragment 766 *_fragment = cached.MakeWritable(transaction); 767 if (*_fragment == NULL) 768 return B_IO_ERROR; 769 770 *_offset = bplustree_node::FragmentOffset(value); 771 *_index = j; 772 return B_OK; 773 } 774 } 775 } 776 return B_ENTRY_NOT_FOUND; 777 } 778 779 780 status_t 781 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 782 const bplustree_node* node, uint16 index, off_t value) 783 { 784 CachedNode cachedDuplicate(this); 785 off_t* values = node->Values(); 786 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 787 status_t status; 788 off_t offset; 789 790 if (bplustree_node::IsDuplicate(oldValue)) { 791 // If it's a duplicate fragment, try to insert it into that, or if it 792 // doesn't fit anymore, create a new duplicate node 793 794 if (bplustree_node::LinkType(oldValue) 795 == BPLUSTREE_DUPLICATE_FRAGMENT) { 796 bplustree_node* duplicate 797 = cachedDuplicate.SetToWritable(transaction, 798 bplustree_node::FragmentOffset(oldValue), false); 799 if (duplicate == NULL) 800 return B_IO_ERROR; 801 802 duplicate_array* array = duplicate->FragmentAt( 803 bplustree_node::FragmentIndex(oldValue)); 804 if (array->count > NUM_FRAGMENT_VALUES 805 || array->count < 1) { 806 FATAL(("insertDuplicate: Invalid array[%d] size in fragment " 807 "%Ld == %Ld!\n", 808 (int)bplustree_node::FragmentIndex(oldValue), 809 bplustree_node::FragmentOffset(oldValue), array->count)); 810 return B_BAD_DATA; 811 } 812 813 if (array->count < NUM_FRAGMENT_VALUES) { 814 array->Insert(value); 815 } else { 816 // Test if the fragment will be empty if we remove this key's 817 // values 818 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 819 // The node will be empty without our values, so let us 820 // reuse it as a duplicate node 821 offset = bplustree_node::FragmentOffset(oldValue); 822 823 memmove(duplicate->DuplicateArray(), array, 824 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 825 duplicate->left_link = duplicate->right_link 826 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 827 828 array = duplicate->DuplicateArray(); 829 array->Insert(value); 830 } else { 831 // Create a new duplicate node 832 833 cachedDuplicate.UnsetUnchanged(transaction); 834 // The old duplicate has not been touched, so we can 835 // reuse it 836 837 bplustree_node* newDuplicate; 838 status = cachedDuplicate.Allocate(transaction, 839 &newDuplicate, &offset); 840 if (status < B_OK) 841 RETURN_ERROR(status); 842 843 // Copy the array from the fragment node to the duplicate 844 // node and free the old entry (by zero'ing all values) 845 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 846 array->count); 847 memcpy(&newDuplicate->all_key_count, &array->values[0], 848 array->count * sizeof(off_t)); 849 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 850 851 array = newDuplicate->DuplicateArray(); 852 array->Insert(value); 853 } 854 855 // Update the main pointer to link to a duplicate node 856 if (cached.MakeWritable(transaction) == NULL) 857 return B_IO_ERROR; 858 859 values[index] 860 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 861 BPLUSTREE_DUPLICATE_NODE, offset)); 862 } 863 864 return B_OK; 865 } 866 867 // Put the value into a dedicated duplicate node 868 869 // search for free space in the duplicate nodes of that key 870 duplicate_array* array; 871 const bplustree_node* duplicate; 872 off_t duplicateOffset; 873 do { 874 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 875 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 876 if (duplicate == NULL) 877 return B_IO_ERROR; 878 879 array = duplicate->DuplicateArray(); 880 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 881 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld " 882 "== %Ld!\n", duplicateOffset, array->count)); 883 return B_BAD_DATA; 884 } 885 } while (array->count >= NUM_DUPLICATE_VALUES 886 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 887 888 bplustree_node* writableDuplicate 889 = cachedDuplicate.MakeWritable(transaction); 890 if (writableDuplicate == NULL) 891 return B_IO_ERROR; 892 893 if (array->count < NUM_DUPLICATE_VALUES) { 894 array = writableDuplicate->DuplicateArray(); 895 array->Insert(value); 896 } else { 897 // no space left - add a new duplicate node 898 899 bplustree_node* newDuplicate; 900 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 901 &offset); 902 if (status != B_OK) 903 RETURN_ERROR(status); 904 905 // link the two nodes together 906 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 907 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 908 909 array = newDuplicate->DuplicateArray(); 910 array->count = 0; 911 array->Insert(value); 912 } 913 return B_OK; 914 } 915 916 // Search for a free duplicate fragment or create a new one 917 // to insert the duplicate value into 918 919 uint32 fragmentIndex = 0; 920 bplustree_node* fragment; 921 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 922 &offset, &fragment, &fragmentIndex) != B_OK) { 923 // allocate a new duplicate fragment node 924 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 925 if (status != B_OK) 926 RETURN_ERROR(status); 927 928 memset(fragment, 0, fNodeSize); 929 } 930 931 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 932 array->Insert(oldValue); 933 array->Insert(value); 934 935 if (cached.MakeWritable(transaction) == NULL) 936 return B_IO_ERROR; 937 938 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 939 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 940 941 return B_OK; 942 } 943 944 945 void 946 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 947 uint16 keyLength, off_t value) 948 { 949 // should never happen, but who knows? 950 if (index > node->NumKeys()) 951 return; 952 953 off_t* values = node->Values(); 954 uint16* keyLengths = node->KeyLengths(); 955 uint8* keys = node->Keys(); 956 957 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 958 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 959 + keyLength); 960 961 off_t* newValues = node->Values(); 962 uint16* newKeyLengths = node->KeyLengths(); 963 964 // move values and copy new value into them 965 memmove(newValues + index + 1, values + index, 966 sizeof(off_t) * (node->NumKeys() - 1 - index)); 967 memmove(newValues, values, sizeof(off_t) * index); 968 969 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 970 971 // move and update key length index 972 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 973 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 974 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 975 } 976 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 977 978 int32 keyStart; 979 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 980 + (keyStart = index > 0 981 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 982 983 // move keys and copy new key into them 984 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 985 int32 size = node->AllKeyLength() - length; 986 if (size > 0) 987 memmove(keys + length, keys + length - keyLength, size); 988 989 memcpy(keys + keyStart, key, keyLength); 990 } 991 992 993 /*! Splits the \a node into two halves - the other half will be put into 994 \a other. It also takes care to create a new overflow link if the node 995 to split is an index node. 996 */ 997 status_t 998 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 999 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1000 uint16* _keyLength, off_t* _value) 1001 { 1002 if (*_keyIndex > node->NumKeys() + 1) 1003 return B_BAD_VALUE; 1004 1005 uint16* inKeyLengths = node->KeyLengths(); 1006 off_t* inKeyValues = node->Values(); 1007 uint8* inKeys = node->Keys(); 1008 uint8* outKeys = other->Keys(); 1009 int32 keyIndex = *_keyIndex; // can become less than zero! 1010 1011 if (keyIndex > node->NumKeys()) { 1012 FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex, 1013 node->NumKeys())); 1014 return B_BAD_VALUE; 1015 } 1016 1017 // how many keys will fit in one (half) page? 1018 // that loop will find the answer to this question and 1019 // change the key lengths indices for their new home 1020 1021 // "bytes" is the number of bytes written for the new key, 1022 // "bytesBefore" are the bytes before that key 1023 // "bytesAfter" are the bytes after the new key, if any 1024 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1025 1026 size_t size = fNodeSize >> 1; 1027 int32 out, in; 1028 for (in = out = 0; in < node->NumKeys() + 1;) { 1029 if (!bytes) { 1030 bytesBefore = in > 0 1031 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1032 } 1033 1034 if (in == keyIndex && !bytes) { 1035 bytes = *_keyLength; 1036 } else { 1037 if (keyIndex < out) { 1038 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1039 - bytesBefore; 1040 } 1041 1042 in++; 1043 } 1044 out++; 1045 1046 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1047 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1048 // we have found the number of keys in the new node! 1049 break; 1050 } 1051 } 1052 1053 // if the new key was not inserted, set the length of the keys 1054 // that can be copied directly 1055 if (keyIndex >= out && in > 0) 1056 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1057 1058 if (bytesBefore < 0 || bytesAfter < 0) 1059 return B_BAD_DATA; 1060 1061 other->left_link = node->left_link; 1062 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1063 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1064 + bytesAfter); 1065 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1066 1067 uint16* outKeyLengths = other->KeyLengths(); 1068 off_t* outKeyValues = other->Values(); 1069 int32 keys = out > keyIndex ? keyIndex : out; 1070 1071 if (bytesBefore) { 1072 // copy the keys 1073 memcpy(outKeys, inKeys, bytesBefore); 1074 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1075 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1076 } 1077 if (bytes) { 1078 // copy the newly inserted key 1079 memcpy(outKeys + bytesBefore, key, bytes); 1080 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1081 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1082 1083 if (bytesAfter) { 1084 // copy the keys after the new key 1085 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1086 bytesAfter); 1087 keys = out - keyIndex - 1; 1088 for (int32 i = 0;i < keys;i++) { 1089 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1090 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1091 + bytes); 1092 } 1093 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1094 keys * sizeof(off_t)); 1095 } 1096 } 1097 1098 // if the new key was already inserted, we shouldn't use it again 1099 if (in != out) 1100 keyIndex--; 1101 1102 int32 total = bytesBefore + bytesAfter; 1103 1104 // these variables are for the key that will be returned 1105 // to the parent node 1106 uint8* newKey = NULL; 1107 uint16 newLength; 1108 bool newAllocated = false; 1109 1110 // If we have split an index node, we have to drop the first key 1111 // of the next node (which can also be the new key to insert). 1112 // The dropped key is also the one which has to be inserted in 1113 // the parent node, so we will set the "newKey" already here. 1114 if (node->OverflowLink() != BPLUSTREE_NULL) { 1115 if (in == keyIndex) { 1116 newKey = key; 1117 newLength = *_keyLength; 1118 1119 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1120 keyIndex--; 1121 } else { 1122 // If a key is dropped (is not the new key), we have to copy 1123 // it, because it would be lost if not. 1124 uint8* droppedKey = node->KeyAt(in, &newLength); 1125 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1126 > (uint8*)node + fNodeSize 1127 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1128 fStream->GetVolume()->Panic(); 1129 RETURN_ERROR(B_BAD_DATA); 1130 } 1131 newKey = (uint8*)malloc(newLength); 1132 if (newKey == NULL) 1133 return B_NO_MEMORY; 1134 1135 newAllocated = true; 1136 memcpy(newKey, droppedKey, newLength); 1137 1138 other->overflow_link = inKeyValues[in]; 1139 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1140 } 1141 } 1142 1143 // and now the same game for the other page and the rest of the keys 1144 // (but with memmove() instead of memcpy(), because they may overlap) 1145 1146 bytesBefore = bytesAfter = bytes = 0; 1147 out = 0; 1148 int32 skip = in; 1149 while (in < node->NumKeys() + 1) { 1150 if (in == keyIndex && !bytes) { 1151 // it's enough to set bytesBefore once here, because we do 1152 // not need to know the exact length of all keys in this 1153 // loop 1154 bytesBefore = in > skip 1155 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1156 bytes = *_keyLength; 1157 out++; 1158 } else { 1159 if (in < node->NumKeys()) { 1160 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1161 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1162 1163 if (bytes) { 1164 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1165 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1166 1167 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1168 - bytesBefore - bytes; 1169 } 1170 out++; 1171 } 1172 in++; 1173 } 1174 } 1175 1176 // adjust the byte counts (since we were a bit lazy in the loop) 1177 if (keyIndex < skip) 1178 bytesBefore = node->AllKeyLength() - total; 1179 1180 if (bytesBefore < 0 || bytesAfter < 0) { 1181 if (newAllocated) 1182 free(newKey); 1183 return B_BAD_DATA; 1184 } 1185 1186 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1187 // right link, and overflow link can stay the same 1188 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1189 + bytesAfter); 1190 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1191 1192 // array positions have changed 1193 outKeyLengths = node->KeyLengths(); 1194 outKeyValues = node->Values(); 1195 1196 // move the keys in the old node: the order is important here, 1197 // because we don't want to overwrite any contents 1198 1199 keys = keyIndex <= skip ? out : keyIndex - skip; 1200 keyIndex -= skip; 1201 in = out - keyIndex - 1; 1202 // Note: keyIndex and in will contain invalid values when the new key 1203 // went to the other node. But in this case bytes and bytesAfter are 1204 // 0 and subsequently we never use keyIndex and in. 1205 1206 if (bytesBefore) 1207 memmove(inKeys, inKeys + total, bytesBefore); 1208 if (bytesAfter) { 1209 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1210 bytesAfter); 1211 } 1212 1213 if (bytesBefore) 1214 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1215 if (bytesAfter) { 1216 // if byteAfter is > 0, keyIndex is larger than skip 1217 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1218 in * sizeof(uint16)); 1219 } 1220 1221 if (bytesBefore) 1222 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1223 if (bytesAfter) { 1224 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1225 in * sizeof(off_t)); 1226 } 1227 1228 if (bytes) { 1229 // finally, copy the newly inserted key (don't overwrite anything) 1230 memcpy(inKeys + bytesBefore, key, bytes); 1231 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1232 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1233 } 1234 1235 // Prepare the key that will be inserted in the parent node which 1236 // is either the dropped key or the last of the other node. 1237 // If it's the dropped key, "newKey" was already set earlier. 1238 1239 if (newKey == NULL) 1240 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1241 1242 memcpy(key, newKey, newLength); 1243 *_keyLength = newLength; 1244 *_value = otherOffset; 1245 1246 if (newAllocated) 1247 free(newKey); 1248 1249 return B_OK; 1250 } 1251 1252 1253 /*! This inserts a key into the tree. The changes made to the tree will 1254 all be part of the \a transaction. 1255 You need to have the inode write locked. 1256 */ 1257 status_t 1258 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1259 off_t value) 1260 { 1261 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1262 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1263 RETURN_ERROR(B_BAD_VALUE); 1264 #ifdef DEBUG 1265 if (value < 0) 1266 panic("tried to insert invalid value %Ld!\n", value); 1267 #endif 1268 1269 ASSERT_WRITE_LOCKED_INODE(fStream); 1270 1271 Stack<node_and_key> stack; 1272 if (_SeekDown(stack, key, keyLength) != B_OK) 1273 RETURN_ERROR(B_ERROR); 1274 1275 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1276 1277 memcpy(keyBuffer, key, keyLength); 1278 keyBuffer[keyLength] = 0; 1279 1280 node_and_key nodeAndKey; 1281 const bplustree_node* node; 1282 1283 CachedNode cached(this); 1284 while (stack.Pop(&nodeAndKey) 1285 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1286 #ifdef DEBUG 1287 NodeChecker checker(node, fNodeSize, "insert"); 1288 #endif 1289 if (node->IsLeaf()) { 1290 // first round, check for duplicate entries 1291 status_t status = _FindKey(node, key, keyLength, 1292 &nodeAndKey.keyIndex); 1293 1294 // is this a duplicate entry? 1295 if (status == B_OK) { 1296 if (fAllowDuplicates) { 1297 status = _InsertDuplicate(transaction, cached, node, 1298 nodeAndKey.keyIndex, value); 1299 if (status != B_OK) 1300 RETURN_ERROR(status); 1301 return B_OK; 1302 } 1303 1304 return B_NAME_IN_USE; 1305 } 1306 } 1307 1308 bplustree_node* writableNode = cached.MakeWritable(transaction); 1309 if (writableNode == NULL) 1310 return B_IO_ERROR; 1311 1312 // is the node big enough to hold the pair? 1313 if (int32(key_align(sizeof(bplustree_node) 1314 + writableNode->AllKeyLength() + keyLength) 1315 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1316 + sizeof(off_t))) < fNodeSize) { 1317 _InsertKey(writableNode, nodeAndKey.keyIndex, 1318 keyBuffer, keyLength, value); 1319 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1320 nodeAndKey.keyIndex, 0, 1); 1321 1322 return B_OK; 1323 } else { 1324 CachedNode cachedNewRoot(this); 1325 CachedNode cachedOther(this); 1326 1327 // do we need to allocate a new root node? if so, then do 1328 // it now 1329 off_t newRoot = BPLUSTREE_NULL; 1330 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1331 bplustree_node* root; 1332 status_t status = cachedNewRoot.Allocate(transaction, &root, 1333 &newRoot); 1334 if (status < B_OK) { 1335 // The tree is most likely corrupted! 1336 // But it's still sane at leaf level - we could set 1337 // a flag in the header that forces the tree to be 1338 // rebuild next time... 1339 // But since we will have journaling, that's not a big 1340 // problem anyway. 1341 RETURN_ERROR(status); 1342 } 1343 } 1344 1345 // reserve space for the other node 1346 bplustree_node* other; 1347 off_t otherOffset; 1348 status_t status = cachedOther.Allocate(transaction, &other, 1349 &otherOffset); 1350 if (status < B_OK) { 1351 cachedNewRoot.Free(transaction, newRoot); 1352 RETURN_ERROR(status); 1353 } 1354 1355 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1356 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1357 &value) < B_OK) { 1358 // free root node & other node here 1359 cachedOther.Free(transaction, otherOffset); 1360 cachedNewRoot.Free(transaction, newRoot); 1361 1362 RETURN_ERROR(B_ERROR); 1363 } 1364 #ifdef DEBUG 1365 checker.Check("insert split"); 1366 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1367 #endif 1368 1369 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1370 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1371 1372 // update the right link of the node in the left of the new node 1373 if ((other = cachedOther.SetToWritable(transaction, 1374 other->LeftLink())) != NULL) { 1375 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1376 } 1377 1378 // create a new root if necessary 1379 if (newRoot != BPLUSTREE_NULL) { 1380 bplustree_node* root = cachedNewRoot.Node(); 1381 1382 _InsertKey(root, 0, keyBuffer, keyLength, 1383 writableNode->LeftLink()); 1384 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1385 nodeAndKey.nodeOffset); 1386 1387 bplustree_header* header 1388 = fCachedHeader.MakeWritableHeader(transaction); 1389 if (header == NULL) 1390 return B_IO_ERROR; 1391 1392 // finally, update header to point to the new root 1393 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1394 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1395 header->MaxNumberOfLevels() + 1); 1396 1397 return B_OK; 1398 } 1399 } 1400 } 1401 RETURN_ERROR(B_ERROR); 1402 } 1403 1404 1405 /*! Removes the duplicate index/value pair from the tree. 1406 It's part of the private tree interface. 1407 */ 1408 status_t 1409 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1410 const bplustree_node* node, CachedNode& cached, uint16 index, 1411 off_t value) 1412 { 1413 off_t* values = node->Values(); 1414 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1415 1416 CachedNode cachedDuplicate(this); 1417 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1418 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1419 duplicateOffset, false); 1420 if (duplicate == NULL) 1421 return B_IO_ERROR; 1422 1423 // if it's a duplicate fragment, remove the entry from there 1424 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1425 duplicate_array* array = duplicate->FragmentAt( 1426 bplustree_node::FragmentIndex(oldValue)); 1427 1428 if (array->count > NUM_FRAGMENT_VALUES 1429 || array->count < 1) { 1430 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld " 1431 "== %Ld!\n", (int)bplustree_node::FragmentIndex(oldValue), 1432 duplicateOffset, array->count)); 1433 return B_BAD_DATA; 1434 } 1435 if (!array->Remove(value)) { 1436 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1437 value, duplicateOffset)); 1438 return B_ENTRY_NOT_FOUND; 1439 } 1440 1441 // remove the array from the fragment node if it is empty 1442 if (array->count == 1) { 1443 // set the link to the remaining value 1444 if (cached.MakeWritable(transaction) == NULL) 1445 return B_IO_ERROR; 1446 1447 values[index] = array->values[0]; 1448 1449 // Remove the whole fragment node, if this was the only array, 1450 // otherwise free just the array 1451 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1452 status_t status = cachedDuplicate.Free(transaction, 1453 duplicateOffset); 1454 if (status < B_OK) 1455 return status; 1456 } else 1457 array->count = 0; 1458 } 1459 return B_OK; 1460 } 1461 1462 // Remove value from a duplicate node! 1463 1464 duplicate_array* array = NULL; 1465 1466 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1467 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1468 duplicate->LeftLink())); 1469 return B_BAD_DATA; 1470 } 1471 1472 // Search the duplicate nodes until the entry could be found (and removed) 1473 while (duplicate != NULL) { 1474 array = duplicate->DuplicateArray(); 1475 if (array->count > NUM_DUPLICATE_VALUES 1476 || array->count < 0) { 1477 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == " 1478 "%Ld!\n", duplicateOffset, array->count)); 1479 return B_BAD_DATA; 1480 } 1481 1482 if (array->Remove(value)) 1483 break; 1484 1485 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1486 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1487 1488 cachedDuplicate.UnsetUnchanged(transaction); 1489 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1490 false); 1491 } 1492 if (duplicate == NULL) 1493 RETURN_ERROR(B_IO_ERROR); 1494 1495 // The entry got removed from the duplicate node, but we might want to free 1496 // it now in case it's empty 1497 1498 while (true) { 1499 off_t left = duplicate->LeftLink(); 1500 off_t right = duplicate->RightLink(); 1501 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1502 1503 if ((isLast && array->count == 1) || array->count == 0) { 1504 // Free empty duplicate page, link their siblings together, and 1505 // update the duplicate link if needed (ie. when we either remove 1506 // the last duplicate node or have a new first one) 1507 1508 if (left == BPLUSTREE_NULL) { 1509 // the duplicate link points to us 1510 if (cached.MakeWritable(transaction) == NULL) 1511 return B_IO_ERROR; 1512 1513 if (array->count == 1) { 1514 // This is the last node, and there is only one value left; 1515 // replace the duplicate link with that value, it's no 1516 // duplicate anymore 1517 values[index] = array->values[0]; 1518 } else { 1519 // Move the duplicate link to the next node 1520 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1521 bplustree_node::MakeLink( 1522 BPLUSTREE_DUPLICATE_NODE, right)); 1523 } 1524 } 1525 1526 status_t status = cachedDuplicate.Free(transaction, 1527 duplicateOffset); 1528 if (status < B_OK) 1529 return status; 1530 1531 if (left != BPLUSTREE_NULL 1532 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1533 false)) != NULL) { 1534 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1535 1536 // If the next node is the last node, we need to free that node 1537 // and convert the duplicate entry back into a normal entry 1538 array = duplicate->DuplicateArray(); 1539 if (right == BPLUSTREE_NULL 1540 && duplicate->LeftLink() == BPLUSTREE_NULL 1541 && array->count <= NUM_FRAGMENT_VALUES) { 1542 duplicateOffset = left; 1543 continue; 1544 } 1545 } 1546 if (right != BPLUSTREE_NULL 1547 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1548 right, false)) != NULL) { 1549 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1550 1551 // Again, we may need to turn the duplicate entry back into a 1552 // normal entry 1553 array = duplicate->DuplicateArray(); 1554 if (left == BPLUSTREE_NULL 1555 && duplicate->RightLink() == BPLUSTREE_NULL 1556 && array->count <= NUM_FRAGMENT_VALUES) { 1557 duplicateOffset = right; 1558 continue; 1559 } 1560 } 1561 return B_OK; 1562 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1563 // If the number of entries fits in a duplicate fragment, then 1564 // either find a free fragment node, or convert this node to a 1565 // fragment node. 1566 CachedNode cachedOther(this); 1567 1568 bplustree_node* fragment = NULL; 1569 uint32 fragmentIndex = 0; 1570 off_t offset; 1571 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1572 &offset, &fragment, &fragmentIndex) == B_OK) { 1573 // move to other node 1574 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 1575 memcpy(target, array, 1576 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1577 1578 cachedDuplicate.Free(transaction, duplicateOffset); 1579 duplicateOffset = offset; 1580 } else { 1581 // convert node 1582 memmove(duplicate, array, 1583 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1584 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1585 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1586 } 1587 1588 if (cached.MakeWritable(transaction) == NULL) 1589 return B_IO_ERROR; 1590 1591 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1592 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1593 } 1594 return B_OK; 1595 } 1596 } 1597 1598 1599 /*! Removes the key with the given index from the specified node. 1600 Since it has to get the key from the node anyway (to obtain it's 1601 pointer), it's not needed to pass the key & its length, although 1602 the calling method (BPlusTree::Remove()) have this data. 1603 */ 1604 void 1605 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 1606 { 1607 // should never happen, but who knows? 1608 if (index > node->NumKeys() && node->NumKeys() > 0) { 1609 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1610 return; 1611 } 1612 1613 off_t* values = node->Values(); 1614 1615 // if we would have to drop the overflow link, drop 1616 // the last key instead and update the overflow link 1617 // to the value of that one 1618 if (!node->IsLeaf() && index == node->NumKeys()) 1619 node->overflow_link = values[--index]; 1620 1621 uint16 length; 1622 uint8* key = node->KeyAt(index, &length); 1623 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 1624 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1625 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1626 (int)fStream->BlockRun().allocation_group, 1627 fStream->BlockRun().start)); 1628 fStream->GetVolume()->Panic(); 1629 return; 1630 } 1631 1632 uint16* keyLengths = node->KeyLengths(); 1633 uint8* keys = node->Keys(); 1634 1635 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1636 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 1637 node->AllKeyLength() - length); 1638 1639 off_t* newValues = node->Values(); 1640 uint16* newKeyLengths = node->KeyLengths(); 1641 1642 // move key data 1643 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1644 1645 // move and update key lengths 1646 if (index > 0 && newKeyLengths != keyLengths) 1647 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1648 for (uint16 i = index; i < node->NumKeys(); i++) { 1649 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1650 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1651 } 1652 1653 // move values 1654 if (index > 0) 1655 memmove(newValues, values, index * sizeof(off_t)); 1656 if (node->NumKeys() > index) { 1657 memmove(newValues + index, values + index + 1, 1658 (node->NumKeys() - index) * sizeof(off_t)); 1659 } 1660 } 1661 1662 1663 /*! Removes the specified key from the tree. The "value" parameter is only used 1664 for trees which allow duplicates, so you may safely ignore it. 1665 It's not an optional parameter, so at least you have to think about it. 1666 You need to have the inode write locked. 1667 */ 1668 status_t 1669 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 1670 off_t value) 1671 { 1672 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1673 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1674 RETURN_ERROR(B_BAD_VALUE); 1675 1676 ASSERT_WRITE_LOCKED_INODE(fStream); 1677 1678 Stack<node_and_key> stack; 1679 if (_SeekDown(stack, key, keyLength) != B_OK) 1680 RETURN_ERROR(B_ERROR); 1681 1682 node_and_key nodeAndKey; 1683 const bplustree_node* node; 1684 1685 CachedNode cached(this); 1686 while (stack.Pop(&nodeAndKey) 1687 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1688 #ifdef DEBUG 1689 NodeChecker checker(node, fNodeSize, "remove"); 1690 #endif 1691 if (node->IsLeaf()) { 1692 // first round, check for duplicate entries 1693 status_t status = _FindKey(node, key, keyLength, 1694 &nodeAndKey.keyIndex); 1695 if (status != B_OK) 1696 RETURN_ERROR(status); 1697 1698 // Is this a duplicate entry? 1699 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1700 node->Values()[nodeAndKey.keyIndex]))) { 1701 if (fAllowDuplicates) { 1702 return _RemoveDuplicate(transaction, node, cached, 1703 nodeAndKey.keyIndex, value); 1704 } 1705 1706 FATAL(("dupliate node found where no duplicates are " 1707 "allowed!\n")); 1708 RETURN_ERROR(B_ERROR); 1709 } else { 1710 if (node->Values()[nodeAndKey.keyIndex] != value) 1711 return B_ENTRY_NOT_FOUND; 1712 1713 // If we will remove the last key, the iterator will be set 1714 // to the next node after the current - if there aren't any 1715 // more nodes, we need a way to prevent the TreeIterators to 1716 // touch the old node again, we use BPLUSTREE_FREE for this 1717 off_t next = node->RightLink() == BPLUSTREE_NULL 1718 ? BPLUSTREE_FREE : node->RightLink(); 1719 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1720 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1721 } 1722 } 1723 1724 bplustree_node* writableNode = cached.MakeWritable(transaction); 1725 if (writableNode == NULL) 1726 return B_IO_ERROR; 1727 1728 // if it's an empty root node, we have to convert it 1729 // to a leaf node by dropping the overflow link, or, 1730 // if it's already a leaf node, just empty it 1731 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1732 && (node->NumKeys() == 0 1733 || (node->NumKeys() == 1 && node->IsLeaf()))) { 1734 writableNode->overflow_link 1735 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1736 writableNode->all_key_count = 0; 1737 writableNode->all_key_length = 0; 1738 1739 // if we've made a leaf node out of the root node, we need 1740 // to reset the maximum number of levels in the header 1741 if (fHeader->MaxNumberOfLevels() != 1) { 1742 bplustree_header* header 1743 = fCachedHeader.MakeWritableHeader(transaction); 1744 if (header == NULL) 1745 return B_IO_ERROR; 1746 1747 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1748 } 1749 return B_OK; 1750 } 1751 1752 // if there is only one key left, we don't have to remove 1753 // it, we can just dump the node (index nodes still have 1754 // the overflow link, so we have to drop the last key) 1755 if (writableNode->NumKeys() > 1 1756 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 1757 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1758 return B_OK; 1759 } 1760 1761 // when we are here, we can just free the node, but 1762 // we have to update the right/left link of the 1763 // siblings first 1764 CachedNode otherCached(this); 1765 bplustree_node* other = otherCached.SetToWritable(transaction, 1766 writableNode->LeftLink()); 1767 if (other != NULL) 1768 other->right_link = writableNode->right_link; 1769 1770 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 1771 != NULL) { 1772 other->left_link = writableNode->left_link; 1773 } 1774 1775 cached.Free(transaction, nodeAndKey.nodeOffset); 1776 } 1777 RETURN_ERROR(B_ERROR); 1778 } 1779 1780 1781 /*! Replaces the value for the key in the tree. 1782 Returns B_OK if the key could be found and its value replaced, 1783 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1784 to indicate that something went terribly wrong. 1785 Note that this doesn't work with duplicates - it will just 1786 return B_BAD_TYPE if you call this function on a tree where 1787 duplicates are allowed. 1788 You need to have the inode write locked. 1789 */ 1790 status_t 1791 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 1792 off_t value) 1793 { 1794 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1795 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1796 || key == NULL) 1797 RETURN_ERROR(B_BAD_VALUE); 1798 1799 if (fAllowDuplicates) 1800 RETURN_ERROR(B_BAD_TYPE); 1801 1802 ASSERT_WRITE_LOCKED_INODE(fStream); 1803 1804 off_t nodeOffset = fHeader->RootNode(); 1805 CachedNode cached(this); 1806 const bplustree_node* node; 1807 1808 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1809 uint16 keyIndex = 0; 1810 off_t nextOffset; 1811 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1812 &nextOffset); 1813 1814 if (node->OverflowLink() == BPLUSTREE_NULL) { 1815 if (status == B_OK) { 1816 bplustree_node* writableNode = cached.MakeWritable(transaction); 1817 if (writableNode != NULL) { 1818 writableNode->Values()[keyIndex] 1819 = HOST_ENDIAN_TO_BFS_INT64(value); 1820 } else 1821 status = B_IO_ERROR; 1822 } 1823 1824 return status; 1825 } else if (nextOffset == nodeOffset) 1826 RETURN_ERROR(B_ERROR); 1827 1828 nodeOffset = nextOffset; 1829 } 1830 RETURN_ERROR(B_ERROR); 1831 } 1832 1833 1834 /*! Searches the key in the tree, and stores the offset found in _value, 1835 if successful. 1836 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 1837 while it descends the tree. 1838 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 1839 It can also return other errors to indicate that something went wrong. 1840 Note that this doesn't work with duplicates - it will just return 1841 B_BAD_TYPE if you call this function on a tree where duplicates are 1842 allowed. 1843 You need to have the inode read or write locked. 1844 */ 1845 status_t 1846 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 1847 { 1848 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1849 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1850 || key == NULL) 1851 RETURN_ERROR(B_BAD_VALUE); 1852 1853 if (fAllowDuplicates) 1854 RETURN_ERROR(B_BAD_TYPE); 1855 1856 ASSERT_READ_LOCKED_INODE(fStream); 1857 1858 off_t nodeOffset = fHeader->RootNode(); 1859 CachedNode cached(this); 1860 const bplustree_node* node; 1861 1862 #ifdef DEBUG 1863 int32 levels = 0; 1864 #endif 1865 1866 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1867 uint16 keyIndex = 0; 1868 off_t nextOffset; 1869 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1870 &nextOffset); 1871 1872 #ifdef DEBUG 1873 levels++; 1874 #endif 1875 if (node->OverflowLink() == BPLUSTREE_NULL) { 1876 if (status == B_OK && _value != NULL) 1877 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1878 1879 #ifdef DEBUG 1880 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1881 DEBUGGER(("levels don't match")); 1882 #endif 1883 return status; 1884 } else if (nextOffset == nodeOffset) 1885 RETURN_ERROR(B_ERROR); 1886 1887 nodeOffset = nextOffset; 1888 } 1889 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1890 RETURN_ERROR(B_ERROR); 1891 } 1892 1893 1894 // #pragma mark - 1895 1896 1897 TreeIterator::TreeIterator(BPlusTree* tree) 1898 : 1899 fTree(tree), 1900 fCurrentNodeOffset(BPLUSTREE_NULL) 1901 { 1902 tree->_AddIterator(this); 1903 } 1904 1905 1906 TreeIterator::~TreeIterator() 1907 { 1908 if (fTree) 1909 fTree->_RemoveIterator(this); 1910 } 1911 1912 1913 status_t 1914 TreeIterator::Goto(int8 to) 1915 { 1916 if (fTree == NULL || fTree->fHeader == NULL) 1917 RETURN_ERROR(B_BAD_VALUE); 1918 1919 // lock access to stream 1920 InodeReadLocker locker(fTree->fStream); 1921 1922 off_t nodeOffset = fTree->fHeader->RootNode(); 1923 CachedNode cached(fTree); 1924 const bplustree_node* node; 1925 1926 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1927 // is the node a leaf node? 1928 if (node->OverflowLink() == BPLUSTREE_NULL) { 1929 fCurrentNodeOffset = nodeOffset; 1930 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1931 fDuplicateNode = BPLUSTREE_NULL; 1932 1933 return B_OK; 1934 } 1935 1936 // get the next node offset depending on the direction (and if there 1937 // are any keys in that node at all) 1938 off_t nextOffset; 1939 if (to == BPLUSTREE_END || node->all_key_count == 0) 1940 nextOffset = node->OverflowLink(); 1941 else { 1942 if (node->AllKeyLength() > fTree->fNodeSize 1943 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1944 - 8 * node->NumKeys()) 1945 RETURN_ERROR(B_ERROR); 1946 1947 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1948 } 1949 if (nextOffset == nodeOffset) 1950 break; 1951 1952 nodeOffset = nextOffset; 1953 } 1954 FATAL(("%s fails\n", __FUNCTION__)); 1955 1956 RETURN_ERROR(B_ERROR); 1957 } 1958 1959 1960 /*! Iterates through the tree in the specified direction. 1961 When it iterates through duplicates, the "key" is only updated for the 1962 first entry - if you need to know when this happens, use the "duplicate" 1963 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1964 the other duplicates. 1965 That's not too nice, but saves the 256 bytes that would be needed to 1966 store the last key - if this will ever become an issue, it will be 1967 easy to change. 1968 The other advantage of this is, that the queries can skip all duplicates 1969 at once when they are not relevant to them. 1970 */ 1971 status_t 1972 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 1973 uint16 maxLength, off_t* value, uint16* duplicate) 1974 { 1975 if (fTree == NULL) 1976 return B_INTERRUPTED; 1977 1978 bool forward = direction == BPLUSTREE_FORWARD; 1979 1980 if (fCurrentNodeOffset == BPLUSTREE_NULL 1981 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1982 RETURN_ERROR(B_ERROR); 1983 1984 // if the tree was emptied since the last call 1985 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1986 return B_ENTRY_NOT_FOUND; 1987 1988 // lock access to stream 1989 InodeReadLocker locker(fTree->fStream); 1990 1991 CachedNode cached(fTree); 1992 const bplustree_node* node; 1993 1994 if (fDuplicateNode != BPLUSTREE_NULL) { 1995 // regardless of traverse direction the duplicates are always presented 1996 // in the same order; since they are all considered as equal, this 1997 // shouldn't cause any problems 1998 1999 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2000 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2001 false); 2002 } else 2003 node = NULL; 2004 2005 if (node != NULL) { 2006 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2007 // If the node is out of duplicates, we go directly to the next 2008 // one 2009 fDuplicateNode = node->RightLink(); 2010 if (fDuplicateNode != BPLUSTREE_NULL 2011 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2012 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2013 false); 2014 fDuplicate = 0; 2015 } 2016 } 2017 if (fDuplicate < fNumDuplicates) { 2018 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2019 fDuplicate++); 2020 if (duplicate) 2021 *duplicate = 2; 2022 return B_OK; 2023 } 2024 } 2025 fDuplicateNode = BPLUSTREE_NULL; 2026 } 2027 2028 off_t savedNodeOffset = fCurrentNodeOffset; 2029 int32 savedKey = fCurrentKey; 2030 2031 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2032 RETURN_ERROR(B_ERROR); 2033 2034 if (duplicate) 2035 *duplicate = 0; 2036 2037 fCurrentKey += direction; 2038 2039 // is the current key in the current node? 2040 while ((forward && fCurrentKey >= node->NumKeys()) 2041 || (!forward && fCurrentKey < 0)) { 2042 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2043 2044 // are there any more nodes? 2045 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2046 node = cached.SetTo(fCurrentNodeOffset); 2047 if (!node) 2048 RETURN_ERROR(B_ERROR); 2049 2050 // reset current key 2051 fCurrentKey = forward ? 0 : node->NumKeys(); 2052 } else { 2053 // there are no nodes left, so turn back to the last key 2054 fCurrentNodeOffset = savedNodeOffset; 2055 fCurrentKey = savedKey; 2056 2057 return B_ENTRY_NOT_FOUND; 2058 } 2059 } 2060 2061 if (node->all_key_count == 0) 2062 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2063 2064 uint16 length; 2065 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2066 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2067 > (uint8*)node + fTree->fNodeSize 2068 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2069 fTree->fStream->GetVolume()->Panic(); 2070 RETURN_ERROR(B_BAD_DATA); 2071 } 2072 2073 length = min_c(length, maxLength); 2074 memcpy(key, keyStart, length); 2075 2076 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 2077 // terminate string type 2078 if (length == maxLength) 2079 length--; 2080 ((char*)key)[length] = '\0'; 2081 } 2082 *keyLength = length; 2083 2084 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2085 2086 // duplicate fragments? 2087 uint8 type = bplustree_node::LinkType(offset); 2088 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2089 || type == BPLUSTREE_DUPLICATE_NODE) { 2090 fDuplicateNode = offset; 2091 2092 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2093 false); 2094 if (node == NULL) 2095 RETURN_ERROR(B_ERROR); 2096 2097 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2098 2099 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2100 if (fNumDuplicates) { 2101 offset = node->DuplicateAt(offset, fIsFragment, 0); 2102 fDuplicate = 1; 2103 if (duplicate) 2104 *duplicate = 1; 2105 } else { 2106 // Shouldn't happen, but we're dealing here with potentially 2107 // corrupt disks... 2108 fDuplicateNode = BPLUSTREE_NULL; 2109 offset = 0; 2110 } 2111 } 2112 *value = offset; 2113 2114 return B_OK; 2115 } 2116 2117 2118 /*! This is more or less a copy of BPlusTree::Find() - but it just 2119 sets the current position in the iterator, regardless of if the 2120 key could be found or not. 2121 */ 2122 status_t 2123 TreeIterator::Find(const uint8* key, uint16 keyLength) 2124 { 2125 if (fTree == NULL) 2126 return B_INTERRUPTED; 2127 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2128 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2129 || key == NULL) 2130 RETURN_ERROR(B_BAD_VALUE); 2131 2132 // lock access to stream 2133 InodeReadLocker locker(fTree->fStream); 2134 2135 off_t nodeOffset = fTree->fHeader->RootNode(); 2136 2137 CachedNode cached(fTree); 2138 const bplustree_node* node; 2139 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2140 uint16 keyIndex = 0; 2141 off_t nextOffset; 2142 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2143 &nextOffset); 2144 2145 if (node->OverflowLink() == BPLUSTREE_NULL) { 2146 fCurrentNodeOffset = nodeOffset; 2147 fCurrentKey = keyIndex - 1; 2148 fDuplicateNode = BPLUSTREE_NULL; 2149 2150 return status; 2151 } else if (nextOffset == nodeOffset) 2152 RETURN_ERROR(B_ERROR); 2153 2154 nodeOffset = nextOffset; 2155 } 2156 RETURN_ERROR(B_ERROR); 2157 } 2158 2159 2160 void 2161 TreeIterator::SkipDuplicates() 2162 { 2163 fDuplicateNode = BPLUSTREE_NULL; 2164 } 2165 2166 2167 void 2168 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2169 uint16 splitAt, int8 change) 2170 { 2171 if (offset != fCurrentNodeOffset) 2172 return; 2173 2174 if (nextOffset != BPLUSTREE_NULL) { 2175 fCurrentNodeOffset = nextOffset; 2176 if (splitAt <= fCurrentKey) { 2177 fCurrentKey -= splitAt; 2178 keyIndex -= splitAt; 2179 } 2180 } 2181 2182 // Adjust fCurrentKey to point to the same key as before. 2183 // Note, that if a key is inserted at the current position 2184 // it won't be included in this tree transition. 2185 if (keyIndex <= fCurrentKey) 2186 fCurrentKey += change; 2187 2188 // TODO: duplicate handling! 2189 } 2190 2191 2192 void 2193 TreeIterator::Stop() 2194 { 2195 fTree = NULL; 2196 } 2197 2198 2199 #ifdef DEBUG 2200 void 2201 TreeIterator::Dump() 2202 { 2203 __out("TreeIterator at %p:\n", this); 2204 __out("\tfTree = %p\n", fTree); 2205 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2206 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2207 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", 2208 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2209 fDuplicateNode); 2210 __out("\tfDuplicate = %u\n", fDuplicate); 2211 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2212 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2213 } 2214 #endif 2215 2216 2217 // #pragma mark - 2218 2219 2220 void 2221 bplustree_node::Initialize() 2222 { 2223 left_link = right_link = overflow_link 2224 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2225 all_key_count = 0; 2226 all_key_length = 0; 2227 } 2228 2229 2230 uint8* 2231 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2232 { 2233 if (index < 0 || index > NumKeys()) 2234 return NULL; 2235 2236 uint8* keyStart = Keys(); 2237 uint16* keyLengths = KeyLengths(); 2238 2239 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2240 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2241 if (index > 0) 2242 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2243 2244 return keyStart; 2245 } 2246 2247 2248 uint8 2249 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2250 { 2251 // the duplicate fragment handling is currently hard-coded to a node size 2252 // of 1024 bytes - with future versions of BFS, this may be a problem 2253 2254 if (isFragment) { 2255 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2256 2257 return ((off_t*)this)[fragment]; 2258 } 2259 return OverflowLink(); 2260 } 2261 2262 2263 off_t 2264 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2265 { 2266 uint32 start; 2267 if (isFragment) 2268 start = 8 * ((uint64)offset & 0x3ff); 2269 else 2270 start = 2; 2271 2272 return ((off_t*)this)[start + 1 + index]; 2273 } 2274 2275 2276 /*! Although the name suggests it, this function doesn't return the real 2277 used fragment count; at least, it can only count to two: it returns 2278 0, if there is no fragment used, 1 if there is only one fragment 2279 used, and 2 if there are at least 2 fragments used. 2280 */ 2281 uint32 2282 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2283 { 2284 uint32 used = 0; 2285 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2286 duplicate_array* array = FragmentAt(i); 2287 if (array->count > 0 && ++used > 1) 2288 return used; 2289 } 2290 return used; 2291 } 2292 2293 2294 #ifdef DEBUG 2295 status_t 2296 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2297 { 2298 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2299 DEBUGGER(("invalid node: key/length count")); 2300 2301 for (int32 i = 0; i < NumKeys(); i++) { 2302 uint16 length; 2303 uint8* key = KeyAt(i, &length); 2304 if (key + length + sizeof(off_t) + sizeof(uint16) 2305 > (uint8*)this + nodeSize 2306 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2307 dprintf("node %p, key %ld\n", this, i); 2308 DEBUGGER(("invalid node: keys corrupted")); 2309 return B_BAD_DATA; 2310 } 2311 if (Values()[i] == -1) { 2312 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2313 DEBUGGER(("invalid node: values corrupted")); 2314 return B_BAD_DATA; 2315 } 2316 } 2317 return B_OK; 2318 } 2319 #endif 2320 2321 2322 // #pragma mark - 2323 2324 2325 int32 2326 compareKeys(type_code type, const void* key1, int keyLength1, 2327 const void* key2, int keyLength2) 2328 { 2329 // if one of the keys is NULL, bail out gracefully 2330 if (key1 == NULL || key2 == NULL) { 2331 // even if it's most probably a bug in the calling code, we try to 2332 // give a meaningful result 2333 if (key1 == NULL && key2 != NULL) 2334 return 1; 2335 if (key2 == NULL && key1 != NULL) 2336 return -1; 2337 2338 return 0; 2339 } 2340 2341 switch (type) { 2342 case B_STRING_TYPE: 2343 { 2344 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 2345 if (result == 0) { 2346 // ignore trailing null bytes 2347 if ((keyLength1 == keyLength2 + 1 2348 && ((uint8*)key1)[keyLength2] == '\0') 2349 || (keyLength2 == keyLength1 + 1 2350 && ((uint8*)key2)[keyLength1] == '\0')) 2351 return 0; 2352 2353 result = keyLength1 - keyLength2; 2354 } 2355 2356 return result; 2357 } 2358 2359 case B_SSIZE_T_TYPE: 2360 case B_INT32_TYPE: 2361 return *(int32*)key1 - *(int32*)key2; 2362 2363 case B_SIZE_T_TYPE: 2364 case B_UINT32_TYPE: 2365 if (*(uint32*)key1 == *(uint32*)key2) 2366 return 0; 2367 if (*(uint32*)key1 > *(uint32*)key2) 2368 return 1; 2369 2370 return -1; 2371 2372 case B_OFF_T_TYPE: 2373 case B_INT64_TYPE: 2374 if (*(int64*)key1 == *(int64*)key2) 2375 return 0; 2376 if (*(int64*)key1 > *(int64*)key2) 2377 return 1; 2378 2379 return -1; 2380 2381 case B_UINT64_TYPE: 2382 if (*(uint64*)key1 == *(uint64*)key2) 2383 return 0; 2384 if (*(uint64*)key1 > *(uint64*)key2) 2385 return 1; 2386 2387 return -1; 2388 2389 case B_FLOAT_TYPE: 2390 { 2391 float result = *(float*)key1 - *(float*)key2; 2392 if (result == 0.0f) 2393 return 0; 2394 2395 return (result < 0.0f) ? -1 : 1; 2396 } 2397 2398 case B_DOUBLE_TYPE: 2399 { 2400 double result = *(double*)key1 - *(double*)key2; 2401 if (result == 0.0) 2402 return 0; 2403 2404 return (result < 0.0) ? -1 : 1; 2405 } 2406 } 2407 2408 // if the type is unknown, the entries don't match... 2409 return -1; 2410 } 2411 2412