1 /* 2 * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 //! B+Tree implementation 10 11 12 #include "Debug.h" 13 #include "BPlusTree.h" 14 #include "Inode.h" 15 #include "Utility.h" 16 17 18 #ifdef DEBUG 19 class NodeChecker { 20 public: 21 NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text) 22 : 23 fNode(node), 24 fSize(nodeSize), 25 fText(text) 26 { 27 Check("integrity check failed on construction."); 28 } 29 30 ~NodeChecker() 31 { 32 Check("integrity check failed on destruction."); 33 } 34 35 void 36 Check(const char *message) 37 { 38 if (fNode->CheckIntegrity(fSize) < B_OK) 39 dprintf("%s: %s\n", fText, message); 40 } 41 42 private: 43 const bplustree_node *fNode; 44 int32 fSize; 45 const char *fText; 46 }; 47 #endif 48 49 50 // Node Caching for the BPlusTree class 51 // 52 // With write support, there is the need for a function that allocates new 53 // nodes by either returning empty nodes, or by growing the file's data stream 54 // 55 // !! The CachedNode class assumes that you have properly locked the stream 56 // !! before asking for nodes. 57 // 58 // Note: This code will fail if the block size is smaller than the node size! 59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 60 // is hard-coded to 1024 bytes, that's not an issue now. 61 62 void 63 CachedNode::UnsetUnchanged(Transaction &transaction) 64 { 65 if (fTree == NULL || fTree->fStream == NULL) 66 return; 67 68 if (fNode != NULL) { 69 void *cache = fTree->fStream->GetVolume()->BlockCache(); 70 71 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 72 block_cache_put(cache, fBlockNumber); 73 fNode = NULL; 74 } 75 } 76 77 78 void 79 CachedNode::Unset() 80 { 81 if (fTree == NULL || fTree->fStream == NULL) 82 return; 83 84 if (fNode != NULL) { 85 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); 86 fNode = NULL; 87 } 88 } 89 90 91 const bplustree_node * 92 CachedNode::SetTo(off_t offset, bool check) 93 { 94 if (fTree == NULL || fTree->fStream == NULL) { 95 REPORT_ERROR(B_BAD_VALUE); 96 return NULL; 97 } 98 99 Unset(); 100 101 // You can only ask for nodes at valid positions - you can't 102 // even access the b+tree header with this method (use SetToHeader() 103 // instead) 104 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 105 || offset <= 0 106 || (offset % fTree->fNodeSize) != 0) 107 return NULL; 108 109 if (InternalSetTo(NULL, offset) != NULL && check) { 110 // sanity checks (links, all_key_count) 111 if (!fTree->fHeader->CheckNode(fNode)) { 112 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 113 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 114 return NULL; 115 } 116 } 117 return fNode; 118 } 119 120 121 bplustree_node * 122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check) 123 { 124 if (fTree == NULL || fTree->fStream == NULL) { 125 REPORT_ERROR(B_BAD_VALUE); 126 return NULL; 127 } 128 129 Unset(); 130 131 // You can only ask for nodes at valid positions - you can't 132 // even access the b+tree header with this method (use SetToHeader() 133 // instead) 134 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 135 || offset <= 0 136 || (offset % fTree->fNodeSize) != 0) 137 return NULL; 138 139 if (InternalSetTo(&transaction, offset) != NULL && check) { 140 // sanity checks (links, all_key_count) 141 if (!fTree->fHeader->CheckNode(fNode)) { 142 FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode " 143 "at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID())); 144 return NULL; 145 } 146 } 147 return fNode; 148 } 149 150 151 bplustree_node * 152 CachedNode::MakeWritable(Transaction &transaction) 153 { 154 if (fNode == NULL) 155 return NULL; 156 157 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 158 fBlockNumber, transaction.ID()) == B_OK) 159 return fNode; 160 161 return NULL; 162 } 163 164 165 bplustree_header * 166 CachedNode::MakeWritableHeader(Transaction &transaction) 167 { 168 return (bplustree_header *)MakeWritable(transaction); 169 } 170 171 172 const bplustree_header * 173 CachedNode::SetToHeader() 174 { 175 if (fTree == NULL || fTree->fStream == NULL) { 176 REPORT_ERROR(B_BAD_VALUE); 177 return NULL; 178 } 179 180 Unset(); 181 182 InternalSetTo(NULL, 0LL); 183 return (bplustree_header *)fNode; 184 } 185 186 187 bplustree_header * 188 CachedNode::SetToWritableHeader(Transaction &transaction) 189 { 190 if (fTree == NULL || fTree->fStream == NULL) { 191 REPORT_ERROR(B_BAD_VALUE); 192 return NULL; 193 } 194 195 Unset(); 196 197 InternalSetTo(&transaction, 0LL); 198 return (bplustree_header *)fNode; 199 } 200 201 202 bplustree_node * 203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset) 204 { 205 fNode = NULL; 206 fOffset = offset; 207 208 off_t fileOffset; 209 block_run run; 210 if (offset < fTree->fStream->Size() 211 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 212 Volume *volume = fTree->fStream->GetVolume(); 213 214 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 215 fBlockNumber = volume->ToBlock(run) + blockOffset; 216 uint8 *block; 217 218 if (transaction != NULL) { 219 block = (uint8 *)block_cache_get_writable(volume->BlockCache(), 220 fBlockNumber, transaction->ID()); 221 fWritable = true; 222 } else { 223 block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber); 224 fWritable = false; 225 } 226 227 if (block) { 228 // the node is somewhere in that block... (confusing offset calculation) 229 fNode = (bplustree_node *)(block + offset - 230 (fileOffset + (blockOffset << volume->BlockShift()))); 231 } else 232 REPORT_ERROR(B_IO_ERROR); 233 } 234 return fNode; 235 } 236 237 238 status_t 239 CachedNode::Free(Transaction &transaction, off_t offset) 240 { 241 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 242 RETURN_ERROR(B_BAD_VALUE); 243 244 // ToDo: scan the free nodes list and remove all nodes at the end 245 // of the tree - perhaps that shouldn't be done everytime that 246 // function is called, perhaps it should be done when the directory 247 // inode is closed or based on some calculation or whatever... 248 249 bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction); 250 if (header == NULL) 251 return B_IO_ERROR; 252 253 #if 0 254 // TODO: temporarily disabled because CheckNode() doesn't like this... 255 // Also, it's such an edge case that it's almost useless, anyway. 256 // if the node is the last one in the tree, we shrink 257 // the tree and file size by one node 258 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 259 if (offset == lastOffset) { 260 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 261 if (status < B_OK) 262 return status; 263 264 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 265 return B_OK; 266 } 267 #endif 268 269 // add the node to the free nodes list 270 fNode->left_link = header->free_node_pointer; 271 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 272 273 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 274 return B_OK; 275 } 276 277 278 status_t 279 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, 280 off_t *_offset) 281 { 282 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 283 RETURN_ERROR(B_BAD_VALUE); 284 285 Unset(); 286 287 bplustree_header *header; 288 status_t status; 289 290 // if there are any free nodes, recycle them 291 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 292 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 293 if (header == NULL) 294 return B_IO_ERROR; 295 296 *_offset = header->FreeNode(); 297 *_node = fNode; 298 299 // set new free node pointer 300 header->free_node_pointer = fNode->left_link; 301 fNode->Initialize(); 302 return B_OK; 303 } 304 305 // allocate space for a new node 306 Inode *stream = fTree->fStream; 307 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 308 return status; 309 310 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 311 if (header == NULL) 312 return B_IO_ERROR; 313 314 // the maximum_size has to be changed before the call to SetTo() - or 315 // else it will fail because the requested node is out of bounds 316 off_t offset = header->MaximumSize(); 317 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 318 + fTree->fNodeSize); 319 320 if (SetToWritable(transaction, offset, false) != NULL) { 321 fNode->Initialize(); 322 323 *_offset = offset; 324 *_node = fNode; 325 return B_OK; 326 } 327 328 // revert header size to old value 329 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 330 - fTree->fNodeSize); 331 RETURN_ERROR(B_ERROR); 332 } 333 334 335 // #pragma mark - 336 337 338 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize) 339 : 340 fStream(NULL), 341 fHeader(NULL), 342 fCachedHeader(this) 343 { 344 SetTo(transaction, stream); 345 } 346 347 348 BPlusTree::BPlusTree(Inode *stream) 349 : 350 fStream(NULL), 351 fHeader(NULL), 352 fCachedHeader(this) 353 { 354 SetTo(stream); 355 } 356 357 358 BPlusTree::BPlusTree() 359 : 360 fStream(NULL), 361 fHeader(NULL), 362 fCachedHeader(this), 363 fNodeSize(BPLUSTREE_NODE_SIZE), 364 fAllowDuplicates(true), 365 fStatus(B_NO_INIT) 366 { 367 } 368 369 370 BPlusTree::~BPlusTree() 371 { 372 // if there are any TreeIterators left, we need to stop them 373 // (can happen when the tree's inode gets deleted while 374 // traversing the tree - a TreeIterator doesn't lock the inode) 375 if (fIteratorLock.Lock() < B_OK) 376 return; 377 378 TreeIterator *iterator = NULL; 379 while ((iterator = fIterators.Next(iterator)) != NULL) 380 iterator->Stop(); 381 382 fIteratorLock.Unlock(); 383 } 384 385 386 /*! Create a new B+Tree on the specified stream */ 387 status_t 388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize) 389 { 390 // initializes in-memory B+Tree 391 392 fStream = stream; 393 394 bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction); 395 if (header == NULL) { 396 // allocate space for new header + node! 397 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 398 if (fStatus < B_OK) 399 RETURN_ERROR(fStatus); 400 401 header = fCachedHeader.SetToWritableHeader(transaction); 402 if (header == NULL) 403 RETURN_ERROR(fStatus = B_ERROR); 404 } 405 406 fHeader = header; 407 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 408 && stream->BlockRun() != stream->Parent()) 409 || (stream->Mode() & S_ALLOW_DUPS) != 0; 410 411 fNodeSize = nodeSize; 412 413 // initialize b+tree header 414 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 415 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 416 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 417 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 418 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 419 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 420 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 421 422 // initialize b+tree root node 423 CachedNode cached(this); 424 cached.SetToWritable(transaction, header->RootNode(), false); 425 if (cached.Node() == NULL) 426 RETURN_ERROR(B_IO_ERROR); 427 428 cached.Node()->Initialize(); 429 430 return fStatus = B_OK; 431 } 432 433 434 status_t 435 BPlusTree::SetTo(Inode *stream) 436 { 437 if (stream == NULL) 438 RETURN_ERROR(fStatus = B_BAD_VALUE); 439 440 // get on-disk B+Tree header 441 442 fCachedHeader.Unset(); 443 fStream = stream; 444 445 fHeader = fCachedHeader.SetToHeader(); 446 if (fHeader == NULL) 447 RETURN_ERROR(fStatus = B_NO_INIT); 448 449 // is header valid? 450 451 if (fHeader->MaximumSize() != stream->Size()) { 452 FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n", 453 fHeader->MaximumSize(), stream->Size())); 454 // we can't change the header since we don't have a transaction 455 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 456 } 457 if (fHeader->Magic() != BPLUSTREE_MAGIC 458 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 459 || !fHeader->IsValidLink(fHeader->RootNode()) 460 || !fHeader->IsValidLink(fHeader->FreeNode())) { 461 #ifdef DEBUG 462 dump_bplustree_header(fHeader); 463 dump_block((const char *)fHeader, 128); 464 #endif 465 RETURN_ERROR(fStatus = B_BAD_DATA); 466 } 467 468 fNodeSize = fHeader->NodeSize(); 469 470 { 471 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 472 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 473 S_DOUBLE_INDEX}; 474 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 475 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 476 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 477 478 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 479 || ((stream->Mode() & S_INDEX_DIR) != 0 480 && toMode[fHeader->DataType()] != mode) 481 || !stream->IsContainer()) { 482 D( dump_bplustree_header(fHeader); 483 dump_inode(&stream->Node()); 484 ); 485 RETURN_ERROR(fStatus = B_BAD_TYPE); 486 } 487 488 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 489 // in the original BFS code - we will honour it nevertheless 490 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 491 && stream->BlockRun() != stream->Parent()) 492 || (stream->Mode() & S_ALLOW_DUPS) != 0; 493 } 494 495 CachedNode cached(this, fHeader->RootNode()); 496 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 497 } 498 499 500 status_t 501 BPlusTree::InitCheck() 502 { 503 return fStatus; 504 } 505 506 507 int32 508 BPlusTree::TypeCodeToKeyType(type_code code) 509 { 510 switch (code) { 511 case B_STRING_TYPE: 512 return BPLUSTREE_STRING_TYPE; 513 case B_SSIZE_T_TYPE: 514 case B_INT32_TYPE: 515 return BPLUSTREE_INT32_TYPE; 516 case B_SIZE_T_TYPE: 517 case B_UINT32_TYPE: 518 return BPLUSTREE_UINT32_TYPE; 519 case B_OFF_T_TYPE: 520 case B_INT64_TYPE: 521 return BPLUSTREE_INT64_TYPE; 522 case B_UINT64_TYPE: 523 return BPLUSTREE_UINT64_TYPE; 524 case B_FLOAT_TYPE: 525 return BPLUSTREE_FLOAT_TYPE; 526 case B_DOUBLE_TYPE: 527 return BPLUSTREE_DOUBLE_TYPE; 528 } 529 return -1; 530 } 531 532 533 int32 534 BPlusTree::ModeToKeyType(mode_t mode) 535 { 536 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 537 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 538 case S_INT_INDEX: 539 return BPLUSTREE_INT32_TYPE; 540 case S_UINT_INDEX: 541 return BPLUSTREE_UINT32_TYPE; 542 case S_LONG_LONG_INDEX: 543 return BPLUSTREE_INT64_TYPE; 544 case S_ULONG_LONG_INDEX: 545 return BPLUSTREE_UINT64_TYPE; 546 case S_FLOAT_INDEX: 547 return BPLUSTREE_FLOAT_TYPE; 548 case S_DOUBLE_INDEX: 549 return BPLUSTREE_DOUBLE_TYPE; 550 case S_STR_INDEX: 551 default: 552 // default is for standard directories 553 return BPLUSTREE_STRING_TYPE; 554 } 555 } 556 557 558 // #pragma mark - 559 560 561 void 562 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 563 uint16 splitAt, int8 change) 564 { 565 // Although every iterator which is affected by this update currently 566 // waits on a semaphore, other iterators could be added/removed at 567 // any time, so we need to protect this loop 568 if (fIteratorLock.Lock() < B_OK) 569 return; 570 571 TreeIterator *iterator = NULL; 572 while ((iterator = fIterators.Next(iterator)) != NULL) 573 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 574 575 fIteratorLock.Unlock(); 576 } 577 578 579 void 580 BPlusTree::_AddIterator(TreeIterator *iterator) 581 { 582 if (fIteratorLock.Lock() < B_OK) 583 return; 584 585 fIterators.Add(iterator); 586 587 fIteratorLock.Unlock(); 588 } 589 590 591 void 592 BPlusTree::_RemoveIterator(TreeIterator *iterator) 593 { 594 if (fIteratorLock.Lock() < B_OK) 595 return; 596 597 fIterators.Remove(iterator); 598 599 fIteratorLock.Unlock(); 600 } 601 602 603 int32 604 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2, 605 int keyLength2) 606 { 607 type_code type = 0; 608 switch (fHeader->data_type) { 609 case BPLUSTREE_STRING_TYPE: 610 type = B_STRING_TYPE; 611 break; 612 case BPLUSTREE_INT32_TYPE: 613 type = B_INT32_TYPE; 614 break; 615 case BPLUSTREE_UINT32_TYPE: 616 type = B_UINT32_TYPE; 617 break; 618 case BPLUSTREE_INT64_TYPE: 619 type = B_INT64_TYPE; 620 break; 621 case BPLUSTREE_UINT64_TYPE: 622 type = B_UINT64_TYPE; 623 break; 624 case BPLUSTREE_FLOAT_TYPE: 625 type = B_FLOAT_TYPE; 626 break; 627 case BPLUSTREE_DOUBLE_TYPE: 628 type = B_DOUBLE_TYPE; 629 break; 630 } 631 return compareKeys(type, key1, keyLength1, key2, keyLength2); 632 } 633 634 635 status_t 636 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key, 637 uint16 keyLength, uint16 *_index, off_t *_next) 638 { 639 #ifdef DEBUG 640 NodeChecker checker(node, fNodeSize, "find"); 641 #endif 642 643 if (node->all_key_count == 0) { 644 if (_index) 645 *_index = 0; 646 if (_next) 647 *_next = node->OverflowLink(); 648 return B_ENTRY_NOT_FOUND; 649 } 650 651 off_t *values = node->Values(); 652 int16 saveIndex = -1; 653 654 // binary search in the key array 655 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 656 uint16 i = (first + last) >> 1; 657 658 uint16 searchLength; 659 uint8 *searchKey = node->KeyAt(i, &searchLength); 660 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 661 > (uint8 *)node + fNodeSize 662 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 663 fStream->GetVolume()->Panic(); 664 RETURN_ERROR(B_BAD_DATA); 665 } 666 667 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 668 if (cmp < 0) { 669 last = i - 1; 670 saveIndex = i; 671 } else if (cmp > 0) { 672 saveIndex = first = i + 1; 673 } else { 674 if (_index) 675 *_index = i; 676 if (_next) 677 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 678 return B_OK; 679 } 680 } 681 682 if (_index) 683 *_index = saveIndex; 684 if (_next) { 685 if (saveIndex == node->NumKeys()) 686 *_next = node->OverflowLink(); 687 else 688 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 689 } 690 return B_ENTRY_NOT_FOUND; 691 } 692 693 694 /*! 695 Prepares the stack to contain all nodes that were passed while 696 following the key, from the root node to the leaf node that could 697 or should contain that key. 698 */ 699 status_t 700 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key, 701 uint16 keyLength) 702 { 703 // set the root node to begin with 704 node_and_key nodeAndKey; 705 nodeAndKey.nodeOffset = fHeader->RootNode(); 706 707 CachedNode cached(this); 708 const bplustree_node *node; 709 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 710 // if we are already on leaf level, we're done 711 if (node->OverflowLink() == BPLUSTREE_NULL) { 712 // node that the keyIndex is not properly set here (but it's not 713 // needed in the calling functions anyway)! 714 nodeAndKey.keyIndex = 0; 715 stack.Push(nodeAndKey); 716 return B_OK; 717 } 718 719 off_t nextOffset; 720 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 721 &nextOffset); 722 723 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 724 RETURN_ERROR(B_ERROR); 725 726 if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) { 727 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 728 break; 729 } 730 731 // put the node offset & the correct keyIndex on the stack 732 stack.Push(nodeAndKey); 733 734 nodeAndKey.nodeOffset = nextOffset; 735 } 736 737 FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n", 738 nodeAndKey.nodeOffset)); 739 return B_ERROR; 740 } 741 742 743 /*! 744 This will find a free duplicate fragment in the given bplustree_node. 745 The CachedNode will be set to the writable fragment on success. 746 */ 747 status_t 748 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction, 749 const bplustree_node *node, CachedNode &cached, 750 off_t *_offset, bplustree_node **_fragment, uint32 *_index) 751 { 752 off_t *values = node->Values(); 753 for (int32 i = 0; i < node->NumKeys(); i++) { 754 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 755 756 // does the value link to a duplicate fragment? 757 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 758 continue; 759 760 const bplustree_node *fragment = cached.SetTo( 761 bplustree_node::FragmentOffset(value), false); 762 if (fragment == NULL) { 763 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 764 continue; 765 } 766 767 // see if there is some space left for us 768 uint32 num = bplustree_node::MaxFragments(fNodeSize); 769 for (uint32 j = 0; j < num; j++) { 770 duplicate_array *array = fragment->FragmentAt(j); 771 772 if (array->count == 0) { 773 // found an unused fragment 774 *_fragment = cached.MakeWritable(transaction); 775 if (*_fragment == NULL) 776 return B_IO_ERROR; 777 778 *_offset = bplustree_node::FragmentOffset(value); 779 *_index = j; 780 return B_OK; 781 } 782 } 783 } 784 return B_ENTRY_NOT_FOUND; 785 } 786 787 788 status_t 789 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached, 790 const bplustree_node *node, uint16 index, off_t value) 791 { 792 CachedNode cachedDuplicate(this); 793 off_t *values = node->Values(); 794 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 795 status_t status; 796 off_t offset; 797 798 if (bplustree_node::IsDuplicate(oldValue)) { 799 // If it's a duplicate fragment, try to insert it into that, or if it 800 // doesn't fit anymore, create a new duplicate node 801 802 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 803 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 804 bplustree_node::FragmentOffset(oldValue), false); 805 if (duplicate == NULL) 806 return B_IO_ERROR; 807 808 duplicate_array *array = duplicate->FragmentAt( 809 bplustree_node::FragmentIndex(oldValue)); 810 if (array->count > NUM_FRAGMENT_VALUES 811 || array->count < 1) { 812 FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 813 (int)bplustree_node::FragmentIndex(oldValue), 814 bplustree_node::FragmentOffset(oldValue), 815 array->count)); 816 return B_BAD_DATA; 817 } 818 819 if (array->count < NUM_FRAGMENT_VALUES) { 820 array->Insert(value); 821 } else { 822 // test if the fragment will be empty if we remove this key's values 823 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 824 // the node will be empty without our values, so let us 825 // reuse it as a duplicate node 826 offset = bplustree_node::FragmentOffset(oldValue); 827 828 memmove(duplicate->DuplicateArray(), array, 829 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 830 duplicate->left_link = duplicate->right_link 831 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 832 833 array = duplicate->DuplicateArray(); 834 array->Insert(value); 835 } else { 836 // create a new duplicate node 837 838 cachedDuplicate.UnsetUnchanged(transaction); 839 // the old duplicate has not been touched, so we can reuse it 840 841 bplustree_node *newDuplicate; 842 status = cachedDuplicate.Allocate(transaction, 843 &newDuplicate, &offset); 844 if (status < B_OK) 845 RETURN_ERROR(status); 846 847 // copy the array from the fragment node to the duplicate node 848 // and free the old entry (by zero'ing all values) 849 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 850 array->count); 851 memcpy(&newDuplicate->all_key_count, &array->values[0], 852 array->count * sizeof(off_t)); 853 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 854 855 array = newDuplicate->DuplicateArray(); 856 array->Insert(value); 857 } 858 859 // update the main pointer to link to a duplicate node 860 if (cached.MakeWritable(transaction) == NULL) 861 return B_IO_ERROR; 862 863 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 864 BPLUSTREE_DUPLICATE_NODE, offset)); 865 } 866 867 return B_OK; 868 } 869 870 // Put the value into a dedicated duplicate node 871 872 // search for free space in the duplicate nodes of that key 873 duplicate_array *array; 874 const bplustree_node *duplicate; 875 off_t duplicateOffset; 876 do { 877 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 878 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 879 if (duplicate == NULL) 880 return B_IO_ERROR; 881 882 array = duplicate->DuplicateArray(); 883 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 884 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 885 duplicateOffset, array->count)); 886 return B_BAD_DATA; 887 } 888 } while (array->count >= NUM_DUPLICATE_VALUES 889 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 890 891 bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction); 892 if (writableDuplicate == NULL) 893 return B_IO_ERROR; 894 895 if (array->count < NUM_DUPLICATE_VALUES) { 896 array = writableDuplicate->DuplicateArray(); 897 array->Insert(value); 898 } else { 899 // no space left - add a new duplicate node 900 901 bplustree_node *newDuplicate; 902 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 903 if (status < B_OK) 904 RETURN_ERROR(status); 905 906 // link the two nodes together 907 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 908 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 909 910 array = newDuplicate->DuplicateArray(); 911 array->count = 0; 912 array->Insert(value); 913 } 914 return B_OK; 915 } 916 917 // Search for a free duplicate fragment or create a new one 918 // to insert the duplicate value into 919 920 uint32 fragmentIndex = 0; 921 bplustree_node *fragment; 922 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 923 &offset, &fragment, &fragmentIndex) != B_OK) { 924 // allocate a new duplicate fragment node 925 if ((status = cachedDuplicate.Allocate(transaction, &fragment, 926 &offset)) < B_OK) 927 RETURN_ERROR(status); 928 929 memset(fragment, 0, fNodeSize); 930 } 931 932 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 933 array->Insert(oldValue); 934 array->Insert(value); 935 936 if (cached.MakeWritable(transaction) == NULL) 937 return B_IO_ERROR; 938 939 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 940 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 941 942 return B_OK; 943 } 944 945 946 void 947 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key, 948 uint16 keyLength, off_t value) 949 { 950 // should never happen, but who knows? 951 if (index > node->NumKeys()) 952 return; 953 954 off_t *values = node->Values(); 955 uint16 *keyLengths = node->KeyLengths(); 956 uint8 *keys = node->Keys(); 957 958 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 959 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 960 + keyLength); 961 962 off_t *newValues = node->Values(); 963 uint16 *newKeyLengths = node->KeyLengths(); 964 965 // move values and copy new value into them 966 memmove(newValues + index + 1, values + index, 967 sizeof(off_t) * (node->NumKeys() - 1 - index)); 968 memmove(newValues, values, sizeof(off_t) * index); 969 970 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 971 972 // move and update key length index 973 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 974 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 975 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 976 } 977 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 978 979 int32 keyStart; 980 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 981 + (keyStart = index > 0 982 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 983 984 // move keys and copy new key into them 985 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 986 int32 size = node->AllKeyLength() - length; 987 if (size > 0) 988 memmove(keys + length, keys + length - keyLength, size); 989 990 memcpy(keys + keyStart, key, keyLength); 991 } 992 993 994 /*! 995 Splits the \a node into two halves - the other half will be put into \a other. 996 It also takes care to create a new overflow link if the node to split is an 997 index node. 998 */ 999 status_t 1000 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset, 1001 bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key, 1002 uint16 *_keyLength, off_t *_value) 1003 { 1004 if (*_keyIndex > node->NumKeys() + 1) 1005 return B_BAD_VALUE; 1006 1007 uint16 *inKeyLengths = node->KeyLengths(); 1008 off_t *inKeyValues = node->Values(); 1009 uint8 *inKeys = node->Keys(); 1010 uint8 *outKeys = other->Keys(); 1011 int32 keyIndex = *_keyIndex; // can become less than zero! 1012 1013 if (keyIndex > node->NumKeys()) { 1014 FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex, 1015 node->NumKeys())); 1016 return B_BAD_VALUE; 1017 } 1018 1019 // how many keys will fit in one (half) page? 1020 // that loop will find the answer to this question and 1021 // change the key lengths indices for their new home 1022 1023 // "bytes" is the number of bytes written for the new key, 1024 // "bytesBefore" are the bytes before that key 1025 // "bytesAfter" are the bytes after the new key, if any 1026 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1027 1028 size_t size = fNodeSize >> 1; 1029 int32 out, in; 1030 for (in = out = 0; in < node->NumKeys() + 1;) { 1031 if (!bytes) { 1032 bytesBefore = in > 0 1033 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1034 } 1035 1036 if (in == keyIndex && !bytes) { 1037 bytes = *_keyLength; 1038 } else { 1039 if (keyIndex < out) { 1040 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1041 - bytesBefore; 1042 } 1043 1044 in++; 1045 } 1046 out++; 1047 1048 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1049 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1050 // we have found the number of keys in the new node! 1051 break; 1052 } 1053 } 1054 1055 // if the new key was not inserted, set the length of the keys 1056 // that can be copied directly 1057 if (keyIndex >= out && in > 0) 1058 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1059 1060 if (bytesBefore < 0 || bytesAfter < 0) 1061 return B_BAD_DATA; 1062 1063 other->left_link = node->left_link; 1064 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1065 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1066 + bytesAfter); 1067 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1068 1069 uint16 *outKeyLengths = other->KeyLengths(); 1070 off_t *outKeyValues = other->Values(); 1071 int32 keys = out > keyIndex ? keyIndex : out; 1072 1073 if (bytesBefore) { 1074 // copy the keys 1075 memcpy(outKeys, inKeys, bytesBefore); 1076 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1077 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1078 } 1079 if (bytes) { 1080 // copy the newly inserted key 1081 memcpy(outKeys + bytesBefore, key, bytes); 1082 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1083 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1084 1085 if (bytesAfter) { 1086 // copy the keys after the new key 1087 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1088 bytesAfter); 1089 keys = out - keyIndex - 1; 1090 for (int32 i = 0;i < keys;i++) { 1091 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1092 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes); 1093 } 1094 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1095 keys * sizeof(off_t)); 1096 } 1097 } 1098 1099 // if the new key was already inserted, we shouldn't use it again 1100 if (in != out) 1101 keyIndex--; 1102 1103 int32 total = bytesBefore + bytesAfter; 1104 1105 // these variables are for the key that will be returned 1106 // to the parent node 1107 uint8 *newKey = NULL; 1108 uint16 newLength; 1109 bool newAllocated = false; 1110 1111 // If we have split an index node, we have to drop the first key 1112 // of the next node (which can also be the new key to insert). 1113 // The dropped key is also the one which has to be inserted in 1114 // the parent node, so we will set the "newKey" already here. 1115 if (node->OverflowLink() != BPLUSTREE_NULL) { 1116 if (in == keyIndex) { 1117 newKey = key; 1118 newLength = *_keyLength; 1119 1120 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1121 keyIndex--; 1122 } else { 1123 // If a key is dropped (is not the new key), we have to copy 1124 // it, because it would be lost if not. 1125 uint8 *droppedKey = node->KeyAt(in, &newLength); 1126 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1127 > (uint8 *)node + fNodeSize 1128 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1129 fStream->GetVolume()->Panic(); 1130 RETURN_ERROR(B_BAD_DATA); 1131 } 1132 newKey = (uint8 *)malloc(newLength); 1133 if (newKey == NULL) 1134 return B_NO_MEMORY; 1135 1136 newAllocated = true; 1137 memcpy(newKey, droppedKey, newLength); 1138 1139 other->overflow_link = inKeyValues[in]; 1140 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1141 } 1142 } 1143 1144 // and now the same game for the other page and the rest of the keys 1145 // (but with memmove() instead of memcpy(), because they may overlap) 1146 1147 bytesBefore = bytesAfter = bytes = 0; 1148 out = 0; 1149 int32 skip = in; 1150 while (in < node->NumKeys() + 1) { 1151 if (in == keyIndex && !bytes) { 1152 // it's enough to set bytesBefore once here, because we do 1153 // not need to know the exact length of all keys in this 1154 // loop 1155 bytesBefore = in > skip 1156 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1157 bytes = *_keyLength; 1158 out++; 1159 } else { 1160 if (in < node->NumKeys()) { 1161 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1162 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1163 1164 if (bytes) { 1165 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1166 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1167 1168 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1169 - bytesBefore - bytes; 1170 } 1171 out++; 1172 } 1173 in++; 1174 } 1175 } 1176 1177 // adjust the byte counts (since we were a bit lazy in the loop) 1178 if (keyIndex < skip) 1179 bytesBefore = node->AllKeyLength() - total; 1180 1181 if (bytesBefore < 0 || bytesAfter < 0) { 1182 if (newAllocated) 1183 free(newKey); 1184 return B_BAD_DATA; 1185 } 1186 1187 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1188 // right link, and overflow link can stay the same 1189 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1190 + bytesAfter); 1191 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1192 1193 // array positions have changed 1194 outKeyLengths = node->KeyLengths(); 1195 outKeyValues = node->Values(); 1196 1197 // move the keys in the old node: the order is important here, 1198 // because we don't want to overwrite any contents 1199 1200 keys = keyIndex <= skip ? out : keyIndex - skip; 1201 keyIndex -= skip; 1202 in = out - keyIndex - 1; 1203 // Note: keyIndex and in will contain invalid values when the new key 1204 // went to the other node. But in this case bytes and bytesAfter are 1205 // 0 and subsequently we never use keyIndex and in. 1206 1207 if (bytesBefore) 1208 memmove(inKeys, inKeys + total, bytesBefore); 1209 if (bytesAfter) { 1210 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1211 bytesAfter); 1212 } 1213 1214 if (bytesBefore) 1215 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1216 if (bytesAfter) { 1217 // if byteAfter is > 0, keyIndex is larger than skip 1218 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1219 in * sizeof(uint16)); 1220 } 1221 1222 if (bytesBefore) 1223 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1224 if (bytesAfter) { 1225 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1226 in * sizeof(off_t)); 1227 } 1228 1229 if (bytes) { 1230 // finally, copy the newly inserted key (don't overwrite anything) 1231 memcpy(inKeys + bytesBefore, key, bytes); 1232 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1233 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1234 } 1235 1236 // Prepare the key that will be inserted in the parent node which 1237 // is either the dropped key or the last of the other node. 1238 // If it's the dropped key, "newKey" was already set earlier. 1239 1240 if (newKey == NULL) 1241 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1242 1243 memcpy(key, newKey, newLength); 1244 *_keyLength = newLength; 1245 *_value = otherOffset; 1246 1247 if (newAllocated) 1248 free(newKey); 1249 1250 return B_OK; 1251 } 1252 1253 1254 /*! 1255 This inserts a key into the tree. The changes made to the tree will 1256 all be part of the \a transaction. 1257 */ 1258 status_t 1259 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, 1260 off_t value) 1261 { 1262 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1263 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1264 RETURN_ERROR(B_BAD_VALUE); 1265 #ifdef DEBUG 1266 if (value < 0) 1267 panic("tried to insert invalid value %Ld!\n", value); 1268 #endif 1269 1270 // lock access to stream 1271 WriteLocked locked(fStream->Lock()); 1272 1273 Stack<node_and_key> stack; 1274 if (_SeekDown(stack, key, keyLength) != B_OK) 1275 RETURN_ERROR(B_ERROR); 1276 1277 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1278 1279 memcpy(keyBuffer, key, keyLength); 1280 keyBuffer[keyLength] = 0; 1281 1282 node_and_key nodeAndKey; 1283 const bplustree_node *node; 1284 1285 CachedNode cached(this); 1286 while (stack.Pop(&nodeAndKey) 1287 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1288 #ifdef DEBUG 1289 NodeChecker checker(node, fNodeSize, "insert"); 1290 #endif 1291 if (node->IsLeaf()) { 1292 // first round, check for duplicate entries 1293 status_t status = _FindKey(node, key, keyLength, 1294 &nodeAndKey.keyIndex); 1295 1296 // is this a duplicate entry? 1297 if (status == B_OK) { 1298 if (fAllowDuplicates) { 1299 status = _InsertDuplicate(transaction, cached, node, 1300 nodeAndKey.keyIndex, value); 1301 if (status != B_OK) 1302 RETURN_ERROR(status); 1303 return B_OK; 1304 } 1305 1306 RETURN_ERROR(B_NAME_IN_USE); 1307 } 1308 } 1309 1310 bplustree_node *writableNode = cached.MakeWritable(transaction); 1311 if (writableNode == NULL) 1312 return B_IO_ERROR; 1313 1314 // is the node big enough to hold the pair? 1315 if (int32(round_up(sizeof(bplustree_node) 1316 + writableNode->AllKeyLength() + keyLength) 1317 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1318 + sizeof(off_t))) < fNodeSize) { 1319 _InsertKey(writableNode, nodeAndKey.keyIndex, 1320 keyBuffer, keyLength, value); 1321 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1322 nodeAndKey.keyIndex, 0, 1); 1323 1324 return B_OK; 1325 } else { 1326 CachedNode cachedNewRoot(this); 1327 CachedNode cachedOther(this); 1328 1329 // do we need to allocate a new root node? if so, then do 1330 // it now 1331 off_t newRoot = BPLUSTREE_NULL; 1332 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1333 bplustree_node *root; 1334 status_t status = cachedNewRoot.Allocate(transaction, &root, 1335 &newRoot); 1336 if (status < B_OK) { 1337 // The tree is most likely corrupted! 1338 // But it's still sane at leaf level - we could set 1339 // a flag in the header that forces the tree to be 1340 // rebuild next time... 1341 // But since we will have journaling, that's not a big 1342 // problem anyway. 1343 RETURN_ERROR(status); 1344 } 1345 } 1346 1347 // reserve space for the other node 1348 bplustree_node *other; 1349 off_t otherOffset; 1350 status_t status = cachedOther.Allocate(transaction, &other, 1351 &otherOffset); 1352 if (status < B_OK) { 1353 cachedNewRoot.Free(transaction, newRoot); 1354 RETURN_ERROR(status); 1355 } 1356 1357 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1358 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1359 &value) < B_OK) { 1360 // free root node & other node here 1361 cachedOther.Free(transaction, otherOffset); 1362 cachedNewRoot.Free(transaction, newRoot); 1363 1364 RETURN_ERROR(B_ERROR); 1365 } 1366 #ifdef DEBUG 1367 checker.Check("insert split"); 1368 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1369 #endif 1370 1371 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1372 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1373 1374 // update the right link of the node in the left of the new node 1375 if ((other = cachedOther.SetToWritable(transaction, 1376 other->LeftLink())) != NULL) { 1377 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1378 } 1379 1380 // create a new root if necessary 1381 if (newRoot != BPLUSTREE_NULL) { 1382 bplustree_node *root = cachedNewRoot.Node(); 1383 1384 _InsertKey(root, 0, keyBuffer, keyLength, 1385 writableNode->LeftLink()); 1386 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1387 nodeAndKey.nodeOffset); 1388 1389 bplustree_header *header 1390 = fCachedHeader.MakeWritableHeader(transaction); 1391 if (header == NULL) 1392 return B_IO_ERROR; 1393 1394 // finally, update header to point to the new root 1395 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1396 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1397 header->MaxNumberOfLevels() + 1); 1398 1399 return B_OK; 1400 } 1401 } 1402 } 1403 RETURN_ERROR(B_ERROR); 1404 } 1405 1406 1407 /*! 1408 Removes the duplicate index/value pair from the tree. 1409 It's part of the private tree interface. 1410 */ 1411 status_t 1412 BPlusTree::_RemoveDuplicate(Transaction &transaction, 1413 const bplustree_node *node, CachedNode &cached, uint16 index, 1414 off_t value) 1415 { 1416 off_t *values = node->Values(); 1417 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1418 1419 CachedNode cachedDuplicate(this); 1420 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1421 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 1422 duplicateOffset, false); 1423 if (duplicate == NULL) 1424 return B_IO_ERROR; 1425 1426 // if it's a duplicate fragment, remove the entry from there 1427 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1428 duplicate_array *array = duplicate->FragmentAt( 1429 bplustree_node::FragmentIndex(oldValue)); 1430 1431 if (array->count > NUM_FRAGMENT_VALUES 1432 || array->count < 1) { 1433 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 1434 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1435 return B_BAD_DATA; 1436 } 1437 if (!array->Remove(value)) { 1438 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1439 value, duplicateOffset)); 1440 return B_ENTRY_NOT_FOUND; 1441 } 1442 1443 // remove the array from the fragment node if it is empty 1444 if (array->count == 1) { 1445 // set the link to the remaining value 1446 if (cached.MakeWritable(transaction) == NULL) 1447 return B_IO_ERROR; 1448 1449 values[index] = array->values[0]; 1450 1451 // Remove the whole fragment node, if this was the only array, 1452 // otherwise free just the array 1453 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1454 status_t status = cachedDuplicate.Free(transaction, duplicateOffset); 1455 if (status < B_OK) 1456 return status; 1457 } else 1458 array->count = 0; 1459 } 1460 return B_OK; 1461 } 1462 1463 // Remove value from a duplicate node! 1464 1465 duplicate_array *array = NULL; 1466 1467 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1468 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1469 duplicate->LeftLink())); 1470 return B_BAD_DATA; 1471 } 1472 1473 // Search the duplicate nodes until the entry could be found (and removed) 1474 while (duplicate != NULL) { 1475 array = duplicate->DuplicateArray(); 1476 if (array->count > NUM_DUPLICATE_VALUES 1477 || array->count < 0) { 1478 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1479 duplicateOffset, array->count)); 1480 return B_BAD_DATA; 1481 } 1482 1483 if (array->Remove(value)) 1484 break; 1485 1486 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1487 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1488 1489 cachedDuplicate.UnsetUnchanged(transaction); 1490 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); 1491 } 1492 if (duplicate == NULL) 1493 RETURN_ERROR(B_IO_ERROR); 1494 1495 // The entry got removed from the duplicate node, but we might want to free 1496 // it now in case it's empty 1497 1498 while (true) { 1499 off_t left = duplicate->LeftLink(); 1500 off_t right = duplicate->RightLink(); 1501 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1502 1503 if ((isLast && array->count == 1) || array->count == 0) { 1504 // Free empty duplicate page, link their siblings together, and 1505 // update the duplicate link if needed (ie. when we either remove 1506 // the last duplicate node or have a new first one) 1507 1508 if (left == BPLUSTREE_NULL) { 1509 // the duplicate link points to us 1510 if (cached.MakeWritable(transaction) == NULL) 1511 return B_IO_ERROR; 1512 1513 if (array->count == 1) { 1514 // this is the last node, and there is only one value left; 1515 // replace the duplicate link with that value, it's no duplicate 1516 // anymore 1517 values[index] = array->values[0]; 1518 } else { 1519 // move the duplicate link to the next node 1520 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1521 BPLUSTREE_DUPLICATE_NODE, right)); 1522 } 1523 } 1524 1525 status_t status; 1526 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1527 return status; 1528 1529 if (left != BPLUSTREE_NULL 1530 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { 1531 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1532 1533 // If the next node is the last node, we need to free that node 1534 // and convert the duplicate entry back into a normal entry 1535 array = duplicate->DuplicateArray(); 1536 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1537 && array->count <= NUM_FRAGMENT_VALUES) { 1538 duplicateOffset = left; 1539 continue; 1540 } 1541 } 1542 if (right != BPLUSTREE_NULL 1543 && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { 1544 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1545 1546 // Again, we may need to turn the duplicate entry back into a normal entry 1547 array = duplicate->DuplicateArray(); 1548 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1549 && array->count <= NUM_FRAGMENT_VALUES) { 1550 duplicateOffset = right; 1551 continue; 1552 } 1553 } 1554 return B_OK; 1555 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1556 // If the number of entries fits in a duplicate fragment, then 1557 // either find a free fragment node, or convert this node to a 1558 // fragment node. 1559 CachedNode cachedOther(this); 1560 1561 bplustree_node *fragment = NULL; 1562 uint32 fragmentIndex = 0; 1563 off_t offset; 1564 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1565 &offset, &fragment, &fragmentIndex) == B_OK) { 1566 // move to other node 1567 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1568 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1569 1570 cachedDuplicate.Free(transaction, duplicateOffset); 1571 duplicateOffset = offset; 1572 } else { 1573 // convert node 1574 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1575 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1576 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1577 } 1578 1579 if (cached.MakeWritable(transaction) == NULL) 1580 return B_IO_ERROR; 1581 1582 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1583 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1584 } 1585 return B_OK; 1586 } 1587 } 1588 1589 1590 /*! 1591 Removes the key with the given index from the specified node. 1592 Since it has to get the key from the node anyway (to obtain it's 1593 pointer), it's not needed to pass the key & its length, although 1594 the calling method (BPlusTree::Remove()) have this data. 1595 */ 1596 void 1597 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index) 1598 { 1599 // should never happen, but who knows? 1600 if (index > node->NumKeys() && node->NumKeys() > 0) { 1601 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1602 return; 1603 } 1604 1605 off_t *values = node->Values(); 1606 1607 // if we would have to drop the overflow link, drop 1608 // the last key instead and update the overflow link 1609 // to the value of that one 1610 if (!node->IsLeaf() && index == node->NumKeys()) 1611 node->overflow_link = values[--index]; 1612 1613 uint16 length; 1614 uint8 *key = node->KeyAt(index, &length); 1615 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1616 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1617 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1618 (int)fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1619 fStream->GetVolume()->Panic(); 1620 return; 1621 } 1622 1623 uint16 *keyLengths = node->KeyLengths(); 1624 uint8 *keys = node->Keys(); 1625 1626 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1627 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1628 1629 off_t *newValues = node->Values(); 1630 uint16 *newKeyLengths = node->KeyLengths(); 1631 1632 // move key data 1633 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1634 1635 // move and update key lengths 1636 if (index > 0 && newKeyLengths != keyLengths) 1637 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1638 for (uint16 i = index; i < node->NumKeys(); i++) { 1639 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1640 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1641 } 1642 1643 // move values 1644 if (index > 0) 1645 memmove(newValues, values, index * sizeof(off_t)); 1646 if (node->NumKeys() > index) 1647 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1648 } 1649 1650 1651 /*! 1652 Removes the specified key from the tree. The "value" parameter is only used 1653 for trees which allow duplicates, so you may safely ignore it. 1654 It's not an optional parameter, so at least you have to think about it. 1655 */ 1656 status_t 1657 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, 1658 off_t value) 1659 { 1660 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1661 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1662 RETURN_ERROR(B_BAD_VALUE); 1663 1664 // lock access to stream 1665 WriteLocked locked(fStream->Lock()); 1666 1667 Stack<node_and_key> stack; 1668 if (_SeekDown(stack, key, keyLength) != B_OK) 1669 RETURN_ERROR(B_ERROR); 1670 1671 node_and_key nodeAndKey; 1672 const bplustree_node *node; 1673 1674 CachedNode cached(this); 1675 while (stack.Pop(&nodeAndKey) 1676 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1677 #ifdef DEBUG 1678 NodeChecker checker(node, fNodeSize, "remove"); 1679 #endif 1680 if (node->IsLeaf()) { 1681 // first round, check for duplicate entries 1682 status_t status = _FindKey(node, key, keyLength, 1683 &nodeAndKey.keyIndex); 1684 if (status < B_OK) 1685 RETURN_ERROR(status); 1686 1687 // If we will remove the last key, the iterator will be set 1688 // to the next node after the current - if there aren't any 1689 // more nodes, we need a way to prevent the TreeIterators to 1690 // touch the old node again, we use BPLUSTREE_FREE for this 1691 off_t next = node->RightLink() == BPLUSTREE_NULL 1692 ? BPLUSTREE_FREE : node->RightLink(); 1693 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1694 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1695 1696 // is this a duplicate entry? 1697 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1698 node->Values()[nodeAndKey.keyIndex]))) { 1699 if (fAllowDuplicates) { 1700 return _RemoveDuplicate(transaction, node, cached, 1701 nodeAndKey.keyIndex, value); 1702 } else { 1703 FATAL(("dupliate node found where no duplicates are allowed!\n")); 1704 RETURN_ERROR(B_ERROR); 1705 } 1706 } 1707 } 1708 1709 bplustree_node *writableNode = cached.MakeWritable(transaction); 1710 if (writableNode == NULL) 1711 return B_IO_ERROR; 1712 1713 // if it's an empty root node, we have to convert it 1714 // to a leaf node by dropping the overflow link, or, 1715 // if it's already a leaf node, just empty it 1716 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1717 && (node->NumKeys() == 0 || (node->NumKeys() == 1 && node->IsLeaf()))) { 1718 writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1719 writableNode->all_key_count = 0; 1720 writableNode->all_key_length = 0; 1721 1722 // if we've made a leaf node out of the root node, we need 1723 // to reset the maximum number of levels in the header 1724 if (fHeader->MaxNumberOfLevels() != 1) { 1725 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1726 if (header == NULL) 1727 return B_IO_ERROR; 1728 1729 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1730 } 1731 return B_OK; 1732 } 1733 1734 // if there is only one key left, we don't have to remove 1735 // it, we can just dump the node (index nodes still have 1736 // the overflow link, so we have to drop the last key) 1737 if (writableNode->NumKeys() > 1 1738 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 1739 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1740 return B_OK; 1741 } 1742 1743 // when we are here, we can just free the node, but 1744 // we have to update the right/left link of the 1745 // siblings first 1746 CachedNode otherCached(this); 1747 bplustree_node *other = otherCached.SetToWritable(transaction, 1748 writableNode->LeftLink()); 1749 if (other != NULL) 1750 other->right_link = writableNode->right_link; 1751 1752 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) 1753 other->left_link = writableNode->left_link; 1754 1755 cached.Free(transaction, nodeAndKey.nodeOffset); 1756 } 1757 RETURN_ERROR(B_ERROR); 1758 } 1759 1760 1761 /*! 1762 Replaces the value for the key in the tree. 1763 Returns B_OK if the key could be found and its value replaced, 1764 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1765 to indicate that something went terribly wrong. 1766 Note that this doesn't work with duplicates - it will just 1767 return B_BAD_TYPE if you call this function on a tree where 1768 duplicates are allowed. 1769 */ 1770 status_t 1771 BPlusTree::Replace(Transaction &transaction, const uint8 *key, 1772 uint16 keyLength, off_t value) 1773 { 1774 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1775 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1776 || key == NULL) 1777 RETURN_ERROR(B_BAD_VALUE); 1778 1779 if (fAllowDuplicates) 1780 RETURN_ERROR(B_BAD_TYPE); 1781 1782 // lock access to stream (a read lock is okay for this purpose) 1783 ReadLocked locked(fStream->Lock()); 1784 1785 off_t nodeOffset = fHeader->RootNode(); 1786 CachedNode cached(this); 1787 const bplustree_node *node; 1788 1789 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1790 uint16 keyIndex = 0; 1791 off_t nextOffset; 1792 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1793 1794 if (node->OverflowLink() == BPLUSTREE_NULL) { 1795 if (status == B_OK) { 1796 bplustree_node *writableNode = cached.MakeWritable(transaction); 1797 if (writableNode != NULL) 1798 writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); 1799 else 1800 status = B_IO_ERROR; 1801 } 1802 1803 return status; 1804 } else if (nextOffset == nodeOffset) 1805 RETURN_ERROR(B_ERROR); 1806 1807 nodeOffset = nextOffset; 1808 } 1809 RETURN_ERROR(B_ERROR); 1810 } 1811 1812 1813 /*! 1814 Searches the key in the tree, and stores the offset found in 1815 _value, if successful. 1816 It's very similar to BPlusTree::SeekDown(), but doesn't fill 1817 a stack while it descends the tree. 1818 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1819 if not. It can also return other errors to indicate that 1820 something went wrong. 1821 Note that this doesn't work with duplicates - it will just 1822 return B_BAD_TYPE if you call this function on a tree where 1823 duplicates are allowed. 1824 */ 1825 status_t 1826 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1827 { 1828 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1829 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1830 || key == NULL) 1831 RETURN_ERROR(B_BAD_VALUE); 1832 1833 if (fAllowDuplicates) 1834 RETURN_ERROR(B_BAD_TYPE); 1835 1836 // lock access to stream 1837 ReadLocked locked(fStream->Lock()); 1838 1839 off_t nodeOffset = fHeader->RootNode(); 1840 CachedNode cached(this); 1841 const bplustree_node *node; 1842 1843 #ifdef DEBUG 1844 int32 levels = 0; 1845 #endif 1846 1847 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1848 uint16 keyIndex = 0; 1849 off_t nextOffset; 1850 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1851 1852 #ifdef DEBUG 1853 levels++; 1854 #endif 1855 if (node->OverflowLink() == BPLUSTREE_NULL) { 1856 if (status == B_OK && _value != NULL) 1857 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1858 1859 #ifdef DEBUG 1860 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1861 DEBUGGER(("levels don't match")); 1862 #endif 1863 return status; 1864 } else if (nextOffset == nodeOffset) 1865 RETURN_ERROR(B_ERROR); 1866 1867 nodeOffset = nextOffset; 1868 } 1869 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1870 RETURN_ERROR(B_ERROR); 1871 } 1872 1873 1874 // #pragma mark - 1875 1876 1877 TreeIterator::TreeIterator(BPlusTree *tree) 1878 : 1879 fTree(tree), 1880 fCurrentNodeOffset(BPLUSTREE_NULL), 1881 fNext(NULL) 1882 { 1883 tree->_AddIterator(this); 1884 } 1885 1886 1887 TreeIterator::~TreeIterator() 1888 { 1889 if (fTree) 1890 fTree->_RemoveIterator(this); 1891 } 1892 1893 1894 status_t 1895 TreeIterator::Goto(int8 to) 1896 { 1897 if (fTree == NULL || fTree->fHeader == NULL) 1898 RETURN_ERROR(B_BAD_VALUE); 1899 1900 // lock access to stream 1901 ReadLocked locked(fTree->fStream->Lock()); 1902 1903 off_t nodeOffset = fTree->fHeader->RootNode(); 1904 CachedNode cached(fTree); 1905 const bplustree_node *node; 1906 1907 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1908 // is the node a leaf node? 1909 if (node->OverflowLink() == BPLUSTREE_NULL) { 1910 fCurrentNodeOffset = nodeOffset; 1911 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1912 fDuplicateNode = BPLUSTREE_NULL; 1913 1914 return B_OK; 1915 } 1916 1917 // get the next node offset depending on the direction (and if there 1918 // are any keys in that node at all) 1919 off_t nextOffset; 1920 if (to == BPLUSTREE_END || node->all_key_count == 0) 1921 nextOffset = node->OverflowLink(); 1922 else { 1923 if (node->AllKeyLength() > fTree->fNodeSize 1924 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1925 - 8 * node->NumKeys()) 1926 RETURN_ERROR(B_ERROR); 1927 1928 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1929 } 1930 if (nextOffset == nodeOffset) 1931 break; 1932 1933 nodeOffset = nextOffset; 1934 } 1935 FATAL(("%s fails\n", __FUNCTION__)); 1936 1937 RETURN_ERROR(B_ERROR); 1938 } 1939 1940 1941 /*! 1942 Iterates through the tree in the specified direction. 1943 When it iterates through duplicates, the "key" is only updated for the 1944 first entry - if you need to know when this happens, use the "duplicate" 1945 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1946 the other duplicates. 1947 That's not too nice, but saves the 256 bytes that would be needed to 1948 store the last key - if this will ever become an issue, it will be 1949 easy to change. 1950 The other advantage of this is, that the queries can skip all duplicates 1951 at once when they are not relevant to them. 1952 */ 1953 status_t 1954 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, 1955 uint16 maxLength, off_t *value, uint16 *duplicate) 1956 { 1957 if (fTree == NULL) 1958 return B_INTERRUPTED; 1959 1960 bool forward = direction == BPLUSTREE_FORWARD; 1961 1962 if (fCurrentNodeOffset == BPLUSTREE_NULL 1963 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1964 RETURN_ERROR(B_ERROR); 1965 1966 // if the tree was emptied since the last call 1967 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1968 return B_ENTRY_NOT_FOUND; 1969 1970 // lock access to stream 1971 ReadLocked locked(fTree->fStream->Lock()); 1972 1973 CachedNode cached(fTree); 1974 const bplustree_node *node; 1975 1976 if (fDuplicateNode != BPLUSTREE_NULL) { 1977 // regardless of traverse direction the duplicates are always presented in 1978 // the same order; since they are all considered as equal, this shouldn't 1979 // cause any problems 1980 1981 if (!fIsFragment || fDuplicate < fNumDuplicates) 1982 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1983 else 1984 node = NULL; 1985 1986 if (node != NULL) { 1987 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1988 // if the node is out of duplicates, we go directly to the next one 1989 fDuplicateNode = node->RightLink(); 1990 if (fDuplicateNode != BPLUSTREE_NULL 1991 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1992 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1993 fDuplicate = 0; 1994 } 1995 } 1996 if (fDuplicate < fNumDuplicates) { 1997 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1998 if (duplicate) 1999 *duplicate = 2; 2000 return B_OK; 2001 } 2002 } 2003 fDuplicateNode = BPLUSTREE_NULL; 2004 } 2005 2006 off_t savedNodeOffset = fCurrentNodeOffset; 2007 int32 savedKey = fCurrentKey; 2008 2009 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2010 RETURN_ERROR(B_ERROR); 2011 2012 if (duplicate) 2013 *duplicate = 0; 2014 2015 fCurrentKey += direction; 2016 2017 // is the current key in the current node? 2018 while ((forward && fCurrentKey >= node->NumKeys()) 2019 || (!forward && fCurrentKey < 0)) { 2020 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2021 2022 // are there any more nodes? 2023 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2024 node = cached.SetTo(fCurrentNodeOffset); 2025 if (!node) 2026 RETURN_ERROR(B_ERROR); 2027 2028 // reset current key 2029 fCurrentKey = forward ? 0 : node->NumKeys(); 2030 } else { 2031 // there are no nodes left, so turn back to the last key 2032 fCurrentNodeOffset = savedNodeOffset; 2033 fCurrentKey = savedKey; 2034 2035 return B_ENTRY_NOT_FOUND; 2036 } 2037 } 2038 2039 if (node->all_key_count == 0) 2040 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2041 2042 uint16 length; 2043 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 2044 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 2045 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2046 fTree->fStream->GetVolume()->Panic(); 2047 RETURN_ERROR(B_BAD_DATA); 2048 } 2049 2050 length = min_c(length, maxLength); 2051 memcpy(key, keyStart, length); 2052 2053 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 2054 // terminate string type 2055 if (length == maxLength) 2056 length--; 2057 ((char *)key)[length] = '\0'; 2058 } 2059 *keyLength = length; 2060 2061 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2062 2063 // duplicate fragments? 2064 uint8 type = bplustree_node::LinkType(offset); 2065 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 2066 fDuplicateNode = offset; 2067 2068 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 2069 if (node == NULL) 2070 RETURN_ERROR(B_ERROR); 2071 2072 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2073 2074 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2075 if (fNumDuplicates) { 2076 offset = node->DuplicateAt(offset, fIsFragment, 0); 2077 fDuplicate = 1; 2078 if (duplicate) 2079 *duplicate = 1; 2080 } else { 2081 // shouldn't happen, but we're dealing here with potentially corrupt disks... 2082 fDuplicateNode = BPLUSTREE_NULL; 2083 offset = 0; 2084 } 2085 } 2086 *value = offset; 2087 2088 return B_OK; 2089 } 2090 2091 2092 /*! 2093 This is more or less a copy of BPlusTree::Find() - but it just 2094 sets the current position in the iterator, regardless of if the 2095 key could be found or not. 2096 */ 2097 status_t 2098 TreeIterator::Find(const uint8 *key, uint16 keyLength) 2099 { 2100 if (fTree == NULL) 2101 return B_INTERRUPTED; 2102 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2103 || key == NULL) 2104 RETURN_ERROR(B_BAD_VALUE); 2105 2106 // lock access to stream 2107 ReadLocked locked(fTree->fStream->Lock()); 2108 2109 off_t nodeOffset = fTree->fHeader->RootNode(); 2110 2111 CachedNode cached(fTree); 2112 const bplustree_node *node; 2113 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2114 uint16 keyIndex = 0; 2115 off_t nextOffset; 2116 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2117 &nextOffset); 2118 2119 if (node->OverflowLink() == BPLUSTREE_NULL) { 2120 fCurrentNodeOffset = nodeOffset; 2121 fCurrentKey = keyIndex - 1; 2122 fDuplicateNode = BPLUSTREE_NULL; 2123 2124 return status; 2125 } else if (nextOffset == nodeOffset) 2126 RETURN_ERROR(B_ERROR); 2127 2128 nodeOffset = nextOffset; 2129 } 2130 RETURN_ERROR(B_ERROR); 2131 } 2132 2133 2134 void 2135 TreeIterator::SkipDuplicates() 2136 { 2137 fDuplicateNode = BPLUSTREE_NULL; 2138 } 2139 2140 2141 void 2142 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 2143 { 2144 if (offset != fCurrentNodeOffset) 2145 return; 2146 2147 if (nextOffset != BPLUSTREE_NULL) { 2148 fCurrentNodeOffset = nextOffset; 2149 if (splitAt <= fCurrentKey) { 2150 fCurrentKey -= splitAt; 2151 keyIndex -= splitAt; 2152 } 2153 } 2154 2155 // Adjust fCurrentKey to point to the same key as before. 2156 // Note, that if a key is inserted at the current position 2157 // it won't be included in this tree transition. 2158 if (keyIndex <= fCurrentKey) 2159 fCurrentKey += change; 2160 2161 // ToDo: duplicate handling! 2162 } 2163 2164 2165 void 2166 TreeIterator::Stop() 2167 { 2168 fTree = NULL; 2169 } 2170 2171 2172 #ifdef DEBUG 2173 void 2174 TreeIterator::Dump() 2175 { 2176 __out("TreeIterator at %p:\n", this); 2177 __out("\tfTree = %p\n", fTree); 2178 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2179 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2180 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 2181 __out("\tfDuplicate = %u\n", fDuplicate); 2182 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2183 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2184 } 2185 #endif 2186 2187 2188 // #pragma mark - 2189 2190 2191 void 2192 bplustree_node::Initialize() 2193 { 2194 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2195 all_key_count = 0; 2196 all_key_length = 0; 2197 } 2198 2199 2200 uint8 * 2201 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 2202 { 2203 if (index < 0 || index > NumKeys()) 2204 return NULL; 2205 2206 uint8 *keyStart = Keys(); 2207 uint16 *keyLengths = KeyLengths(); 2208 2209 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2210 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2211 if (index > 0) 2212 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2213 2214 return keyStart; 2215 } 2216 2217 2218 uint8 2219 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2220 { 2221 // the duplicate fragment handling is currently hard-coded to a node size 2222 // of 1024 bytes - with future versions of BFS, this may be a problem 2223 2224 if (isFragment) { 2225 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2226 2227 return ((off_t *)this)[fragment]; 2228 } 2229 return OverflowLink(); 2230 } 2231 2232 2233 off_t 2234 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2235 { 2236 uint32 start; 2237 if (isFragment) 2238 start = 8 * ((uint64)offset & 0x3ff); 2239 else 2240 start = 2; 2241 2242 return ((off_t *)this)[start + 1 + index]; 2243 } 2244 2245 2246 /*! Although the name suggests it, this function doesn't return the real 2247 used fragment count; at least, it can only count to two: it returns 2248 0, if there is no fragment used, 1 if there is only one fragment 2249 used, and 2 if there are at least 2 fragments used. 2250 */ 2251 uint32 2252 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2253 { 2254 uint32 used = 0; 2255 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2256 duplicate_array *array = FragmentAt(i); 2257 if (array->count > 0 && ++used > 1) 2258 return used; 2259 } 2260 return used; 2261 } 2262 2263 2264 #ifdef DEBUG 2265 status_t 2266 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2267 { 2268 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2269 DEBUGGER(("invalid node: key/length count")); 2270 2271 for (int32 i = 0; i < NumKeys(); i++) { 2272 uint16 length; 2273 uint8 *key = KeyAt(i, &length); 2274 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2275 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2276 dprintf("node %p, key %ld\n", this, i); 2277 DEBUGGER(("invalid node: keys corrupted")); 2278 return B_BAD_DATA; 2279 } 2280 if (Values()[i] == -1) { 2281 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2282 DEBUGGER(("invalid node: values corrupted")); 2283 return B_BAD_DATA; 2284 } 2285 } 2286 return B_OK; 2287 } 2288 #endif 2289 2290 2291 // #pragma mark - 2292 2293 2294 int32 2295 compareKeys(type_code type, const void *key1, int keyLength1, 2296 const void *key2, int keyLength2) 2297 { 2298 // if one of the keys is NULL, bail out gracefully 2299 if (key1 == NULL || key2 == NULL) { 2300 // even if it's most probably a bug in the calling code, we try to 2301 // give a meaningful result 2302 if (key1 == NULL && key2 != NULL) 2303 return 1; 2304 if (key2 == NULL && key1 != NULL) 2305 return -1; 2306 2307 return 0; 2308 } 2309 2310 switch (type) { 2311 case B_STRING_TYPE: 2312 { 2313 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 2314 if (result == 0) { 2315 // ignore trailing null bytes 2316 if ((keyLength1 == keyLength2 + 1 2317 && ((uint8 *)key1)[keyLength2] == '\0') 2318 || (keyLength2 == keyLength1 + 1 2319 && ((uint8 *)key2)[keyLength1] == '\0')) 2320 return 0; 2321 2322 result = keyLength1 - keyLength2; 2323 } 2324 2325 return result; 2326 } 2327 2328 case B_SSIZE_T_TYPE: 2329 case B_INT32_TYPE: 2330 return *(int32 *)key1 - *(int32 *)key2; 2331 2332 case B_SIZE_T_TYPE: 2333 case B_UINT32_TYPE: 2334 if (*(uint32 *)key1 == *(uint32 *)key2) 2335 return 0; 2336 if (*(uint32 *)key1 > *(uint32 *)key2) 2337 return 1; 2338 2339 return -1; 2340 2341 case B_OFF_T_TYPE: 2342 case B_INT64_TYPE: 2343 if (*(int64 *)key1 == *(int64 *)key2) 2344 return 0; 2345 if (*(int64 *)key1 > *(int64 *)key2) 2346 return 1; 2347 2348 return -1; 2349 2350 case B_UINT64_TYPE: 2351 if (*(uint64 *)key1 == *(uint64 *)key2) 2352 return 0; 2353 if (*(uint64 *)key1 > *(uint64 *)key2) 2354 return 1; 2355 2356 return -1; 2357 2358 case B_FLOAT_TYPE: 2359 { 2360 float result = *(float *)key1 - *(float *)key2; 2361 if (result == 0.0f) 2362 return 0; 2363 2364 return (result < 0.0f) ? -1 : 1; 2365 } 2366 2367 case B_DOUBLE_TYPE: 2368 { 2369 double result = *(double *)key1 - *(double *)key2; 2370 if (result == 0.0) 2371 return 0; 2372 2373 return (result < 0.0) ? -1 : 1; 2374 } 2375 } 2376 2377 // if the type is unknown, the entries don't match... 2378 return -1; 2379 } 2380 2381