1 /* 2 * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 //! B+Tree implementation 10 11 12 #include "Debug.h" 13 #include "BPlusTree.h" 14 #include "Inode.h" 15 #include "Utility.h" 16 17 18 #ifdef DEBUG 19 class NodeChecker { 20 public: 21 NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text) 22 : 23 fNode(node), 24 fSize(nodeSize), 25 fText(text) 26 { 27 Check("integrity check failed on construction."); 28 } 29 30 ~NodeChecker() 31 { 32 Check("integrity check failed on destruction."); 33 } 34 35 void 36 Check(const char *message) 37 { 38 if (fNode->CheckIntegrity(fSize) < B_OK) 39 dprintf("%s: %s\n", fText, message); 40 } 41 42 private: 43 const bplustree_node *fNode; 44 int32 fSize; 45 const char *fText; 46 }; 47 #endif 48 49 50 // Node Caching for the BPlusTree class 51 // 52 // With write support, there is the need for a function that allocates new 53 // nodes by either returning empty nodes, or by growing the file's data stream 54 // 55 // !! The CachedNode class assumes that you have properly locked the stream 56 // !! before asking for nodes. 57 // 58 // Note: This code will fail if the block size is smaller than the node size! 59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 60 // is hard-coded to 1024 bytes, that's not an issue now. 61 62 void 63 CachedNode::UnsetUnchanged(Transaction &transaction) 64 { 65 if (fTree == NULL || fTree->fStream == NULL) 66 return; 67 68 if (fNode != NULL) { 69 void *cache = fTree->fStream->GetVolume()->BlockCache(); 70 71 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 72 block_cache_put(cache, fBlockNumber); 73 fNode = NULL; 74 } 75 } 76 77 78 void 79 CachedNode::Unset() 80 { 81 if (fTree == NULL || fTree->fStream == NULL) 82 return; 83 84 if (fNode != NULL) { 85 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); 86 fNode = NULL; 87 } 88 } 89 90 91 const bplustree_node * 92 CachedNode::SetTo(off_t offset, bool check) 93 { 94 if (fTree == NULL || fTree->fStream == NULL) { 95 REPORT_ERROR(B_BAD_VALUE); 96 return NULL; 97 } 98 99 Unset(); 100 101 // You can only ask for nodes at valid positions - you can't 102 // even access the b+tree header with this method (use SetToHeader() 103 // instead) 104 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 105 || offset <= 0 106 || (offset % fTree->fNodeSize) != 0) 107 return NULL; 108 109 if (InternalSetTo(NULL, offset) != NULL && check) { 110 // sanity checks (links, all_key_count) 111 if (!fTree->fHeader->CheckNode(fNode)) { 112 FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n", 113 fNode, offset, fTree->fStream->ID())); 114 return NULL; 115 } 116 } 117 return fNode; 118 } 119 120 121 bplustree_node * 122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check) 123 { 124 if (fTree == NULL || fTree->fStream == NULL) { 125 REPORT_ERROR(B_BAD_VALUE); 126 return NULL; 127 } 128 129 Unset(); 130 131 // You can only ask for nodes at valid positions - you can't 132 // even access the b+tree header with this method (use SetToHeader() 133 // instead) 134 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 135 || offset <= 0 136 || (offset % fTree->fNodeSize) != 0) 137 return NULL; 138 139 if (InternalSetTo(&transaction, offset) != NULL && check) { 140 // sanity checks (links, all_key_count) 141 if (!fTree->fHeader->CheckNode(fNode)) { 142 FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n", 143 fNode, offset, fTree->fStream->ID())); 144 return NULL; 145 } 146 } 147 return fNode; 148 } 149 150 151 bplustree_node * 152 CachedNode::MakeWritable(Transaction &transaction) 153 { 154 if (fNode == NULL) 155 return NULL; 156 157 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 158 fBlockNumber, transaction.ID()) == B_OK) 159 return fNode; 160 161 return NULL; 162 } 163 164 165 bplustree_header * 166 CachedNode::MakeWritableHeader(Transaction &transaction) 167 { 168 return (bplustree_header *)MakeWritable(transaction); 169 } 170 171 172 const bplustree_header * 173 CachedNode::SetToHeader() 174 { 175 if (fTree == NULL || fTree->fStream == NULL) { 176 REPORT_ERROR(B_BAD_VALUE); 177 return NULL; 178 } 179 180 Unset(); 181 182 InternalSetTo(NULL, 0LL); 183 return (bplustree_header *)fNode; 184 } 185 186 187 bplustree_header * 188 CachedNode::SetToWritableHeader(Transaction &transaction) 189 { 190 if (fTree == NULL || fTree->fStream == NULL) { 191 REPORT_ERROR(B_BAD_VALUE); 192 return NULL; 193 } 194 195 Unset(); 196 197 InternalSetTo(&transaction, 0LL); 198 return (bplustree_header *)fNode; 199 } 200 201 202 bplustree_node * 203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset) 204 { 205 fNode = NULL; 206 fOffset = offset; 207 208 off_t fileOffset; 209 block_run run; 210 if (offset < fTree->fStream->Size() 211 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 212 Volume *volume = fTree->fStream->GetVolume(); 213 214 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 215 fBlockNumber = volume->ToBlock(run) + blockOffset; 216 uint8 *block; 217 218 if (transaction != NULL) { 219 block = (uint8 *)block_cache_get_writable(volume->BlockCache(), 220 fBlockNumber, transaction->ID()); 221 fWritable = true; 222 } else { 223 block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber); 224 fWritable = false; 225 } 226 227 if (block) { 228 // the node is somewhere in that block... (confusing offset calculation) 229 fNode = (bplustree_node *)(block + offset - 230 (fileOffset + (blockOffset << volume->BlockShift()))); 231 } else 232 REPORT_ERROR(B_IO_ERROR); 233 } 234 return fNode; 235 } 236 237 238 status_t 239 CachedNode::Free(Transaction &transaction, off_t offset) 240 { 241 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 242 RETURN_ERROR(B_BAD_VALUE); 243 244 // ToDo: scan the free nodes list and remove all nodes at the end 245 // of the tree - perhaps that shouldn't be done everytime that 246 // function is called, perhaps it should be done when the directory 247 // inode is closed or based on some calculation or whatever... 248 249 bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction); 250 if (header == NULL) 251 return B_IO_ERROR; 252 253 // if the node is the last one in the tree, we shrink 254 // the tree and file size by one node 255 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 256 if (offset == lastOffset) { 257 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 258 if (status < B_OK) 259 return status; 260 261 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 262 return B_OK; 263 } 264 265 // add the node to the free nodes list 266 fNode->left_link = header->free_node_pointer; 267 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 268 269 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 270 return B_OK; 271 } 272 273 274 status_t 275 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, 276 off_t *_offset) 277 { 278 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 279 RETURN_ERROR(B_BAD_VALUE); 280 281 Unset(); 282 283 bplustree_header *header; 284 status_t status; 285 286 // if there are any free nodes, recycle them 287 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 288 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 289 if (header == NULL) 290 return B_IO_ERROR; 291 292 *_offset = header->FreeNode(); 293 *_node = fNode; 294 295 // set new free node pointer 296 header->free_node_pointer = fNode->left_link; 297 fNode->Initialize(); 298 return B_OK; 299 } 300 301 // allocate space for a new node 302 Inode *stream = fTree->fStream; 303 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 304 return status; 305 306 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 307 if (header == NULL) 308 return B_IO_ERROR; 309 310 // the maximum_size has to be changed before the call to SetTo() - or 311 // else it will fail because the requested node is out of bounds 312 off_t offset = header->MaximumSize(); 313 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 314 + fTree->fNodeSize); 315 316 if (SetToWritable(transaction, offset, false) != NULL) { 317 fNode->Initialize(); 318 319 *_offset = offset; 320 *_node = fNode; 321 return B_OK; 322 } 323 324 // revert header size to old value 325 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 326 - fTree->fNodeSize); 327 RETURN_ERROR(B_ERROR); 328 } 329 330 331 // #pragma mark - 332 333 334 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize) 335 : 336 fStream(NULL), 337 fHeader(NULL), 338 fCachedHeader(this) 339 { 340 SetTo(transaction, stream); 341 } 342 343 344 BPlusTree::BPlusTree(Inode *stream) 345 : 346 fStream(NULL), 347 fHeader(NULL), 348 fCachedHeader(this) 349 { 350 SetTo(stream); 351 } 352 353 354 BPlusTree::BPlusTree() 355 : 356 fStream(NULL), 357 fHeader(NULL), 358 fCachedHeader(this), 359 fNodeSize(BPLUSTREE_NODE_SIZE), 360 fAllowDuplicates(true), 361 fStatus(B_NO_INIT) 362 { 363 } 364 365 366 BPlusTree::~BPlusTree() 367 { 368 // if there are any TreeIterators left, we need to stop them 369 // (can happen when the tree's inode gets deleted while 370 // traversing the tree - a TreeIterator doesn't lock the inode) 371 if (fIteratorLock.Lock() < B_OK) 372 return; 373 374 TreeIterator *iterator = NULL; 375 while ((iterator = fIterators.Next(iterator)) != NULL) 376 iterator->Stop(); 377 378 fIteratorLock.Unlock(); 379 } 380 381 382 /*! Create a new B+Tree on the specified stream */ 383 status_t 384 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize) 385 { 386 // initializes in-memory B+Tree 387 388 fStream = stream; 389 390 bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction); 391 if (header == NULL) { 392 // allocate space for new header + node! 393 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 394 if (fStatus < B_OK) 395 RETURN_ERROR(fStatus); 396 397 header = fCachedHeader.SetToWritableHeader(transaction); 398 if (header == NULL) 399 RETURN_ERROR(fStatus = B_ERROR); 400 } 401 402 fHeader = header; 403 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 404 && stream->BlockRun() != stream->Parent()) 405 || (stream->Mode() & S_ALLOW_DUPS) != 0; 406 407 fNodeSize = nodeSize; 408 409 // initialize b+tree header 410 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 411 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 412 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 413 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 414 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 415 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 416 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 417 418 // initialize b+tree root node 419 CachedNode cached(this); 420 cached.SetToWritable(transaction, header->RootNode(), false); 421 if (cached.Node() == NULL) 422 RETURN_ERROR(B_IO_ERROR); 423 424 cached.Node()->Initialize(); 425 426 return fStatus = B_OK; 427 } 428 429 430 status_t 431 BPlusTree::SetTo(Inode *stream) 432 { 433 if (stream == NULL) 434 RETURN_ERROR(fStatus = B_BAD_VALUE); 435 436 // get on-disk B+Tree header 437 438 fCachedHeader.Unset(); 439 fStream = stream; 440 441 fHeader = fCachedHeader.SetToHeader(); 442 if (fHeader == NULL) 443 RETURN_ERROR(fStatus = B_NO_INIT); 444 445 // is header valid? 446 447 if (fHeader->MaximumSize() != stream->Size()) { 448 FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n", 449 fHeader->MaximumSize(), stream->Size())); 450 // we can't change the header since we don't have a transaction 451 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 452 } 453 if (fHeader->Magic() != BPLUSTREE_MAGIC 454 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 455 || !fHeader->IsValidLink(fHeader->RootNode()) 456 || !fHeader->IsValidLink(fHeader->FreeNode())) { 457 #ifdef DEBUG 458 dump_bplustree_header(fHeader); 459 dump_block((const char *)fHeader, 128); 460 #endif 461 RETURN_ERROR(fStatus = B_BAD_DATA); 462 } 463 464 fNodeSize = fHeader->NodeSize(); 465 466 { 467 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 468 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 469 S_DOUBLE_INDEX}; 470 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 471 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 472 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 473 474 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 475 || (stream->Mode() & S_INDEX_DIR) 476 && toMode[fHeader->DataType()] != mode 477 || !stream->IsContainer()) { 478 D( dump_bplustree_header(fHeader); 479 dump_inode(&stream->Node()); 480 ); 481 RETURN_ERROR(fStatus = B_BAD_TYPE); 482 } 483 484 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 485 // in the original BFS code - we will honour it nevertheless 486 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 487 && stream->BlockRun() != stream->Parent()) 488 || (stream->Mode() & S_ALLOW_DUPS) != 0; 489 } 490 491 CachedNode cached(this, fHeader->RootNode()); 492 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 493 } 494 495 496 status_t 497 BPlusTree::InitCheck() 498 { 499 return fStatus; 500 } 501 502 503 int32 504 BPlusTree::TypeCodeToKeyType(type_code code) 505 { 506 switch (code) { 507 case B_STRING_TYPE: 508 return BPLUSTREE_STRING_TYPE; 509 case B_SSIZE_T_TYPE: 510 case B_INT32_TYPE: 511 return BPLUSTREE_INT32_TYPE; 512 case B_SIZE_T_TYPE: 513 case B_UINT32_TYPE: 514 return BPLUSTREE_UINT32_TYPE; 515 case B_OFF_T_TYPE: 516 case B_INT64_TYPE: 517 return BPLUSTREE_INT64_TYPE; 518 case B_UINT64_TYPE: 519 return BPLUSTREE_UINT64_TYPE; 520 case B_FLOAT_TYPE: 521 return BPLUSTREE_FLOAT_TYPE; 522 case B_DOUBLE_TYPE: 523 return BPLUSTREE_DOUBLE_TYPE; 524 } 525 return -1; 526 } 527 528 529 int32 530 BPlusTree::ModeToKeyType(mode_t mode) 531 { 532 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 533 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 534 case S_INT_INDEX: 535 return BPLUSTREE_INT32_TYPE; 536 case S_UINT_INDEX: 537 return BPLUSTREE_UINT32_TYPE; 538 case S_LONG_LONG_INDEX: 539 return BPLUSTREE_INT64_TYPE; 540 case S_ULONG_LONG_INDEX: 541 return BPLUSTREE_UINT64_TYPE; 542 case S_FLOAT_INDEX: 543 return BPLUSTREE_FLOAT_TYPE; 544 case S_DOUBLE_INDEX: 545 return BPLUSTREE_DOUBLE_TYPE; 546 case S_STR_INDEX: 547 default: 548 // default is for standard directories 549 return BPLUSTREE_STRING_TYPE; 550 } 551 } 552 553 554 // #pragma mark - 555 556 557 void 558 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 559 uint16 splitAt, int8 change) 560 { 561 // Although every iterator which is affected by this update currently 562 // waits on a semaphore, other iterators could be added/removed at 563 // any time, so we need to protect this loop 564 if (fIteratorLock.Lock() < B_OK) 565 return; 566 567 TreeIterator *iterator = NULL; 568 while ((iterator = fIterators.Next(iterator)) != NULL) 569 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 570 571 fIteratorLock.Unlock(); 572 } 573 574 575 void 576 BPlusTree::_AddIterator(TreeIterator *iterator) 577 { 578 if (fIteratorLock.Lock() < B_OK) 579 return; 580 581 fIterators.Add(iterator); 582 583 fIteratorLock.Unlock(); 584 } 585 586 587 void 588 BPlusTree::_RemoveIterator(TreeIterator *iterator) 589 { 590 if (fIteratorLock.Lock() < B_OK) 591 return; 592 593 fIterators.Remove(iterator); 594 595 fIteratorLock.Unlock(); 596 } 597 598 599 int32 600 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2, 601 int keyLength2) 602 { 603 type_code type = 0; 604 switch (fHeader->data_type) { 605 case BPLUSTREE_STRING_TYPE: 606 type = B_STRING_TYPE; 607 break; 608 case BPLUSTREE_INT32_TYPE: 609 type = B_INT32_TYPE; 610 break; 611 case BPLUSTREE_UINT32_TYPE: 612 type = B_UINT32_TYPE; 613 break; 614 case BPLUSTREE_INT64_TYPE: 615 type = B_INT64_TYPE; 616 break; 617 case BPLUSTREE_UINT64_TYPE: 618 type = B_UINT64_TYPE; 619 break; 620 case BPLUSTREE_FLOAT_TYPE: 621 type = B_FLOAT_TYPE; 622 break; 623 case BPLUSTREE_DOUBLE_TYPE: 624 type = B_DOUBLE_TYPE; 625 break; 626 } 627 return compareKeys(type, key1, keyLength1, key2, keyLength2); 628 } 629 630 631 status_t 632 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key, 633 uint16 keyLength, uint16 *_index, off_t *_next) 634 { 635 #ifdef DEBUG 636 NodeChecker checker(node, fNodeSize, "find"); 637 #endif 638 639 if (node->all_key_count == 0) { 640 if (_index) 641 *_index = 0; 642 if (_next) 643 *_next = node->OverflowLink(); 644 return B_ENTRY_NOT_FOUND; 645 } 646 647 off_t *values = node->Values(); 648 int16 saveIndex = -1; 649 650 // binary search in the key array 651 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 652 uint16 i = (first + last) >> 1; 653 654 uint16 searchLength; 655 uint8 *searchKey = node->KeyAt(i, &searchLength); 656 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 657 > (uint8 *)node + fNodeSize 658 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 659 fStream->GetVolume()->Panic(); 660 RETURN_ERROR(B_BAD_DATA); 661 } 662 663 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 664 if (cmp < 0) { 665 last = i - 1; 666 saveIndex = i; 667 } else if (cmp > 0) { 668 saveIndex = first = i + 1; 669 } else { 670 if (_index) 671 *_index = i; 672 if (_next) 673 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 674 return B_OK; 675 } 676 } 677 678 if (_index) 679 *_index = saveIndex; 680 if (_next) { 681 if (saveIndex == node->NumKeys()) 682 *_next = node->OverflowLink(); 683 else 684 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 685 } 686 return B_ENTRY_NOT_FOUND; 687 } 688 689 690 /*! 691 Prepares the stack to contain all nodes that were passed while 692 following the key, from the root node to the leaf node that could 693 or should contain that key. 694 */ 695 status_t 696 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key, 697 uint16 keyLength) 698 { 699 // set the root node to begin with 700 node_and_key nodeAndKey; 701 nodeAndKey.nodeOffset = fHeader->RootNode(); 702 703 CachedNode cached(this); 704 const bplustree_node *node; 705 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 706 // if we are already on leaf level, we're done 707 if (node->OverflowLink() == BPLUSTREE_NULL) { 708 // node that the keyIndex is not properly set here (but it's not 709 // needed in the calling functions anyway)! 710 nodeAndKey.keyIndex = 0; 711 stack.Push(nodeAndKey); 712 return B_OK; 713 } 714 715 off_t nextOffset; 716 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 717 &nextOffset); 718 719 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 720 RETURN_ERROR(B_ERROR); 721 722 // put the node offset & the correct keyIndex on the stack 723 stack.Push(nodeAndKey); 724 725 nodeAndKey.nodeOffset = nextOffset; 726 } 727 728 FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", 729 nodeAndKey.nodeOffset)); 730 return B_ERROR; 731 } 732 733 734 /*! 735 This will find a free duplicate fragment in the given bplustree_node. 736 The CachedNode will be set to the writable fragment on success. 737 */ 738 status_t 739 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction, 740 const bplustree_node *node, CachedNode &cached, 741 off_t *_offset, bplustree_node **_fragment, uint32 *_index) 742 { 743 off_t *values = node->Values(); 744 for (int32 i = 0; i < node->NumKeys(); i++) { 745 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 746 747 // does the value link to a duplicate fragment? 748 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 749 continue; 750 751 const bplustree_node *fragment = cached.SetTo( 752 bplustree_node::FragmentOffset(value), false); 753 if (fragment == NULL) { 754 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 755 continue; 756 } 757 758 // see if there is some space left for us 759 uint32 num = bplustree_node::MaxFragments(fNodeSize); 760 for (uint32 j = 0; j < num; j++) { 761 duplicate_array *array = fragment->FragmentAt(j); 762 763 if (array->count == 0) { 764 // found an unused fragment 765 *_fragment = cached.MakeWritable(transaction); 766 if (*_fragment == NULL) 767 return B_IO_ERROR; 768 769 *_offset = bplustree_node::FragmentOffset(value); 770 *_index = j; 771 return B_OK; 772 } 773 } 774 } 775 return B_ENTRY_NOT_FOUND; 776 } 777 778 779 status_t 780 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached, 781 const bplustree_node *node, uint16 index, off_t value) 782 { 783 CachedNode cachedDuplicate(this); 784 off_t *values = node->Values(); 785 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 786 status_t status; 787 off_t offset; 788 789 if (bplustree_node::IsDuplicate(oldValue)) { 790 // If it's a duplicate fragment, try to insert it into that, or if it 791 // doesn't fit anymore, create a new duplicate node 792 793 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 794 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 795 bplustree_node::FragmentOffset(oldValue), false); 796 if (duplicate == NULL) 797 return B_IO_ERROR; 798 799 duplicate_array *array = duplicate->FragmentAt( 800 bplustree_node::FragmentIndex(oldValue)); 801 if (array->count > NUM_FRAGMENT_VALUES 802 || array->count < 1) { 803 FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 804 (int)bplustree_node::FragmentIndex(oldValue), 805 bplustree_node::FragmentOffset(oldValue), 806 array->count)); 807 return B_BAD_DATA; 808 } 809 810 if (array->count < NUM_FRAGMENT_VALUES) { 811 array->Insert(value); 812 } else { 813 // test if the fragment will be empty if we remove this key's values 814 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 815 // the node will be empty without our values, so let us 816 // reuse it as a duplicate node 817 offset = bplustree_node::FragmentOffset(oldValue); 818 819 memmove(duplicate->DuplicateArray(), array, 820 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 821 duplicate->left_link = duplicate->right_link 822 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 823 824 array = duplicate->DuplicateArray(); 825 array->Insert(value); 826 } else { 827 // create a new duplicate node 828 829 cachedDuplicate.UnsetUnchanged(transaction); 830 // the old duplicate has not been touched, so we can reuse it 831 832 bplustree_node *newDuplicate; 833 status = cachedDuplicate.Allocate(transaction, 834 &newDuplicate, &offset); 835 if (status < B_OK) 836 RETURN_ERROR(status); 837 838 // copy the array from the fragment node to the duplicate node 839 // and free the old entry (by zero'ing all values) 840 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 841 array->count); 842 memcpy(&newDuplicate->all_key_count, &array->values[0], 843 array->count * sizeof(off_t)); 844 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 845 846 array = newDuplicate->DuplicateArray(); 847 array->Insert(value); 848 } 849 850 // update the main pointer to link to a duplicate node 851 if (cached.MakeWritable(transaction) == NULL) 852 return B_IO_ERROR; 853 854 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 855 BPLUSTREE_DUPLICATE_NODE, offset)); 856 } 857 858 return B_OK; 859 } 860 861 // Put the value into a dedicated duplicate node 862 863 // search for free space in the duplicate nodes of that key 864 duplicate_array *array; 865 const bplustree_node *duplicate; 866 off_t duplicateOffset; 867 do { 868 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 869 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 870 if (duplicate == NULL) 871 return B_IO_ERROR; 872 873 array = duplicate->DuplicateArray(); 874 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 875 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 876 duplicateOffset, array->count)); 877 return B_BAD_DATA; 878 } 879 } while (array->count >= NUM_DUPLICATE_VALUES 880 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 881 882 bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction); 883 if (writableDuplicate == NULL) 884 return B_IO_ERROR; 885 886 if (array->count < NUM_DUPLICATE_VALUES) { 887 array = writableDuplicate->DuplicateArray(); 888 array->Insert(value); 889 } else { 890 // no space left - add a new duplicate node 891 892 bplustree_node *newDuplicate; 893 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 894 if (status < B_OK) 895 RETURN_ERROR(status); 896 897 // link the two nodes together 898 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 899 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 900 901 array = newDuplicate->DuplicateArray(); 902 array->count = 0; 903 array->Insert(value); 904 } 905 return B_OK; 906 } 907 908 // Search for a free duplicate fragment or create a new one 909 // to insert the duplicate value into 910 911 uint32 fragmentIndex = 0; 912 bplustree_node *fragment; 913 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 914 &offset, &fragment, &fragmentIndex) != B_OK) { 915 // allocate a new duplicate fragment node 916 if ((status = cachedDuplicate.Allocate(transaction, &fragment, 917 &offset)) < B_OK) 918 RETURN_ERROR(status); 919 920 memset(fragment, 0, fNodeSize); 921 } 922 923 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 924 array->Insert(oldValue); 925 array->Insert(value); 926 927 if (cached.MakeWritable(transaction) == NULL) 928 return B_IO_ERROR; 929 930 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 931 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 932 933 return B_OK; 934 } 935 936 937 void 938 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key, 939 uint16 keyLength, off_t value) 940 { 941 // should never happen, but who knows? 942 if (index > node->NumKeys()) 943 return; 944 945 off_t *values = node->Values(); 946 uint16 *keyLengths = node->KeyLengths(); 947 uint8 *keys = node->Keys(); 948 949 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 950 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 951 + keyLength); 952 953 off_t *newValues = node->Values(); 954 uint16 *newKeyLengths = node->KeyLengths(); 955 956 // move values and copy new value into them 957 memmove(newValues + index + 1, values + index, 958 sizeof(off_t) * (node->NumKeys() - 1 - index)); 959 memmove(newValues, values, sizeof(off_t) * index); 960 961 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 962 963 // move and update key length index 964 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 965 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 966 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 967 } 968 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 969 970 int32 keyStart; 971 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 972 + (keyStart = index > 0 973 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 974 975 // move keys and copy new key into them 976 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 977 int32 size = node->AllKeyLength() - length; 978 if (size > 0) 979 memmove(keys + length, keys + length - keyLength, size); 980 981 memcpy(keys + keyStart, key, keyLength); 982 } 983 984 985 /*! 986 Splits the \a node into two halves - the other half will be put into \a other. 987 It also takes care to create a new overflow link if the node to split is an 988 index node. 989 */ 990 status_t 991 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset, 992 bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key, 993 uint16 *_keyLength, off_t *_value) 994 { 995 if (*_keyIndex > node->NumKeys() + 1) 996 return B_BAD_VALUE; 997 998 uint16 *inKeyLengths = node->KeyLengths(); 999 off_t *inKeyValues = node->Values(); 1000 uint8 *inKeys = node->Keys(); 1001 uint8 *outKeys = other->Keys(); 1002 int32 keyIndex = *_keyIndex; // can become less than zero! 1003 1004 if (keyIndex > node->NumKeys()) { 1005 FATAL(("key index out of bounds: %ld, num keys: %u\n", keyIndex, 1006 node->NumKeys())); 1007 return B_BAD_VALUE; 1008 } 1009 1010 // how many keys will fit in one (half) page? 1011 // that loop will find the answer to this question and 1012 // change the key lengths indices for their new home 1013 1014 // "bytes" is the number of bytes written for the new key, 1015 // "bytesBefore" are the bytes before that key 1016 // "bytesAfter" are the bytes after the new key, if any 1017 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1018 1019 size_t size = fNodeSize >> 1; 1020 int32 out, in; 1021 for (in = out = 0; in < node->NumKeys() + 1;) { 1022 if (!bytes) { 1023 bytesBefore = in > 0 1024 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1025 } 1026 1027 if (in == keyIndex && !bytes) { 1028 bytes = *_keyLength; 1029 } else { 1030 if (keyIndex < out) { 1031 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1032 - bytesBefore; 1033 } 1034 1035 in++; 1036 } 1037 out++; 1038 1039 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1040 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1041 // we have found the number of keys in the new node! 1042 break; 1043 } 1044 } 1045 1046 // if the new key was not inserted, set the length of the keys 1047 // that can be copied directly 1048 if (keyIndex >= out && in > 0) 1049 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1050 1051 if (bytesBefore < 0 || bytesAfter < 0) 1052 return B_BAD_DATA; 1053 1054 other->left_link = node->left_link; 1055 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1056 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1057 + bytesAfter); 1058 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1059 1060 uint16 *outKeyLengths = other->KeyLengths(); 1061 off_t *outKeyValues = other->Values(); 1062 int32 keys = out > keyIndex ? keyIndex : out; 1063 1064 if (bytesBefore) { 1065 // copy the keys 1066 memcpy(outKeys, inKeys, bytesBefore); 1067 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1068 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1069 } 1070 if (bytes) { 1071 // copy the newly inserted key 1072 memcpy(outKeys + bytesBefore, key, bytes); 1073 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1074 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1075 1076 if (bytesAfter) { 1077 // copy the keys after the new key 1078 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1079 bytesAfter); 1080 keys = out - keyIndex - 1; 1081 for (int32 i = 0;i < keys;i++) { 1082 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1083 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes); 1084 } 1085 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1086 keys * sizeof(off_t)); 1087 } 1088 } 1089 1090 // if the new key was already inserted, we shouldn't use it again 1091 if (in != out) 1092 keyIndex--; 1093 1094 int32 total = bytesBefore + bytesAfter; 1095 1096 // these variables are for the key that will be returned 1097 // to the parent node 1098 uint8 *newKey = NULL; 1099 uint16 newLength; 1100 bool newAllocated = false; 1101 1102 // If we have split an index node, we have to drop the first key 1103 // of the next node (which can also be the new key to insert). 1104 // The dropped key is also the one which has to be inserted in 1105 // the parent node, so we will set the "newKey" already here. 1106 if (node->OverflowLink() != BPLUSTREE_NULL) { 1107 if (in == keyIndex) { 1108 newKey = key; 1109 newLength = *_keyLength; 1110 1111 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1112 keyIndex--; 1113 } else { 1114 // If a key is dropped (is not the new key), we have to copy 1115 // it, because it would be lost if not. 1116 uint8 *droppedKey = node->KeyAt(in, &newLength); 1117 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1118 > (uint8 *)node + fNodeSize 1119 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1120 fStream->GetVolume()->Panic(); 1121 RETURN_ERROR(B_BAD_DATA); 1122 } 1123 newKey = (uint8 *)malloc(newLength); 1124 if (newKey == NULL) 1125 return B_NO_MEMORY; 1126 1127 newAllocated = true; 1128 memcpy(newKey, droppedKey, newLength); 1129 1130 other->overflow_link = inKeyValues[in]; 1131 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1132 } 1133 } 1134 1135 // and now the same game for the other page and the rest of the keys 1136 // (but with memmove() instead of memcpy(), because they may overlap) 1137 1138 bytesBefore = bytesAfter = bytes = 0; 1139 out = 0; 1140 int32 skip = in; 1141 while (in < node->NumKeys() + 1) { 1142 if (in == keyIndex && !bytes) { 1143 // it's enough to set bytesBefore once here, because we do 1144 // not need to know the exact length of all keys in this 1145 // loop 1146 bytesBefore = in > skip 1147 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1148 bytes = *_keyLength; 1149 out++; 1150 } else { 1151 if (in < node->NumKeys()) { 1152 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1153 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1154 1155 if (bytes) { 1156 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1157 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1158 1159 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1160 - bytesBefore - bytes; 1161 } 1162 out++; 1163 } 1164 in++; 1165 } 1166 } 1167 1168 // adjust the byte counts (since we were a bit lazy in the loop) 1169 if (keyIndex < skip) 1170 bytesBefore = node->AllKeyLength() - total; 1171 1172 if (bytesBefore < 0 || bytesAfter < 0) { 1173 if (newAllocated) 1174 free(newKey); 1175 return B_BAD_DATA; 1176 } 1177 1178 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1179 // right link, and overflow link can stay the same 1180 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1181 + bytesAfter); 1182 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1183 1184 // array positions have changed 1185 outKeyLengths = node->KeyLengths(); 1186 outKeyValues = node->Values(); 1187 1188 // move the keys in the old node: the order is important here, 1189 // because we don't want to overwrite any contents 1190 1191 keys = keyIndex <= skip ? out : keyIndex - skip; 1192 keyIndex -= skip; 1193 in = out - keyIndex - 1; 1194 // Note: keyIndex and in will contain invalid values when the new key 1195 // went to the other node. But in this case bytes and bytesAfter are 1196 // 0 and subsequently we never use keyIndex and in. 1197 1198 if (bytesBefore) 1199 memmove(inKeys, inKeys + total, bytesBefore); 1200 if (bytesAfter) { 1201 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1202 bytesAfter); 1203 } 1204 1205 if (bytesBefore) 1206 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1207 if (bytesAfter) { 1208 // if byteAfter is > 0, keyIndex is larger than skip 1209 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1210 in * sizeof(uint16)); 1211 } 1212 1213 if (bytesBefore) 1214 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1215 if (bytesAfter) { 1216 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1217 in * sizeof(off_t)); 1218 } 1219 1220 if (bytes) { 1221 // finally, copy the newly inserted key (don't overwrite anything) 1222 memcpy(inKeys + bytesBefore, key, bytes); 1223 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1224 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1225 } 1226 1227 // Prepare the key that will be inserted in the parent node which 1228 // is either the dropped key or the last of the other node. 1229 // If it's the dropped key, "newKey" was already set earlier. 1230 1231 if (newKey == NULL) 1232 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1233 1234 memcpy(key, newKey, newLength); 1235 *_keyLength = newLength; 1236 *_value = otherOffset; 1237 1238 if (newAllocated) 1239 free(newKey); 1240 1241 return B_OK; 1242 } 1243 1244 1245 /*! 1246 This inserts a key into the tree. The changes made to the tree will 1247 all be part of the \a transaction. 1248 */ 1249 status_t 1250 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, 1251 off_t value) 1252 { 1253 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1254 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1255 RETURN_ERROR(B_BAD_VALUE); 1256 #ifdef DEBUG 1257 if (value < 0) 1258 panic("tried to insert invalid value %Ld!\n", value); 1259 #endif 1260 1261 // lock access to stream 1262 WriteLocked locked(fStream->Lock()); 1263 1264 Stack<node_and_key> stack; 1265 if (_SeekDown(stack, key, keyLength) != B_OK) 1266 RETURN_ERROR(B_ERROR); 1267 1268 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1269 1270 memcpy(keyBuffer, key, keyLength); 1271 keyBuffer[keyLength] = 0; 1272 1273 node_and_key nodeAndKey; 1274 const bplustree_node *node; 1275 1276 CachedNode cached(this); 1277 while (stack.Pop(&nodeAndKey) 1278 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1279 #ifdef DEBUG 1280 NodeChecker checker(node, fNodeSize, "insert"); 1281 #endif 1282 if (node->IsLeaf()) { 1283 // first round, check for duplicate entries 1284 status_t status = _FindKey(node, key, keyLength, 1285 &nodeAndKey.keyIndex); 1286 1287 // is this a duplicate entry? 1288 if (status == B_OK) { 1289 if (fAllowDuplicates) { 1290 status = _InsertDuplicate(transaction, cached, node, 1291 nodeAndKey.keyIndex, value); 1292 if (status != B_OK) 1293 RETURN_ERROR(status); 1294 return B_OK; 1295 } 1296 1297 RETURN_ERROR(B_NAME_IN_USE); 1298 } 1299 } 1300 1301 bplustree_node *writableNode = cached.MakeWritable(transaction); 1302 if (writableNode == NULL) 1303 return B_IO_ERROR; 1304 1305 // is the node big enough to hold the pair? 1306 if (int32(round_up(sizeof(bplustree_node) 1307 + writableNode->AllKeyLength() + keyLength) 1308 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1309 + sizeof(off_t))) < fNodeSize) { 1310 _InsertKey(writableNode, nodeAndKey.keyIndex, 1311 keyBuffer, keyLength, value); 1312 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1313 nodeAndKey.keyIndex, 0, 1); 1314 1315 return B_OK; 1316 } else { 1317 CachedNode cachedNewRoot(this); 1318 CachedNode cachedOther(this); 1319 1320 // do we need to allocate a new root node? if so, then do 1321 // it now 1322 off_t newRoot = BPLUSTREE_NULL; 1323 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1324 bplustree_node *root; 1325 status_t status = cachedNewRoot.Allocate(transaction, &root, 1326 &newRoot); 1327 if (status < B_OK) { 1328 // The tree is most likely corrupted! 1329 // But it's still sane at leaf level - we could set 1330 // a flag in the header that forces the tree to be 1331 // rebuild next time... 1332 // But since we will have journaling, that's not a big 1333 // problem anyway. 1334 RETURN_ERROR(status); 1335 } 1336 } 1337 1338 // reserve space for the other node 1339 bplustree_node *other; 1340 off_t otherOffset; 1341 status_t status = cachedOther.Allocate(transaction, &other, 1342 &otherOffset); 1343 if (status < B_OK) { 1344 cachedNewRoot.Free(transaction, newRoot); 1345 RETURN_ERROR(status); 1346 } 1347 1348 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1349 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1350 &value) < B_OK) { 1351 // free root node & other node here 1352 cachedOther.Free(transaction, otherOffset); 1353 cachedNewRoot.Free(transaction, newRoot); 1354 1355 RETURN_ERROR(B_ERROR); 1356 } 1357 #ifdef DEBUG 1358 checker.Check("insert split"); 1359 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1360 #endif 1361 1362 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1363 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1364 1365 // update the right link of the node in the left of the new node 1366 if ((other = cachedOther.SetToWritable(transaction, 1367 other->LeftLink())) != NULL) { 1368 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1369 } 1370 1371 // create a new root if necessary 1372 if (newRoot != BPLUSTREE_NULL) { 1373 bplustree_node *root = cachedNewRoot.Node(); 1374 1375 _InsertKey(root, 0, keyBuffer, keyLength, 1376 writableNode->LeftLink()); 1377 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1378 nodeAndKey.nodeOffset); 1379 1380 bplustree_header *header 1381 = fCachedHeader.MakeWritableHeader(transaction); 1382 if (header == NULL) 1383 return B_IO_ERROR; 1384 1385 // finally, update header to point to the new root 1386 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1387 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1388 header->MaxNumberOfLevels() + 1); 1389 1390 return B_OK; 1391 } 1392 } 1393 } 1394 RETURN_ERROR(B_ERROR); 1395 } 1396 1397 1398 /*! 1399 Removes the duplicate index/value pair from the tree. 1400 It's part of the private tree interface. 1401 */ 1402 status_t 1403 BPlusTree::_RemoveDuplicate(Transaction &transaction, 1404 const bplustree_node *node, CachedNode &cached, uint16 index, 1405 off_t value) 1406 { 1407 off_t *values = node->Values(); 1408 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1409 1410 CachedNode cachedDuplicate(this); 1411 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1412 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 1413 duplicateOffset, false); 1414 if (duplicate == NULL) 1415 return B_IO_ERROR; 1416 1417 // if it's a duplicate fragment, remove the entry from there 1418 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1419 duplicate_array *array = duplicate->FragmentAt( 1420 bplustree_node::FragmentIndex(oldValue)); 1421 1422 if (array->count > NUM_FRAGMENT_VALUES 1423 || array->count < 1) { 1424 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 1425 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1426 return B_BAD_DATA; 1427 } 1428 if (!array->Remove(value)) { 1429 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1430 value, duplicateOffset)); 1431 return B_ENTRY_NOT_FOUND; 1432 } 1433 1434 // remove the array from the fragment node if it is empty 1435 if (array->count == 1) { 1436 // set the link to the remaining value 1437 if (cached.MakeWritable(transaction) == NULL) 1438 return B_IO_ERROR; 1439 1440 values[index] = array->values[0]; 1441 1442 // Remove the whole fragment node, if this was the only array, 1443 // otherwise free just the array 1444 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1445 status_t status = cachedDuplicate.Free(transaction, duplicateOffset); 1446 if (status < B_OK) 1447 return status; 1448 } else 1449 array->count = 0; 1450 } 1451 return B_OK; 1452 } 1453 1454 // Remove value from a duplicate node! 1455 1456 duplicate_array *array = NULL; 1457 1458 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1459 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1460 duplicate->LeftLink())); 1461 return B_BAD_DATA; 1462 } 1463 1464 // Search the duplicate nodes until the entry could be found (and removed) 1465 while (duplicate != NULL) { 1466 array = duplicate->DuplicateArray(); 1467 if (array->count > NUM_DUPLICATE_VALUES 1468 || array->count < 0) { 1469 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1470 duplicateOffset, array->count)); 1471 return B_BAD_DATA; 1472 } 1473 1474 if (array->Remove(value)) 1475 break; 1476 1477 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1478 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1479 1480 cachedDuplicate.UnsetUnchanged(transaction); 1481 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); 1482 } 1483 if (duplicate == NULL) 1484 RETURN_ERROR(B_IO_ERROR); 1485 1486 // The entry got removed from the duplicate node, but we might want to free 1487 // it now in case it's empty 1488 1489 while (true) { 1490 off_t left = duplicate->LeftLink(); 1491 off_t right = duplicate->RightLink(); 1492 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1493 1494 if ((isLast && array->count == 1) || array->count == 0) { 1495 // Free empty duplicate page, link their siblings together, and 1496 // update the duplicate link if needed (ie. when we either remove 1497 // the last duplicate node or have a new first one) 1498 1499 if (left == BPLUSTREE_NULL) { 1500 // the duplicate link points to us 1501 if (cached.MakeWritable(transaction) == NULL) 1502 return B_IO_ERROR; 1503 1504 if (array->count == 1) { 1505 // this is the last node, and there is only one value left; 1506 // replace the duplicate link with that value, it's no duplicate 1507 // anymore 1508 values[index] = array->values[0]; 1509 } else { 1510 // move the duplicate link to the next node 1511 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1512 BPLUSTREE_DUPLICATE_NODE, right)); 1513 } 1514 } 1515 1516 status_t status; 1517 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1518 return status; 1519 1520 if (left != BPLUSTREE_NULL 1521 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { 1522 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1523 1524 // If the next node is the last node, we need to free that node 1525 // and convert the duplicate entry back into a normal entry 1526 array = duplicate->DuplicateArray(); 1527 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1528 && array->count <= NUM_FRAGMENT_VALUES) { 1529 duplicateOffset = left; 1530 continue; 1531 } 1532 } 1533 if (right != BPLUSTREE_NULL 1534 && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { 1535 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1536 1537 // Again, we may need to turn the duplicate entry back into a normal entry 1538 array = duplicate->DuplicateArray(); 1539 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1540 && array->count <= NUM_FRAGMENT_VALUES) { 1541 duplicateOffset = right; 1542 continue; 1543 } 1544 } 1545 return B_OK; 1546 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1547 // If the number of entries fits in a duplicate fragment, then 1548 // either find a free fragment node, or convert this node to a 1549 // fragment node. 1550 CachedNode cachedOther(this); 1551 1552 bplustree_node *fragment = NULL; 1553 uint32 fragmentIndex = 0; 1554 off_t offset; 1555 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1556 &offset, &fragment, &fragmentIndex) == B_OK) { 1557 // move to other node 1558 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1559 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1560 1561 cachedDuplicate.Free(transaction, duplicateOffset); 1562 duplicateOffset = offset; 1563 } else { 1564 // convert node 1565 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1566 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1567 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1568 } 1569 1570 if (cached.MakeWritable(transaction) == NULL) 1571 return B_IO_ERROR; 1572 1573 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1574 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1575 } 1576 return B_OK; 1577 } 1578 } 1579 1580 1581 /*! 1582 Removes the key with the given index from the specified node. 1583 Since it has to get the key from the node anyway (to obtain it's 1584 pointer), it's not needed to pass the key & its length, although 1585 the calling method (BPlusTree::Remove()) have this data. 1586 */ 1587 void 1588 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index) 1589 { 1590 // should never happen, but who knows? 1591 if (index > node->NumKeys() && node->NumKeys() > 0) { 1592 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1593 return; 1594 } 1595 1596 off_t *values = node->Values(); 1597 1598 // if we would have to drop the overflow link, drop 1599 // the last key instead and update the overflow link 1600 // to the value of that one 1601 if (!node->IsLeaf() && index == node->NumKeys()) 1602 node->overflow_link = values[--index]; 1603 1604 uint16 length; 1605 uint8 *key = node->KeyAt(index, &length); 1606 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1607 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1608 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1609 (int)fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1610 fStream->GetVolume()->Panic(); 1611 return; 1612 } 1613 1614 uint16 *keyLengths = node->KeyLengths(); 1615 uint8 *keys = node->Keys(); 1616 1617 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1618 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1619 1620 off_t *newValues = node->Values(); 1621 uint16 *newKeyLengths = node->KeyLengths(); 1622 1623 // move key data 1624 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1625 1626 // move and update key lengths 1627 if (index > 0 && newKeyLengths != keyLengths) 1628 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1629 for (uint16 i = index; i < node->NumKeys(); i++) { 1630 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1631 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1632 } 1633 1634 // move values 1635 if (index > 0) 1636 memmove(newValues, values, index * sizeof(off_t)); 1637 if (node->NumKeys() > index) 1638 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1639 } 1640 1641 1642 /*! 1643 Removes the specified key from the tree. The "value" parameter is only used 1644 for trees which allow duplicates, so you may safely ignore it. 1645 It's not an optional parameter, so at least you have to think about it. 1646 */ 1647 status_t 1648 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, 1649 off_t value) 1650 { 1651 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1652 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1653 RETURN_ERROR(B_BAD_VALUE); 1654 1655 // lock access to stream 1656 WriteLocked locked(fStream->Lock()); 1657 1658 Stack<node_and_key> stack; 1659 if (_SeekDown(stack, key, keyLength) != B_OK) 1660 RETURN_ERROR(B_ERROR); 1661 1662 node_and_key nodeAndKey; 1663 const bplustree_node *node; 1664 1665 CachedNode cached(this); 1666 while (stack.Pop(&nodeAndKey) 1667 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1668 #ifdef DEBUG 1669 NodeChecker checker(node, fNodeSize, "remove"); 1670 #endif 1671 if (node->IsLeaf()) { 1672 // first round, check for duplicate entries 1673 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1674 if (status < B_OK) 1675 RETURN_ERROR(status); 1676 1677 // If we will remove the last key, the iterator will be set 1678 // to the next node after the current - if there aren't any 1679 // more nodes, we need a way to prevent the TreeIterators to 1680 // touch the old node again, we use BPLUSTREE_FREE for this 1681 off_t next = node->RightLink() == BPLUSTREE_NULL 1682 ? BPLUSTREE_FREE : node->RightLink(); 1683 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1684 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1685 1686 // is this a duplicate entry? 1687 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1688 node->Values()[nodeAndKey.keyIndex]))) { 1689 if (fAllowDuplicates) { 1690 return _RemoveDuplicate(transaction, node, cached, 1691 nodeAndKey.keyIndex, value); 1692 } else { 1693 FATAL(("dupliate node found where no duplicates are allowed!\n")); 1694 RETURN_ERROR(B_ERROR); 1695 } 1696 } 1697 } 1698 1699 bplustree_node *writableNode = cached.MakeWritable(transaction); 1700 if (writableNode == NULL) 1701 return B_IO_ERROR; 1702 1703 // if it's an empty root node, we have to convert it 1704 // to a leaf node by dropping the overflow link, or, 1705 // if it's already a leaf node, just empty it 1706 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1707 && (node->NumKeys() == 0 || node->NumKeys() == 1 && node->IsLeaf())) { 1708 writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1709 writableNode->all_key_count = 0; 1710 writableNode->all_key_length = 0; 1711 1712 // if we've made a leaf node out of the root node, we need 1713 // to reset the maximum number of levels in the header 1714 if (fHeader->MaxNumberOfLevels() != 1) { 1715 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1716 if (header == NULL) 1717 return B_IO_ERROR; 1718 1719 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1720 } 1721 return B_OK; 1722 } 1723 1724 // if there is only one key left, we don't have to remove 1725 // it, we can just dump the node (index nodes still have 1726 // the overflow link, so we have to drop the last key) 1727 if (writableNode->NumKeys() > 1 1728 || !writableNode->IsLeaf() && writableNode->NumKeys() == 1) { 1729 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1730 return B_OK; 1731 } 1732 1733 // when we are here, we can just free the node, but 1734 // we have to update the right/left link of the 1735 // siblings first 1736 CachedNode otherCached(this); 1737 bplustree_node *other = otherCached.SetToWritable(transaction, 1738 writableNode->LeftLink()); 1739 if (other != NULL) 1740 other->right_link = writableNode->right_link; 1741 1742 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) 1743 other->left_link = writableNode->left_link; 1744 1745 cached.Free(transaction, nodeAndKey.nodeOffset); 1746 } 1747 RETURN_ERROR(B_ERROR); 1748 } 1749 1750 1751 /*! 1752 Replaces the value for the key in the tree. 1753 Returns B_OK if the key could be found and its value replaced, 1754 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1755 to indicate that something went terribly wrong. 1756 Note that this doesn't work with duplicates - it will just 1757 return B_BAD_TYPE if you call this function on a tree where 1758 duplicates are allowed. 1759 */ 1760 status_t 1761 BPlusTree::Replace(Transaction &transaction, const uint8 *key, 1762 uint16 keyLength, off_t value) 1763 { 1764 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1765 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1766 || key == NULL) 1767 RETURN_ERROR(B_BAD_VALUE); 1768 1769 if (fAllowDuplicates) 1770 RETURN_ERROR(B_BAD_TYPE); 1771 1772 // lock access to stream (a read lock is okay for this purpose) 1773 ReadLocked locked(fStream->Lock()); 1774 1775 off_t nodeOffset = fHeader->RootNode(); 1776 CachedNode cached(this); 1777 const bplustree_node *node; 1778 1779 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1780 uint16 keyIndex = 0; 1781 off_t nextOffset; 1782 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1783 1784 if (node->OverflowLink() == BPLUSTREE_NULL) { 1785 if (status == B_OK) { 1786 bplustree_node *writableNode = cached.MakeWritable(transaction); 1787 if (writableNode != NULL) 1788 writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); 1789 else 1790 status = B_IO_ERROR; 1791 } 1792 1793 return status; 1794 } else if (nextOffset == nodeOffset) 1795 RETURN_ERROR(B_ERROR); 1796 1797 nodeOffset = nextOffset; 1798 } 1799 RETURN_ERROR(B_ERROR); 1800 } 1801 1802 1803 /*! 1804 Searches the key in the tree, and stores the offset found in 1805 _value, if successful. 1806 It's very similar to BPlusTree::SeekDown(), but doesn't fill 1807 a stack while it descends the tree. 1808 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1809 if not. It can also return other errors to indicate that 1810 something went wrong. 1811 Note that this doesn't work with duplicates - it will just 1812 return B_BAD_TYPE if you call this function on a tree where 1813 duplicates are allowed. 1814 */ 1815 status_t 1816 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1817 { 1818 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1819 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1820 || key == NULL) 1821 RETURN_ERROR(B_BAD_VALUE); 1822 1823 if (fAllowDuplicates) 1824 RETURN_ERROR(B_BAD_TYPE); 1825 1826 // lock access to stream 1827 ReadLocked locked(fStream->Lock()); 1828 1829 off_t nodeOffset = fHeader->RootNode(); 1830 CachedNode cached(this); 1831 const bplustree_node *node; 1832 1833 #ifdef DEBUG 1834 int32 levels = 0; 1835 #endif 1836 1837 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1838 uint16 keyIndex = 0; 1839 off_t nextOffset; 1840 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1841 1842 #ifdef DEBUG 1843 levels++; 1844 #endif 1845 if (node->OverflowLink() == BPLUSTREE_NULL) { 1846 if (status == B_OK && _value != NULL) 1847 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1848 1849 #ifdef DEBUG 1850 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1851 DEBUGGER(("levels don't match")); 1852 #endif 1853 return status; 1854 } else if (nextOffset == nodeOffset) 1855 RETURN_ERROR(B_ERROR); 1856 1857 nodeOffset = nextOffset; 1858 } 1859 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1860 RETURN_ERROR(B_ERROR); 1861 } 1862 1863 1864 // #pragma mark - 1865 1866 1867 TreeIterator::TreeIterator(BPlusTree *tree) 1868 : 1869 fTree(tree), 1870 fCurrentNodeOffset(BPLUSTREE_NULL), 1871 fNext(NULL) 1872 { 1873 tree->_AddIterator(this); 1874 } 1875 1876 1877 TreeIterator::~TreeIterator() 1878 { 1879 if (fTree) 1880 fTree->_RemoveIterator(this); 1881 } 1882 1883 1884 status_t 1885 TreeIterator::Goto(int8 to) 1886 { 1887 if (fTree == NULL || fTree->fHeader == NULL) 1888 RETURN_ERROR(B_BAD_VALUE); 1889 1890 // lock access to stream 1891 ReadLocked locked(fTree->fStream->Lock()); 1892 1893 off_t nodeOffset = fTree->fHeader->RootNode(); 1894 CachedNode cached(fTree); 1895 const bplustree_node *node; 1896 1897 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1898 // is the node a leaf node? 1899 if (node->OverflowLink() == BPLUSTREE_NULL) { 1900 fCurrentNodeOffset = nodeOffset; 1901 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1902 fDuplicateNode = BPLUSTREE_NULL; 1903 1904 return B_OK; 1905 } 1906 1907 // get the next node offset depending on the direction (and if there 1908 // are any keys in that node at all) 1909 off_t nextOffset; 1910 if (to == BPLUSTREE_END || node->all_key_count == 0) 1911 nextOffset = node->OverflowLink(); 1912 else { 1913 if (node->AllKeyLength() > fTree->fNodeSize 1914 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1915 - 8 * node->NumKeys()) 1916 RETURN_ERROR(B_ERROR); 1917 1918 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1919 } 1920 if (nextOffset == nodeOffset) 1921 break; 1922 1923 nodeOffset = nextOffset; 1924 } 1925 FATAL(("%s fails\n", __FUNCTION__)); 1926 1927 RETURN_ERROR(B_ERROR); 1928 } 1929 1930 1931 /*! 1932 Iterates through the tree in the specified direction. 1933 When it iterates through duplicates, the "key" is only updated for the 1934 first entry - if you need to know when this happens, use the "duplicate" 1935 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1936 the other duplicates. 1937 That's not too nice, but saves the 256 bytes that would be needed to 1938 store the last key - if this will ever become an issue, it will be 1939 easy to change. 1940 The other advantage of this is, that the queries can skip all duplicates 1941 at once when they are not relevant to them. 1942 */ 1943 status_t 1944 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, 1945 uint16 maxLength, off_t *value, uint16 *duplicate) 1946 { 1947 if (fTree == NULL) 1948 return B_INTERRUPTED; 1949 1950 bool forward = direction == BPLUSTREE_FORWARD; 1951 1952 if (fCurrentNodeOffset == BPLUSTREE_NULL 1953 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1954 RETURN_ERROR(B_ERROR); 1955 1956 // if the tree was emptied since the last call 1957 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1958 return B_ENTRY_NOT_FOUND; 1959 1960 // lock access to stream 1961 ReadLocked locked(fTree->fStream->Lock()); 1962 1963 CachedNode cached(fTree); 1964 const bplustree_node *node; 1965 1966 if (fDuplicateNode != BPLUSTREE_NULL) { 1967 // regardless of traverse direction the duplicates are always presented in 1968 // the same order; since they are all considered as equal, this shouldn't 1969 // cause any problems 1970 1971 if (!fIsFragment || fDuplicate < fNumDuplicates) 1972 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1973 else 1974 node = NULL; 1975 1976 if (node != NULL) { 1977 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1978 // if the node is out of duplicates, we go directly to the next one 1979 fDuplicateNode = node->RightLink(); 1980 if (fDuplicateNode != BPLUSTREE_NULL 1981 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1982 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1983 fDuplicate = 0; 1984 } 1985 } 1986 if (fDuplicate < fNumDuplicates) { 1987 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1988 if (duplicate) 1989 *duplicate = 2; 1990 return B_OK; 1991 } 1992 } 1993 fDuplicateNode = BPLUSTREE_NULL; 1994 } 1995 1996 off_t savedNodeOffset = fCurrentNodeOffset; 1997 int32 savedKey = fCurrentKey; 1998 1999 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2000 RETURN_ERROR(B_ERROR); 2001 2002 if (duplicate) 2003 *duplicate = 0; 2004 2005 fCurrentKey += direction; 2006 2007 // is the current key in the current node? 2008 while ((forward && fCurrentKey >= node->NumKeys()) 2009 || (!forward && fCurrentKey < 0)) { 2010 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2011 2012 // are there any more nodes? 2013 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2014 node = cached.SetTo(fCurrentNodeOffset); 2015 if (!node) 2016 RETURN_ERROR(B_ERROR); 2017 2018 // reset current key 2019 fCurrentKey = forward ? 0 : node->NumKeys(); 2020 } else { 2021 // there are no nodes left, so turn back to the last key 2022 fCurrentNodeOffset = savedNodeOffset; 2023 fCurrentKey = savedKey; 2024 2025 return B_ENTRY_NOT_FOUND; 2026 } 2027 } 2028 2029 if (node->all_key_count == 0) 2030 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2031 2032 uint16 length; 2033 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 2034 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 2035 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2036 fTree->fStream->GetVolume()->Panic(); 2037 RETURN_ERROR(B_BAD_DATA); 2038 } 2039 2040 length = min_c(length, maxLength); 2041 memcpy(key, keyStart, length); 2042 2043 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 2044 // terminate string type 2045 if (length == maxLength) 2046 length--; 2047 ((char *)key)[length] = '\0'; 2048 } 2049 *keyLength = length; 2050 2051 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2052 2053 // duplicate fragments? 2054 uint8 type = bplustree_node::LinkType(offset); 2055 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 2056 fDuplicateNode = offset; 2057 2058 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 2059 if (node == NULL) 2060 RETURN_ERROR(B_ERROR); 2061 2062 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2063 2064 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2065 if (fNumDuplicates) { 2066 offset = node->DuplicateAt(offset, fIsFragment, 0); 2067 fDuplicate = 1; 2068 if (duplicate) 2069 *duplicate = 1; 2070 } else { 2071 // shouldn't happen, but we're dealing here with potentially corrupt disks... 2072 fDuplicateNode = BPLUSTREE_NULL; 2073 offset = 0; 2074 } 2075 } 2076 *value = offset; 2077 2078 return B_OK; 2079 } 2080 2081 2082 /*! 2083 This is more or less a copy of BPlusTree::Find() - but it just 2084 sets the current position in the iterator, regardless of if the 2085 key could be found or not. 2086 */ 2087 status_t 2088 TreeIterator::Find(const uint8 *key, uint16 keyLength) 2089 { 2090 if (fTree == NULL) 2091 return B_INTERRUPTED; 2092 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2093 || key == NULL) 2094 RETURN_ERROR(B_BAD_VALUE); 2095 2096 // lock access to stream 2097 ReadLocked locked(fTree->fStream->Lock()); 2098 2099 off_t nodeOffset = fTree->fHeader->RootNode(); 2100 2101 CachedNode cached(fTree); 2102 const bplustree_node *node; 2103 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2104 uint16 keyIndex = 0; 2105 off_t nextOffset; 2106 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2107 &nextOffset); 2108 2109 if (node->OverflowLink() == BPLUSTREE_NULL) { 2110 fCurrentNodeOffset = nodeOffset; 2111 fCurrentKey = keyIndex - 1; 2112 fDuplicateNode = BPLUSTREE_NULL; 2113 2114 return status; 2115 } else if (nextOffset == nodeOffset) 2116 RETURN_ERROR(B_ERROR); 2117 2118 nodeOffset = nextOffset; 2119 } 2120 RETURN_ERROR(B_ERROR); 2121 } 2122 2123 2124 void 2125 TreeIterator::SkipDuplicates() 2126 { 2127 fDuplicateNode = BPLUSTREE_NULL; 2128 } 2129 2130 2131 void 2132 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 2133 { 2134 if (offset != fCurrentNodeOffset) 2135 return; 2136 2137 if (nextOffset != BPLUSTREE_NULL) { 2138 fCurrentNodeOffset = nextOffset; 2139 if (splitAt <= fCurrentKey) { 2140 fCurrentKey -= splitAt; 2141 keyIndex -= splitAt; 2142 } 2143 } 2144 2145 // Adjust fCurrentKey to point to the same key as before. 2146 // Note, that if a key is inserted at the current position 2147 // it won't be included in this tree transition. 2148 if (keyIndex <= fCurrentKey) 2149 fCurrentKey += change; 2150 2151 // ToDo: duplicate handling! 2152 } 2153 2154 2155 void 2156 TreeIterator::Stop() 2157 { 2158 fTree = NULL; 2159 } 2160 2161 2162 #ifdef DEBUG 2163 void 2164 TreeIterator::Dump() 2165 { 2166 __out("TreeIterator at %p:\n", this); 2167 __out("\tfTree = %p\n", fTree); 2168 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2169 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2170 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 2171 __out("\tfDuplicate = %u\n", fDuplicate); 2172 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2173 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2174 } 2175 #endif 2176 2177 2178 // #pragma mark - 2179 2180 2181 void 2182 bplustree_node::Initialize() 2183 { 2184 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2185 all_key_count = 0; 2186 all_key_length = 0; 2187 } 2188 2189 2190 uint8 * 2191 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 2192 { 2193 if (index < 0 || index > NumKeys()) 2194 return NULL; 2195 2196 uint8 *keyStart = Keys(); 2197 uint16 *keyLengths = KeyLengths(); 2198 2199 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2200 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2201 if (index > 0) 2202 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2203 2204 return keyStart; 2205 } 2206 2207 2208 uint8 2209 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2210 { 2211 // the duplicate fragment handling is currently hard-coded to a node size 2212 // of 1024 bytes - with future versions of BFS, this may be a problem 2213 2214 if (isFragment) { 2215 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2216 2217 return ((off_t *)this)[fragment]; 2218 } 2219 return OverflowLink(); 2220 } 2221 2222 2223 off_t 2224 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2225 { 2226 uint32 start; 2227 if (isFragment) 2228 start = 8 * ((uint64)offset & 0x3ff); 2229 else 2230 start = 2; 2231 2232 return ((off_t *)this)[start + 1 + index]; 2233 } 2234 2235 2236 /*! Although the name suggests it, this function doesn't return the real 2237 used fragment count; at least, it can only count to two: it returns 2238 0, if there is no fragment used, 1 if there is only one fragment 2239 used, and 2 if there are at least 2 fragments used. 2240 */ 2241 uint32 2242 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2243 { 2244 uint32 used = 0; 2245 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2246 duplicate_array *array = FragmentAt(i); 2247 if (array->count > 0 && ++used > 1) 2248 return used; 2249 } 2250 return used; 2251 } 2252 2253 2254 #ifdef DEBUG 2255 status_t 2256 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2257 { 2258 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2259 DEBUGGER(("invalid node: key/length count")); 2260 2261 for (int32 i = 0; i < NumKeys(); i++) { 2262 uint16 length; 2263 uint8 *key = KeyAt(i, &length); 2264 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2265 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2266 dprintf("node %p, key %ld\n", this, i); 2267 DEBUGGER(("invalid node: keys corrupted")); 2268 return B_BAD_DATA; 2269 } 2270 if (Values()[i] == -1) { 2271 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2272 DEBUGGER(("invalid node: values corrupted")); 2273 return B_BAD_DATA; 2274 } 2275 } 2276 return B_OK; 2277 } 2278 #endif 2279 2280 2281 // #pragma mark - 2282 2283 2284 int32 2285 compareKeys(type_code type, const void *key1, int keyLength1, 2286 const void *key2, int keyLength2) 2287 { 2288 // if one of the keys is NULL, bail out gracefully 2289 if (key1 == NULL || key2 == NULL) { 2290 // even if it's most probably a bug in the calling code, we try to 2291 // give a meaningful result 2292 if (key1 == NULL && key2 != NULL) 2293 return 1; 2294 else if (key2 == NULL && key1 != NULL) 2295 return -1; 2296 2297 return 0; 2298 } 2299 2300 switch (type) { 2301 case B_STRING_TYPE: 2302 { 2303 int len = min_c(keyLength1, keyLength2); 2304 int result = strncmp((const char *)key1, (const char *)key2, len); 2305 2306 if (result == 0 2307 && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) 2308 result = keyLength1 - keyLength2; 2309 2310 return result; 2311 } 2312 2313 case B_SSIZE_T_TYPE: 2314 case B_INT32_TYPE: 2315 return *(int32 *)key1 - *(int32 *)key2; 2316 2317 case B_SIZE_T_TYPE: 2318 case B_UINT32_TYPE: 2319 if (*(uint32 *)key1 == *(uint32 *)key2) 2320 return 0; 2321 else if (*(uint32 *)key1 > *(uint32 *)key2) 2322 return 1; 2323 2324 return -1; 2325 2326 case B_OFF_T_TYPE: 2327 case B_INT64_TYPE: 2328 if (*(int64 *)key1 == *(int64 *)key2) 2329 return 0; 2330 else if (*(int64 *)key1 > *(int64 *)key2) 2331 return 1; 2332 2333 return -1; 2334 2335 case B_UINT64_TYPE: 2336 if (*(uint64 *)key1 == *(uint64 *)key2) 2337 return 0; 2338 else if (*(uint64 *)key1 > *(uint64 *)key2) 2339 return 1; 2340 2341 return -1; 2342 2343 case B_FLOAT_TYPE: 2344 { 2345 float result = *(float *)key1 - *(float *)key2; 2346 if (result == 0.0f) 2347 return 0; 2348 2349 return (result < 0.0f) ? -1 : 1; 2350 } 2351 2352 case B_DOUBLE_TYPE: 2353 { 2354 double result = *(double *)key1 - *(double *)key2; 2355 if (result == 0.0) 2356 return 0; 2357 2358 return (result < 0.0) ? -1 : 1; 2359 } 2360 } 2361 2362 // if the type is unknown, the entries don't match... 2363 return -1; 2364 } 2365 2366