1 /* 2 * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 //! B+Tree implementation 10 11 12 #include "Debug.h" 13 #include "BPlusTree.h" 14 #include "Inode.h" 15 #include "Utility.h" 16 17 18 #ifdef DEBUG 19 class NodeChecker { 20 public: 21 NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text) 22 : 23 fNode(node), 24 fSize(nodeSize), 25 fText(text) 26 { 27 Check("integrity check failed on construction."); 28 } 29 30 ~NodeChecker() 31 { 32 Check("integrity check failed on destruction."); 33 } 34 35 void 36 Check(const char *message) 37 { 38 if (fNode->CheckIntegrity(fSize) < B_OK) 39 dprintf("%s: %s\n", fText, message); 40 } 41 42 private: 43 const bplustree_node *fNode; 44 int32 fSize; 45 const char *fText; 46 }; 47 #endif 48 49 50 // Node Caching for the BPlusTree class 51 // 52 // With write support, there is the need for a function that allocates new 53 // nodes by either returning empty nodes, or by growing the file's data stream 54 // 55 // !! The CachedNode class assumes that you have properly locked the stream 56 // !! before asking for nodes. 57 // 58 // Note: This code will fail if the block size is smaller than the node size! 59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 60 // is hard-coded to 1024 bytes, that's not an issue now. 61 62 void 63 CachedNode::UnsetUnchanged(Transaction &transaction) 64 { 65 if (fTree == NULL || fTree->fStream == NULL) 66 return; 67 68 if (fNode != NULL) { 69 void *cache = fTree->fStream->GetVolume()->BlockCache(); 70 71 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 72 block_cache_put(cache, fBlockNumber); 73 fNode = NULL; 74 } 75 } 76 77 78 void 79 CachedNode::Unset() 80 { 81 if (fTree == NULL || fTree->fStream == NULL) 82 return; 83 84 if (fNode != NULL) { 85 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); 86 fNode = NULL; 87 } 88 } 89 90 91 const bplustree_node * 92 CachedNode::SetTo(off_t offset, bool check) 93 { 94 if (fTree == NULL || fTree->fStream == NULL) { 95 REPORT_ERROR(B_BAD_VALUE); 96 return NULL; 97 } 98 99 Unset(); 100 101 // You can only ask for nodes at valid positions - you can't 102 // even access the b+tree header with this method (use SetToHeader() 103 // instead) 104 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 105 || offset <= 0 106 || (offset % fTree->fNodeSize) != 0) 107 return NULL; 108 109 if (InternalSetTo(NULL, offset) != NULL && check) { 110 // sanity checks (links, all_key_count) 111 if (!fTree->fHeader->CheckNode(fNode)) { 112 FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n", 113 fNode, offset, fTree->fStream->ID())); 114 return NULL; 115 } 116 } 117 return fNode; 118 } 119 120 121 bplustree_node * 122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check) 123 { 124 if (fTree == NULL || fTree->fStream == NULL) { 125 REPORT_ERROR(B_BAD_VALUE); 126 return NULL; 127 } 128 129 Unset(); 130 131 // You can only ask for nodes at valid positions - you can't 132 // even access the b+tree header with this method (use SetToHeader() 133 // instead) 134 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 135 || offset <= 0 136 || (offset % fTree->fNodeSize) != 0) 137 return NULL; 138 139 if (InternalSetTo(&transaction, offset) != NULL && check) { 140 // sanity checks (links, all_key_count) 141 if (!fTree->fHeader->CheckNode(fNode)) { 142 FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n", 143 fNode, offset, fTree->fStream->ID())); 144 return NULL; 145 } 146 } 147 return fNode; 148 } 149 150 151 bplustree_node * 152 CachedNode::MakeWritable(Transaction &transaction) 153 { 154 if (fNode == NULL) 155 return NULL; 156 157 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 158 fBlockNumber, transaction.ID()) == B_OK) 159 return fNode; 160 161 return NULL; 162 } 163 164 165 bplustree_header * 166 CachedNode::MakeWritableHeader(Transaction &transaction) 167 { 168 return (bplustree_header *)MakeWritable(transaction); 169 } 170 171 172 const bplustree_header * 173 CachedNode::SetToHeader() 174 { 175 if (fTree == NULL || fTree->fStream == NULL) { 176 REPORT_ERROR(B_BAD_VALUE); 177 return NULL; 178 } 179 180 Unset(); 181 182 InternalSetTo(NULL, 0LL); 183 return (bplustree_header *)fNode; 184 } 185 186 187 bplustree_header * 188 CachedNode::SetToWritableHeader(Transaction &transaction) 189 { 190 if (fTree == NULL || fTree->fStream == NULL) { 191 REPORT_ERROR(B_BAD_VALUE); 192 return NULL; 193 } 194 195 Unset(); 196 197 InternalSetTo(&transaction, 0LL); 198 return (bplustree_header *)fNode; 199 } 200 201 202 bplustree_node * 203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset) 204 { 205 fNode = NULL; 206 fOffset = offset; 207 208 off_t fileOffset; 209 block_run run; 210 if (offset < fTree->fStream->Size() 211 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 212 Volume *volume = fTree->fStream->GetVolume(); 213 214 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 215 fBlockNumber = volume->ToBlock(run) + blockOffset; 216 uint8 *block; 217 218 if (transaction != NULL) { 219 block = (uint8 *)block_cache_get_writable(volume->BlockCache(), 220 fBlockNumber, transaction->ID()); 221 fWritable = true; 222 } else { 223 block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber); 224 fWritable = false; 225 } 226 227 if (block) { 228 // the node is somewhere in that block... (confusing offset calculation) 229 fNode = (bplustree_node *)(block + offset - 230 (fileOffset + (blockOffset << volume->BlockShift()))); 231 } else 232 REPORT_ERROR(B_IO_ERROR); 233 } 234 return fNode; 235 } 236 237 238 status_t 239 CachedNode::Free(Transaction &transaction, off_t offset) 240 { 241 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 242 RETURN_ERROR(B_BAD_VALUE); 243 244 // ToDo: scan the free nodes list and remove all nodes at the end 245 // of the tree - perhaps that shouldn't be done everytime that 246 // function is called, perhaps it should be done when the directory 247 // inode is closed or based on some calculation or whatever... 248 249 bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction); 250 if (header == NULL) 251 return B_IO_ERROR; 252 253 // if the node is the last one in the tree, we shrink 254 // the tree and file size by one node 255 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 256 if (offset == lastOffset) { 257 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 258 if (status < B_OK) 259 return status; 260 261 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 262 return B_OK; 263 } 264 265 // add the node to the free nodes list 266 fNode->left_link = header->free_node_pointer; 267 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 268 269 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 270 return B_OK; 271 } 272 273 274 status_t 275 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, 276 off_t *_offset) 277 { 278 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 279 RETURN_ERROR(B_BAD_VALUE); 280 281 Unset(); 282 283 bplustree_header *header; 284 status_t status; 285 286 // if there are any free nodes, recycle them 287 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 288 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 289 if (header == NULL) 290 return B_IO_ERROR; 291 292 *_offset = header->FreeNode(); 293 *_node = fNode; 294 295 // set new free node pointer 296 header->free_node_pointer = fNode->left_link; 297 fNode->Initialize(); 298 return B_OK; 299 } 300 301 // allocate space for a new node 302 Inode *stream = fTree->fStream; 303 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 304 return status; 305 306 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 307 if (header == NULL) 308 return B_IO_ERROR; 309 310 // the maximum_size has to be changed before the call to SetTo() - or 311 // else it will fail because the requested node is out of bounds 312 off_t offset = header->MaximumSize(); 313 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 314 + fTree->fNodeSize); 315 316 if (SetToWritable(transaction, offset, false) != NULL) { 317 fNode->Initialize(); 318 319 *_offset = offset; 320 *_node = fNode; 321 return B_OK; 322 } 323 324 // revert header size to old value 325 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 326 - fTree->fNodeSize); 327 RETURN_ERROR(B_ERROR); 328 } 329 330 331 // #pragma mark - 332 333 334 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize) 335 : 336 fStream(NULL), 337 fHeader(NULL), 338 fCachedHeader(this) 339 { 340 SetTo(transaction, stream); 341 } 342 343 344 BPlusTree::BPlusTree(Inode *stream) 345 : 346 fStream(NULL), 347 fHeader(NULL), 348 fCachedHeader(this) 349 { 350 SetTo(stream); 351 } 352 353 354 BPlusTree::BPlusTree() 355 : 356 fStream(NULL), 357 fHeader(NULL), 358 fCachedHeader(this), 359 fNodeSize(BPLUSTREE_NODE_SIZE), 360 fAllowDuplicates(true), 361 fStatus(B_NO_INIT) 362 { 363 } 364 365 366 BPlusTree::~BPlusTree() 367 { 368 // if there are any TreeIterators left, we need to stop them 369 // (can happen when the tree's inode gets deleted while 370 // traversing the tree - a TreeIterator doesn't lock the inode) 371 if (fIteratorLock.Lock() < B_OK) 372 return; 373 374 TreeIterator *iterator = NULL; 375 while ((iterator = fIterators.Next(iterator)) != NULL) 376 iterator->Stop(); 377 378 fIteratorLock.Unlock(); 379 } 380 381 382 /*! Create a new B+Tree on the specified stream */ 383 status_t 384 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize) 385 { 386 // initializes in-memory B+Tree 387 388 fStream = stream; 389 390 bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction); 391 if (header == NULL) { 392 // allocate space for new header + node! 393 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 394 if (fStatus < B_OK) 395 RETURN_ERROR(fStatus); 396 397 header = fCachedHeader.SetToWritableHeader(transaction); 398 if (header == NULL) 399 RETURN_ERROR(fStatus = B_ERROR); 400 } 401 402 fHeader = header; 403 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 404 && stream->BlockRun() != stream->Parent()) 405 || (stream->Mode() & S_ALLOW_DUPS) != 0; 406 407 fNodeSize = nodeSize; 408 409 // initialize b+tree header 410 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 411 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 412 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 413 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 414 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 415 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 416 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 417 418 // initialize b+tree root node 419 CachedNode cached(this); 420 cached.SetToWritable(transaction, header->RootNode(), false); 421 if (cached.Node() == NULL) 422 RETURN_ERROR(B_IO_ERROR); 423 424 cached.Node()->Initialize(); 425 426 return fStatus = B_OK; 427 } 428 429 430 status_t 431 BPlusTree::SetTo(Inode *stream) 432 { 433 if (stream == NULL) 434 RETURN_ERROR(fStatus = B_BAD_VALUE); 435 436 // get on-disk B+Tree header 437 438 fCachedHeader.Unset(); 439 fStream = stream; 440 441 fHeader = fCachedHeader.SetToHeader(); 442 if (fHeader == NULL) 443 RETURN_ERROR(fStatus = B_NO_INIT); 444 445 // is header valid? 446 447 if (fHeader->MaximumSize() != stream->Size()) { 448 FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n", 449 fHeader->MaximumSize(), stream->Size())); 450 // we can't change the header since we don't have a transaction 451 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 452 } 453 if (fHeader->Magic() != BPLUSTREE_MAGIC 454 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 455 || !fHeader->IsValidLink(fHeader->RootNode()) 456 || !fHeader->IsValidLink(fHeader->FreeNode())) { 457 #ifdef DEBUG 458 dump_bplustree_header(fHeader); 459 dump_block((const char *)fHeader, 128); 460 #endif 461 RETURN_ERROR(fStatus = B_BAD_DATA); 462 } 463 464 fNodeSize = fHeader->NodeSize(); 465 466 { 467 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 468 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 469 S_DOUBLE_INDEX}; 470 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 471 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 472 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 473 474 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 475 || (stream->Mode() & S_INDEX_DIR) 476 && toMode[fHeader->DataType()] != mode 477 || !stream->IsContainer()) { 478 D( dump_bplustree_header(fHeader); 479 dump_inode(&stream->Node()); 480 ); 481 RETURN_ERROR(fStatus = B_BAD_TYPE); 482 } 483 484 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 485 // in the original BFS code - we will honour it nevertheless 486 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 487 && stream->BlockRun() != stream->Parent()) 488 || (stream->Mode() & S_ALLOW_DUPS) != 0; 489 } 490 491 CachedNode cached(this, fHeader->RootNode()); 492 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 493 } 494 495 496 status_t 497 BPlusTree::InitCheck() 498 { 499 return fStatus; 500 } 501 502 503 int32 504 BPlusTree::TypeCodeToKeyType(type_code code) 505 { 506 switch (code) { 507 case B_STRING_TYPE: 508 return BPLUSTREE_STRING_TYPE; 509 case B_SSIZE_T_TYPE: 510 case B_INT32_TYPE: 511 return BPLUSTREE_INT32_TYPE; 512 case B_SIZE_T_TYPE: 513 case B_UINT32_TYPE: 514 return BPLUSTREE_UINT32_TYPE; 515 case B_OFF_T_TYPE: 516 case B_INT64_TYPE: 517 return BPLUSTREE_INT64_TYPE; 518 case B_UINT64_TYPE: 519 return BPLUSTREE_UINT64_TYPE; 520 case B_FLOAT_TYPE: 521 return BPLUSTREE_FLOAT_TYPE; 522 case B_DOUBLE_TYPE: 523 return BPLUSTREE_DOUBLE_TYPE; 524 } 525 return -1; 526 } 527 528 529 int32 530 BPlusTree::ModeToKeyType(mode_t mode) 531 { 532 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 533 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 534 case S_INT_INDEX: 535 return BPLUSTREE_INT32_TYPE; 536 case S_UINT_INDEX: 537 return BPLUSTREE_UINT32_TYPE; 538 case S_LONG_LONG_INDEX: 539 return BPLUSTREE_INT64_TYPE; 540 case S_ULONG_LONG_INDEX: 541 return BPLUSTREE_UINT64_TYPE; 542 case S_FLOAT_INDEX: 543 return BPLUSTREE_FLOAT_TYPE; 544 case S_DOUBLE_INDEX: 545 return BPLUSTREE_DOUBLE_TYPE; 546 case S_STR_INDEX: 547 default: 548 // default is for standard directories 549 return BPLUSTREE_STRING_TYPE; 550 } 551 } 552 553 554 // #pragma mark - 555 556 557 void 558 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 559 uint16 splitAt, int8 change) 560 { 561 // Although every iterator which is affected by this update currently 562 // waits on a semaphore, other iterators could be added/removed at 563 // any time, so we need to protect this loop 564 if (fIteratorLock.Lock() < B_OK) 565 return; 566 567 TreeIterator *iterator = NULL; 568 while ((iterator = fIterators.Next(iterator)) != NULL) 569 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 570 571 fIteratorLock.Unlock(); 572 } 573 574 575 void 576 BPlusTree::_AddIterator(TreeIterator *iterator) 577 { 578 if (fIteratorLock.Lock() < B_OK) 579 return; 580 581 fIterators.Add(iterator); 582 583 fIteratorLock.Unlock(); 584 } 585 586 587 void 588 BPlusTree::_RemoveIterator(TreeIterator *iterator) 589 { 590 if (fIteratorLock.Lock() < B_OK) 591 return; 592 593 fIterators.Remove(iterator); 594 595 fIteratorLock.Unlock(); 596 } 597 598 599 int32 600 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2, 601 int keyLength2) 602 { 603 type_code type = 0; 604 switch (fHeader->data_type) { 605 case BPLUSTREE_STRING_TYPE: 606 type = B_STRING_TYPE; 607 break; 608 case BPLUSTREE_INT32_TYPE: 609 type = B_INT32_TYPE; 610 break; 611 case BPLUSTREE_UINT32_TYPE: 612 type = B_UINT32_TYPE; 613 break; 614 case BPLUSTREE_INT64_TYPE: 615 type = B_INT64_TYPE; 616 break; 617 case BPLUSTREE_UINT64_TYPE: 618 type = B_UINT64_TYPE; 619 break; 620 case BPLUSTREE_FLOAT_TYPE: 621 type = B_FLOAT_TYPE; 622 break; 623 case BPLUSTREE_DOUBLE_TYPE: 624 type = B_DOUBLE_TYPE; 625 break; 626 } 627 return compareKeys(type, key1, keyLength1, key2, keyLength2); 628 } 629 630 631 status_t 632 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key, 633 uint16 keyLength, uint16 *_index, off_t *_next) 634 { 635 #ifdef DEBUG 636 NodeChecker checker(node, fNodeSize, "find"); 637 #endif 638 639 if (node->all_key_count == 0) { 640 if (_index) 641 *_index = 0; 642 if (_next) 643 *_next = node->OverflowLink(); 644 return B_ENTRY_NOT_FOUND; 645 } 646 647 off_t *values = node->Values(); 648 int16 saveIndex = -1; 649 650 // binary search in the key array 651 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 652 uint16 i = (first + last) >> 1; 653 654 uint16 searchLength; 655 uint8 *searchKey = node->KeyAt(i, &searchLength); 656 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 657 > (uint8 *)node + fNodeSize 658 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 659 fStream->GetVolume()->Panic(); 660 RETURN_ERROR(B_BAD_DATA); 661 } 662 663 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 664 if (cmp < 0) { 665 last = i - 1; 666 saveIndex = i; 667 } else if (cmp > 0) { 668 saveIndex = first = i + 1; 669 } else { 670 if (_index) 671 *_index = i; 672 if (_next) 673 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 674 return B_OK; 675 } 676 } 677 678 if (_index) 679 *_index = saveIndex; 680 if (_next) { 681 if (saveIndex == node->NumKeys()) 682 *_next = node->OverflowLink(); 683 else 684 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 685 } 686 return B_ENTRY_NOT_FOUND; 687 } 688 689 690 /*! 691 Prepares the stack to contain all nodes that were passed while 692 following the key, from the root node to the leaf node that could 693 or should contain that key. 694 */ 695 status_t 696 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key, 697 uint16 keyLength) 698 { 699 // set the root node to begin with 700 node_and_key nodeAndKey; 701 nodeAndKey.nodeOffset = fHeader->RootNode(); 702 703 CachedNode cached(this); 704 const bplustree_node *node; 705 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 706 // if we are already on leaf level, we're done 707 if (node->OverflowLink() == BPLUSTREE_NULL) { 708 // node that the keyIndex is not properly set here (but it's not 709 // needed in the calling functions anyway)! 710 nodeAndKey.keyIndex = 0; 711 stack.Push(nodeAndKey); 712 return B_OK; 713 } 714 715 off_t nextOffset; 716 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 717 &nextOffset); 718 719 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 720 RETURN_ERROR(B_ERROR); 721 722 // put the node offset & the correct keyIndex on the stack 723 stack.Push(nodeAndKey); 724 725 nodeAndKey.nodeOffset = nextOffset; 726 } 727 728 FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", 729 nodeAndKey.nodeOffset)); 730 return B_ERROR; 731 } 732 733 734 /*! 735 This will find a free duplicate fragment in the given bplustree_node. 736 The CachedNode will be set to the writable fragment on success. 737 */ 738 status_t 739 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction, 740 const bplustree_node *node, CachedNode &cached, 741 off_t *_offset, bplustree_node **_fragment, uint32 *_index) 742 { 743 off_t *values = node->Values(); 744 for (int32 i = 0; i < node->NumKeys(); i++) { 745 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 746 747 // does the value link to a duplicate fragment? 748 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 749 continue; 750 751 const bplustree_node *fragment = cached.SetTo( 752 bplustree_node::FragmentOffset(value), false); 753 if (fragment == NULL) { 754 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 755 continue; 756 } 757 758 // see if there is some space left for us 759 uint32 num = bplustree_node::MaxFragments(fNodeSize); 760 for (uint32 j = 0; j < num; j++) { 761 duplicate_array *array = fragment->FragmentAt(j); 762 763 if (array->count == 0) { 764 // found an unused fragment 765 *_fragment = cached.MakeWritable(transaction); 766 if (*_fragment == NULL) 767 return B_IO_ERROR; 768 769 *_offset = bplustree_node::FragmentOffset(value); 770 *_index = j; 771 return B_OK; 772 } 773 } 774 } 775 return B_ENTRY_NOT_FOUND; 776 } 777 778 779 status_t 780 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached, 781 const bplustree_node *node, uint16 index, off_t value) 782 { 783 CachedNode cachedDuplicate(this); 784 off_t *values = node->Values(); 785 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 786 status_t status; 787 off_t offset; 788 789 if (bplustree_node::IsDuplicate(oldValue)) { 790 // If it's a duplicate fragment, try to insert it into that, or if it 791 // doesn't fit anymore, create a new duplicate node 792 793 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 794 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 795 bplustree_node::FragmentOffset(oldValue), false); 796 if (duplicate == NULL) 797 return B_IO_ERROR; 798 799 duplicate_array *array = duplicate->FragmentAt( 800 bplustree_node::FragmentIndex(oldValue)); 801 if (array->count > NUM_FRAGMENT_VALUES 802 || array->count < 1) { 803 FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 804 (int)bplustree_node::FragmentIndex(oldValue), 805 bplustree_node::FragmentOffset(oldValue), 806 array->count)); 807 return B_BAD_DATA; 808 } 809 810 if (array->count < NUM_FRAGMENT_VALUES) { 811 array->Insert(value); 812 } else { 813 // test if the fragment will be empty if we remove this key's values 814 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 815 // the node will be empty without our values, so let us 816 // reuse it as a duplicate node 817 offset = bplustree_node::FragmentOffset(oldValue); 818 819 memmove(duplicate->DuplicateArray(), array, 820 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 821 duplicate->left_link = duplicate->right_link 822 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 823 824 array = duplicate->DuplicateArray(); 825 array->Insert(value); 826 } else { 827 // create a new duplicate node 828 829 cachedDuplicate.UnsetUnchanged(transaction); 830 // the old duplicate has not been touched, so we can reuse it 831 832 bplustree_node *newDuplicate; 833 status = cachedDuplicate.Allocate(transaction, 834 &newDuplicate, &offset); 835 if (status < B_OK) 836 RETURN_ERROR(status); 837 838 // copy the array from the fragment node to the duplicate node 839 // and free the old entry (by zero'ing all values) 840 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 841 array->count); 842 memcpy(&newDuplicate->all_key_count, &array->values[0], 843 array->count * sizeof(off_t)); 844 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 845 846 array = newDuplicate->DuplicateArray(); 847 array->Insert(value); 848 } 849 850 // update the main pointer to link to a duplicate node 851 if (cached.MakeWritable(transaction) == NULL) 852 return B_IO_ERROR; 853 854 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 855 BPLUSTREE_DUPLICATE_NODE, offset)); 856 } 857 858 return B_OK; 859 } 860 861 // Put the value into a dedicated duplicate node 862 863 // search for free space in the duplicate nodes of that key 864 duplicate_array *array; 865 const bplustree_node *duplicate; 866 off_t duplicateOffset; 867 do { 868 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 869 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 870 if (duplicate == NULL) 871 return B_IO_ERROR; 872 873 array = duplicate->DuplicateArray(); 874 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 875 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 876 duplicateOffset, array->count)); 877 return B_BAD_DATA; 878 } 879 } while (array->count >= NUM_DUPLICATE_VALUES 880 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 881 882 bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction); 883 if (writableDuplicate == NULL) 884 return B_IO_ERROR; 885 886 if (array->count < NUM_DUPLICATE_VALUES) { 887 array = writableDuplicate->DuplicateArray(); 888 array->Insert(value); 889 } else { 890 // no space left - add a new duplicate node 891 892 bplustree_node *newDuplicate; 893 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 894 if (status < B_OK) 895 RETURN_ERROR(status); 896 897 // link the two nodes together 898 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 899 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 900 901 array = newDuplicate->DuplicateArray(); 902 array->count = 0; 903 array->Insert(value); 904 } 905 return B_OK; 906 } 907 908 // Search for a free duplicate fragment or create a new one 909 // to insert the duplicate value into 910 911 uint32 fragmentIndex = 0; 912 bplustree_node *fragment; 913 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 914 &offset, &fragment, &fragmentIndex) != B_OK) { 915 // allocate a new duplicate fragment node 916 if ((status = cachedDuplicate.Allocate(transaction, &fragment, 917 &offset)) < B_OK) 918 RETURN_ERROR(status); 919 920 memset(fragment, 0, fNodeSize); 921 } 922 923 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 924 array->Insert(oldValue); 925 array->Insert(value); 926 927 if (cached.MakeWritable(transaction) == NULL) 928 return B_IO_ERROR; 929 930 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 931 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 932 933 return B_OK; 934 } 935 936 937 void 938 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key, 939 uint16 keyLength, off_t value) 940 { 941 // should never happen, but who knows? 942 if (index > node->NumKeys()) 943 return; 944 945 off_t *values = node->Values(); 946 uint16 *keyLengths = node->KeyLengths(); 947 uint8 *keys = node->Keys(); 948 949 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 950 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 951 + keyLength); 952 953 off_t *newValues = node->Values(); 954 uint16 *newKeyLengths = node->KeyLengths(); 955 956 // move values and copy new value into them 957 memmove(newValues + index + 1, values + index, 958 sizeof(off_t) * (node->NumKeys() - 1 - index)); 959 memmove(newValues, values, sizeof(off_t) * index); 960 961 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 962 963 // move and update key length index 964 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 965 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 966 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 967 } 968 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 969 970 int32 keyStart; 971 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 972 + (keyStart = index > 0 973 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 974 975 // move keys and copy new key into them 976 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 977 int32 size = node->AllKeyLength() - length; 978 if (size > 0) 979 memmove(keys + length, keys + length - keyLength, size); 980 981 memcpy(keys + keyStart, key, keyLength); 982 } 983 984 985 /*! 986 Splits the \a node into two halves - the other half will be put into \a other. 987 It also takes care to create a new overflow link if the node to split is an 988 index node. 989 */ 990 status_t 991 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset, 992 bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key, 993 uint16 *_keyLength, off_t *_value) 994 { 995 if (*_keyIndex > node->NumKeys() + 1) 996 return B_BAD_VALUE; 997 998 uint16 *inKeyLengths = node->KeyLengths(); 999 off_t *inKeyValues = node->Values(); 1000 uint8 *inKeys = node->Keys(); 1001 uint8 *outKeys = other->Keys(); 1002 int32 keyIndex = *_keyIndex; // can become less than zero! 1003 1004 // how many keys will fit in one (half) page? 1005 // that loop will find the answer to this question and 1006 // change the key lengths indices for their new home 1007 1008 // "bytes" is the number of bytes written for the new key, 1009 // "bytesBefore" are the bytes before that key 1010 // "bytesAfter" are the bytes after the new key, if any 1011 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1012 1013 size_t size = fNodeSize >> 1; 1014 int32 out, in; 1015 for (in = out = 0; in < node->NumKeys() + 1;) { 1016 if (!bytes) { 1017 bytesBefore = in > 0 1018 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1019 } 1020 1021 if (in == keyIndex && !bytes) { 1022 bytes = *_keyLength; 1023 } else { 1024 if (keyIndex < out) { 1025 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1026 - bytesBefore; 1027 } 1028 1029 in++; 1030 } 1031 out++; 1032 1033 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1034 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1035 // we have found the number of keys in the new node! 1036 break; 1037 } 1038 } 1039 1040 // if the new key was not inserted, set the length of the keys 1041 // that can be copied directly 1042 if (keyIndex >= out && in > 0) 1043 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1044 1045 if (bytesBefore < 0 || bytesAfter < 0) 1046 return B_BAD_DATA; 1047 1048 other->left_link = node->left_link; 1049 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1050 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1051 + bytesAfter); 1052 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1053 1054 uint16 *outKeyLengths = other->KeyLengths(); 1055 off_t *outKeyValues = other->Values(); 1056 int32 keys = out > keyIndex ? keyIndex : out; 1057 1058 if (bytesBefore) { 1059 // copy the keys 1060 memcpy(outKeys, inKeys, bytesBefore); 1061 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1062 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1063 } 1064 if (bytes) { 1065 // copy the newly inserted key 1066 memcpy(outKeys + bytesBefore, key, bytes); 1067 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1068 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1069 1070 if (bytesAfter) { 1071 // copy the keys after the new key 1072 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1073 bytesAfter); 1074 keys = out - keyIndex - 1; 1075 for (int32 i = 0;i < keys;i++) { 1076 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1077 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes); 1078 } 1079 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1080 keys * sizeof(off_t)); 1081 } 1082 } 1083 1084 // if the new key was already inserted, we shouldn't use it again 1085 if (in != out) 1086 keyIndex--; 1087 1088 int32 total = bytesBefore + bytesAfter; 1089 1090 // these variables are for the key that will be returned 1091 // to the parent node 1092 uint8 *newKey = NULL; 1093 uint16 newLength; 1094 bool newAllocated = false; 1095 1096 // If we have split an index node, we have to drop the first key 1097 // of the next node (which can also be the new key to insert). 1098 // The dropped key is also the one which has to be inserted in 1099 // the parent node, so we will set the "newKey" already here. 1100 if (node->OverflowLink() != BPLUSTREE_NULL) { 1101 if (in == keyIndex) { 1102 newKey = key; 1103 newLength = *_keyLength; 1104 1105 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1106 keyIndex--; 1107 } else { 1108 // If a key is dropped (is not the new key), we have to copy 1109 // it, because it would be lost if not. 1110 uint8 *droppedKey = node->KeyAt(in, &newLength); 1111 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1112 > (uint8 *)node + fNodeSize 1113 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1114 fStream->GetVolume()->Panic(); 1115 RETURN_ERROR(B_BAD_DATA); 1116 } 1117 newKey = (uint8 *)malloc(newLength); 1118 if (newKey == NULL) 1119 return B_NO_MEMORY; 1120 1121 newAllocated = true; 1122 memcpy(newKey, droppedKey, newLength); 1123 1124 other->overflow_link = inKeyValues[in]; 1125 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1126 } 1127 } 1128 1129 // and now the same game for the other page and the rest of the keys 1130 // (but with memmove() instead of memcpy(), because they may overlap) 1131 1132 bytesBefore = bytesAfter = bytes = 0; 1133 out = 0; 1134 int32 skip = in; 1135 while (in < node->NumKeys() + 1) { 1136 if (in == keyIndex && !bytes) { 1137 // it's enough to set bytesBefore once here, because we do 1138 // not need to know the exact length of all keys in this 1139 // loop 1140 bytesBefore = in > skip 1141 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1142 bytes = *_keyLength; 1143 } else { 1144 if (in < node->NumKeys()) { 1145 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1146 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1147 1148 if (bytes) { 1149 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1150 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1151 1152 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1153 - bytesBefore - bytes; 1154 } 1155 } 1156 in++; 1157 } 1158 1159 out++; 1160 1161 // break out when all keys are done 1162 if (in > node->NumKeys() && keyIndex < in) 1163 break; 1164 } 1165 1166 // adjust the byte counts (since we were a bit lazy in the loop) 1167 if (keyIndex >= in && keyIndex - skip < out) { 1168 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1169 - bytesBefore - total; 1170 } else if (keyIndex < skip) 1171 bytesBefore = node->AllKeyLength() - total; 1172 1173 if (bytesBefore < 0 || bytesAfter < 0) { 1174 if (newAllocated) 1175 free(newKey); 1176 return B_BAD_DATA; 1177 } 1178 1179 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1180 // right link, and overflow link can stay the same 1181 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1182 + bytesAfter); 1183 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1); 1184 1185 // array positions have changed 1186 outKeyLengths = node->KeyLengths(); 1187 outKeyValues = node->Values(); 1188 1189 // move the keys in the old node: the order is important here, 1190 // because we don't want to overwrite any contents 1191 1192 keys = keyIndex <= skip ? out : keyIndex - skip; 1193 keyIndex -= skip; 1194 1195 if (bytesBefore) 1196 memmove(inKeys, inKeys + total, bytesBefore); 1197 if (bytesAfter) { 1198 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1199 bytesAfter); 1200 } 1201 1202 if (bytesBefore) 1203 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1204 in = out - keyIndex - 1; 1205 if (bytesAfter) { 1206 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1207 in * sizeof(uint16)); 1208 } 1209 1210 if (bytesBefore) 1211 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1212 if (bytesAfter) { 1213 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1214 in * sizeof(off_t)); 1215 } 1216 1217 if (bytes) { 1218 // finally, copy the newly inserted key (don't overwrite anything) 1219 memcpy(inKeys + bytesBefore, key, bytes); 1220 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1221 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1222 } 1223 1224 // Prepare the key that will be inserted in the parent node which 1225 // is either the dropped key or the last of the other node. 1226 // If it's the dropped key, "newKey" was already set earlier. 1227 1228 if (newKey == NULL) 1229 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1230 1231 memcpy(key, newKey, newLength); 1232 *_keyLength = newLength; 1233 *_value = otherOffset; 1234 1235 if (newAllocated) 1236 free(newKey); 1237 1238 return B_OK; 1239 } 1240 1241 1242 /*! 1243 This inserts a key into the tree. The changes made to the tree will 1244 all be part of the \a transaction. 1245 */ 1246 status_t 1247 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, 1248 off_t value) 1249 { 1250 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1251 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1252 RETURN_ERROR(B_BAD_VALUE); 1253 #ifdef DEBUG 1254 if (value < 0) 1255 panic("tried to insert invalid value %Ld!\n", value); 1256 #endif 1257 1258 // lock access to stream 1259 WriteLocked locked(fStream->Lock()); 1260 1261 Stack<node_and_key> stack; 1262 if (_SeekDown(stack, key, keyLength) != B_OK) 1263 RETURN_ERROR(B_ERROR); 1264 1265 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1266 1267 memcpy(keyBuffer, key, keyLength); 1268 keyBuffer[keyLength] = 0; 1269 1270 node_and_key nodeAndKey; 1271 const bplustree_node *node; 1272 1273 CachedNode cached(this); 1274 while (stack.Pop(&nodeAndKey) 1275 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1276 #ifdef DEBUG 1277 NodeChecker checker(node, fNodeSize, "insert"); 1278 #endif 1279 if (node->IsLeaf()) { 1280 // first round, check for duplicate entries 1281 status_t status = _FindKey(node, key, keyLength, 1282 &nodeAndKey.keyIndex); 1283 1284 // is this a duplicate entry? 1285 if (status == B_OK) { 1286 if (fAllowDuplicates) { 1287 status = _InsertDuplicate(transaction, cached, node, 1288 nodeAndKey.keyIndex, value); 1289 if (status != B_OK) 1290 RETURN_ERROR(status); 1291 return B_OK; 1292 } 1293 1294 RETURN_ERROR(B_NAME_IN_USE); 1295 } 1296 } 1297 1298 bplustree_node *writableNode = cached.MakeWritable(transaction); 1299 if (writableNode == NULL) 1300 return B_IO_ERROR; 1301 1302 // is the node big enough to hold the pair? 1303 if (int32(round_up(sizeof(bplustree_node) 1304 + writableNode->AllKeyLength() + keyLength) 1305 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1306 + sizeof(off_t))) < fNodeSize) { 1307 _InsertKey(writableNode, nodeAndKey.keyIndex, 1308 keyBuffer, keyLength, value); 1309 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1310 nodeAndKey.keyIndex, 0, 1); 1311 1312 return B_OK; 1313 } else { 1314 CachedNode cachedNewRoot(this); 1315 CachedNode cachedOther(this); 1316 1317 // do we need to allocate a new root node? if so, then do 1318 // it now 1319 off_t newRoot = BPLUSTREE_NULL; 1320 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1321 bplustree_node *root; 1322 status_t status = cachedNewRoot.Allocate(transaction, &root, 1323 &newRoot); 1324 if (status < B_OK) { 1325 // The tree is most likely corrupted! 1326 // But it's still sane at leaf level - we could set 1327 // a flag in the header that forces the tree to be 1328 // rebuild next time... 1329 // But since we will have journaling, that's not a big 1330 // problem anyway. 1331 RETURN_ERROR(status); 1332 } 1333 } 1334 1335 // reserve space for the other node 1336 bplustree_node *other; 1337 off_t otherOffset; 1338 status_t status = cachedOther.Allocate(transaction, &other, 1339 &otherOffset); 1340 if (status < B_OK) { 1341 cachedNewRoot.Free(transaction, newRoot); 1342 RETURN_ERROR(status); 1343 } 1344 1345 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1346 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1347 &value) < B_OK) { 1348 // free root node & other node here 1349 cachedOther.Free(transaction, otherOffset); 1350 cachedNewRoot.Free(transaction, newRoot); 1351 1352 RETURN_ERROR(B_ERROR); 1353 } 1354 #ifdef DEBUG 1355 checker.Check("insert split"); 1356 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1357 #endif 1358 1359 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1360 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1361 1362 // update the right link of the node in the left of the new node 1363 if ((other = cachedOther.SetToWritable(transaction, 1364 other->LeftLink())) != NULL) { 1365 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1366 } 1367 1368 // create a new root if necessary 1369 if (newRoot != BPLUSTREE_NULL) { 1370 bplustree_node *root = cachedNewRoot.Node(); 1371 1372 _InsertKey(root, 0, keyBuffer, keyLength, 1373 writableNode->LeftLink()); 1374 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1375 nodeAndKey.nodeOffset); 1376 1377 bplustree_header *header 1378 = fCachedHeader.MakeWritableHeader(transaction); 1379 if (header == NULL) 1380 return B_IO_ERROR; 1381 1382 // finally, update header to point to the new root 1383 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1384 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1385 header->MaxNumberOfLevels() + 1); 1386 1387 return B_OK; 1388 } 1389 } 1390 } 1391 RETURN_ERROR(B_ERROR); 1392 } 1393 1394 1395 /*! 1396 Removes the duplicate index/value pair from the tree. 1397 It's part of the private tree interface. 1398 */ 1399 status_t 1400 BPlusTree::_RemoveDuplicate(Transaction &transaction, 1401 const bplustree_node *node, CachedNode &cached, uint16 index, 1402 off_t value) 1403 { 1404 off_t *values = node->Values(); 1405 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1406 1407 CachedNode cachedDuplicate(this); 1408 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1409 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 1410 duplicateOffset, false); 1411 if (duplicate == NULL) 1412 return B_IO_ERROR; 1413 1414 // if it's a duplicate fragment, remove the entry from there 1415 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1416 duplicate_array *array = duplicate->FragmentAt( 1417 bplustree_node::FragmentIndex(oldValue)); 1418 1419 if (array->count > NUM_FRAGMENT_VALUES 1420 || array->count < 1) { 1421 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n", 1422 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1423 return B_BAD_DATA; 1424 } 1425 if (!array->Remove(value)) { 1426 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1427 value, duplicateOffset)); 1428 return B_ENTRY_NOT_FOUND; 1429 } 1430 1431 // remove the array from the fragment node if it is empty 1432 if (array->count == 1) { 1433 // set the link to the remaining value 1434 if (cached.MakeWritable(transaction) == NULL) 1435 return B_IO_ERROR; 1436 1437 values[index] = array->values[0]; 1438 1439 // Remove the whole fragment node, if this was the only array, 1440 // otherwise free just the array 1441 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1442 status_t status = cachedDuplicate.Free(transaction, duplicateOffset); 1443 if (status < B_OK) 1444 return status; 1445 } else 1446 array->count = 0; 1447 } 1448 return B_OK; 1449 } 1450 1451 // Remove value from a duplicate node! 1452 1453 duplicate_array *array = NULL; 1454 1455 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1456 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1457 duplicate->LeftLink())); 1458 return B_BAD_DATA; 1459 } 1460 1461 // Search the duplicate nodes until the entry could be found (and removed) 1462 while (duplicate != NULL) { 1463 array = duplicate->DuplicateArray(); 1464 if (array->count > NUM_DUPLICATE_VALUES 1465 || array->count < 0) { 1466 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1467 duplicateOffset, array->count)); 1468 return B_BAD_DATA; 1469 } 1470 1471 if (array->Remove(value)) 1472 break; 1473 1474 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1475 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1476 1477 cachedDuplicate.UnsetUnchanged(transaction); 1478 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); 1479 } 1480 if (duplicate == NULL) 1481 RETURN_ERROR(B_IO_ERROR); 1482 1483 // The entry got removed from the duplicate node, but we might want to free 1484 // it now in case it's empty 1485 1486 while (true) { 1487 off_t left = duplicate->LeftLink(); 1488 off_t right = duplicate->RightLink(); 1489 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1490 1491 if ((isLast && array->count == 1) || array->count == 0) { 1492 // Free empty duplicate page, link their siblings together, and 1493 // update the duplicate link if needed (ie. when we either remove 1494 // the last duplicate node or have a new first one) 1495 1496 if (left == BPLUSTREE_NULL) { 1497 // the duplicate link points to us 1498 if (cached.MakeWritable(transaction) == NULL) 1499 return B_IO_ERROR; 1500 1501 if (array->count == 1) { 1502 // this is the last node, and there is only one value left; 1503 // replace the duplicate link with that value, it's no duplicate 1504 // anymore 1505 values[index] = array->values[0]; 1506 } else { 1507 // move the duplicate link to the next node 1508 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1509 BPLUSTREE_DUPLICATE_NODE, right)); 1510 } 1511 } 1512 1513 status_t status; 1514 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1515 return status; 1516 1517 if (left != BPLUSTREE_NULL 1518 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { 1519 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1520 1521 // If the next node is the last node, we need to free that node 1522 // and convert the duplicate entry back into a normal entry 1523 array = duplicate->DuplicateArray(); 1524 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1525 && array->count <= NUM_FRAGMENT_VALUES) { 1526 duplicateOffset = left; 1527 continue; 1528 } 1529 } 1530 if (right != BPLUSTREE_NULL 1531 && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { 1532 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1533 1534 // Again, we may need to turn the duplicate entry back into a normal entry 1535 array = duplicate->DuplicateArray(); 1536 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1537 && array->count <= NUM_FRAGMENT_VALUES) { 1538 duplicateOffset = right; 1539 continue; 1540 } 1541 } 1542 return B_OK; 1543 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1544 // If the number of entries fits in a duplicate fragment, then 1545 // either find a free fragment node, or convert this node to a 1546 // fragment node. 1547 CachedNode cachedOther(this); 1548 1549 bplustree_node *fragment = NULL; 1550 uint32 fragmentIndex = 0; 1551 off_t offset; 1552 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1553 &offset, &fragment, &fragmentIndex) == B_OK) { 1554 // move to other node 1555 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1556 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1557 1558 cachedDuplicate.Free(transaction, duplicateOffset); 1559 duplicateOffset = offset; 1560 } else { 1561 // convert node 1562 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1563 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1564 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1565 } 1566 1567 if (cached.MakeWritable(transaction) == NULL) 1568 return B_IO_ERROR; 1569 1570 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1571 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1572 } 1573 return B_OK; 1574 } 1575 } 1576 1577 1578 /*! 1579 Removes the key with the given index from the specified node. 1580 Since it has to get the key from the node anyway (to obtain it's 1581 pointer), it's not needed to pass the key & its length, although 1582 the calling method (BPlusTree::Remove()) have this data. 1583 */ 1584 void 1585 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index) 1586 { 1587 // should never happen, but who knows? 1588 if (index > node->NumKeys() && node->NumKeys() > 0) { 1589 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1590 return; 1591 } 1592 1593 off_t *values = node->Values(); 1594 1595 // if we would have to drop the overflow link, drop 1596 // the last key instead and update the overflow link 1597 // to the value of that one 1598 if (!node->IsLeaf() && index == node->NumKeys()) 1599 node->overflow_link = values[--index]; 1600 1601 uint16 length; 1602 uint8 *key = node->KeyAt(index, &length); 1603 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1604 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1605 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1606 (int)fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1607 fStream->GetVolume()->Panic(); 1608 return; 1609 } 1610 1611 uint16 *keyLengths = node->KeyLengths(); 1612 uint8 *keys = node->Keys(); 1613 1614 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1615 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1616 1617 off_t *newValues = node->Values(); 1618 uint16 *newKeyLengths = node->KeyLengths(); 1619 1620 // move key data 1621 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1622 1623 // move and update key lengths 1624 if (index > 0 && newKeyLengths != keyLengths) 1625 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1626 for (uint16 i = index; i < node->NumKeys(); i++) { 1627 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1628 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1629 } 1630 1631 // move values 1632 if (index > 0) 1633 memmove(newValues, values, index * sizeof(off_t)); 1634 if (node->NumKeys() > index) 1635 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1636 } 1637 1638 1639 /*! 1640 Removes the specified key from the tree. The "value" parameter is only used 1641 for trees which allow duplicates, so you may safely ignore it. 1642 It's not an optional parameter, so at least you have to think about it. 1643 */ 1644 status_t 1645 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, 1646 off_t value) 1647 { 1648 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1649 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1650 RETURN_ERROR(B_BAD_VALUE); 1651 1652 // lock access to stream 1653 WriteLocked locked(fStream->Lock()); 1654 1655 Stack<node_and_key> stack; 1656 if (_SeekDown(stack, key, keyLength) != B_OK) 1657 RETURN_ERROR(B_ERROR); 1658 1659 node_and_key nodeAndKey; 1660 const bplustree_node *node; 1661 1662 CachedNode cached(this); 1663 while (stack.Pop(&nodeAndKey) 1664 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1665 #ifdef DEBUG 1666 NodeChecker checker(node, fNodeSize, "remove"); 1667 #endif 1668 if (node->IsLeaf()) { 1669 // first round, check for duplicate entries 1670 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1671 if (status < B_OK) 1672 RETURN_ERROR(status); 1673 1674 // If we will remove the last key, the iterator will be set 1675 // to the next node after the current - if there aren't any 1676 // more nodes, we need a way to prevent the TreeIterators to 1677 // touch the old node again, we use BPLUSTREE_FREE for this 1678 off_t next = node->RightLink() == BPLUSTREE_NULL 1679 ? BPLUSTREE_FREE : node->RightLink(); 1680 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1681 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1682 1683 // is this a duplicate entry? 1684 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1685 node->Values()[nodeAndKey.keyIndex]))) { 1686 if (fAllowDuplicates) { 1687 return _RemoveDuplicate(transaction, node, cached, 1688 nodeAndKey.keyIndex, value); 1689 } else { 1690 FATAL(("dupliate node found where no duplicates are allowed!\n")); 1691 RETURN_ERROR(B_ERROR); 1692 } 1693 } 1694 } 1695 1696 bplustree_node *writableNode = cached.MakeWritable(transaction); 1697 if (writableNode == NULL) 1698 return B_IO_ERROR; 1699 1700 // if it's an empty root node, we have to convert it 1701 // to a leaf node by dropping the overflow link, or, 1702 // if it's already a leaf node, just empty it 1703 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1704 && (node->NumKeys() == 0 || node->NumKeys() == 1 && node->IsLeaf())) { 1705 writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1706 writableNode->all_key_count = 0; 1707 writableNode->all_key_length = 0; 1708 1709 // if we've made a leaf node out of the root node, we need 1710 // to reset the maximum number of levels in the header 1711 if (fHeader->MaxNumberOfLevels() != 1) { 1712 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1713 if (header == NULL) 1714 return B_IO_ERROR; 1715 1716 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1717 } 1718 return B_OK; 1719 } 1720 1721 // if there is only one key left, we don't have to remove 1722 // it, we can just dump the node (index nodes still have 1723 // the overflow link, so we have to drop the last key) 1724 if (writableNode->NumKeys() > 1 1725 || !writableNode->IsLeaf() && writableNode->NumKeys() == 1) { 1726 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1727 return B_OK; 1728 } 1729 1730 // when we are here, we can just free the node, but 1731 // we have to update the right/left link of the 1732 // siblings first 1733 CachedNode otherCached(this); 1734 bplustree_node *other = otherCached.SetToWritable(transaction, 1735 writableNode->LeftLink()); 1736 if (other != NULL) 1737 other->right_link = writableNode->right_link; 1738 1739 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) 1740 other->left_link = writableNode->left_link; 1741 1742 cached.Free(transaction, nodeAndKey.nodeOffset); 1743 } 1744 RETURN_ERROR(B_ERROR); 1745 } 1746 1747 1748 /*! 1749 Replaces the value for the key in the tree. 1750 Returns B_OK if the key could be found and its value replaced, 1751 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1752 to indicate that something went terribly wrong. 1753 Note that this doesn't work with duplicates - it will just 1754 return B_BAD_TYPE if you call this function on a tree where 1755 duplicates are allowed. 1756 */ 1757 status_t 1758 BPlusTree::Replace(Transaction &transaction, const uint8 *key, 1759 uint16 keyLength, off_t value) 1760 { 1761 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1762 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1763 || key == NULL) 1764 RETURN_ERROR(B_BAD_VALUE); 1765 1766 if (fAllowDuplicates) 1767 RETURN_ERROR(B_BAD_TYPE); 1768 1769 // lock access to stream (a read lock is okay for this purpose) 1770 ReadLocked locked(fStream->Lock()); 1771 1772 off_t nodeOffset = fHeader->RootNode(); 1773 CachedNode cached(this); 1774 const bplustree_node *node; 1775 1776 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1777 uint16 keyIndex = 0; 1778 off_t nextOffset; 1779 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1780 1781 if (node->OverflowLink() == BPLUSTREE_NULL) { 1782 if (status == B_OK) { 1783 bplustree_node *writableNode = cached.MakeWritable(transaction); 1784 if (writableNode != NULL) 1785 writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); 1786 else 1787 status = B_IO_ERROR; 1788 } 1789 1790 return status; 1791 } else if (nextOffset == nodeOffset) 1792 RETURN_ERROR(B_ERROR); 1793 1794 nodeOffset = nextOffset; 1795 } 1796 RETURN_ERROR(B_ERROR); 1797 } 1798 1799 1800 /*! 1801 Searches the key in the tree, and stores the offset found in 1802 _value, if successful. 1803 It's very similar to BPlusTree::SeekDown(), but doesn't fill 1804 a stack while it descends the tree. 1805 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1806 if not. It can also return other errors to indicate that 1807 something went wrong. 1808 Note that this doesn't work with duplicates - it will just 1809 return B_BAD_TYPE if you call this function on a tree where 1810 duplicates are allowed. 1811 */ 1812 status_t 1813 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1814 { 1815 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1816 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1817 || key == NULL) 1818 RETURN_ERROR(B_BAD_VALUE); 1819 1820 if (fAllowDuplicates) 1821 RETURN_ERROR(B_BAD_TYPE); 1822 1823 // lock access to stream 1824 ReadLocked locked(fStream->Lock()); 1825 1826 off_t nodeOffset = fHeader->RootNode(); 1827 CachedNode cached(this); 1828 const bplustree_node *node; 1829 1830 #ifdef DEBUG 1831 int32 levels = 0; 1832 #endif 1833 1834 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1835 uint16 keyIndex = 0; 1836 off_t nextOffset; 1837 status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1838 1839 #ifdef DEBUG 1840 levels++; 1841 #endif 1842 if (node->OverflowLink() == BPLUSTREE_NULL) { 1843 if (status == B_OK && _value != NULL) 1844 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1845 1846 #ifdef DEBUG 1847 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1848 DEBUGGER(("levels don't match")); 1849 #endif 1850 return status; 1851 } else if (nextOffset == nodeOffset) 1852 RETURN_ERROR(B_ERROR); 1853 1854 nodeOffset = nextOffset; 1855 } 1856 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1857 RETURN_ERROR(B_ERROR); 1858 } 1859 1860 1861 // #pragma mark - 1862 1863 1864 TreeIterator::TreeIterator(BPlusTree *tree) 1865 : 1866 fTree(tree), 1867 fCurrentNodeOffset(BPLUSTREE_NULL), 1868 fNext(NULL) 1869 { 1870 tree->_AddIterator(this); 1871 } 1872 1873 1874 TreeIterator::~TreeIterator() 1875 { 1876 if (fTree) 1877 fTree->_RemoveIterator(this); 1878 } 1879 1880 1881 status_t 1882 TreeIterator::Goto(int8 to) 1883 { 1884 if (fTree == NULL || fTree->fHeader == NULL) 1885 RETURN_ERROR(B_BAD_VALUE); 1886 1887 // lock access to stream 1888 ReadLocked locked(fTree->fStream->Lock()); 1889 1890 off_t nodeOffset = fTree->fHeader->RootNode(); 1891 CachedNode cached(fTree); 1892 const bplustree_node *node; 1893 1894 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1895 // is the node a leaf node? 1896 if (node->OverflowLink() == BPLUSTREE_NULL) { 1897 fCurrentNodeOffset = nodeOffset; 1898 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1899 fDuplicateNode = BPLUSTREE_NULL; 1900 1901 return B_OK; 1902 } 1903 1904 // get the next node offset depending on the direction (and if there 1905 // are any keys in that node at all) 1906 off_t nextOffset; 1907 if (to == BPLUSTREE_END || node->all_key_count == 0) 1908 nextOffset = node->OverflowLink(); 1909 else { 1910 if (node->AllKeyLength() > fTree->fNodeSize 1911 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1912 - 8 * node->NumKeys()) 1913 RETURN_ERROR(B_ERROR); 1914 1915 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1916 } 1917 if (nextOffset == nodeOffset) 1918 break; 1919 1920 nodeOffset = nextOffset; 1921 } 1922 FATAL(("%s fails\n", __FUNCTION__)); 1923 1924 RETURN_ERROR(B_ERROR); 1925 } 1926 1927 1928 /*! 1929 Iterates through the tree in the specified direction. 1930 When it iterates through duplicates, the "key" is only updated for the 1931 first entry - if you need to know when this happens, use the "duplicate" 1932 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1933 the other duplicates. 1934 That's not too nice, but saves the 256 bytes that would be needed to 1935 store the last key - if this will ever become an issue, it will be 1936 easy to change. 1937 The other advantage of this is, that the queries can skip all duplicates 1938 at once when they are not relevant to them. 1939 */ 1940 status_t 1941 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, 1942 uint16 maxLength, off_t *value, uint16 *duplicate) 1943 { 1944 if (fTree == NULL) 1945 return B_INTERRUPTED; 1946 1947 bool forward = direction == BPLUSTREE_FORWARD; 1948 1949 if (fCurrentNodeOffset == BPLUSTREE_NULL 1950 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1951 RETURN_ERROR(B_ERROR); 1952 1953 // if the tree was emptied since the last call 1954 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1955 return B_ENTRY_NOT_FOUND; 1956 1957 // lock access to stream 1958 ReadLocked locked(fTree->fStream->Lock()); 1959 1960 CachedNode cached(fTree); 1961 const bplustree_node *node; 1962 1963 if (fDuplicateNode != BPLUSTREE_NULL) { 1964 // regardless of traverse direction the duplicates are always presented in 1965 // the same order; since they are all considered as equal, this shouldn't 1966 // cause any problems 1967 1968 if (!fIsFragment || fDuplicate < fNumDuplicates) 1969 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1970 else 1971 node = NULL; 1972 1973 if (node != NULL) { 1974 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1975 // if the node is out of duplicates, we go directly to the next one 1976 fDuplicateNode = node->RightLink(); 1977 if (fDuplicateNode != BPLUSTREE_NULL 1978 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1979 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1980 fDuplicate = 0; 1981 } 1982 } 1983 if (fDuplicate < fNumDuplicates) { 1984 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1985 if (duplicate) 1986 *duplicate = 2; 1987 return B_OK; 1988 } 1989 } 1990 fDuplicateNode = BPLUSTREE_NULL; 1991 } 1992 1993 off_t savedNodeOffset = fCurrentNodeOffset; 1994 int32 savedKey = fCurrentKey; 1995 1996 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 1997 RETURN_ERROR(B_ERROR); 1998 1999 if (duplicate) 2000 *duplicate = 0; 2001 2002 fCurrentKey += direction; 2003 2004 // is the current key in the current node? 2005 while ((forward && fCurrentKey >= node->NumKeys()) 2006 || (!forward && fCurrentKey < 0)) { 2007 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2008 2009 // are there any more nodes? 2010 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2011 node = cached.SetTo(fCurrentNodeOffset); 2012 if (!node) 2013 RETURN_ERROR(B_ERROR); 2014 2015 // reset current key 2016 fCurrentKey = forward ? 0 : node->NumKeys(); 2017 } else { 2018 // there are no nodes left, so turn back to the last key 2019 fCurrentNodeOffset = savedNodeOffset; 2020 fCurrentKey = savedKey; 2021 2022 return B_ENTRY_NOT_FOUND; 2023 } 2024 } 2025 2026 if (node->all_key_count == 0) 2027 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2028 2029 uint16 length; 2030 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 2031 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 2032 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2033 fTree->fStream->GetVolume()->Panic(); 2034 RETURN_ERROR(B_BAD_DATA); 2035 } 2036 2037 length = min_c(length, maxLength); 2038 memcpy(key, keyStart, length); 2039 2040 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 2041 // terminate string type 2042 if (length == maxLength) 2043 length--; 2044 ((char *)key)[length] = '\0'; 2045 } 2046 *keyLength = length; 2047 2048 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2049 2050 // duplicate fragments? 2051 uint8 type = bplustree_node::LinkType(offset); 2052 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 2053 fDuplicateNode = offset; 2054 2055 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 2056 if (node == NULL) 2057 RETURN_ERROR(B_ERROR); 2058 2059 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2060 2061 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2062 if (fNumDuplicates) { 2063 offset = node->DuplicateAt(offset, fIsFragment, 0); 2064 fDuplicate = 1; 2065 if (duplicate) 2066 *duplicate = 1; 2067 } else { 2068 // shouldn't happen, but we're dealing here with potentially corrupt disks... 2069 fDuplicateNode = BPLUSTREE_NULL; 2070 offset = 0; 2071 } 2072 } 2073 *value = offset; 2074 2075 return B_OK; 2076 } 2077 2078 2079 /*! 2080 This is more or less a copy of BPlusTree::Find() - but it just 2081 sets the current position in the iterator, regardless of if the 2082 key could be found or not. 2083 */ 2084 status_t 2085 TreeIterator::Find(const uint8 *key, uint16 keyLength) 2086 { 2087 if (fTree == NULL) 2088 return B_INTERRUPTED; 2089 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2090 || key == NULL) 2091 RETURN_ERROR(B_BAD_VALUE); 2092 2093 // lock access to stream 2094 ReadLocked locked(fTree->fStream->Lock()); 2095 2096 off_t nodeOffset = fTree->fHeader->RootNode(); 2097 2098 CachedNode cached(fTree); 2099 const bplustree_node *node; 2100 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2101 uint16 keyIndex = 0; 2102 off_t nextOffset; 2103 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2104 &nextOffset); 2105 2106 if (node->OverflowLink() == BPLUSTREE_NULL) { 2107 fCurrentNodeOffset = nodeOffset; 2108 fCurrentKey = keyIndex - 1; 2109 fDuplicateNode = BPLUSTREE_NULL; 2110 2111 return status; 2112 } else if (nextOffset == nodeOffset) 2113 RETURN_ERROR(B_ERROR); 2114 2115 nodeOffset = nextOffset; 2116 } 2117 RETURN_ERROR(B_ERROR); 2118 } 2119 2120 2121 void 2122 TreeIterator::SkipDuplicates() 2123 { 2124 fDuplicateNode = BPLUSTREE_NULL; 2125 } 2126 2127 2128 void 2129 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 2130 { 2131 if (offset != fCurrentNodeOffset) 2132 return; 2133 2134 if (nextOffset != BPLUSTREE_NULL) { 2135 fCurrentNodeOffset = nextOffset; 2136 if (splitAt <= fCurrentKey) { 2137 fCurrentKey -= splitAt; 2138 keyIndex -= splitAt; 2139 } 2140 } 2141 2142 // Adjust fCurrentKey to point to the same key as before. 2143 // Note, that if a key is inserted at the current position 2144 // it won't be included in this tree transition. 2145 if (keyIndex <= fCurrentKey) 2146 fCurrentKey += change; 2147 2148 // ToDo: duplicate handling! 2149 } 2150 2151 2152 void 2153 TreeIterator::Stop() 2154 { 2155 fTree = NULL; 2156 } 2157 2158 2159 #ifdef DEBUG 2160 void 2161 TreeIterator::Dump() 2162 { 2163 __out("TreeIterator at %p:\n", this); 2164 __out("\tfTree = %p\n", fTree); 2165 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2166 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2167 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 2168 __out("\tfDuplicate = %u\n", fDuplicate); 2169 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2170 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2171 } 2172 #endif 2173 2174 2175 // #pragma mark - 2176 2177 2178 void 2179 bplustree_node::Initialize() 2180 { 2181 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2182 all_key_count = 0; 2183 all_key_length = 0; 2184 } 2185 2186 2187 uint8 * 2188 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 2189 { 2190 if (index < 0 || index > NumKeys()) 2191 return NULL; 2192 2193 uint8 *keyStart = Keys(); 2194 uint16 *keyLengths = KeyLengths(); 2195 2196 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2197 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2198 if (index > 0) 2199 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2200 2201 return keyStart; 2202 } 2203 2204 2205 uint8 2206 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2207 { 2208 // the duplicate fragment handling is currently hard-coded to a node size 2209 // of 1024 bytes - with future versions of BFS, this may be a problem 2210 2211 if (isFragment) { 2212 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2213 2214 return ((off_t *)this)[fragment]; 2215 } 2216 return OverflowLink(); 2217 } 2218 2219 2220 off_t 2221 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2222 { 2223 uint32 start; 2224 if (isFragment) 2225 start = 8 * ((uint64)offset & 0x3ff); 2226 else 2227 start = 2; 2228 2229 return ((off_t *)this)[start + 1 + index]; 2230 } 2231 2232 2233 /*! Although the name suggests it, this function doesn't return the real 2234 used fragment count; at least, it can only count to two: it returns 2235 0, if there is no fragment used, 1 if there is only one fragment 2236 used, and 2 if there are at least 2 fragments used. 2237 */ 2238 uint32 2239 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2240 { 2241 uint32 used = 0; 2242 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2243 duplicate_array *array = FragmentAt(i); 2244 if (array->count > 0 && ++used > 1) 2245 return used; 2246 } 2247 return used; 2248 } 2249 2250 2251 #ifdef DEBUG 2252 status_t 2253 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2254 { 2255 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2256 DEBUGGER(("invalid node: key/length count")); 2257 2258 for (int32 i = 0; i < NumKeys(); i++) { 2259 uint16 length; 2260 uint8 *key = KeyAt(i, &length); 2261 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2262 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2263 dprintf("node %p, key %ld\n", this, i); 2264 DEBUGGER(("invalid node: keys corrupted")); 2265 return B_BAD_DATA; 2266 } 2267 if (Values()[i] == -1) { 2268 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2269 DEBUGGER(("invalid node: values corrupted")); 2270 return B_BAD_DATA; 2271 } 2272 } 2273 return B_OK; 2274 } 2275 #endif 2276 2277 2278 // #pragma mark - 2279 2280 2281 int32 2282 compareKeys(type_code type, const void *key1, int keyLength1, 2283 const void *key2, int keyLength2) 2284 { 2285 // if one of the keys is NULL, bail out gracefully 2286 if (key1 == NULL || key2 == NULL) { 2287 // even if it's most probably a bug in the calling code, we try to 2288 // give a meaningful result 2289 if (key1 == NULL && key2 != NULL) 2290 return 1; 2291 else if (key2 == NULL && key1 != NULL) 2292 return -1; 2293 2294 return 0; 2295 } 2296 2297 switch (type) { 2298 case B_STRING_TYPE: 2299 { 2300 int len = min_c(keyLength1, keyLength2); 2301 int result = strncmp((const char *)key1, (const char *)key2, len); 2302 2303 if (result == 0 2304 && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) 2305 result = keyLength1 - keyLength2; 2306 2307 return result; 2308 } 2309 2310 case B_SSIZE_T_TYPE: 2311 case B_INT32_TYPE: 2312 return *(int32 *)key1 - *(int32 *)key2; 2313 2314 case B_SIZE_T_TYPE: 2315 case B_UINT32_TYPE: 2316 if (*(uint32 *)key1 == *(uint32 *)key2) 2317 return 0; 2318 else if (*(uint32 *)key1 > *(uint32 *)key2) 2319 return 1; 2320 2321 return -1; 2322 2323 case B_OFF_T_TYPE: 2324 case B_INT64_TYPE: 2325 if (*(int64 *)key1 == *(int64 *)key2) 2326 return 0; 2327 else if (*(int64 *)key1 > *(int64 *)key2) 2328 return 1; 2329 2330 return -1; 2331 2332 case B_UINT64_TYPE: 2333 if (*(uint64 *)key1 == *(uint64 *)key2) 2334 return 0; 2335 else if (*(uint64 *)key1 > *(uint64 *)key2) 2336 return 1; 2337 2338 return -1; 2339 2340 case B_FLOAT_TYPE: 2341 { 2342 float result = *(float *)key1 - *(float *)key2; 2343 if (result == 0.0f) 2344 return 0; 2345 2346 return (result < 0.0f) ? -1 : 1; 2347 } 2348 2349 case B_DOUBLE_TYPE: 2350 { 2351 double result = *(double *)key1 - *(double *)key2; 2352 if (result == 0.0) 2353 return 0; 2354 2355 return (result < 0.0) ? -1 : 1; 2356 } 2357 } 2358 2359 // if the type is unknown, the entries don't match... 2360 return -1; 2361 } 2362 2363