1 /* BPlusTree - BFS B+Tree implementation 2 * 3 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 4 * no code but achieves binary compatibility with the on disk format. 5 * 6 * Copyright 2001-2005, Axel Dörfler, axeld@pinc-software.de. 7 * This file may be used under the terms of the MIT License. 8 */ 9 10 11 #include "Debug.h" 12 #include "BPlusTree.h" 13 #include "Inode.h" 14 #include "Utility.h" 15 16 #include <util/kernel_cpp.h> 17 #include <util/Stack.h> 18 #include <TypeConstants.h> 19 20 #include <string.h> 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 25 #ifdef DEBUG 26 class NodeChecker { 27 public: 28 NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text) 29 : 30 fNode(node), 31 fSize(nodeSize), 32 fText(text) 33 { 34 Check("integrity check failed on construction."); 35 } 36 37 ~NodeChecker() 38 { 39 Check("integrity check failed on destruction."); 40 } 41 42 void 43 Check(const char *message) 44 { 45 if (fNode->CheckIntegrity(fSize) < B_OK) 46 dprintf("%s: %s\n", fText, message); 47 } 48 49 private: 50 const bplustree_node *fNode; 51 int32 fSize; 52 const char *fText; 53 }; 54 #endif 55 56 57 // Node Caching for the BPlusTree class 58 // 59 // With write support, there is the need for a function that allocates new 60 // nodes by either returning empty nodes, or by growing the file's data stream 61 // 62 // !! The CachedNode class assumes that you have properly locked the stream 63 // !! before asking for nodes. 64 // 65 // Note: This code will fail if the block size is smaller than the node size! 66 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 67 // is hard-coded to 1024 bytes, that's not an issue now. 68 69 void 70 CachedNode::UnsetUnchanged(Transaction &transaction) 71 { 72 if (fTree == NULL || fTree->fStream == NULL) 73 return; 74 75 if (fNode != NULL) { 76 void *cache = fTree->fStream->GetVolume()->BlockCache(); 77 78 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 79 block_cache_put(cache, fBlockNumber); 80 fNode = NULL; 81 } 82 } 83 84 85 void 86 CachedNode::Unset() 87 { 88 if (fTree == NULL || fTree->fStream == NULL) 89 return; 90 91 if (fNode != NULL) { 92 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); 93 fNode = NULL; 94 } 95 } 96 97 98 const bplustree_node * 99 CachedNode::SetTo(off_t offset, bool check) 100 { 101 if (fTree == NULL || fTree->fStream == NULL) { 102 REPORT_ERROR(B_BAD_VALUE); 103 return NULL; 104 } 105 106 Unset(); 107 108 // You can only ask for nodes at valid positions - you can't 109 // even access the b+tree header with this method (use SetToHeader() 110 // instead) 111 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 112 || offset <= 0 113 || (offset % fTree->fNodeSize) != 0) 114 return NULL; 115 116 if (InternalSetTo(NULL, offset) != NULL && check) { 117 // sanity checks (links, all_key_count) 118 if (!fTree->fHeader->CheckNode(fNode)) { 119 FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n", 120 fNode, offset, fTree->fStream->ID())); 121 return NULL; 122 } 123 } 124 return fNode; 125 } 126 127 128 bplustree_node * 129 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check) 130 { 131 if (fTree == NULL || fTree->fStream == NULL) { 132 REPORT_ERROR(B_BAD_VALUE); 133 return NULL; 134 } 135 136 Unset(); 137 138 // You can only ask for nodes at valid positions - you can't 139 // even access the b+tree header with this method (use SetToHeader() 140 // instead) 141 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 142 || offset <= 0 143 || (offset % fTree->fNodeSize) != 0) 144 return NULL; 145 146 if (InternalSetTo(&transaction, offset) != NULL && check) { 147 // sanity checks (links, all_key_count) 148 if (!fTree->fHeader->CheckNode(fNode)) { 149 FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n", 150 fNode, offset, fTree->fStream->ID())); 151 return NULL; 152 } 153 } 154 return fNode; 155 } 156 157 158 bplustree_node * 159 CachedNode::MakeWritable(Transaction &transaction) 160 { 161 if (fNode == NULL) 162 return NULL; 163 164 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 165 fBlockNumber, transaction.ID()) == B_OK) 166 return fNode; 167 168 return NULL; 169 } 170 171 172 bplustree_header * 173 CachedNode::MakeWritableHeader(Transaction &transaction) 174 { 175 return (bplustree_header *)MakeWritable(transaction); 176 } 177 178 179 const bplustree_header * 180 CachedNode::SetToHeader() 181 { 182 if (fTree == NULL || fTree->fStream == NULL) { 183 REPORT_ERROR(B_BAD_VALUE); 184 return NULL; 185 } 186 187 Unset(); 188 189 InternalSetTo(NULL, 0LL); 190 return (bplustree_header *)fNode; 191 } 192 193 194 bplustree_header * 195 CachedNode::SetToWritableHeader(Transaction &transaction) 196 { 197 if (fTree == NULL || fTree->fStream == NULL) { 198 REPORT_ERROR(B_BAD_VALUE); 199 return NULL; 200 } 201 202 Unset(); 203 204 InternalSetTo(&transaction, 0LL); 205 return (bplustree_header *)fNode; 206 } 207 208 209 bplustree_node * 210 CachedNode::InternalSetTo(Transaction *transaction, off_t offset) 211 { 212 fNode = NULL; 213 fOffset = offset; 214 215 off_t fileOffset; 216 block_run run; 217 if (offset < fTree->fStream->Size() 218 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 219 Volume *volume = fTree->fStream->GetVolume(); 220 221 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 222 fBlockNumber = volume->ToBlock(run) + blockOffset; 223 uint8 *block; 224 225 if (transaction != NULL) { 226 block = (uint8 *)block_cache_get_writable(volume->BlockCache(), fBlockNumber, transaction->ID()); 227 fWritable = true; 228 } else { 229 block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber); 230 fWritable = false; 231 } 232 233 if (block) { 234 // the node is somewhere in that block... (confusing offset calculation) 235 fNode = (bplustree_node *)(block + offset - 236 (fileOffset + (blockOffset << volume->BlockShift()))); 237 } else 238 REPORT_ERROR(B_IO_ERROR); 239 } 240 return fNode; 241 } 242 243 244 status_t 245 CachedNode::Free(Transaction &transaction, off_t offset) 246 { 247 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 248 RETURN_ERROR(B_BAD_VALUE); 249 250 // ToDo: scan the free nodes list and remove all nodes at the end 251 // of the tree - perhaps that shouldn't be done everytime that 252 // function is called, perhaps it should be done when the directory 253 // inode is closed or based on some calculation or whatever... 254 255 bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction); 256 if (header == NULL) 257 return B_IO_ERROR; 258 259 // if the node is the last one in the tree, we shrink 260 // the tree and file size by one node 261 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 262 if (offset == lastOffset) { 263 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 264 if (status < B_OK) 265 return status; 266 267 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 268 return B_OK; 269 } 270 271 // add the node to the free nodes list 272 fNode->left_link = header->free_node_pointer; 273 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 274 275 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 276 return B_OK; 277 } 278 279 280 status_t 281 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, off_t *_offset) 282 { 283 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 284 RETURN_ERROR(B_BAD_VALUE); 285 286 Unset(); 287 288 bplustree_header *header; 289 status_t status; 290 291 // if there are any free nodes, recycle them 292 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 293 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 294 if (header == NULL) 295 return B_IO_ERROR; 296 297 *_offset = header->FreeNode(); 298 *_node = fNode; 299 300 // set new free node pointer 301 header->free_node_pointer = fNode->left_link; 302 fNode->Initialize(); 303 return B_OK; 304 } 305 306 // allocate space for a new node 307 Inode *stream = fTree->fStream; 308 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 309 return status; 310 311 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 312 if (header == NULL) 313 return B_IO_ERROR; 314 315 // the maximum_size has to be changed before the call to SetTo() - or 316 // else it will fail because the requested node is out of bounds 317 off_t offset = header->MaximumSize(); 318 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() + fTree->fNodeSize); 319 320 if (SetToWritable(transaction, offset, false) != NULL) { 321 fNode->Initialize(); 322 323 *_offset = offset; 324 *_node = fNode; 325 return B_OK; 326 } 327 328 // revert header size to old value 329 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() - fTree->fNodeSize); 330 RETURN_ERROR(B_ERROR); 331 } 332 333 334 // #pragma mark - 335 336 337 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize) 338 : 339 fStream(NULL), 340 fHeader(NULL), 341 fCachedHeader(this) 342 { 343 SetTo(transaction, stream); 344 } 345 346 347 BPlusTree::BPlusTree(Inode *stream) 348 : 349 fStream(NULL), 350 fHeader(NULL), 351 fCachedHeader(this) 352 { 353 SetTo(stream); 354 } 355 356 357 BPlusTree::BPlusTree() 358 : 359 fStream(NULL), 360 fHeader(NULL), 361 fCachedHeader(this), 362 fNodeSize(BPLUSTREE_NODE_SIZE), 363 fAllowDuplicates(true), 364 fStatus(B_NO_INIT) 365 { 366 } 367 368 369 BPlusTree::~BPlusTree() 370 { 371 // if there are any TreeIterators left, we need to stop them 372 // (can happen when the tree's inode gets deleted while 373 // traversing the tree - a TreeIterator doesn't lock the inode) 374 if (fIteratorLock.Lock() < B_OK) 375 return; 376 377 TreeIterator *iterator = NULL; 378 while ((iterator = fIterators.Next(iterator)) != NULL) 379 iterator->Stop(); 380 381 fIteratorLock.Unlock(); 382 } 383 384 385 /** Create a new B+Tree on the specified stream */ 386 387 status_t 388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize) 389 { 390 // initializes in-memory B+Tree 391 392 fStream = stream; 393 394 bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction); 395 if (header == NULL) { 396 // allocate space for new header + node! 397 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 398 if (fStatus < B_OK) 399 RETURN_ERROR(fStatus); 400 401 header = fCachedHeader.SetToWritableHeader(transaction); 402 if (header == NULL) 403 RETURN_ERROR(fStatus = B_ERROR); 404 } 405 406 fHeader = header; 407 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 408 && stream->BlockRun() != stream->Parent()) 409 || (stream->Mode() & S_ALLOW_DUPS) != 0; 410 411 fNodeSize = nodeSize; 412 413 // initialize b+tree header 414 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 415 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 416 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 417 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 418 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 419 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 420 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 421 422 // initialize b+tree root node 423 CachedNode cached(this); 424 cached.SetToWritable(transaction, header->RootNode(), false); 425 if (cached.Node() == NULL) 426 RETURN_ERROR(B_IO_ERROR); 427 428 cached.Node()->Initialize(); 429 430 return fStatus = B_OK; 431 } 432 433 434 status_t 435 BPlusTree::SetTo(Inode *stream) 436 { 437 if (stream == NULL) 438 RETURN_ERROR(fStatus = B_BAD_VALUE); 439 440 // get on-disk B+Tree header 441 442 fCachedHeader.Unset(); 443 fStream = stream; 444 445 fHeader = fCachedHeader.SetToHeader(); 446 if (fHeader == NULL) 447 RETURN_ERROR(fStatus = B_NO_INIT); 448 449 // is header valid? 450 451 if (fHeader->MaximumSize() != stream->Size()) { 452 FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n", 453 fHeader->MaximumSize(), stream->Size())); 454 // we can't change the header since we don't have a transaction 455 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 456 } 457 if (fHeader->Magic() != BPLUSTREE_MAGIC 458 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 459 || !fHeader->IsValidLink(fHeader->RootNode()) 460 || !fHeader->IsValidLink(fHeader->FreeNode())) { 461 #ifdef DEBUG 462 dump_bplustree_header(fHeader); 463 dump_block((const char *)fHeader, 128); 464 #endif 465 RETURN_ERROR(fStatus = B_BAD_DATA); 466 } 467 468 fNodeSize = fHeader->NodeSize(); 469 470 { 471 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 472 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 473 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 474 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 475 476 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 477 || (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode 478 || !stream->IsContainer()) { 479 D( dump_bplustree_header(fHeader); 480 dump_inode(&stream->Node()); 481 ); 482 RETURN_ERROR(fStatus = B_BAD_TYPE); 483 } 484 485 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 486 // in the original BFS code - we will honour it nevertheless 487 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 488 && stream->BlockRun() != stream->Parent()) 489 || (stream->Mode() & S_ALLOW_DUPS) != 0; 490 } 491 492 CachedNode cached(this, fHeader->RootNode()); 493 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 494 } 495 496 497 status_t 498 BPlusTree::InitCheck() 499 { 500 return fStatus; 501 } 502 503 504 int32 505 BPlusTree::TypeCodeToKeyType(type_code code) 506 { 507 switch (code) { 508 case B_STRING_TYPE: 509 return BPLUSTREE_STRING_TYPE; 510 case B_SSIZE_T_TYPE: 511 case B_INT32_TYPE: 512 return BPLUSTREE_INT32_TYPE; 513 case B_SIZE_T_TYPE: 514 case B_UINT32_TYPE: 515 return BPLUSTREE_UINT32_TYPE; 516 case B_OFF_T_TYPE: 517 case B_INT64_TYPE: 518 return BPLUSTREE_INT64_TYPE; 519 case B_UINT64_TYPE: 520 return BPLUSTREE_UINT64_TYPE; 521 case B_FLOAT_TYPE: 522 return BPLUSTREE_FLOAT_TYPE; 523 case B_DOUBLE_TYPE: 524 return BPLUSTREE_DOUBLE_TYPE; 525 } 526 return -1; 527 } 528 529 530 int32 531 BPlusTree::ModeToKeyType(mode_t mode) 532 { 533 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 534 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 535 case S_INT_INDEX: 536 return BPLUSTREE_INT32_TYPE; 537 case S_UINT_INDEX: 538 return BPLUSTREE_UINT32_TYPE; 539 case S_LONG_LONG_INDEX: 540 return BPLUSTREE_INT64_TYPE; 541 case S_ULONG_LONG_INDEX: 542 return BPLUSTREE_UINT64_TYPE; 543 case S_FLOAT_INDEX: 544 return BPLUSTREE_FLOAT_TYPE; 545 case S_DOUBLE_INDEX: 546 return BPLUSTREE_DOUBLE_TYPE; 547 case S_STR_INDEX: 548 default: 549 // default is for standard directories 550 return BPLUSTREE_STRING_TYPE; 551 } 552 } 553 554 555 // #pragma mark - 556 557 558 void 559 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, 560 int8 change) 561 { 562 // Although every iterator which is affected by this update currently 563 // waits on a semaphore, other iterators could be added/removed at 564 // any time, so we need to protect this loop 565 if (fIteratorLock.Lock() < B_OK) 566 return; 567 568 TreeIterator *iterator = NULL; 569 while ((iterator = fIterators.Next(iterator)) != NULL) 570 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 571 572 fIteratorLock.Unlock(); 573 } 574 575 576 void 577 BPlusTree::AddIterator(TreeIterator *iterator) 578 { 579 if (fIteratorLock.Lock() < B_OK) 580 return; 581 582 fIterators.Add(iterator); 583 584 fIteratorLock.Unlock(); 585 } 586 587 588 void 589 BPlusTree::RemoveIterator(TreeIterator *iterator) 590 { 591 if (fIteratorLock.Lock() < B_OK) 592 return; 593 594 fIterators.Remove(iterator); 595 596 fIteratorLock.Unlock(); 597 } 598 599 600 int32 601 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 602 { 603 type_code type = 0; 604 switch (fHeader->data_type) { 605 case BPLUSTREE_STRING_TYPE: 606 type = B_STRING_TYPE; 607 break; 608 case BPLUSTREE_INT32_TYPE: 609 type = B_INT32_TYPE; 610 break; 611 case BPLUSTREE_UINT32_TYPE: 612 type = B_UINT32_TYPE; 613 break; 614 case BPLUSTREE_INT64_TYPE: 615 type = B_INT64_TYPE; 616 break; 617 case BPLUSTREE_UINT64_TYPE: 618 type = B_UINT64_TYPE; 619 break; 620 case BPLUSTREE_FLOAT_TYPE: 621 type = B_FLOAT_TYPE; 622 break; 623 case BPLUSTREE_DOUBLE_TYPE: 624 type = B_DOUBLE_TYPE; 625 break; 626 } 627 return compareKeys(type, key1, keyLength1, key2, keyLength2); 628 } 629 630 631 status_t 632 BPlusTree::FindKey(const bplustree_node *node, const uint8 *key, uint16 keyLength, 633 uint16 *_index, off_t *_next) 634 { 635 #ifdef DEBUG 636 NodeChecker checker(node, fNodeSize, "find"); 637 #endif 638 639 if (node->all_key_count == 0) { 640 if (_index) 641 *_index = 0; 642 if (_next) 643 *_next = node->OverflowLink(); 644 return B_ENTRY_NOT_FOUND; 645 } 646 647 off_t *values = node->Values(); 648 int16 saveIndex = -1; 649 650 // binary search in the key array 651 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 652 uint16 i = (first + last) >> 1; 653 654 uint16 searchLength; 655 uint8 *searchKey = node->KeyAt(i, &searchLength); 656 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 657 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 658 fStream->GetVolume()->Panic(); 659 RETURN_ERROR(B_BAD_DATA); 660 } 661 662 int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength); 663 if (cmp < 0) { 664 last = i - 1; 665 saveIndex = i; 666 } else if (cmp > 0) { 667 saveIndex = first = i + 1; 668 } else { 669 if (_index) 670 *_index = i; 671 if (_next) 672 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 673 return B_OK; 674 } 675 } 676 677 if (_index) 678 *_index = saveIndex; 679 if (_next) { 680 if (saveIndex == node->NumKeys()) 681 *_next = node->OverflowLink(); 682 else 683 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 684 } 685 return B_ENTRY_NOT_FOUND; 686 } 687 688 689 /** Prepares the stack to contain all nodes that were passed while 690 * following the key, from the root node to the leaf node that could 691 * or should contain that key. 692 */ 693 694 status_t 695 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength) 696 { 697 // set the root node to begin with 698 node_and_key nodeAndKey; 699 nodeAndKey.nodeOffset = fHeader->RootNode(); 700 701 CachedNode cached(this); 702 const bplustree_node *node; 703 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 704 // if we are already on leaf level, we're done 705 if (node->OverflowLink() == BPLUSTREE_NULL) { 706 // node that the keyIndex is not properly set here (but it's not 707 // needed in the calling functions anyway)! 708 nodeAndKey.keyIndex = 0; 709 stack.Push(nodeAndKey); 710 return B_OK; 711 } 712 713 off_t nextOffset; 714 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset); 715 716 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 717 RETURN_ERROR(B_ERROR); 718 719 // put the node offset & the correct keyIndex on the stack 720 stack.Push(nodeAndKey); 721 722 nodeAndKey.nodeOffset = nextOffset; 723 } 724 725 FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", nodeAndKey.nodeOffset)); 726 return B_ERROR; 727 } 728 729 730 /** This will find a free duplicate fragment in the given bplustree_node. 731 * The CachedNode will be set to the writable fragment on success. 732 */ 733 734 status_t 735 BPlusTree::FindFreeDuplicateFragment(Transaction &transaction, 736 const bplustree_node *node, CachedNode &cached, 737 off_t *_offset, bplustree_node **_fragment, uint32 *_index) 738 { 739 off_t *values = node->Values(); 740 for (int32 i = 0; i < node->NumKeys(); i++) { 741 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 742 743 // does the value link to a duplicate fragment? 744 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 745 continue; 746 747 const bplustree_node *fragment = cached.SetTo( 748 bplustree_node::FragmentOffset(value), false); 749 if (fragment == NULL) { 750 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 751 continue; 752 } 753 754 // see if there is some space left for us 755 uint32 num = bplustree_node::MaxFragments(fNodeSize); 756 for (uint32 j = 0; j < num; j++) { 757 duplicate_array *array = fragment->FragmentAt(j); 758 759 if (array->count == 0) { 760 // found an unused fragment 761 *_fragment = cached.MakeWritable(transaction); 762 if (*_fragment == NULL) 763 return B_IO_ERROR; 764 765 *_offset = bplustree_node::FragmentOffset(value); 766 *_index = j; 767 return B_OK; 768 } 769 } 770 } 771 return B_ENTRY_NOT_FOUND; 772 } 773 774 775 status_t 776 BPlusTree::InsertDuplicate(Transaction &transaction, CachedNode &cached, 777 const bplustree_node *node, uint16 index, off_t value) 778 { 779 CachedNode cachedDuplicate(this); 780 off_t *values = node->Values(); 781 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 782 status_t status; 783 off_t offset; 784 785 if (bplustree_node::IsDuplicate(oldValue)) { 786 // If it's a duplicate fragment, try to insert it into that, or if it 787 // doesn't fit anymore, create a new duplicate node 788 789 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 790 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 791 bplustree_node::FragmentOffset(oldValue), false); 792 if (duplicate == NULL) 793 return B_IO_ERROR; 794 795 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 796 if (array->count > NUM_FRAGMENT_VALUES 797 || array->count < 1) { 798 FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 799 bplustree_node::FragmentIndex(oldValue), 800 bplustree_node::FragmentOffset(oldValue), 801 array->count)); 802 return B_BAD_DATA; 803 } 804 805 if (array->count < NUM_FRAGMENT_VALUES) { 806 array->Insert(value); 807 } else { 808 // test if the fragment will be empty if we remove this key's values 809 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 810 // the node will be empty without our values, so let us 811 // reuse it as a duplicate node 812 offset = bplustree_node::FragmentOffset(oldValue); 813 814 memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 815 duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64( 816 (uint64)BPLUSTREE_NULL); 817 818 array = duplicate->DuplicateArray(); 819 array->Insert(value); 820 } else { 821 // create a new duplicate node 822 823 cachedDuplicate.UnsetUnchanged(transaction); 824 // the old duplicate has not been touched, so we can reuse it 825 826 bplustree_node *newDuplicate; 827 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 828 if (status < B_OK) 829 RETURN_ERROR(status); 830 831 // copy the array from the fragment node to the duplicate node 832 // and free the old entry (by zero'ing all values) 833 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count); 834 memcpy(&newDuplicate->all_key_count, &array->values[0], 835 array->count * sizeof(off_t)); 836 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 837 838 array = newDuplicate->DuplicateArray(); 839 array->Insert(value); 840 } 841 842 // update the main pointer to link to a duplicate node 843 if (cached.MakeWritable(transaction) == NULL) 844 return B_IO_ERROR; 845 846 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 847 BPLUSTREE_DUPLICATE_NODE, offset)); 848 } 849 850 return B_OK; 851 } 852 853 // Put the value into a dedicated duplicate node 854 855 // search for free space in the duplicate nodes of that key 856 duplicate_array *array; 857 const bplustree_node *duplicate; 858 off_t duplicateOffset; 859 do { 860 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 861 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 862 if (duplicate == NULL) 863 return B_IO_ERROR; 864 865 array = duplicate->DuplicateArray(); 866 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 867 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 868 duplicateOffset, array->count)); 869 return B_BAD_DATA; 870 } 871 } while (array->count >= NUM_DUPLICATE_VALUES 872 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 873 874 bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction); 875 if (writableDuplicate == NULL) 876 return B_IO_ERROR; 877 878 if (array->count < NUM_DUPLICATE_VALUES) { 879 array = writableDuplicate->DuplicateArray(); 880 array->Insert(value); 881 } else { 882 // no space left - add a new duplicate node 883 884 bplustree_node *newDuplicate; 885 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 886 if (status < B_OK) 887 RETURN_ERROR(status); 888 889 // link the two nodes together 890 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 891 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 892 893 array = newDuplicate->DuplicateArray(); 894 array->count = 0; 895 array->Insert(value); 896 } 897 return B_OK; 898 } 899 900 // Search for a free duplicate fragment or create a new one 901 // to insert the duplicate value into 902 903 uint32 fragmentIndex = 0; 904 bplustree_node *fragment; 905 if (FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 906 &offset, &fragment, &fragmentIndex) != B_OK) { 907 // allocate a new duplicate fragment node 908 if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK) 909 RETURN_ERROR(status); 910 911 memset(fragment, 0, fNodeSize); 912 } 913 914 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 915 array->Insert(oldValue); 916 array->Insert(value); 917 918 if (cached.MakeWritable(transaction) == NULL) 919 return B_IO_ERROR; 920 921 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 922 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 923 924 return B_OK; 925 } 926 927 928 void 929 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength, 930 off_t value) 931 { 932 // should never happen, but who knows? 933 if (index > node->NumKeys()) 934 return; 935 936 off_t *values = node->Values(); 937 uint16 *keyLengths = node->KeyLengths(); 938 uint8 *keys = node->Keys(); 939 940 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 941 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength); 942 943 off_t *newValues = node->Values(); 944 uint16 *newKeyLengths = node->KeyLengths(); 945 946 // move values and copy new value into them 947 memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index)); 948 memmove(newValues, values, sizeof(off_t) * index); 949 950 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 951 952 // move and update key length index 953 for (uint16 i = node->NumKeys(); i-- > index + 1;) 954 newKeyLengths[i] = keyLengths[i - 1] + keyLength; 955 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 956 957 int32 keyStart; 958 newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); 959 960 // move keys and copy new key into them 961 int32 size = node->AllKeyLength() - newKeyLengths[index]; 962 if (size > 0) 963 memmove(keys + newKeyLengths[index], keys + newKeyLengths[index] - keyLength, size); 964 965 memcpy(keys + keyStart, key, keyLength); 966 } 967 968 969 /** Splits the \a node into two halves - the other half will be put into \a other. 970 * It also takes care to create a new overflow link if the node to split is an 971 * index node. 972 */ 973 974 status_t 975 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other, 976 off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value) 977 { 978 if (*_keyIndex > node->NumKeys() + 1) 979 return B_BAD_VALUE; 980 981 uint16 *inKeyLengths = node->KeyLengths(); 982 off_t *inKeyValues = node->Values(); 983 uint8 *inKeys = node->Keys(); 984 uint8 *outKeys = other->Keys(); 985 int32 keyIndex = *_keyIndex; // can become less than zero! 986 987 // how many keys will fit in one (half) page? 988 // that loop will find the answer to this question and 989 // change the key lengths indices for their new home 990 991 // "bytes" is the number of bytes written for the new key, 992 // "bytesBefore" are the bytes before that key 993 // "bytesAfter" are the bytes after the new key, if any 994 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 995 996 size_t size = fNodeSize >> 1; 997 int32 out, in; 998 for (in = out = 0; in < node->NumKeys() + 1;) { 999 if (!bytes) 1000 bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; 1001 1002 if (in == keyIndex && !bytes) { 1003 bytes = *_keyLength; 1004 } else { 1005 if (keyIndex < out) 1006 bytesAfter = inKeyLengths[in] - bytesBefore; 1007 1008 in++; 1009 } 1010 out++; 1011 1012 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1013 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1014 // we have found the number of keys in the new node! 1015 break; 1016 } 1017 } 1018 1019 // if the new key was not inserted, set the length of the keys 1020 // that can be copied directly 1021 if (keyIndex >= out && in > 0) 1022 bytesBefore = inKeyLengths[in - 1]; 1023 1024 if (bytesBefore < 0 || bytesAfter < 0) 1025 return B_BAD_DATA; 1026 1027 other->left_link = node->left_link; 1028 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1029 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 1030 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1031 1032 uint16 *outKeyLengths = other->KeyLengths(); 1033 off_t *outKeyValues = other->Values(); 1034 int32 keys = out > keyIndex ? keyIndex : out; 1035 1036 if (bytesBefore) { 1037 // copy the keys 1038 memcpy(outKeys, inKeys, bytesBefore); 1039 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1040 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1041 } 1042 if (bytes) { 1043 // copy the newly inserted key 1044 memcpy(outKeys + bytesBefore, key, bytes); 1045 outKeyLengths[keyIndex] = bytes + bytesBefore; 1046 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1047 1048 if (bytesAfter) { 1049 // copy the keys after the new key 1050 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter); 1051 keys = out - keyIndex - 1; 1052 for (int32 i = 0;i < keys;i++) 1053 outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes; 1054 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t)); 1055 } 1056 } 1057 1058 // if the new key was already inserted, we shouldn't use it again 1059 if (in != out) 1060 keyIndex--; 1061 1062 int32 total = bytesBefore + bytesAfter; 1063 1064 // these variables are for the key that will be returned 1065 // to the parent node 1066 uint8 *newKey = NULL; 1067 uint16 newLength; 1068 bool newAllocated = false; 1069 1070 // If we have split an index node, we have to drop the first key 1071 // of the next node (which can also be the new key to insert). 1072 // The dropped key is also the one which has to be inserted in 1073 // the parent node, so we will set the "newKey" already here. 1074 if (node->OverflowLink() != BPLUSTREE_NULL) { 1075 if (in == keyIndex) { 1076 newKey = key; 1077 newLength = *_keyLength; 1078 1079 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1080 keyIndex--; 1081 } else { 1082 // If a key is dropped (is not the new key), we have to copy 1083 // it, because it would be lost if not. 1084 uint8 *droppedKey = node->KeyAt(in, &newLength); 1085 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1086 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1087 fStream->GetVolume()->Panic(); 1088 RETURN_ERROR(B_BAD_DATA); 1089 } 1090 newKey = (uint8 *)malloc(newLength); 1091 if (newKey == NULL) 1092 return B_NO_MEMORY; 1093 memcpy(newKey, droppedKey, newLength); 1094 1095 other->overflow_link = inKeyValues[in]; 1096 total = inKeyLengths[in++]; 1097 } 1098 } 1099 1100 // and now the same game for the other page and the rest of the keys 1101 // (but with memmove() instead of memcpy(), because they may overlap) 1102 1103 bytesBefore = bytesAfter = bytes = 0; 1104 out = 0; 1105 int32 skip = in; 1106 while (in < node->NumKeys() + 1) { 1107 if (in == keyIndex && !bytes) { 1108 // it's enough to set bytesBefore once here, because we do 1109 // not need to know the exact length of all keys in this 1110 // loop 1111 bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; 1112 bytes = *_keyLength; 1113 } else { 1114 if (in < node->NumKeys()) { 1115 inKeyLengths[in] -= total; 1116 if (bytes) { 1117 inKeyLengths[in] += bytes; 1118 bytesAfter = inKeyLengths[in] - bytesBefore - bytes; 1119 } 1120 } 1121 in++; 1122 } 1123 1124 out++; 1125 1126 // break out when all keys are done 1127 if (in > node->NumKeys() && keyIndex < in) 1128 break; 1129 } 1130 1131 // adjust the byte counts (since we were a bit lazy in the loop) 1132 if (keyIndex >= in && keyIndex - skip < out) 1133 bytesAfter = inKeyLengths[in] - bytesBefore - total; 1134 else if (keyIndex < skip) 1135 bytesBefore = node->AllKeyLength() - total; 1136 1137 if (bytesBefore < 0 || bytesAfter < 0) 1138 return B_BAD_DATA; 1139 1140 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1141 // right link, and overflow link can stay the same 1142 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 1143 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1); 1144 1145 // array positions have changed 1146 outKeyLengths = node->KeyLengths(); 1147 outKeyValues = node->Values(); 1148 1149 // move the keys in the old node: the order is important here, 1150 // because we don't want to overwrite any contents 1151 1152 keys = keyIndex <= skip ? out : keyIndex - skip; 1153 keyIndex -= skip; 1154 1155 if (bytesBefore) 1156 memmove(inKeys, inKeys + total, bytesBefore); 1157 if (bytesAfter) 1158 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter); 1159 1160 if (bytesBefore) 1161 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1162 in = out - keyIndex - 1; 1163 if (bytesAfter) 1164 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16)); 1165 1166 if (bytesBefore) 1167 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1168 if (bytesAfter) 1169 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t)); 1170 1171 if (bytes) { 1172 // finally, copy the newly inserted key (don't overwrite anything) 1173 memcpy(inKeys + bytesBefore, key, bytes); 1174 outKeyLengths[keyIndex] = bytes + bytesBefore; 1175 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1176 } 1177 1178 // Prepare the key that will be inserted in the parent node which 1179 // is either the dropped key or the last of the other node. 1180 // If it's the dropped key, "newKey" was already set earlier. 1181 1182 if (newKey == NULL) 1183 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1184 1185 memcpy(key, newKey, newLength); 1186 *_keyLength = newLength; 1187 *_value = otherOffset; 1188 1189 if (newAllocated) 1190 free(newKey); 1191 1192 return B_OK; 1193 } 1194 1195 1196 /** This inserts a key into the tree. The changes made to the tree will 1197 * all be part of the \a transaction. 1198 */ 1199 1200 status_t 1201 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1202 { 1203 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1204 RETURN_ERROR(B_BAD_VALUE); 1205 #ifdef DEBUG 1206 if (value < 0) 1207 panic("tried to insert invalid value %Ld!\n", value); 1208 #endif 1209 1210 // lock access to stream 1211 WriteLocked locked(fStream->Lock()); 1212 1213 Stack<node_and_key> stack; 1214 if (SeekDown(stack, key, keyLength) != B_OK) 1215 RETURN_ERROR(B_ERROR); 1216 1217 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1218 1219 memcpy(keyBuffer, key, keyLength); 1220 keyBuffer[keyLength] = 0; 1221 1222 node_and_key nodeAndKey; 1223 const bplustree_node *node; 1224 1225 CachedNode cached(this); 1226 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1227 #ifdef DEBUG 1228 NodeChecker checker(node, fNodeSize, "insert"); 1229 #endif 1230 if (node->IsLeaf()) { 1231 // first round, check for duplicate entries 1232 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1233 1234 // is this a duplicate entry? 1235 if (status == B_OK) { 1236 if (fAllowDuplicates) { 1237 status = InsertDuplicate(transaction, cached, node, nodeAndKey.keyIndex, value); 1238 if (status != B_OK) 1239 RETURN_ERROR(status); 1240 return B_OK; 1241 } 1242 1243 RETURN_ERROR(B_NAME_IN_USE); 1244 } 1245 } 1246 1247 bplustree_node *writableNode = cached.MakeWritable(transaction); 1248 if (writableNode == NULL) 1249 return B_IO_ERROR; 1250 1251 // is the node big enough to hold the pair? 1252 if (int32(round_up(sizeof(bplustree_node) + writableNode->AllKeyLength() + keyLength) 1253 + (writableNode->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 1254 { 1255 InsertKey(writableNode, nodeAndKey.keyIndex, keyBuffer, keyLength, value); 1256 UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1); 1257 1258 return B_OK; 1259 } else { 1260 CachedNode cachedNewRoot(this); 1261 CachedNode cachedOther(this); 1262 1263 // do we need to allocate a new root node? if so, then do 1264 // it now 1265 off_t newRoot = BPLUSTREE_NULL; 1266 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1267 bplustree_node *root; 1268 status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot); 1269 if (status < B_OK) { 1270 // The tree is most likely corrupted! 1271 // But it's still sane at leaf level - we could set 1272 // a flag in the header that forces the tree to be 1273 // rebuild next time... 1274 // But since we will have journaling, that's not a big 1275 // problem anyway. 1276 RETURN_ERROR(status); 1277 } 1278 } 1279 1280 // reserve space for the other node 1281 bplustree_node *other; 1282 off_t otherOffset; 1283 status_t status = cachedOther.Allocate(transaction, &other, &otherOffset); 1284 if (status < B_OK) { 1285 cachedNewRoot.Free(transaction, newRoot); 1286 RETURN_ERROR(status); 1287 } 1288 1289 if (SplitNode(writableNode, nodeAndKey.nodeOffset, other, otherOffset, 1290 &nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) { 1291 // free root node & other node here 1292 cachedOther.Free(transaction, otherOffset); 1293 cachedNewRoot.Free(transaction, newRoot); 1294 1295 RETURN_ERROR(B_ERROR); 1296 } 1297 #ifdef DEBUG 1298 checker.Check("insert split"); 1299 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1300 #endif 1301 1302 UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex, 1303 writableNode->NumKeys(), 1); 1304 1305 // update the right link of the node in the left of the new node 1306 if ((other = cachedOther.SetToWritable(transaction, other->LeftLink())) != NULL) { 1307 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1308 } 1309 1310 // create a new root if necessary 1311 if (newRoot != BPLUSTREE_NULL) { 1312 bplustree_node *root = cachedNewRoot.Node(); 1313 1314 InsertKey(root, 0, keyBuffer, keyLength, writableNode->LeftLink()); 1315 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset); 1316 1317 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1318 if (header == NULL) 1319 return B_IO_ERROR; 1320 1321 // finally, update header to point to the new root 1322 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1323 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(header->MaxNumberOfLevels() + 1); 1324 1325 return B_OK; 1326 } 1327 } 1328 } 1329 RETURN_ERROR(B_ERROR); 1330 } 1331 1332 1333 /** Removes the duplicate index/value pair from the tree. 1334 * It's part of the private tree interface. 1335 */ 1336 1337 status_t 1338 BPlusTree::RemoveDuplicate(Transaction &transaction, const bplustree_node *node, 1339 CachedNode &cached, uint16 index, off_t value) 1340 { 1341 off_t *values = node->Values(); 1342 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1343 1344 CachedNode cachedDuplicate(this); 1345 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1346 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 1347 duplicateOffset, false); 1348 if (duplicate == NULL) 1349 return B_IO_ERROR; 1350 1351 // if it's a duplicate fragment, remove the entry from there 1352 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1353 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 1354 1355 if (array->count > NUM_FRAGMENT_VALUES 1356 || array->count < 1) { 1357 FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 1358 bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1359 return B_BAD_DATA; 1360 } 1361 if (!array->Remove(value)) { 1362 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1363 value, duplicateOffset)); 1364 return B_ENTRY_NOT_FOUND; 1365 } 1366 1367 // remove the array from the fragment node if it is empty 1368 if (array->count == 1) { 1369 // set the link to the remaining value 1370 if (cached.MakeWritable(transaction) == NULL) 1371 return B_IO_ERROR; 1372 1373 values[index] = array->values[0]; 1374 1375 // Remove the whole fragment node, if this was the only array, 1376 // otherwise free just the array 1377 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1378 status_t status = cachedDuplicate.Free(transaction, duplicateOffset); 1379 if (status < B_OK) 1380 return status; 1381 } else 1382 array->count = 0; 1383 } 1384 return B_OK; 1385 } 1386 1387 // Remove value from a duplicate node! 1388 1389 duplicate_array *array = NULL; 1390 1391 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1392 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1393 duplicate->LeftLink())); 1394 return B_BAD_DATA; 1395 } 1396 1397 // Search the duplicate nodes until the entry could be found (and removed) 1398 while (duplicate != NULL) { 1399 array = duplicate->DuplicateArray(); 1400 if (array->count > NUM_DUPLICATE_VALUES 1401 || array->count < 0) { 1402 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1403 duplicateOffset, array->count)); 1404 return B_BAD_DATA; 1405 } 1406 1407 if (array->Remove(value)) 1408 break; 1409 1410 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1411 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1412 1413 cachedDuplicate.UnsetUnchanged(transaction); 1414 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); 1415 } 1416 if (duplicate == NULL) 1417 RETURN_ERROR(B_IO_ERROR); 1418 1419 // The entry got removed from the duplicate node, but we might want to free 1420 // it now in case it's empty 1421 1422 while (true) { 1423 off_t left = duplicate->LeftLink(); 1424 off_t right = duplicate->RightLink(); 1425 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1426 1427 if ((isLast && array->count == 1) || array->count == 0) { 1428 // Free empty duplicate page, link their siblings together, and 1429 // update the duplicate link if needed (ie. when we either remove 1430 // the last duplicate node or have a new first one) 1431 1432 if (left == BPLUSTREE_NULL) { 1433 // the duplicate link points to us 1434 if (cached.MakeWritable(transaction) == NULL) 1435 return B_IO_ERROR; 1436 1437 if (array->count == 1) { 1438 // this is the last node, and there is only one value left; 1439 // replace the duplicate link with that value, it's no duplicate 1440 // anymore 1441 values[index] = array->values[0]; 1442 } else { 1443 // move the duplicate link to the next node 1444 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1445 BPLUSTREE_DUPLICATE_NODE, right)); 1446 } 1447 } 1448 1449 status_t status; 1450 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1451 return status; 1452 1453 if (left != BPLUSTREE_NULL 1454 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { 1455 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1456 1457 // If the next node is the last node, we need to free that node 1458 // and convert the duplicate entry back into a normal entry 1459 array = duplicate->DuplicateArray(); 1460 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1461 && array->count <= NUM_FRAGMENT_VALUES) { 1462 duplicateOffset = left; 1463 continue; 1464 } 1465 } 1466 if (right != BPLUSTREE_NULL 1467 && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { 1468 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1469 1470 // Again, we may need to turn the duplicate entry back into a normal entry 1471 array = duplicate->DuplicateArray(); 1472 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1473 && array->count <= NUM_FRAGMENT_VALUES) { 1474 duplicateOffset = right; 1475 continue; 1476 } 1477 } 1478 return B_OK; 1479 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1480 // If the number of entries fits in a duplicate fragment, then 1481 // either find a free fragment node, or convert this node to a 1482 // fragment node. 1483 CachedNode cachedOther(this); 1484 1485 bplustree_node *fragment = NULL; 1486 uint32 fragmentIndex = 0; 1487 off_t offset; 1488 if (FindFreeDuplicateFragment(transaction, node, cachedOther, 1489 &offset, &fragment, &fragmentIndex) == B_OK) { 1490 // move to other node 1491 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1492 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1493 1494 cachedDuplicate.Free(transaction, duplicateOffset); 1495 duplicateOffset = offset; 1496 } else { 1497 // convert node 1498 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1499 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1500 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1501 } 1502 1503 if (cached.MakeWritable(transaction) == NULL) 1504 return B_IO_ERROR; 1505 1506 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1507 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1508 } 1509 return B_OK; 1510 } 1511 } 1512 1513 1514 /** Removes the key with the given index from the specified node. 1515 * Since it has to get the key from the node anyway (to obtain it's 1516 * pointer), it's not needed to pass the key & its length, although 1517 * the calling method (BPlusTree::Remove()) have this data. 1518 */ 1519 1520 void 1521 BPlusTree::RemoveKey(bplustree_node *node, uint16 index) 1522 { 1523 // should never happen, but who knows? 1524 if (index > node->NumKeys() && node->NumKeys() > 0) { 1525 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1526 return; 1527 } 1528 1529 off_t *values = node->Values(); 1530 1531 // if we would have to drop the overflow link, drop 1532 // the last key instead and update the overflow link 1533 // to the value of that one 1534 if (!node->IsLeaf() && index == node->NumKeys()) 1535 node->overflow_link = values[--index]; 1536 1537 uint16 length; 1538 uint8 *key = node->KeyAt(index, &length); 1539 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1540 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1541 FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length, 1542 fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1543 fStream->GetVolume()->Panic(); 1544 return; 1545 } 1546 1547 uint16 *keyLengths = node->KeyLengths(); 1548 uint8 *keys = node->Keys(); 1549 1550 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1551 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1552 1553 off_t *newValues = node->Values(); 1554 uint16 *newKeyLengths = node->KeyLengths(); 1555 1556 // move key data 1557 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1558 1559 // move and update key lengths 1560 if (index > 0 && newKeyLengths != keyLengths) 1561 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1562 for (uint16 i = index; i < node->NumKeys(); i++) 1563 newKeyLengths[i] = keyLengths[i + 1] - length; 1564 1565 // move values 1566 if (index > 0) 1567 memmove(newValues, values, index * sizeof(off_t)); 1568 if (node->NumKeys() > index) 1569 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1570 } 1571 1572 1573 /** Removes the specified key from the tree. The "value" parameter is only used 1574 * for trees which allow duplicates, so you may safely ignore it. 1575 * It's not an optional parameter, so at least you have to think about it. 1576 */ 1577 1578 status_t 1579 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1580 { 1581 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1582 RETURN_ERROR(B_BAD_VALUE); 1583 1584 // lock access to stream 1585 WriteLocked locked(fStream->Lock()); 1586 1587 Stack<node_and_key> stack; 1588 if (SeekDown(stack, key, keyLength) != B_OK) 1589 RETURN_ERROR(B_ERROR); 1590 1591 node_and_key nodeAndKey; 1592 const bplustree_node *node; 1593 1594 CachedNode cached(this); 1595 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1596 #ifdef DEBUG 1597 NodeChecker checker(node, fNodeSize, "remove"); 1598 #endif 1599 if (node->IsLeaf()) { 1600 // first round, check for duplicate entries 1601 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1602 if (status < B_OK) 1603 RETURN_ERROR(status); 1604 1605 // If we will remove the last key, the iterator will be set 1606 // to the next node after the current - if there aren't any 1607 // more nodes, we need a way to prevent the TreeIterators to 1608 // touch the old node again, we use BPLUSTREE_FREE for this 1609 off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink(); 1610 UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ? 1611 next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1612 1613 // is this a duplicate entry? 1614 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1615 node->Values()[nodeAndKey.keyIndex]))) { 1616 if (fAllowDuplicates) { 1617 return RemoveDuplicate(transaction, node, cached, 1618 nodeAndKey.keyIndex, value); 1619 } else { 1620 FATAL(("dupliate node found where no duplicates are allowed!\n")); 1621 RETURN_ERROR(B_ERROR); 1622 } 1623 } 1624 } 1625 1626 bplustree_node *writableNode = cached.MakeWritable(transaction); 1627 if (writableNode == NULL) 1628 return B_IO_ERROR; 1629 1630 // if it's an empty root node, we have to convert it 1631 // to a leaf node by dropping the overflow link, or, 1632 // if it's a leaf node, just empty it 1633 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1634 && node->NumKeys() == 0 1635 || node->NumKeys() == 1 && node->IsLeaf()) { 1636 writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1637 writableNode->all_key_count = 0; 1638 writableNode->all_key_length = 0; 1639 1640 // if we've cleared the root node, reset the maximum 1641 // number of levels in the header 1642 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1643 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1644 if (header == NULL) 1645 return B_IO_ERROR; 1646 1647 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1648 } 1649 return B_OK; 1650 } 1651 1652 // if there is only one key left, we don't have to remove 1653 // it, we can just dump the node (index nodes still have 1654 // the overflow link, so we have to drop the last key) 1655 if (writableNode->NumKeys() > 1 1656 || !writableNode->IsLeaf() && writableNode->NumKeys() == 1) { 1657 RemoveKey(writableNode, nodeAndKey.keyIndex); 1658 return B_OK; 1659 } 1660 1661 // when we are here, we can just free the node, but 1662 // we have to update the right/left link of the 1663 // siblings first 1664 CachedNode otherCached(this); 1665 bplustree_node *other = otherCached.SetToWritable(transaction, 1666 writableNode->LeftLink()); 1667 if (other != NULL) 1668 other->right_link = writableNode->right_link; 1669 1670 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) 1671 other->left_link = writableNode->left_link; 1672 1673 cached.Free(transaction, nodeAndKey.nodeOffset); 1674 } 1675 RETURN_ERROR(B_ERROR); 1676 } 1677 1678 1679 /** Replaces the value for the key in the tree. 1680 * Returns B_OK if the key could be found and its value replaced, 1681 * B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1682 * to indicate that something went terribly wrong. 1683 * Note that this doesn't work with duplicates - it will just 1684 * return B_BAD_TYPE if you call this function on a tree where 1685 * duplicates are allowed. 1686 */ 1687 1688 status_t 1689 BPlusTree::Replace(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1690 { 1691 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1692 || key == NULL) 1693 RETURN_ERROR(B_BAD_VALUE); 1694 1695 if (fAllowDuplicates) 1696 RETURN_ERROR(B_BAD_TYPE); 1697 1698 // lock access to stream (a read lock is okay for this purpose) 1699 ReadLocked locked(fStream->Lock()); 1700 1701 off_t nodeOffset = fHeader->RootNode(); 1702 CachedNode cached(this); 1703 const bplustree_node *node; 1704 1705 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1706 uint16 keyIndex = 0; 1707 off_t nextOffset; 1708 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1709 1710 if (node->OverflowLink() == BPLUSTREE_NULL) { 1711 if (status == B_OK) { 1712 bplustree_node *writableNode = cached.MakeWritable(transaction); 1713 if (writableNode != NULL) 1714 writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); 1715 else 1716 status = B_IO_ERROR; 1717 } 1718 1719 return status; 1720 } else if (nextOffset == nodeOffset) 1721 RETURN_ERROR(B_ERROR); 1722 1723 nodeOffset = nextOffset; 1724 } 1725 RETURN_ERROR(B_ERROR); 1726 } 1727 1728 1729 /** Searches the key in the tree, and stores the offset found in 1730 * _value, if successful. 1731 * It's very similar to BPlusTree::SeekDown(), but doesn't fill 1732 * a stack while it descends the tree. 1733 * Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1734 * if not. It can also return other errors to indicate that 1735 * something went wrong. 1736 * Note that this doesn't work with duplicates - it will just 1737 * return B_BAD_TYPE if you call this function on a tree where 1738 * duplicates are allowed. 1739 */ 1740 1741 status_t 1742 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1743 { 1744 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1745 || key == NULL) 1746 RETURN_ERROR(B_BAD_VALUE); 1747 1748 if (fAllowDuplicates) 1749 RETURN_ERROR(B_BAD_TYPE); 1750 1751 // lock access to stream 1752 ReadLocked locked(fStream->Lock()); 1753 1754 off_t nodeOffset = fHeader->RootNode(); 1755 CachedNode cached(this); 1756 const bplustree_node *node; 1757 1758 #ifdef DEBUG 1759 int32 levels = 0; 1760 #endif 1761 1762 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1763 uint16 keyIndex = 0; 1764 off_t nextOffset; 1765 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1766 1767 #ifdef DEBUG 1768 levels++; 1769 #endif 1770 if (node->OverflowLink() == BPLUSTREE_NULL) { 1771 if (status == B_OK && _value != NULL) 1772 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1773 1774 #ifdef DEBUG 1775 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1776 DEBUGGER(("levels don't match")); 1777 #endif 1778 return status; 1779 } else if (nextOffset == nodeOffset) 1780 RETURN_ERROR(B_ERROR); 1781 1782 nodeOffset = nextOffset; 1783 } 1784 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1785 RETURN_ERROR(B_ERROR); 1786 } 1787 1788 1789 // #pragma mark - 1790 1791 1792 TreeIterator::TreeIterator(BPlusTree *tree) 1793 : 1794 fTree(tree), 1795 fCurrentNodeOffset(BPLUSTREE_NULL), 1796 fNext(NULL) 1797 { 1798 tree->AddIterator(this); 1799 } 1800 1801 1802 TreeIterator::~TreeIterator() 1803 { 1804 if (fTree) 1805 fTree->RemoveIterator(this); 1806 } 1807 1808 1809 status_t 1810 TreeIterator::Goto(int8 to) 1811 { 1812 if (fTree == NULL || fTree->fHeader == NULL) 1813 RETURN_ERROR(B_BAD_VALUE); 1814 1815 // lock access to stream 1816 ReadLocked locked(fTree->fStream->Lock()); 1817 1818 off_t nodeOffset = fTree->fHeader->RootNode(); 1819 CachedNode cached(fTree); 1820 const bplustree_node *node; 1821 1822 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1823 // is the node a leaf node? 1824 if (node->OverflowLink() == BPLUSTREE_NULL) { 1825 fCurrentNodeOffset = nodeOffset; 1826 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1827 fDuplicateNode = BPLUSTREE_NULL; 1828 1829 return B_OK; 1830 } 1831 1832 // get the next node offset depending on the direction (and if there 1833 // are any keys in that node at all) 1834 off_t nextOffset; 1835 if (to == BPLUSTREE_END || node->all_key_count == 0) 1836 nextOffset = node->OverflowLink(); 1837 else { 1838 if (node->AllKeyLength() > fTree->fNodeSize 1839 || (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys()) 1840 RETURN_ERROR(B_ERROR); 1841 1842 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1843 } 1844 if (nextOffset == nodeOffset) 1845 break; 1846 1847 nodeOffset = nextOffset; 1848 } 1849 FATAL(("%s fails\n", __FUNCTION__)); 1850 1851 RETURN_ERROR(B_ERROR); 1852 } 1853 1854 1855 /** Iterates through the tree in the specified direction. 1856 * When it iterates through duplicates, the "key" is only updated for the 1857 * first entry - if you need to know when this happens, use the "duplicate" 1858 * parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1859 * the other duplicates. 1860 * That's not too nice, but saves the 256 bytes that would be needed to 1861 * store the last key - if this will ever become an issue, it will be 1862 * easy to change. 1863 * The other advantage of this is, that the queries can skip all duplicates 1864 * at once when they are not relevant to them. 1865 */ 1866 1867 status_t 1868 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength, 1869 off_t *value, uint16 *duplicate) 1870 { 1871 if (fTree == NULL) 1872 return B_INTERRUPTED; 1873 if (fCurrentNodeOffset == BPLUSTREE_NULL 1874 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1875 RETURN_ERROR(B_ERROR); 1876 1877 // if the tree was emptied since the last call 1878 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1879 return B_ENTRY_NOT_FOUND; 1880 1881 // lock access to stream 1882 ReadLocked locked(fTree->fStream->Lock()); 1883 1884 CachedNode cached(fTree); 1885 const bplustree_node *node; 1886 1887 if (fDuplicateNode != BPLUSTREE_NULL) { 1888 // regardless of traverse direction the duplicates are always presented in 1889 // the same order; since they are all considered as equal, this shouldn't 1890 // cause any problems 1891 1892 if (!fIsFragment || fDuplicate < fNumDuplicates) 1893 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1894 else 1895 node = NULL; 1896 1897 if (node != NULL) { 1898 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1899 // if the node is out of duplicates, we go directly to the next one 1900 fDuplicateNode = node->RightLink(); 1901 if (fDuplicateNode != BPLUSTREE_NULL 1902 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1903 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1904 fDuplicate = 0; 1905 } 1906 } 1907 if (fDuplicate < fNumDuplicates) { 1908 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1909 if (duplicate) 1910 *duplicate = 2; 1911 return B_OK; 1912 } 1913 } 1914 fDuplicateNode = BPLUSTREE_NULL; 1915 } 1916 1917 off_t savedNodeOffset = fCurrentNodeOffset; 1918 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 1919 RETURN_ERROR(B_ERROR); 1920 1921 if (duplicate) 1922 *duplicate = 0; 1923 1924 fCurrentKey += direction; 1925 1926 // is the current key in the current node? 1927 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys()) 1928 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) { 1929 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink(); 1930 1931 // are there any more nodes? 1932 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 1933 node = cached.SetTo(fCurrentNodeOffset); 1934 if (!node) 1935 RETURN_ERROR(B_ERROR); 1936 1937 // reset current key 1938 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys(); 1939 } else { 1940 // there are no nodes left, so turn back to the last key 1941 fCurrentNodeOffset = savedNodeOffset; 1942 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1; 1943 1944 return B_ENTRY_NOT_FOUND; 1945 } 1946 } 1947 1948 if (node->all_key_count == 0) 1949 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 1950 1951 uint16 length; 1952 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 1953 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 1954 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1955 fTree->fStream->GetVolume()->Panic(); 1956 RETURN_ERROR(B_BAD_DATA); 1957 } 1958 1959 length = min_c(length, maxLength); 1960 memcpy(key, keyStart, length); 1961 1962 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 1963 // terminate string type 1964 if (length == maxLength) 1965 length--; 1966 ((char *)key)[length] = '\0'; 1967 } 1968 *keyLength = length; 1969 1970 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 1971 1972 // duplicate fragments? 1973 uint8 type = bplustree_node::LinkType(offset); 1974 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 1975 fDuplicateNode = offset; 1976 1977 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1978 if (node == NULL) 1979 RETURN_ERROR(B_ERROR); 1980 1981 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 1982 1983 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 1984 if (fNumDuplicates) { 1985 offset = node->DuplicateAt(offset, fIsFragment, 0); 1986 fDuplicate = 1; 1987 if (duplicate) 1988 *duplicate = 1; 1989 } else { 1990 // shouldn't happen, but we're dealing here with potentially corrupt disks... 1991 fDuplicateNode = BPLUSTREE_NULL; 1992 offset = 0; 1993 } 1994 } 1995 *value = offset; 1996 1997 return B_OK; 1998 } 1999 2000 2001 /** This is more or less a copy of BPlusTree::Find() - but it just 2002 * sets the current position in the iterator, regardless of if the 2003 * key could be found or not. 2004 */ 2005 2006 status_t 2007 TreeIterator::Find(const uint8 *key, uint16 keyLength) 2008 { 2009 if (fTree == NULL) 2010 return B_INTERRUPTED; 2011 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2012 || key == NULL) 2013 RETURN_ERROR(B_BAD_VALUE); 2014 2015 // lock access to stream 2016 ReadLocked locked(fTree->fStream->Lock()); 2017 2018 off_t nodeOffset = fTree->fHeader->RootNode(); 2019 2020 CachedNode cached(fTree); 2021 const bplustree_node *node; 2022 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2023 uint16 keyIndex = 0; 2024 off_t nextOffset; 2025 status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset); 2026 2027 if (node->OverflowLink() == BPLUSTREE_NULL) { 2028 fCurrentNodeOffset = nodeOffset; 2029 fCurrentKey = keyIndex - 1; 2030 fDuplicateNode = BPLUSTREE_NULL; 2031 2032 return status; 2033 } else if (nextOffset == nodeOffset) 2034 RETURN_ERROR(B_ERROR); 2035 2036 nodeOffset = nextOffset; 2037 } 2038 RETURN_ERROR(B_ERROR); 2039 } 2040 2041 2042 void 2043 TreeIterator::SkipDuplicates() 2044 { 2045 fDuplicateNode = BPLUSTREE_NULL; 2046 } 2047 2048 2049 void 2050 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 2051 { 2052 if (offset != fCurrentNodeOffset) 2053 return; 2054 2055 if (nextOffset != BPLUSTREE_NULL) { 2056 fCurrentNodeOffset = nextOffset; 2057 if (splitAt <= fCurrentKey) { 2058 fCurrentKey -= splitAt; 2059 keyIndex -= splitAt; 2060 } 2061 } 2062 2063 // Adjust fCurrentKey to point to the same key as before. 2064 // Note, that if a key is inserted at the current position 2065 // it won't be included in this tree transition. 2066 if (keyIndex <= fCurrentKey) 2067 fCurrentKey += change; 2068 2069 // ToDo: duplicate handling! 2070 } 2071 2072 2073 void 2074 TreeIterator::Stop() 2075 { 2076 fTree = NULL; 2077 } 2078 2079 2080 #ifdef DEBUG 2081 void 2082 TreeIterator::Dump() 2083 { 2084 __out("TreeIterator at %p:\n", this); 2085 __out("\tfTree = %p\n", fTree); 2086 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2087 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2088 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 2089 __out("\tfDuplicate = %u\n", fDuplicate); 2090 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2091 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2092 } 2093 #endif 2094 2095 2096 // #pragma mark - 2097 2098 2099 void 2100 bplustree_node::Initialize() 2101 { 2102 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2103 all_key_count = 0; 2104 all_key_length = 0; 2105 } 2106 2107 2108 uint8 * 2109 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 2110 { 2111 if (index < 0 || index > NumKeys()) 2112 return NULL; 2113 2114 uint8 *keyStart = Keys(); 2115 uint16 *keyLengths = KeyLengths(); 2116 2117 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2118 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2119 if (index > 0) 2120 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2121 2122 return keyStart; 2123 } 2124 2125 2126 uint8 2127 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2128 { 2129 // the duplicate fragment handling is currently hard-coded to a node size 2130 // of 1024 bytes - with future versions of BFS, this may be a problem 2131 2132 if (isFragment) { 2133 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2134 2135 return ((off_t *)this)[fragment]; 2136 } 2137 return OverflowLink(); 2138 } 2139 2140 2141 off_t 2142 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2143 { 2144 uint32 start; 2145 if (isFragment) 2146 start = 8 * ((uint64)offset & 0x3ff); 2147 else 2148 start = 2; 2149 2150 return ((off_t *)this)[start + 1 + index]; 2151 } 2152 2153 2154 /** Although the name suggests it, this function doesn't return the real 2155 * used fragment count; at least, it can only count to two: it returns 2156 * 0, if there is no fragment used, 1 if there is only one fragment 2157 * used, and 2 if there are at least 2 fragments used. 2158 */ 2159 2160 uint32 2161 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2162 { 2163 uint32 used = 0; 2164 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2165 duplicate_array *array = FragmentAt(i); 2166 if (array->count > 0 && ++used > 1) 2167 return used; 2168 } 2169 return used; 2170 } 2171 2172 2173 #ifdef DEBUG 2174 status_t 2175 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2176 { 2177 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2178 DEBUGGER(("invalid node: key/length count")); 2179 2180 for (int32 i = 0; i < NumKeys(); i++) { 2181 uint16 length; 2182 uint8 *key = KeyAt(i, &length); 2183 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2184 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2185 dprintf("node %p, key %ld\n", this, i); 2186 DEBUGGER(("invalid node: keys corrupted")); 2187 return B_BAD_DATA; 2188 } 2189 if (Values()[i] == -1) { 2190 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2191 DEBUGGER(("invalid node: values corrupted")); 2192 return B_BAD_DATA; 2193 } 2194 } 2195 return B_OK; 2196 } 2197 #endif 2198 2199 2200 // #pragma mark - 2201 2202 2203 int32 2204 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2) 2205 { 2206 // if one of the keys is NULL, bail out gracefully 2207 if (key1 == NULL || key2 == NULL) { 2208 //#if USER 2209 // // that's here to see if it's reasonable to handle this case nicely at all 2210 // DEBUGGER(("compareKeys() got NULL key!")); 2211 //#endif 2212 // even if it's most probably a bug in the calling code, we try to 2213 // give a meaningful result 2214 if (key1 == NULL && key2 != NULL) 2215 return 1; 2216 else if (key2 == NULL && key1 != NULL) 2217 return -1; 2218 2219 return 0; 2220 } 2221 2222 switch (type) { 2223 case B_STRING_TYPE: 2224 { 2225 int len = min_c(keyLength1, keyLength2); 2226 int result = strncmp((const char *)key1, (const char *)key2, len); 2227 2228 if (result == 0 2229 && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) 2230 result = keyLength1 - keyLength2; 2231 2232 return result; 2233 } 2234 2235 case B_SSIZE_T_TYPE: 2236 case B_INT32_TYPE: 2237 return *(int32 *)key1 - *(int32 *)key2; 2238 2239 case B_SIZE_T_TYPE: 2240 case B_UINT32_TYPE: 2241 if (*(uint32 *)key1 == *(uint32 *)key2) 2242 return 0; 2243 else if (*(uint32 *)key1 > *(uint32 *)key2) 2244 return 1; 2245 2246 return -1; 2247 2248 case B_OFF_T_TYPE: 2249 case B_INT64_TYPE: 2250 if (*(int64 *)key1 == *(int64 *)key2) 2251 return 0; 2252 else if (*(int64 *)key1 > *(int64 *)key2) 2253 return 1; 2254 2255 return -1; 2256 2257 case B_UINT64_TYPE: 2258 if (*(uint64 *)key1 == *(uint64 *)key2) 2259 return 0; 2260 else if (*(uint64 *)key1 > *(uint64 *)key2) 2261 return 1; 2262 2263 return -1; 2264 2265 case B_FLOAT_TYPE: 2266 { 2267 float result = *(float *)key1 - *(float *)key2; 2268 if (result == 0.0f) 2269 return 0; 2270 2271 return (result < 0.0f) ? -1 : 1; 2272 } 2273 2274 case B_DOUBLE_TYPE: 2275 { 2276 double result = *(double *)key1 - *(double *)key2; 2277 if (result == 0.0) 2278 return 0; 2279 2280 return (result < 0.0) ? -1 : 1; 2281 } 2282 } 2283 2284 // if the type is unknown, the entries don't match... 2285 return -1; 2286 } 2287 2288