1 /* BPlusTree - BFS B+Tree implementation 2 * 3 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 4 * no code but achieves binary compatibility with the on disk format. 5 * 6 * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de. 7 * This file may be used under the terms of the MIT License. 8 */ 9 10 11 #include "Debug.h" 12 #include "BPlusTree.h" 13 #include "Inode.h" 14 #include "Utility.h" 15 16 #include <util/kernel_cpp.h> 17 #include <util/Stack.h> 18 #include <TypeConstants.h> 19 20 #include <string.h> 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 25 #ifdef DEBUG 26 class NodeChecker { 27 public: 28 NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text) 29 : 30 fNode(node), 31 fSize(nodeSize), 32 fText(text) 33 { 34 Check("integrity check failed on construction."); 35 } 36 37 ~NodeChecker() 38 { 39 Check("integrity check failed on destruction."); 40 } 41 42 void 43 Check(const char *message) 44 { 45 if (fNode->CheckIntegrity(fSize) < B_OK) 46 dprintf("%s: %s\n", fText, message); 47 } 48 49 private: 50 const bplustree_node *fNode; 51 int32 fSize; 52 const char *fText; 53 }; 54 #endif 55 56 57 // Node Caching for the BPlusTree class 58 // 59 // With write support, there is the need for a function that allocates new 60 // nodes by either returning empty nodes, or by growing the file's data stream 61 // 62 // !! The CachedNode class assumes that you have properly locked the stream 63 // !! before asking for nodes. 64 // 65 // Note: This code will fail if the block size is smaller than the node size! 66 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 67 // is hard-coded to 1024 bytes, that's not an issue now. 68 69 void 70 CachedNode::UnsetUnchanged(Transaction &transaction) 71 { 72 if (fTree == NULL || fTree->fStream == NULL) 73 return; 74 75 if (fNode != NULL) { 76 void *cache = fTree->fStream->GetVolume()->BlockCache(); 77 78 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 79 block_cache_put(cache, fBlockNumber); 80 fNode = NULL; 81 } 82 } 83 84 85 void 86 CachedNode::Unset() 87 { 88 if (fTree == NULL || fTree->fStream == NULL) 89 return; 90 91 if (fNode != NULL) { 92 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber); 93 fNode = NULL; 94 } 95 } 96 97 98 const bplustree_node * 99 CachedNode::SetTo(off_t offset, bool check) 100 { 101 if (fTree == NULL || fTree->fStream == NULL) { 102 REPORT_ERROR(B_BAD_VALUE); 103 return NULL; 104 } 105 106 Unset(); 107 108 // You can only ask for nodes at valid positions - you can't 109 // even access the b+tree header with this method (use SetToHeader() 110 // instead) 111 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 112 || offset <= 0 113 || (offset % fTree->fNodeSize) != 0) 114 return NULL; 115 116 if (InternalSetTo(NULL, offset) != NULL && check) { 117 // sanity checks (links, all_key_count) 118 if (!fTree->fHeader->CheckNode(fNode)) { 119 FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n", 120 fNode, offset, fTree->fStream->ID())); 121 return NULL; 122 } 123 } 124 return fNode; 125 } 126 127 128 bplustree_node * 129 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check) 130 { 131 if (fTree == NULL || fTree->fStream == NULL) { 132 REPORT_ERROR(B_BAD_VALUE); 133 return NULL; 134 } 135 136 Unset(); 137 138 // You can only ask for nodes at valid positions - you can't 139 // even access the b+tree header with this method (use SetToHeader() 140 // instead) 141 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 142 || offset <= 0 143 || (offset % fTree->fNodeSize) != 0) 144 return NULL; 145 146 if (InternalSetTo(&transaction, offset) != NULL && check) { 147 // sanity checks (links, all_key_count) 148 if (!fTree->fHeader->CheckNode(fNode)) { 149 FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n", 150 fNode, offset, fTree->fStream->ID())); 151 return NULL; 152 } 153 } 154 return fNode; 155 } 156 157 158 bplustree_node * 159 CachedNode::MakeWritable(Transaction &transaction) 160 { 161 if (fNode == NULL) 162 return NULL; 163 164 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 165 fBlockNumber, transaction.ID()) == B_OK) 166 return fNode; 167 168 return NULL; 169 } 170 171 172 bplustree_header * 173 CachedNode::MakeWritableHeader(Transaction &transaction) 174 { 175 return (bplustree_header *)MakeWritable(transaction); 176 } 177 178 179 const bplustree_header * 180 CachedNode::SetToHeader() 181 { 182 if (fTree == NULL || fTree->fStream == NULL) { 183 REPORT_ERROR(B_BAD_VALUE); 184 return NULL; 185 } 186 187 Unset(); 188 189 InternalSetTo(NULL, 0LL); 190 return (bplustree_header *)fNode; 191 } 192 193 194 bplustree_header * 195 CachedNode::SetToWritableHeader(Transaction &transaction) 196 { 197 if (fTree == NULL || fTree->fStream == NULL) { 198 REPORT_ERROR(B_BAD_VALUE); 199 return NULL; 200 } 201 202 Unset(); 203 204 InternalSetTo(&transaction, 0LL); 205 return (bplustree_header *)fNode; 206 } 207 208 209 bplustree_node * 210 CachedNode::InternalSetTo(Transaction *transaction, off_t offset) 211 { 212 fNode = NULL; 213 fOffset = offset; 214 215 off_t fileOffset; 216 block_run run; 217 if (offset < fTree->fStream->Size() 218 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 219 Volume *volume = fTree->fStream->GetVolume(); 220 221 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 222 fBlockNumber = volume->ToBlock(run) + blockOffset; 223 uint8 *block; 224 225 if (transaction != NULL) { 226 block = (uint8 *)block_cache_get_writable(volume->BlockCache(), fBlockNumber, transaction->ID()); 227 fWritable = true; 228 } else { 229 block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber); 230 fWritable = false; 231 } 232 233 if (block) { 234 // the node is somewhere in that block... (confusing offset calculation) 235 fNode = (bplustree_node *)(block + offset - 236 (fileOffset + (blockOffset << volume->BlockShift()))); 237 } else 238 REPORT_ERROR(B_IO_ERROR); 239 } 240 return fNode; 241 } 242 243 244 status_t 245 CachedNode::Free(Transaction &transaction, off_t offset) 246 { 247 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 248 RETURN_ERROR(B_BAD_VALUE); 249 250 // ToDo: scan the free nodes list and remove all nodes at the end 251 // of the tree - perhaps that shouldn't be done everytime that 252 // function is called, perhaps it should be done when the directory 253 // inode is closed or based on some calculation or whatever... 254 255 bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction); 256 if (header == NULL) 257 return B_IO_ERROR; 258 259 // if the node is the last one in the tree, we shrink 260 // the tree and file size by one node 261 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 262 if (offset == lastOffset) { 263 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 264 if (status < B_OK) 265 return status; 266 267 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 268 return B_OK; 269 } 270 271 // add the node to the free nodes list 272 fNode->left_link = header->free_node_pointer; 273 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 274 275 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 276 return B_OK; 277 } 278 279 280 status_t 281 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, off_t *_offset) 282 { 283 if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL) 284 RETURN_ERROR(B_BAD_VALUE); 285 286 Unset(); 287 288 bplustree_header *header; 289 status_t status; 290 291 // if there are any free nodes, recycle them 292 if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) { 293 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 294 if (header == NULL) 295 return B_IO_ERROR; 296 297 *_offset = header->FreeNode(); 298 *_node = fNode; 299 300 // set new free node pointer 301 header->free_node_pointer = fNode->left_link; 302 fNode->Initialize(); 303 return B_OK; 304 } 305 306 // allocate space for a new node 307 Inode *stream = fTree->fStream; 308 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 309 return status; 310 311 header = fTree->fCachedHeader.MakeWritableHeader(transaction); 312 if (header == NULL) 313 return B_IO_ERROR; 314 315 // the maximum_size has to be changed before the call to SetTo() - or 316 // else it will fail because the requested node is out of bounds 317 off_t offset = header->MaximumSize(); 318 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() + fTree->fNodeSize); 319 320 if (SetToWritable(transaction, offset, false) != NULL) { 321 fNode->Initialize(); 322 323 *_offset = offset; 324 *_node = fNode; 325 return B_OK; 326 } 327 328 // revert header size to old value 329 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() - fTree->fNodeSize); 330 RETURN_ERROR(B_ERROR); 331 } 332 333 334 // #pragma mark - 335 336 337 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize) 338 : 339 fStream(NULL), 340 fHeader(NULL), 341 fCachedHeader(this) 342 { 343 SetTo(transaction, stream); 344 } 345 346 347 BPlusTree::BPlusTree(Inode *stream) 348 : 349 fStream(NULL), 350 fHeader(NULL), 351 fCachedHeader(this) 352 { 353 SetTo(stream); 354 } 355 356 357 BPlusTree::BPlusTree() 358 : 359 fStream(NULL), 360 fHeader(NULL), 361 fCachedHeader(this), 362 fNodeSize(BPLUSTREE_NODE_SIZE), 363 fAllowDuplicates(true), 364 fStatus(B_NO_INIT) 365 { 366 } 367 368 369 BPlusTree::~BPlusTree() 370 { 371 // if there are any TreeIterators left, we need to stop them 372 // (can happen when the tree's inode gets deleted while 373 // traversing the tree - a TreeIterator doesn't lock the inode) 374 if (fIteratorLock.Lock() < B_OK) 375 return; 376 377 TreeIterator *iterator = NULL; 378 while ((iterator = fIterators.Next(iterator)) != NULL) 379 iterator->Stop(); 380 381 fIteratorLock.Unlock(); 382 } 383 384 385 /** Create a new B+Tree on the specified stream */ 386 387 status_t 388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize) 389 { 390 // initializes in-memory B+Tree 391 392 fStream = stream; 393 394 bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction); 395 if (header == NULL) { 396 // allocate space for new header + node! 397 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 398 if (fStatus < B_OK) 399 RETURN_ERROR(fStatus); 400 401 header = fCachedHeader.SetToWritableHeader(transaction); 402 if (header == NULL) 403 RETURN_ERROR(fStatus = B_ERROR); 404 } 405 406 fHeader = header; 407 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 408 && stream->BlockRun() != stream->Parent()) 409 || (stream->Mode() & S_ALLOW_DUPS) != 0; 410 411 fNodeSize = nodeSize; 412 413 // initialize b+tree header 414 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 415 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 416 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 417 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 418 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 419 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 420 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 421 422 // initialize b+tree root node 423 CachedNode cached(this); 424 cached.SetToWritable(transaction, header->RootNode(), false); 425 if (cached.Node() == NULL) 426 RETURN_ERROR(B_IO_ERROR); 427 428 cached.Node()->Initialize(); 429 430 return fStatus = B_OK; 431 } 432 433 434 status_t 435 BPlusTree::SetTo(Inode *stream) 436 { 437 if (stream == NULL) 438 RETURN_ERROR(fStatus = B_BAD_VALUE); 439 440 // get on-disk B+Tree header 441 442 fCachedHeader.Unset(); 443 fStream = stream; 444 445 fHeader = fCachedHeader.SetToHeader(); 446 if (fHeader == NULL) 447 RETURN_ERROR(fStatus = B_NO_INIT); 448 449 // is header valid? 450 451 if (fHeader->MaximumSize() != stream->Size()) { 452 FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n", 453 fHeader->MaximumSize(), stream->Size())); 454 // we can't change the header since we don't have a transaction 455 //fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 456 } 457 if (fHeader->Magic() != BPLUSTREE_MAGIC 458 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 459 || !fHeader->IsValidLink(fHeader->RootNode()) 460 || !fHeader->IsValidLink(fHeader->FreeNode())) { 461 #ifdef DEBUG 462 dump_bplustree_header(fHeader); 463 dump_block((const char *)fHeader, 128); 464 #endif 465 RETURN_ERROR(fStatus = B_BAD_DATA); 466 } 467 468 fNodeSize = fHeader->NodeSize(); 469 470 { 471 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 472 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 473 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 474 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 475 476 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 477 || (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode 478 || !stream->IsContainer()) { 479 D( dump_bplustree_header(fHeader); 480 dump_inode(&stream->Node()); 481 ); 482 RETURN_ERROR(fStatus = B_BAD_TYPE); 483 } 484 485 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 486 // in the original BFS code - we will honour it nevertheless 487 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 488 && stream->BlockRun() != stream->Parent()) 489 || (stream->Mode() & S_ALLOW_DUPS) != 0; 490 } 491 492 CachedNode cached(this, fHeader->RootNode()); 493 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 494 } 495 496 497 status_t 498 BPlusTree::InitCheck() 499 { 500 return fStatus; 501 } 502 503 504 int32 505 BPlusTree::TypeCodeToKeyType(type_code code) 506 { 507 switch (code) { 508 case B_STRING_TYPE: 509 return BPLUSTREE_STRING_TYPE; 510 case B_SSIZE_T_TYPE: 511 case B_INT32_TYPE: 512 return BPLUSTREE_INT32_TYPE; 513 case B_SIZE_T_TYPE: 514 case B_UINT32_TYPE: 515 return BPLUSTREE_UINT32_TYPE; 516 case B_OFF_T_TYPE: 517 case B_INT64_TYPE: 518 return BPLUSTREE_INT64_TYPE; 519 case B_UINT64_TYPE: 520 return BPLUSTREE_UINT64_TYPE; 521 case B_FLOAT_TYPE: 522 return BPLUSTREE_FLOAT_TYPE; 523 case B_DOUBLE_TYPE: 524 return BPLUSTREE_DOUBLE_TYPE; 525 } 526 return -1; 527 } 528 529 530 int32 531 BPlusTree::ModeToKeyType(mode_t mode) 532 { 533 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 534 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 535 case S_INT_INDEX: 536 return BPLUSTREE_INT32_TYPE; 537 case S_UINT_INDEX: 538 return BPLUSTREE_UINT32_TYPE; 539 case S_LONG_LONG_INDEX: 540 return BPLUSTREE_INT64_TYPE; 541 case S_ULONG_LONG_INDEX: 542 return BPLUSTREE_UINT64_TYPE; 543 case S_FLOAT_INDEX: 544 return BPLUSTREE_FLOAT_TYPE; 545 case S_DOUBLE_INDEX: 546 return BPLUSTREE_DOUBLE_TYPE; 547 case S_STR_INDEX: 548 default: 549 // default is for standard directories 550 return BPLUSTREE_STRING_TYPE; 551 } 552 } 553 554 555 // #pragma mark - 556 557 558 void 559 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, 560 int8 change) 561 { 562 // Although every iterator which is affected by this update currently 563 // waits on a semaphore, other iterators could be added/removed at 564 // any time, so we need to protect this loop 565 if (fIteratorLock.Lock() < B_OK) 566 return; 567 568 TreeIterator *iterator = NULL; 569 while ((iterator = fIterators.Next(iterator)) != NULL) 570 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 571 572 fIteratorLock.Unlock(); 573 } 574 575 576 void 577 BPlusTree::AddIterator(TreeIterator *iterator) 578 { 579 if (fIteratorLock.Lock() < B_OK) 580 return; 581 582 fIterators.Add(iterator); 583 584 fIteratorLock.Unlock(); 585 } 586 587 588 void 589 BPlusTree::RemoveIterator(TreeIterator *iterator) 590 { 591 if (fIteratorLock.Lock() < B_OK) 592 return; 593 594 fIterators.Remove(iterator); 595 596 fIteratorLock.Unlock(); 597 } 598 599 600 int32 601 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 602 { 603 type_code type = 0; 604 switch (fHeader->data_type) { 605 case BPLUSTREE_STRING_TYPE: 606 type = B_STRING_TYPE; 607 break; 608 case BPLUSTREE_INT32_TYPE: 609 type = B_INT32_TYPE; 610 break; 611 case BPLUSTREE_UINT32_TYPE: 612 type = B_UINT32_TYPE; 613 break; 614 case BPLUSTREE_INT64_TYPE: 615 type = B_INT64_TYPE; 616 break; 617 case BPLUSTREE_UINT64_TYPE: 618 type = B_UINT64_TYPE; 619 break; 620 case BPLUSTREE_FLOAT_TYPE: 621 type = B_FLOAT_TYPE; 622 break; 623 case BPLUSTREE_DOUBLE_TYPE: 624 type = B_DOUBLE_TYPE; 625 break; 626 } 627 return compareKeys(type, key1, keyLength1, key2, keyLength2); 628 } 629 630 631 status_t 632 BPlusTree::FindKey(const bplustree_node *node, const uint8 *key, uint16 keyLength, 633 uint16 *_index, off_t *_next) 634 { 635 #ifdef DEBUG 636 NodeChecker checker(node, fNodeSize, "find"); 637 #endif 638 639 if (node->all_key_count == 0) { 640 if (_index) 641 *_index = 0; 642 if (_next) 643 *_next = node->OverflowLink(); 644 return B_ENTRY_NOT_FOUND; 645 } 646 647 off_t *values = node->Values(); 648 int16 saveIndex = -1; 649 650 // binary search in the key array 651 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 652 uint16 i = (first + last) >> 1; 653 654 uint16 searchLength; 655 uint8 *searchKey = node->KeyAt(i, &searchLength); 656 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 657 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 658 fStream->GetVolume()->Panic(); 659 RETURN_ERROR(B_BAD_DATA); 660 } 661 662 int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength); 663 if (cmp < 0) { 664 last = i - 1; 665 saveIndex = i; 666 } else if (cmp > 0) { 667 saveIndex = first = i + 1; 668 } else { 669 if (_index) 670 *_index = i; 671 if (_next) 672 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 673 return B_OK; 674 } 675 } 676 677 if (_index) 678 *_index = saveIndex; 679 if (_next) { 680 if (saveIndex == node->NumKeys()) 681 *_next = node->OverflowLink(); 682 else 683 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 684 } 685 return B_ENTRY_NOT_FOUND; 686 } 687 688 689 /** Prepares the stack to contain all nodes that were passed while 690 * following the key, from the root node to the leaf node that could 691 * or should contain that key. 692 */ 693 694 status_t 695 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength) 696 { 697 // set the root node to begin with 698 node_and_key nodeAndKey; 699 nodeAndKey.nodeOffset = fHeader->RootNode(); 700 701 CachedNode cached(this); 702 const bplustree_node *node; 703 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 704 // if we are already on leaf level, we're done 705 if (node->OverflowLink() == BPLUSTREE_NULL) { 706 // node that the keyIndex is not properly set here (but it's not 707 // needed in the calling functions anyway)! 708 nodeAndKey.keyIndex = 0; 709 stack.Push(nodeAndKey); 710 return B_OK; 711 } 712 713 off_t nextOffset; 714 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset); 715 716 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 717 RETURN_ERROR(B_ERROR); 718 719 // put the node offset & the correct keyIndex on the stack 720 stack.Push(nodeAndKey); 721 722 nodeAndKey.nodeOffset = nextOffset; 723 } 724 725 FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", nodeAndKey.nodeOffset)); 726 return B_ERROR; 727 } 728 729 730 /** This will find a free duplicate fragment in the given bplustree_node. 731 * The CachedNode will be set to the writable fragment on success. 732 */ 733 734 status_t 735 BPlusTree::FindFreeDuplicateFragment(Transaction &transaction, 736 const bplustree_node *node, CachedNode &cached, 737 off_t *_offset, bplustree_node **_fragment, uint32 *_index) 738 { 739 off_t *values = node->Values(); 740 for (int32 i = 0; i < node->NumKeys(); i++) { 741 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 742 743 // does the value link to a duplicate fragment? 744 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 745 continue; 746 747 const bplustree_node *fragment = cached.SetTo( 748 bplustree_node::FragmentOffset(value), false); 749 if (fragment == NULL) { 750 FATAL(("Could not get duplicate fragment at %Ld\n", value)); 751 continue; 752 } 753 754 // see if there is some space left for us 755 uint32 num = bplustree_node::MaxFragments(fNodeSize); 756 for (uint32 j = 0; j < num; j++) { 757 duplicate_array *array = fragment->FragmentAt(j); 758 759 if (array->count == 0) { 760 // found an unused fragment 761 *_fragment = cached.MakeWritable(transaction); 762 if (*_fragment == NULL) 763 return B_IO_ERROR; 764 765 *_offset = bplustree_node::FragmentOffset(value); 766 *_index = j; 767 return B_OK; 768 } 769 } 770 } 771 return B_ENTRY_NOT_FOUND; 772 } 773 774 775 status_t 776 BPlusTree::InsertDuplicate(Transaction &transaction, CachedNode &cached, 777 const bplustree_node *node, uint16 index, off_t value) 778 { 779 CachedNode cachedDuplicate(this); 780 off_t *values = node->Values(); 781 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 782 status_t status; 783 off_t offset; 784 785 if (bplustree_node::IsDuplicate(oldValue)) { 786 // If it's a duplicate fragment, try to insert it into that, or if it 787 // doesn't fit anymore, create a new duplicate node 788 789 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 790 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 791 bplustree_node::FragmentOffset(oldValue), false); 792 if (duplicate == NULL) 793 return B_IO_ERROR; 794 795 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 796 if (array->count > NUM_FRAGMENT_VALUES 797 || array->count < 1) { 798 FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 799 bplustree_node::FragmentIndex(oldValue), 800 bplustree_node::FragmentOffset(oldValue), 801 array->count)); 802 return B_BAD_DATA; 803 } 804 805 if (array->count < NUM_FRAGMENT_VALUES) { 806 array->Insert(value); 807 } else { 808 // test if the fragment will be empty if we remove this key's values 809 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 810 // the node will be empty without our values, so let us 811 // reuse it as a duplicate node 812 offset = bplustree_node::FragmentOffset(oldValue); 813 814 memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 815 duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64( 816 (uint64)BPLUSTREE_NULL); 817 818 array = duplicate->DuplicateArray(); 819 array->Insert(value); 820 } else { 821 // create a new duplicate node 822 823 cachedDuplicate.UnsetUnchanged(transaction); 824 // the old duplicate has not been touched, so we can reuse it 825 826 bplustree_node *newDuplicate; 827 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 828 if (status < B_OK) 829 RETURN_ERROR(status); 830 831 // copy the array from the fragment node to the duplicate node 832 // and free the old entry (by zero'ing all values) 833 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count); 834 memcpy(&newDuplicate->all_key_count, &array->values[0], 835 array->count * sizeof(off_t)); 836 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 837 838 array = newDuplicate->DuplicateArray(); 839 array->Insert(value); 840 } 841 842 // update the main pointer to link to a duplicate node 843 if (cached.MakeWritable(transaction) == NULL) 844 return B_IO_ERROR; 845 846 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 847 BPLUSTREE_DUPLICATE_NODE, offset)); 848 } 849 850 return B_OK; 851 } 852 853 // Put the value into a dedicated duplicate node 854 855 // search for free space in the duplicate nodes of that key 856 duplicate_array *array; 857 const bplustree_node *duplicate; 858 off_t duplicateOffset; 859 do { 860 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 861 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 862 if (duplicate == NULL) 863 return B_IO_ERROR; 864 865 array = duplicate->DuplicateArray(); 866 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 867 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 868 duplicateOffset, array->count)); 869 return B_BAD_DATA; 870 } 871 } while (array->count >= NUM_DUPLICATE_VALUES 872 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 873 874 bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction); 875 if (writableDuplicate == NULL) 876 return B_IO_ERROR; 877 878 if (array->count < NUM_DUPLICATE_VALUES) { 879 array = writableDuplicate->DuplicateArray(); 880 array->Insert(value); 881 } else { 882 // no space left - add a new duplicate node 883 884 bplustree_node *newDuplicate; 885 status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset); 886 if (status < B_OK) 887 RETURN_ERROR(status); 888 889 // link the two nodes together 890 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 891 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 892 893 array = newDuplicate->DuplicateArray(); 894 array->count = 0; 895 array->Insert(value); 896 } 897 return B_OK; 898 } 899 900 // Search for a free duplicate fragment or create a new one 901 // to insert the duplicate value into 902 903 uint32 fragmentIndex = 0; 904 bplustree_node *fragment; 905 if (FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 906 &offset, &fragment, &fragmentIndex) != B_OK) { 907 // allocate a new duplicate fragment node 908 if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK) 909 RETURN_ERROR(status); 910 911 memset(fragment, 0, fNodeSize); 912 } 913 914 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 915 array->Insert(oldValue); 916 array->Insert(value); 917 918 if (cached.MakeWritable(transaction) == NULL) 919 return B_IO_ERROR; 920 921 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 922 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 923 924 return B_OK; 925 } 926 927 928 void 929 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength, 930 off_t value) 931 { 932 // should never happen, but who knows? 933 if (index > node->NumKeys()) 934 return; 935 936 off_t *values = node->Values(); 937 uint16 *keyLengths = node->KeyLengths(); 938 uint8 *keys = node->Keys(); 939 940 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 941 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength); 942 943 off_t *newValues = node->Values(); 944 uint16 *newKeyLengths = node->KeyLengths(); 945 946 // move values and copy new value into them 947 memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index)); 948 memmove(newValues, values, sizeof(off_t) * index); 949 950 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 951 952 // move and update key length index 953 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 954 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 955 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 956 } 957 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 958 959 int32 keyStart; 960 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 961 + (keyStart = index > 0 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 962 963 // move keys and copy new key into them 964 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 965 int32 size = node->AllKeyLength() - length; 966 if (size > 0) 967 memmove(keys + length, keys + length - keyLength, size); 968 969 memcpy(keys + keyStart, key, keyLength); 970 } 971 972 973 /** Splits the \a node into two halves - the other half will be put into \a other. 974 * It also takes care to create a new overflow link if the node to split is an 975 * index node. 976 */ 977 978 status_t 979 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other, 980 off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value) 981 { 982 if (*_keyIndex > node->NumKeys() + 1) 983 return B_BAD_VALUE; 984 985 uint16 *inKeyLengths = node->KeyLengths(); 986 off_t *inKeyValues = node->Values(); 987 uint8 *inKeys = node->Keys(); 988 uint8 *outKeys = other->Keys(); 989 int32 keyIndex = *_keyIndex; // can become less than zero! 990 991 // how many keys will fit in one (half) page? 992 // that loop will find the answer to this question and 993 // change the key lengths indices for their new home 994 995 // "bytes" is the number of bytes written for the new key, 996 // "bytesBefore" are the bytes before that key 997 // "bytesAfter" are the bytes after the new key, if any 998 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 999 1000 size_t size = fNodeSize >> 1; 1001 int32 out, in; 1002 for (in = out = 0; in < node->NumKeys() + 1;) { 1003 if (!bytes) 1004 bytesBefore = in > 0 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1005 1006 if (in == keyIndex && !bytes) { 1007 bytes = *_keyLength; 1008 } else { 1009 if (keyIndex < out) 1010 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - bytesBefore; 1011 1012 in++; 1013 } 1014 out++; 1015 1016 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1017 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1018 // we have found the number of keys in the new node! 1019 break; 1020 } 1021 } 1022 1023 // if the new key was not inserted, set the length of the keys 1024 // that can be copied directly 1025 if (keyIndex >= out && in > 0) 1026 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1027 1028 if (bytesBefore < 0 || bytesAfter < 0) 1029 return B_BAD_DATA; 1030 1031 other->left_link = node->left_link; 1032 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1033 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 1034 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1035 1036 uint16 *outKeyLengths = other->KeyLengths(); 1037 off_t *outKeyValues = other->Values(); 1038 int32 keys = out > keyIndex ? keyIndex : out; 1039 1040 if (bytesBefore) { 1041 // copy the keys 1042 memcpy(outKeys, inKeys, bytesBefore); 1043 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1044 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1045 } 1046 if (bytes) { 1047 // copy the newly inserted key 1048 memcpy(outKeys + bytesBefore, key, bytes); 1049 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1050 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1051 1052 if (bytesAfter) { 1053 // copy the keys after the new key 1054 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter); 1055 keys = out - keyIndex - 1; 1056 for (int32 i = 0;i < keys;i++) { 1057 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1058 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes); 1059 } 1060 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t)); 1061 } 1062 } 1063 1064 // if the new key was already inserted, we shouldn't use it again 1065 if (in != out) 1066 keyIndex--; 1067 1068 int32 total = bytesBefore + bytesAfter; 1069 1070 // these variables are for the key that will be returned 1071 // to the parent node 1072 uint8 *newKey = NULL; 1073 uint16 newLength; 1074 bool newAllocated = false; 1075 1076 // If we have split an index node, we have to drop the first key 1077 // of the next node (which can also be the new key to insert). 1078 // The dropped key is also the one which has to be inserted in 1079 // the parent node, so we will set the "newKey" already here. 1080 if (node->OverflowLink() != BPLUSTREE_NULL) { 1081 if (in == keyIndex) { 1082 newKey = key; 1083 newLength = *_keyLength; 1084 1085 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1086 keyIndex--; 1087 } else { 1088 // If a key is dropped (is not the new key), we have to copy 1089 // it, because it would be lost if not. 1090 uint8 *droppedKey = node->KeyAt(in, &newLength); 1091 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1092 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1093 fStream->GetVolume()->Panic(); 1094 RETURN_ERROR(B_BAD_DATA); 1095 } 1096 newKey = (uint8 *)malloc(newLength); 1097 if (newKey == NULL) 1098 return B_NO_MEMORY; 1099 memcpy(newKey, droppedKey, newLength); 1100 1101 other->overflow_link = inKeyValues[in]; 1102 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1103 } 1104 } 1105 1106 // and now the same game for the other page and the rest of the keys 1107 // (but with memmove() instead of memcpy(), because they may overlap) 1108 1109 bytesBefore = bytesAfter = bytes = 0; 1110 out = 0; 1111 int32 skip = in; 1112 while (in < node->NumKeys() + 1) { 1113 if (in == keyIndex && !bytes) { 1114 // it's enough to set bytesBefore once here, because we do 1115 // not need to know the exact length of all keys in this 1116 // loop 1117 bytesBefore = in > skip ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1118 bytes = *_keyLength; 1119 } else { 1120 if (in < node->NumKeys()) { 1121 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1122 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1123 1124 if (bytes) { 1125 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1126 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1127 1128 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1129 - bytesBefore - bytes; 1130 } 1131 } 1132 in++; 1133 } 1134 1135 out++; 1136 1137 // break out when all keys are done 1138 if (in > node->NumKeys() && keyIndex < in) 1139 break; 1140 } 1141 1142 // adjust the byte counts (since we were a bit lazy in the loop) 1143 if (keyIndex >= in && keyIndex - skip < out) 1144 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - bytesBefore - total; 1145 else if (keyIndex < skip) 1146 bytesBefore = node->AllKeyLength() - total; 1147 1148 if (bytesBefore < 0 || bytesAfter < 0) 1149 return B_BAD_DATA; 1150 1151 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1152 // right link, and overflow link can stay the same 1153 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 1154 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1); 1155 1156 // array positions have changed 1157 outKeyLengths = node->KeyLengths(); 1158 outKeyValues = node->Values(); 1159 1160 // move the keys in the old node: the order is important here, 1161 // because we don't want to overwrite any contents 1162 1163 keys = keyIndex <= skip ? out : keyIndex - skip; 1164 keyIndex -= skip; 1165 1166 if (bytesBefore) 1167 memmove(inKeys, inKeys + total, bytesBefore); 1168 if (bytesAfter) 1169 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter); 1170 1171 if (bytesBefore) 1172 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1173 in = out - keyIndex - 1; 1174 if (bytesAfter) 1175 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16)); 1176 1177 if (bytesBefore) 1178 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1179 if (bytesAfter) 1180 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t)); 1181 1182 if (bytes) { 1183 // finally, copy the newly inserted key (don't overwrite anything) 1184 memcpy(inKeys + bytesBefore, key, bytes); 1185 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1186 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1187 } 1188 1189 // Prepare the key that will be inserted in the parent node which 1190 // is either the dropped key or the last of the other node. 1191 // If it's the dropped key, "newKey" was already set earlier. 1192 1193 if (newKey == NULL) 1194 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1195 1196 memcpy(key, newKey, newLength); 1197 *_keyLength = newLength; 1198 *_value = otherOffset; 1199 1200 if (newAllocated) 1201 free(newKey); 1202 1203 return B_OK; 1204 } 1205 1206 1207 /** This inserts a key into the tree. The changes made to the tree will 1208 * all be part of the \a transaction. 1209 */ 1210 1211 status_t 1212 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1213 { 1214 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1215 RETURN_ERROR(B_BAD_VALUE); 1216 #ifdef DEBUG 1217 if (value < 0) 1218 panic("tried to insert invalid value %Ld!\n", value); 1219 #endif 1220 1221 // lock access to stream 1222 WriteLocked locked(fStream->Lock()); 1223 1224 Stack<node_and_key> stack; 1225 if (SeekDown(stack, key, keyLength) != B_OK) 1226 RETURN_ERROR(B_ERROR); 1227 1228 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1229 1230 memcpy(keyBuffer, key, keyLength); 1231 keyBuffer[keyLength] = 0; 1232 1233 node_and_key nodeAndKey; 1234 const bplustree_node *node; 1235 1236 CachedNode cached(this); 1237 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1238 #ifdef DEBUG 1239 NodeChecker checker(node, fNodeSize, "insert"); 1240 #endif 1241 if (node->IsLeaf()) { 1242 // first round, check for duplicate entries 1243 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1244 1245 // is this a duplicate entry? 1246 if (status == B_OK) { 1247 if (fAllowDuplicates) { 1248 status = InsertDuplicate(transaction, cached, node, nodeAndKey.keyIndex, value); 1249 if (status != B_OK) 1250 RETURN_ERROR(status); 1251 return B_OK; 1252 } 1253 1254 RETURN_ERROR(B_NAME_IN_USE); 1255 } 1256 } 1257 1258 bplustree_node *writableNode = cached.MakeWritable(transaction); 1259 if (writableNode == NULL) 1260 return B_IO_ERROR; 1261 1262 // is the node big enough to hold the pair? 1263 if (int32(round_up(sizeof(bplustree_node) + writableNode->AllKeyLength() + keyLength) 1264 + (writableNode->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 1265 { 1266 InsertKey(writableNode, nodeAndKey.keyIndex, keyBuffer, keyLength, value); 1267 UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1); 1268 1269 return B_OK; 1270 } else { 1271 CachedNode cachedNewRoot(this); 1272 CachedNode cachedOther(this); 1273 1274 // do we need to allocate a new root node? if so, then do 1275 // it now 1276 off_t newRoot = BPLUSTREE_NULL; 1277 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1278 bplustree_node *root; 1279 status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot); 1280 if (status < B_OK) { 1281 // The tree is most likely corrupted! 1282 // But it's still sane at leaf level - we could set 1283 // a flag in the header that forces the tree to be 1284 // rebuild next time... 1285 // But since we will have journaling, that's not a big 1286 // problem anyway. 1287 RETURN_ERROR(status); 1288 } 1289 } 1290 1291 // reserve space for the other node 1292 bplustree_node *other; 1293 off_t otherOffset; 1294 status_t status = cachedOther.Allocate(transaction, &other, &otherOffset); 1295 if (status < B_OK) { 1296 cachedNewRoot.Free(transaction, newRoot); 1297 RETURN_ERROR(status); 1298 } 1299 1300 if (SplitNode(writableNode, nodeAndKey.nodeOffset, other, otherOffset, 1301 &nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) { 1302 // free root node & other node here 1303 cachedOther.Free(transaction, otherOffset); 1304 cachedNewRoot.Free(transaction, newRoot); 1305 1306 RETURN_ERROR(B_ERROR); 1307 } 1308 #ifdef DEBUG 1309 checker.Check("insert split"); 1310 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1311 #endif 1312 1313 UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex, 1314 writableNode->NumKeys(), 1); 1315 1316 // update the right link of the node in the left of the new node 1317 if ((other = cachedOther.SetToWritable(transaction, other->LeftLink())) != NULL) { 1318 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1319 } 1320 1321 // create a new root if necessary 1322 if (newRoot != BPLUSTREE_NULL) { 1323 bplustree_node *root = cachedNewRoot.Node(); 1324 1325 InsertKey(root, 0, keyBuffer, keyLength, writableNode->LeftLink()); 1326 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset); 1327 1328 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1329 if (header == NULL) 1330 return B_IO_ERROR; 1331 1332 // finally, update header to point to the new root 1333 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1334 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(header->MaxNumberOfLevels() + 1); 1335 1336 return B_OK; 1337 } 1338 } 1339 } 1340 RETURN_ERROR(B_ERROR); 1341 } 1342 1343 1344 /** Removes the duplicate index/value pair from the tree. 1345 * It's part of the private tree interface. 1346 */ 1347 1348 status_t 1349 BPlusTree::RemoveDuplicate(Transaction &transaction, const bplustree_node *node, 1350 CachedNode &cached, uint16 index, off_t value) 1351 { 1352 off_t *values = node->Values(); 1353 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1354 1355 CachedNode cachedDuplicate(this); 1356 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1357 bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction, 1358 duplicateOffset, false); 1359 if (duplicate == NULL) 1360 return B_IO_ERROR; 1361 1362 // if it's a duplicate fragment, remove the entry from there 1363 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1364 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 1365 1366 if (array->count > NUM_FRAGMENT_VALUES 1367 || array->count < 1) { 1368 FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 1369 bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1370 return B_BAD_DATA; 1371 } 1372 if (!array->Remove(value)) { 1373 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1374 value, duplicateOffset)); 1375 return B_ENTRY_NOT_FOUND; 1376 } 1377 1378 // remove the array from the fragment node if it is empty 1379 if (array->count == 1) { 1380 // set the link to the remaining value 1381 if (cached.MakeWritable(transaction) == NULL) 1382 return B_IO_ERROR; 1383 1384 values[index] = array->values[0]; 1385 1386 // Remove the whole fragment node, if this was the only array, 1387 // otherwise free just the array 1388 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1389 status_t status = cachedDuplicate.Free(transaction, duplicateOffset); 1390 if (status < B_OK) 1391 return status; 1392 } else 1393 array->count = 0; 1394 } 1395 return B_OK; 1396 } 1397 1398 // Remove value from a duplicate node! 1399 1400 duplicate_array *array = NULL; 1401 1402 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1403 FATAL(("invalid duplicate node: first left link points to %Ld!\n", 1404 duplicate->LeftLink())); 1405 return B_BAD_DATA; 1406 } 1407 1408 // Search the duplicate nodes until the entry could be found (and removed) 1409 while (duplicate != NULL) { 1410 array = duplicate->DuplicateArray(); 1411 if (array->count > NUM_DUPLICATE_VALUES 1412 || array->count < 0) { 1413 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1414 duplicateOffset, array->count)); 1415 return B_BAD_DATA; 1416 } 1417 1418 if (array->Remove(value)) 1419 break; 1420 1421 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1422 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1423 1424 cachedDuplicate.UnsetUnchanged(transaction); 1425 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false); 1426 } 1427 if (duplicate == NULL) 1428 RETURN_ERROR(B_IO_ERROR); 1429 1430 // The entry got removed from the duplicate node, but we might want to free 1431 // it now in case it's empty 1432 1433 while (true) { 1434 off_t left = duplicate->LeftLink(); 1435 off_t right = duplicate->RightLink(); 1436 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1437 1438 if ((isLast && array->count == 1) || array->count == 0) { 1439 // Free empty duplicate page, link their siblings together, and 1440 // update the duplicate link if needed (ie. when we either remove 1441 // the last duplicate node or have a new first one) 1442 1443 if (left == BPLUSTREE_NULL) { 1444 // the duplicate link points to us 1445 if (cached.MakeWritable(transaction) == NULL) 1446 return B_IO_ERROR; 1447 1448 if (array->count == 1) { 1449 // this is the last node, and there is only one value left; 1450 // replace the duplicate link with that value, it's no duplicate 1451 // anymore 1452 values[index] = array->values[0]; 1453 } else { 1454 // move the duplicate link to the next node 1455 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1456 BPLUSTREE_DUPLICATE_NODE, right)); 1457 } 1458 } 1459 1460 status_t status; 1461 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1462 return status; 1463 1464 if (left != BPLUSTREE_NULL 1465 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) { 1466 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1467 1468 // If the next node is the last node, we need to free that node 1469 // and convert the duplicate entry back into a normal entry 1470 array = duplicate->DuplicateArray(); 1471 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1472 && array->count <= NUM_FRAGMENT_VALUES) { 1473 duplicateOffset = left; 1474 continue; 1475 } 1476 } 1477 if (right != BPLUSTREE_NULL 1478 && (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) { 1479 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1480 1481 // Again, we may need to turn the duplicate entry back into a normal entry 1482 array = duplicate->DuplicateArray(); 1483 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1484 && array->count <= NUM_FRAGMENT_VALUES) { 1485 duplicateOffset = right; 1486 continue; 1487 } 1488 } 1489 return B_OK; 1490 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1491 // If the number of entries fits in a duplicate fragment, then 1492 // either find a free fragment node, or convert this node to a 1493 // fragment node. 1494 CachedNode cachedOther(this); 1495 1496 bplustree_node *fragment = NULL; 1497 uint32 fragmentIndex = 0; 1498 off_t offset; 1499 if (FindFreeDuplicateFragment(transaction, node, cachedOther, 1500 &offset, &fragment, &fragmentIndex) == B_OK) { 1501 // move to other node 1502 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1503 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1504 1505 cachedDuplicate.Free(transaction, duplicateOffset); 1506 duplicateOffset = offset; 1507 } else { 1508 // convert node 1509 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1510 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1511 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1512 } 1513 1514 if (cached.MakeWritable(transaction) == NULL) 1515 return B_IO_ERROR; 1516 1517 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1518 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1519 } 1520 return B_OK; 1521 } 1522 } 1523 1524 1525 /** Removes the key with the given index from the specified node. 1526 * Since it has to get the key from the node anyway (to obtain it's 1527 * pointer), it's not needed to pass the key & its length, although 1528 * the calling method (BPlusTree::Remove()) have this data. 1529 */ 1530 1531 void 1532 BPlusTree::RemoveKey(bplustree_node *node, uint16 index) 1533 { 1534 // should never happen, but who knows? 1535 if (index > node->NumKeys() && node->NumKeys() > 0) { 1536 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1537 return; 1538 } 1539 1540 off_t *values = node->Values(); 1541 1542 // if we would have to drop the overflow link, drop 1543 // the last key instead and update the overflow link 1544 // to the value of that one 1545 if (!node->IsLeaf() && index == node->NumKeys()) 1546 node->overflow_link = values[--index]; 1547 1548 uint16 length; 1549 uint8 *key = node->KeyAt(index, &length); 1550 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1551 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1552 FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length, 1553 fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1554 fStream->GetVolume()->Panic(); 1555 return; 1556 } 1557 1558 uint16 *keyLengths = node->KeyLengths(); 1559 uint8 *keys = node->Keys(); 1560 1561 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1562 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1563 1564 off_t *newValues = node->Values(); 1565 uint16 *newKeyLengths = node->KeyLengths(); 1566 1567 // move key data 1568 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1569 1570 // move and update key lengths 1571 if (index > 0 && newKeyLengths != keyLengths) 1572 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1573 for (uint16 i = index; i < node->NumKeys(); i++) { 1574 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1575 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1576 } 1577 1578 // move values 1579 if (index > 0) 1580 memmove(newValues, values, index * sizeof(off_t)); 1581 if (node->NumKeys() > index) 1582 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1583 } 1584 1585 1586 /** Removes the specified key from the tree. The "value" parameter is only used 1587 * for trees which allow duplicates, so you may safely ignore it. 1588 * It's not an optional parameter, so at least you have to think about it. 1589 */ 1590 1591 status_t 1592 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1593 { 1594 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1595 RETURN_ERROR(B_BAD_VALUE); 1596 1597 // lock access to stream 1598 WriteLocked locked(fStream->Lock()); 1599 1600 Stack<node_and_key> stack; 1601 if (SeekDown(stack, key, keyLength) != B_OK) 1602 RETURN_ERROR(B_ERROR); 1603 1604 node_and_key nodeAndKey; 1605 const bplustree_node *node; 1606 1607 CachedNode cached(this); 1608 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1609 #ifdef DEBUG 1610 NodeChecker checker(node, fNodeSize, "remove"); 1611 #endif 1612 if (node->IsLeaf()) { 1613 // first round, check for duplicate entries 1614 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1615 if (status < B_OK) 1616 RETURN_ERROR(status); 1617 1618 // If we will remove the last key, the iterator will be set 1619 // to the next node after the current - if there aren't any 1620 // more nodes, we need a way to prevent the TreeIterators to 1621 // touch the old node again, we use BPLUSTREE_FREE for this 1622 off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink(); 1623 UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ? 1624 next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1625 1626 // is this a duplicate entry? 1627 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1628 node->Values()[nodeAndKey.keyIndex]))) { 1629 if (fAllowDuplicates) { 1630 return RemoveDuplicate(transaction, node, cached, 1631 nodeAndKey.keyIndex, value); 1632 } else { 1633 FATAL(("dupliate node found where no duplicates are allowed!\n")); 1634 RETURN_ERROR(B_ERROR); 1635 } 1636 } 1637 } 1638 1639 bplustree_node *writableNode = cached.MakeWritable(transaction); 1640 if (writableNode == NULL) 1641 return B_IO_ERROR; 1642 1643 // if it's an empty root node, we have to convert it 1644 // to a leaf node by dropping the overflow link, or, 1645 // if it's a leaf node, just empty it 1646 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1647 && node->NumKeys() == 0 1648 || node->NumKeys() == 1 && node->IsLeaf()) { 1649 writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1650 writableNode->all_key_count = 0; 1651 writableNode->all_key_length = 0; 1652 1653 // if we've cleared the root node, reset the maximum 1654 // number of levels in the header 1655 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1656 bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction); 1657 if (header == NULL) 1658 return B_IO_ERROR; 1659 1660 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1661 } 1662 return B_OK; 1663 } 1664 1665 // if there is only one key left, we don't have to remove 1666 // it, we can just dump the node (index nodes still have 1667 // the overflow link, so we have to drop the last key) 1668 if (writableNode->NumKeys() > 1 1669 || !writableNode->IsLeaf() && writableNode->NumKeys() == 1) { 1670 RemoveKey(writableNode, nodeAndKey.keyIndex); 1671 return B_OK; 1672 } 1673 1674 // when we are here, we can just free the node, but 1675 // we have to update the right/left link of the 1676 // siblings first 1677 CachedNode otherCached(this); 1678 bplustree_node *other = otherCached.SetToWritable(transaction, 1679 writableNode->LeftLink()); 1680 if (other != NULL) 1681 other->right_link = writableNode->right_link; 1682 1683 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL) 1684 other->left_link = writableNode->left_link; 1685 1686 cached.Free(transaction, nodeAndKey.nodeOffset); 1687 } 1688 RETURN_ERROR(B_ERROR); 1689 } 1690 1691 1692 /** Replaces the value for the key in the tree. 1693 * Returns B_OK if the key could be found and its value replaced, 1694 * B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1695 * to indicate that something went terribly wrong. 1696 * Note that this doesn't work with duplicates - it will just 1697 * return B_BAD_TYPE if you call this function on a tree where 1698 * duplicates are allowed. 1699 */ 1700 1701 status_t 1702 BPlusTree::Replace(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value) 1703 { 1704 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1705 || key == NULL) 1706 RETURN_ERROR(B_BAD_VALUE); 1707 1708 if (fAllowDuplicates) 1709 RETURN_ERROR(B_BAD_TYPE); 1710 1711 // lock access to stream (a read lock is okay for this purpose) 1712 ReadLocked locked(fStream->Lock()); 1713 1714 off_t nodeOffset = fHeader->RootNode(); 1715 CachedNode cached(this); 1716 const bplustree_node *node; 1717 1718 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1719 uint16 keyIndex = 0; 1720 off_t nextOffset; 1721 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1722 1723 if (node->OverflowLink() == BPLUSTREE_NULL) { 1724 if (status == B_OK) { 1725 bplustree_node *writableNode = cached.MakeWritable(transaction); 1726 if (writableNode != NULL) 1727 writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value); 1728 else 1729 status = B_IO_ERROR; 1730 } 1731 1732 return status; 1733 } else if (nextOffset == nodeOffset) 1734 RETURN_ERROR(B_ERROR); 1735 1736 nodeOffset = nextOffset; 1737 } 1738 RETURN_ERROR(B_ERROR); 1739 } 1740 1741 1742 /** Searches the key in the tree, and stores the offset found in 1743 * _value, if successful. 1744 * It's very similar to BPlusTree::SeekDown(), but doesn't fill 1745 * a stack while it descends the tree. 1746 * Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1747 * if not. It can also return other errors to indicate that 1748 * something went wrong. 1749 * Note that this doesn't work with duplicates - it will just 1750 * return B_BAD_TYPE if you call this function on a tree where 1751 * duplicates are allowed. 1752 */ 1753 1754 status_t 1755 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1756 { 1757 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1758 || key == NULL) 1759 RETURN_ERROR(B_BAD_VALUE); 1760 1761 if (fAllowDuplicates) 1762 RETURN_ERROR(B_BAD_TYPE); 1763 1764 // lock access to stream 1765 ReadLocked locked(fStream->Lock()); 1766 1767 off_t nodeOffset = fHeader->RootNode(); 1768 CachedNode cached(this); 1769 const bplustree_node *node; 1770 1771 #ifdef DEBUG 1772 int32 levels = 0; 1773 #endif 1774 1775 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1776 uint16 keyIndex = 0; 1777 off_t nextOffset; 1778 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1779 1780 #ifdef DEBUG 1781 levels++; 1782 #endif 1783 if (node->OverflowLink() == BPLUSTREE_NULL) { 1784 if (status == B_OK && _value != NULL) 1785 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1786 1787 #ifdef DEBUG 1788 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1789 DEBUGGER(("levels don't match")); 1790 #endif 1791 return status; 1792 } else if (nextOffset == nodeOffset) 1793 RETURN_ERROR(B_ERROR); 1794 1795 nodeOffset = nextOffset; 1796 } 1797 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1798 RETURN_ERROR(B_ERROR); 1799 } 1800 1801 1802 // #pragma mark - 1803 1804 1805 TreeIterator::TreeIterator(BPlusTree *tree) 1806 : 1807 fTree(tree), 1808 fCurrentNodeOffset(BPLUSTREE_NULL), 1809 fNext(NULL) 1810 { 1811 tree->AddIterator(this); 1812 } 1813 1814 1815 TreeIterator::~TreeIterator() 1816 { 1817 if (fTree) 1818 fTree->RemoveIterator(this); 1819 } 1820 1821 1822 status_t 1823 TreeIterator::Goto(int8 to) 1824 { 1825 if (fTree == NULL || fTree->fHeader == NULL) 1826 RETURN_ERROR(B_BAD_VALUE); 1827 1828 // lock access to stream 1829 ReadLocked locked(fTree->fStream->Lock()); 1830 1831 off_t nodeOffset = fTree->fHeader->RootNode(); 1832 CachedNode cached(fTree); 1833 const bplustree_node *node; 1834 1835 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1836 // is the node a leaf node? 1837 if (node->OverflowLink() == BPLUSTREE_NULL) { 1838 fCurrentNodeOffset = nodeOffset; 1839 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1840 fDuplicateNode = BPLUSTREE_NULL; 1841 1842 return B_OK; 1843 } 1844 1845 // get the next node offset depending on the direction (and if there 1846 // are any keys in that node at all) 1847 off_t nextOffset; 1848 if (to == BPLUSTREE_END || node->all_key_count == 0) 1849 nextOffset = node->OverflowLink(); 1850 else { 1851 if (node->AllKeyLength() > fTree->fNodeSize 1852 || (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys()) 1853 RETURN_ERROR(B_ERROR); 1854 1855 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1856 } 1857 if (nextOffset == nodeOffset) 1858 break; 1859 1860 nodeOffset = nextOffset; 1861 } 1862 FATAL(("%s fails\n", __FUNCTION__)); 1863 1864 RETURN_ERROR(B_ERROR); 1865 } 1866 1867 1868 /** Iterates through the tree in the specified direction. 1869 * When it iterates through duplicates, the "key" is only updated for the 1870 * first entry - if you need to know when this happens, use the "duplicate" 1871 * parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1872 * the other duplicates. 1873 * That's not too nice, but saves the 256 bytes that would be needed to 1874 * store the last key - if this will ever become an issue, it will be 1875 * easy to change. 1876 * The other advantage of this is, that the queries can skip all duplicates 1877 * at once when they are not relevant to them. 1878 */ 1879 1880 status_t 1881 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength, 1882 off_t *value, uint16 *duplicate) 1883 { 1884 if (fTree == NULL) 1885 return B_INTERRUPTED; 1886 if (fCurrentNodeOffset == BPLUSTREE_NULL 1887 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1888 RETURN_ERROR(B_ERROR); 1889 1890 // if the tree was emptied since the last call 1891 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1892 return B_ENTRY_NOT_FOUND; 1893 1894 // lock access to stream 1895 ReadLocked locked(fTree->fStream->Lock()); 1896 1897 CachedNode cached(fTree); 1898 const bplustree_node *node; 1899 1900 if (fDuplicateNode != BPLUSTREE_NULL) { 1901 // regardless of traverse direction the duplicates are always presented in 1902 // the same order; since they are all considered as equal, this shouldn't 1903 // cause any problems 1904 1905 if (!fIsFragment || fDuplicate < fNumDuplicates) 1906 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1907 else 1908 node = NULL; 1909 1910 if (node != NULL) { 1911 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1912 // if the node is out of duplicates, we go directly to the next one 1913 fDuplicateNode = node->RightLink(); 1914 if (fDuplicateNode != BPLUSTREE_NULL 1915 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1916 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1917 fDuplicate = 0; 1918 } 1919 } 1920 if (fDuplicate < fNumDuplicates) { 1921 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1922 if (duplicate) 1923 *duplicate = 2; 1924 return B_OK; 1925 } 1926 } 1927 fDuplicateNode = BPLUSTREE_NULL; 1928 } 1929 1930 off_t savedNodeOffset = fCurrentNodeOffset; 1931 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 1932 RETURN_ERROR(B_ERROR); 1933 1934 if (duplicate) 1935 *duplicate = 0; 1936 1937 fCurrentKey += direction; 1938 1939 // is the current key in the current node? 1940 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys()) 1941 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) { 1942 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink(); 1943 1944 // are there any more nodes? 1945 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 1946 node = cached.SetTo(fCurrentNodeOffset); 1947 if (!node) 1948 RETURN_ERROR(B_ERROR); 1949 1950 // reset current key 1951 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys(); 1952 } else { 1953 // there are no nodes left, so turn back to the last key 1954 fCurrentNodeOffset = savedNodeOffset; 1955 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1; 1956 1957 return B_ENTRY_NOT_FOUND; 1958 } 1959 } 1960 1961 if (node->all_key_count == 0) 1962 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 1963 1964 uint16 length; 1965 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 1966 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 1967 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1968 fTree->fStream->GetVolume()->Panic(); 1969 RETURN_ERROR(B_BAD_DATA); 1970 } 1971 1972 length = min_c(length, maxLength); 1973 memcpy(key, keyStart, length); 1974 1975 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 1976 // terminate string type 1977 if (length == maxLength) 1978 length--; 1979 ((char *)key)[length] = '\0'; 1980 } 1981 *keyLength = length; 1982 1983 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 1984 1985 // duplicate fragments? 1986 uint8 type = bplustree_node::LinkType(offset); 1987 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 1988 fDuplicateNode = offset; 1989 1990 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1991 if (node == NULL) 1992 RETURN_ERROR(B_ERROR); 1993 1994 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 1995 1996 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 1997 if (fNumDuplicates) { 1998 offset = node->DuplicateAt(offset, fIsFragment, 0); 1999 fDuplicate = 1; 2000 if (duplicate) 2001 *duplicate = 1; 2002 } else { 2003 // shouldn't happen, but we're dealing here with potentially corrupt disks... 2004 fDuplicateNode = BPLUSTREE_NULL; 2005 offset = 0; 2006 } 2007 } 2008 *value = offset; 2009 2010 return B_OK; 2011 } 2012 2013 2014 /** This is more or less a copy of BPlusTree::Find() - but it just 2015 * sets the current position in the iterator, regardless of if the 2016 * key could be found or not. 2017 */ 2018 2019 status_t 2020 TreeIterator::Find(const uint8 *key, uint16 keyLength) 2021 { 2022 if (fTree == NULL) 2023 return B_INTERRUPTED; 2024 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2025 || key == NULL) 2026 RETURN_ERROR(B_BAD_VALUE); 2027 2028 // lock access to stream 2029 ReadLocked locked(fTree->fStream->Lock()); 2030 2031 off_t nodeOffset = fTree->fHeader->RootNode(); 2032 2033 CachedNode cached(fTree); 2034 const bplustree_node *node; 2035 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2036 uint16 keyIndex = 0; 2037 off_t nextOffset; 2038 status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset); 2039 2040 if (node->OverflowLink() == BPLUSTREE_NULL) { 2041 fCurrentNodeOffset = nodeOffset; 2042 fCurrentKey = keyIndex - 1; 2043 fDuplicateNode = BPLUSTREE_NULL; 2044 2045 return status; 2046 } else if (nextOffset == nodeOffset) 2047 RETURN_ERROR(B_ERROR); 2048 2049 nodeOffset = nextOffset; 2050 } 2051 RETURN_ERROR(B_ERROR); 2052 } 2053 2054 2055 void 2056 TreeIterator::SkipDuplicates() 2057 { 2058 fDuplicateNode = BPLUSTREE_NULL; 2059 } 2060 2061 2062 void 2063 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 2064 { 2065 if (offset != fCurrentNodeOffset) 2066 return; 2067 2068 if (nextOffset != BPLUSTREE_NULL) { 2069 fCurrentNodeOffset = nextOffset; 2070 if (splitAt <= fCurrentKey) { 2071 fCurrentKey -= splitAt; 2072 keyIndex -= splitAt; 2073 } 2074 } 2075 2076 // Adjust fCurrentKey to point to the same key as before. 2077 // Note, that if a key is inserted at the current position 2078 // it won't be included in this tree transition. 2079 if (keyIndex <= fCurrentKey) 2080 fCurrentKey += change; 2081 2082 // ToDo: duplicate handling! 2083 } 2084 2085 2086 void 2087 TreeIterator::Stop() 2088 { 2089 fTree = NULL; 2090 } 2091 2092 2093 #ifdef DEBUG 2094 void 2095 TreeIterator::Dump() 2096 { 2097 __out("TreeIterator at %p:\n", this); 2098 __out("\tfTree = %p\n", fTree); 2099 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2100 __out("\tfCurrentKey = %ld\n", fCurrentKey); 2101 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 2102 __out("\tfDuplicate = %u\n", fDuplicate); 2103 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2104 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2105 } 2106 #endif 2107 2108 2109 // #pragma mark - 2110 2111 2112 void 2113 bplustree_node::Initialize() 2114 { 2115 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2116 all_key_count = 0; 2117 all_key_length = 0; 2118 } 2119 2120 2121 uint8 * 2122 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 2123 { 2124 if (index < 0 || index > NumKeys()) 2125 return NULL; 2126 2127 uint8 *keyStart = Keys(); 2128 uint16 *keyLengths = KeyLengths(); 2129 2130 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2131 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2132 if (index > 0) 2133 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2134 2135 return keyStart; 2136 } 2137 2138 2139 uint8 2140 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2141 { 2142 // the duplicate fragment handling is currently hard-coded to a node size 2143 // of 1024 bytes - with future versions of BFS, this may be a problem 2144 2145 if (isFragment) { 2146 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2147 2148 return ((off_t *)this)[fragment]; 2149 } 2150 return OverflowLink(); 2151 } 2152 2153 2154 off_t 2155 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2156 { 2157 uint32 start; 2158 if (isFragment) 2159 start = 8 * ((uint64)offset & 0x3ff); 2160 else 2161 start = 2; 2162 2163 return ((off_t *)this)[start + 1 + index]; 2164 } 2165 2166 2167 /** Although the name suggests it, this function doesn't return the real 2168 * used fragment count; at least, it can only count to two: it returns 2169 * 0, if there is no fragment used, 1 if there is only one fragment 2170 * used, and 2 if there are at least 2 fragments used. 2171 */ 2172 2173 uint32 2174 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2175 { 2176 uint32 used = 0; 2177 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2178 duplicate_array *array = FragmentAt(i); 2179 if (array->count > 0 && ++used > 1) 2180 return used; 2181 } 2182 return used; 2183 } 2184 2185 2186 #ifdef DEBUG 2187 status_t 2188 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2189 { 2190 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2191 DEBUGGER(("invalid node: key/length count")); 2192 2193 for (int32 i = 0; i < NumKeys(); i++) { 2194 uint16 length; 2195 uint8 *key = KeyAt(i, &length); 2196 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2197 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2198 dprintf("node %p, key %ld\n", this, i); 2199 DEBUGGER(("invalid node: keys corrupted")); 2200 return B_BAD_DATA; 2201 } 2202 if (Values()[i] == -1) { 2203 dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]); 2204 DEBUGGER(("invalid node: values corrupted")); 2205 return B_BAD_DATA; 2206 } 2207 } 2208 return B_OK; 2209 } 2210 #endif 2211 2212 2213 // #pragma mark - 2214 2215 2216 int32 2217 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2) 2218 { 2219 // if one of the keys is NULL, bail out gracefully 2220 if (key1 == NULL || key2 == NULL) { 2221 //#if USER 2222 // // that's here to see if it's reasonable to handle this case nicely at all 2223 // DEBUGGER(("compareKeys() got NULL key!")); 2224 //#endif 2225 // even if it's most probably a bug in the calling code, we try to 2226 // give a meaningful result 2227 if (key1 == NULL && key2 != NULL) 2228 return 1; 2229 else if (key2 == NULL && key1 != NULL) 2230 return -1; 2231 2232 return 0; 2233 } 2234 2235 switch (type) { 2236 case B_STRING_TYPE: 2237 { 2238 int len = min_c(keyLength1, keyLength2); 2239 int result = strncmp((const char *)key1, (const char *)key2, len); 2240 2241 if (result == 0 2242 && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) 2243 result = keyLength1 - keyLength2; 2244 2245 return result; 2246 } 2247 2248 case B_SSIZE_T_TYPE: 2249 case B_INT32_TYPE: 2250 return *(int32 *)key1 - *(int32 *)key2; 2251 2252 case B_SIZE_T_TYPE: 2253 case B_UINT32_TYPE: 2254 if (*(uint32 *)key1 == *(uint32 *)key2) 2255 return 0; 2256 else if (*(uint32 *)key1 > *(uint32 *)key2) 2257 return 1; 2258 2259 return -1; 2260 2261 case B_OFF_T_TYPE: 2262 case B_INT64_TYPE: 2263 if (*(int64 *)key1 == *(int64 *)key2) 2264 return 0; 2265 else if (*(int64 *)key1 > *(int64 *)key2) 2266 return 1; 2267 2268 return -1; 2269 2270 case B_UINT64_TYPE: 2271 if (*(uint64 *)key1 == *(uint64 *)key2) 2272 return 0; 2273 else if (*(uint64 *)key1 > *(uint64 *)key2) 2274 return 1; 2275 2276 return -1; 2277 2278 case B_FLOAT_TYPE: 2279 { 2280 float result = *(float *)key1 - *(float *)key2; 2281 if (result == 0.0f) 2282 return 0; 2283 2284 return (result < 0.0f) ? -1 : 1; 2285 } 2286 2287 case B_DOUBLE_TYPE: 2288 { 2289 double result = *(double *)key1 - *(double *)key2; 2290 if (result == 0.0) 2291 return 0; 2292 2293 return (result < 0.0) ? -1 : 1; 2294 } 2295 } 2296 2297 // if the type is unknown, the entries don't match... 2298 return -1; 2299 } 2300 2301