1 /* 2 * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 * 5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares 6 * no code but achieves binary compatibility with the on disk format. 7 */ 8 9 10 //! B+Tree implementation 11 12 13 #include "Debug.h" 14 #include "BPlusTree.h" 15 #include "Inode.h" 16 #include "Utility.h" 17 18 19 #ifdef DEBUG 20 class NodeChecker { 21 public: 22 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text) 23 : 24 fNode(node), 25 fSize(nodeSize), 26 fText(text) 27 { 28 Check("integrity check failed on construction."); 29 } 30 31 ~NodeChecker() 32 { 33 Check("integrity check failed on destruction."); 34 } 35 36 void 37 Check(const char* message) 38 { 39 if (fNode->CheckIntegrity(fSize) < B_OK) 40 dprintf("%s: %s\n", fText, message); 41 } 42 43 private: 44 const bplustree_node* fNode; 45 int32 fSize; 46 const char* fText; 47 }; 48 #endif 49 50 51 // Node Caching for the BPlusTree class 52 // 53 // With write support, there is the need for a function that allocates new 54 // nodes by either returning empty nodes, or by growing the file's data stream 55 // 56 // !! The CachedNode class assumes that you have properly locked the stream 57 // !! before asking for nodes. 58 // 59 // Note: This code will fail if the block size is smaller than the node size! 60 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 61 // is hard-coded to 1024 bytes, that's not an issue now. 62 63 void 64 CachedNode::UnsetUnchanged(Transaction& transaction) 65 { 66 if (fTree == NULL || fTree->fStream == NULL) 67 return; 68 69 if (fNode != NULL) { 70 void* cache = fTree->fStream->GetVolume()->BlockCache(); 71 72 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID()); 73 block_cache_put(cache, fBlockNumber); 74 fNode = NULL; 75 } 76 } 77 78 79 void 80 CachedNode::Unset() 81 { 82 if (fTree == NULL || fTree->fStream == NULL) 83 return; 84 85 if (fNode != NULL) { 86 if (fWritable && fOffset == 0) { 87 // The B+tree header has been updated - we need to update the 88 // BPlusTrees copy of it, as well. 89 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header)); 90 } 91 92 block_cache_put(fTree->fStream->GetVolume()->BlockCache(), 93 fBlockNumber); 94 fNode = NULL; 95 } 96 } 97 98 99 const bplustree_node* 100 CachedNode::SetTo(off_t offset, bool check) 101 { 102 if (fTree == NULL || fTree->fStream == NULL) { 103 REPORT_ERROR(B_BAD_VALUE); 104 return NULL; 105 } 106 107 Unset(); 108 109 // You can only ask for nodes at valid positions - you can't 110 // even access the b+tree header with this method (use SetToHeader() 111 // instead) 112 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 113 || offset <= 0 114 || (offset % fTree->fNodeSize) != 0) 115 return NULL; 116 117 if (InternalSetTo(NULL, offset) != NULL && check) { 118 // sanity checks (links, all_key_count) 119 if (!fTree->fHeader.CheckNode(fNode)) { 120 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 121 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 122 fBlockNumber, fTree->fStream->ID())); 123 return NULL; 124 } 125 } 126 return fNode; 127 } 128 129 130 bplustree_node* 131 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check) 132 { 133 if (fTree == NULL || fTree->fStream == NULL) { 134 REPORT_ERROR(B_BAD_VALUE); 135 return NULL; 136 } 137 138 Unset(); 139 140 // You can only ask for nodes at valid positions - you can't 141 // even access the b+tree header with this method (use SetToHeader() 142 // instead) 143 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize 144 || offset <= 0 145 || (offset % fTree->fNodeSize) != 0) 146 return NULL; 147 148 if (InternalSetTo(&transaction, offset) != NULL && check) { 149 // sanity checks (links, all_key_count) 150 if (!fTree->fHeader.CheckNode(fNode)) { 151 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %" 152 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset, 153 fBlockNumber, fTree->fStream->ID())); 154 return NULL; 155 } 156 } 157 return fNode; 158 } 159 160 161 bplustree_node* 162 CachedNode::MakeWritable(Transaction& transaction) 163 { 164 if (fNode == NULL) 165 return NULL; 166 167 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(), 168 fBlockNumber, transaction.ID()) == B_OK) 169 return fNode; 170 171 return NULL; 172 } 173 174 175 const bplustree_header* 176 CachedNode::SetToHeader() 177 { 178 if (fTree == NULL || fTree->fStream == NULL) { 179 REPORT_ERROR(B_BAD_VALUE); 180 return NULL; 181 } 182 183 Unset(); 184 185 InternalSetTo(NULL, 0LL); 186 return (bplustree_header*)fNode; 187 } 188 189 190 bplustree_header* 191 CachedNode::SetToWritableHeader(Transaction& transaction) 192 { 193 if (fTree == NULL || fTree->fStream == NULL) { 194 REPORT_ERROR(B_BAD_VALUE); 195 return NULL; 196 } 197 198 Unset(); 199 200 InternalSetTo(&transaction, 0LL); 201 202 if (fNode != NULL && !fTree->fInTransaction) { 203 transaction.AddListener(fTree); 204 fTree->fInTransaction = true; 205 206 if (!transaction.GetVolume()->IsInitializing()) { 207 acquire_vnode(transaction.GetVolume()->FSVolume(), 208 fTree->fStream->ID()); 209 } 210 } 211 212 return (bplustree_header*)fNode; 213 } 214 215 216 bplustree_node* 217 CachedNode::InternalSetTo(Transaction* transaction, off_t offset) 218 { 219 fNode = NULL; 220 fOffset = offset; 221 222 off_t fileOffset; 223 block_run run; 224 if (offset < fTree->fStream->Size() 225 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 226 Volume* volume = fTree->fStream->GetVolume(); 227 228 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 229 fBlockNumber = volume->ToBlock(run) + blockOffset; 230 uint8* block; 231 232 if (transaction != NULL) { 233 block = (uint8*)block_cache_get_writable(volume->BlockCache(), 234 fBlockNumber, transaction->ID()); 235 fWritable = true; 236 } else { 237 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber); 238 fWritable = false; 239 } 240 241 if (block != NULL) { 242 // The node is somewhere in that block... 243 // (confusing offset calculation) 244 fNode = (bplustree_node*)(block + offset 245 - (fileOffset + (blockOffset << volume->BlockShift()))); 246 } else 247 REPORT_ERROR(B_IO_ERROR); 248 } 249 return fNode; 250 } 251 252 253 status_t 254 CachedNode::Free(Transaction& transaction, off_t offset) 255 { 256 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL) 257 RETURN_ERROR(B_BAD_VALUE); 258 259 // TODO: scan the free nodes list and remove all nodes at the end 260 // of the tree - perhaps that shouldn't be done everytime that 261 // function is called, perhaps it should be done when the directory 262 // inode is closed or based on some calculation or whatever... 263 264 CachedNode cached(fTree); 265 bplustree_header* header = cached.SetToWritableHeader(transaction); 266 if (header == NULL) 267 return B_IO_ERROR; 268 269 #if 0 270 // TODO: temporarily disabled because CheckNode() doesn't like this... 271 // Also, it's such an edge case that it's almost useless, anyway. 272 // if the node is the last one in the tree, we shrink 273 // the tree and file size by one node 274 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize; 275 if (offset == lastOffset) { 276 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 277 if (status < B_OK) 278 return status; 279 280 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 281 return B_OK; 282 } 283 #endif 284 285 // add the node to the free nodes list 286 fNode->left_link = header->free_node_pointer; 287 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 288 289 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 290 return B_OK; 291 } 292 293 294 status_t 295 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node, 296 off_t* _offset) 297 { 298 if (fTree == NULL || fTree->fStream == NULL) 299 RETURN_ERROR(B_BAD_VALUE); 300 301 Unset(); 302 303 bplustree_header* header; 304 status_t status; 305 306 // if there are any free nodes, recycle them 307 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) { 308 CachedNode cached(fTree); 309 header = cached.SetToWritableHeader(transaction); 310 if (header == NULL) 311 return B_IO_ERROR; 312 313 *_offset = header->FreeNode(); 314 *_node = fNode; 315 316 // set new free node pointer 317 header->free_node_pointer = fNode->left_link; 318 fNode->Initialize(); 319 return B_OK; 320 } 321 322 // allocate space for a new node 323 Inode* stream = fTree->fStream; 324 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 325 return status; 326 327 CachedNode cached(fTree); 328 header = cached.SetToWritableHeader(transaction); 329 if (header == NULL) 330 return B_IO_ERROR; 331 332 // the maximum_size has to be changed before the call to SetTo() - or 333 // else it will fail because the requested node is out of bounds 334 off_t offset = header->MaximumSize(); 335 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() 336 + fTree->fNodeSize); 337 338 cached.Unset(); 339 // SetToWritable() below needs the new values in the tree's header 340 341 if (SetToWritable(transaction, offset, false) == NULL) 342 RETURN_ERROR(B_ERROR); 343 344 fNode->Initialize(); 345 346 *_offset = offset; 347 *_node = fNode; 348 return B_OK; 349 } 350 351 352 // #pragma mark - 353 354 355 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize) 356 : 357 fStream(NULL), 358 fInTransaction(false) 359 { 360 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 361 SetTo(transaction, stream); 362 } 363 364 365 BPlusTree::BPlusTree(Inode* stream) 366 : 367 fStream(NULL), 368 fInTransaction(false) 369 { 370 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 371 SetTo(stream); 372 } 373 374 375 BPlusTree::BPlusTree() 376 : 377 fStream(NULL), 378 fNodeSize(BPLUSTREE_NODE_SIZE), 379 fAllowDuplicates(true), 380 fInTransaction(false), 381 fStatus(B_NO_INIT) 382 { 383 mutex_init(&fIteratorLock, "bfs b+tree iterator"); 384 } 385 386 387 BPlusTree::~BPlusTree() 388 { 389 // if there are any TreeIterators left, we need to stop them 390 // (can happen when the tree's inode gets deleted while 391 // traversing the tree - a TreeIterator doesn't lock the inode) 392 mutex_lock(&fIteratorLock); 393 394 SinglyLinkedList<TreeIterator>::Iterator iterator 395 = fIterators.GetIterator(); 396 while (iterator.HasNext()) 397 iterator.Next()->Stop(); 398 399 mutex_destroy(&fIteratorLock); 400 } 401 402 403 /*! Create a new B+Tree on the specified stream */ 404 status_t 405 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize) 406 { 407 // initializes in-memory B+Tree 408 409 fStream = stream; 410 411 CachedNode cached(this); 412 bplustree_header* header = cached.SetToWritableHeader(transaction); 413 if (header == NULL) { 414 // allocate space for new header + node! 415 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 416 if (fStatus < B_OK) 417 RETURN_ERROR(fStatus); 418 419 header = cached.SetToWritableHeader(transaction); 420 if (header == NULL) 421 RETURN_ERROR(fStatus = B_ERROR); 422 } 423 424 fAllowDuplicates = stream->IsIndex() 425 || (stream->Mode() & S_ALLOW_DUPS) != 0; 426 427 fNodeSize = nodeSize; 428 429 // initialize b+tree header 430 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 431 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 432 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 433 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 434 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 435 header->free_node_pointer 436 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 437 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 438 439 cached.Unset(); 440 441 // initialize b+tree root node 442 cached.SetToWritable(transaction, fHeader.RootNode(), false); 443 if (cached.Node() == NULL) 444 RETURN_ERROR(B_IO_ERROR); 445 446 cached.Node()->Initialize(); 447 448 return fStatus = B_OK; 449 } 450 451 452 status_t 453 BPlusTree::SetTo(Inode* stream) 454 { 455 if (stream == NULL) 456 RETURN_ERROR(fStatus = B_BAD_VALUE); 457 458 fStream = stream; 459 460 // get on-disk B+Tree header 461 462 CachedNode cached(this); 463 const bplustree_header* header = cached.SetToHeader(); 464 if (header != NULL) 465 memcpy(&fHeader, header, sizeof(bplustree_header)); 466 else 467 RETURN_ERROR(fStatus = B_IO_ERROR); 468 469 // is header valid? 470 471 if (fHeader.MaximumSize() != stream->Size()) { 472 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %" 473 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size()); 474 // we can't change the header since we don't have a transaction 475 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size()); 476 } 477 if (fHeader.Magic() != BPLUSTREE_MAGIC 478 || (fHeader.RootNode() % fHeader.NodeSize()) != 0 479 || !fHeader.IsValidLink(fHeader.RootNode()) 480 || !fHeader.IsValidLink(fHeader.FreeNode())) { 481 #ifdef DEBUG 482 dump_bplustree_header(&fHeader); 483 dump_block((const char*)&fHeader, 128); 484 #endif 485 RETURN_ERROR(fStatus = B_BAD_DATA); 486 } 487 488 fNodeSize = fHeader.NodeSize(); 489 490 // validity check 491 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, 492 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX, 493 S_DOUBLE_INDEX}; 494 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX 495 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX 496 | S_FLOAT_INDEX | S_DOUBLE_INDEX); 497 498 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE 499 || ((stream->Mode() & S_INDEX_DIR) != 0 500 && kToMode[fHeader.DataType()] != mode) 501 || !stream->IsContainer()) { 502 D( dump_bplustree_header(&fHeader); 503 dump_inode(&stream->Node()); 504 ); 505 RETURN_ERROR(fStatus = B_BAD_TYPE); 506 } 507 508 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 509 // in the original BFS code - we will honour it nevertheless 510 fAllowDuplicates = stream->IsIndex() 511 || (stream->Mode() & S_ALLOW_DUPS) != 0; 512 513 cached.SetTo(fHeader.RootNode()); 514 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 515 } 516 517 518 status_t 519 BPlusTree::InitCheck() 520 { 521 return fStatus; 522 } 523 524 525 int32 526 BPlusTree::TypeCodeToKeyType(type_code code) 527 { 528 switch (code) { 529 case B_STRING_TYPE: 530 return BPLUSTREE_STRING_TYPE; 531 case B_SSIZE_T_TYPE: 532 case B_INT32_TYPE: 533 return BPLUSTREE_INT32_TYPE; 534 case B_SIZE_T_TYPE: 535 case B_UINT32_TYPE: 536 return BPLUSTREE_UINT32_TYPE; 537 case B_OFF_T_TYPE: 538 case B_INT64_TYPE: 539 return BPLUSTREE_INT64_TYPE; 540 case B_UINT64_TYPE: 541 return BPLUSTREE_UINT64_TYPE; 542 case B_FLOAT_TYPE: 543 return BPLUSTREE_FLOAT_TYPE; 544 case B_DOUBLE_TYPE: 545 return BPLUSTREE_DOUBLE_TYPE; 546 } 547 return -1; 548 } 549 550 551 int32 552 BPlusTree::ModeToKeyType(mode_t mode) 553 { 554 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 555 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 556 case S_INT_INDEX: 557 return BPLUSTREE_INT32_TYPE; 558 case S_UINT_INDEX: 559 return BPLUSTREE_UINT32_TYPE; 560 case S_LONG_LONG_INDEX: 561 return BPLUSTREE_INT64_TYPE; 562 case S_ULONG_LONG_INDEX: 563 return BPLUSTREE_UINT64_TYPE; 564 case S_FLOAT_INDEX: 565 return BPLUSTREE_FLOAT_TYPE; 566 case S_DOUBLE_INDEX: 567 return BPLUSTREE_DOUBLE_TYPE; 568 case S_STR_INDEX: 569 default: 570 // default is for standard directories 571 return BPLUSTREE_STRING_TYPE; 572 } 573 } 574 575 576 // #pragma mark - TransactionListener implementation 577 578 579 void 580 BPlusTree::TransactionDone(bool success) 581 { 582 if (!success) { 583 // update header from disk 584 CachedNode cached(this); 585 const bplustree_header* header = cached.SetToHeader(); 586 if (header != NULL) 587 memcpy(&fHeader, header, sizeof(bplustree_header)); 588 } 589 } 590 591 592 void 593 BPlusTree::RemovedFromTransaction() 594 { 595 fInTransaction = false; 596 597 if (!fStream->GetVolume()->IsInitializing()) 598 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID()); 599 } 600 601 602 // #pragma mark - 603 604 605 void 606 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, 607 uint16 splitAt, int8 change) 608 { 609 // Although every iterator which is affected by this update currently 610 // waits on a semaphore, other iterators could be added/removed at 611 // any time, so we need to protect this loop 612 MutexLocker _(fIteratorLock); 613 614 SinglyLinkedList<TreeIterator>::Iterator iterator 615 = fIterators.GetIterator(); 616 while (iterator.HasNext()) 617 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change); 618 } 619 620 621 void 622 BPlusTree::_AddIterator(TreeIterator* iterator) 623 { 624 MutexLocker _(fIteratorLock); 625 fIterators.Add(iterator); 626 } 627 628 629 void 630 BPlusTree::_RemoveIterator(TreeIterator* iterator) 631 { 632 MutexLocker _(fIteratorLock); 633 fIterators.Remove(iterator); 634 } 635 636 637 int32 638 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2, 639 int keyLength2) 640 { 641 type_code type = 0; 642 switch (fHeader.DataType()) { 643 case BPLUSTREE_STRING_TYPE: 644 type = B_STRING_TYPE; 645 break; 646 case BPLUSTREE_INT32_TYPE: 647 type = B_INT32_TYPE; 648 break; 649 case BPLUSTREE_UINT32_TYPE: 650 type = B_UINT32_TYPE; 651 break; 652 case BPLUSTREE_INT64_TYPE: 653 type = B_INT64_TYPE; 654 break; 655 case BPLUSTREE_UINT64_TYPE: 656 type = B_UINT64_TYPE; 657 break; 658 case BPLUSTREE_FLOAT_TYPE: 659 type = B_FLOAT_TYPE; 660 break; 661 case BPLUSTREE_DOUBLE_TYPE: 662 type = B_DOUBLE_TYPE; 663 break; 664 } 665 return compareKeys(type, key1, keyLength1, key2, keyLength2); 666 } 667 668 669 status_t 670 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key, 671 uint16 keyLength, uint16* _index, off_t* _next) 672 { 673 #ifdef DEBUG 674 NodeChecker checker(node, fNodeSize, "find"); 675 #endif 676 677 if (node->all_key_count == 0) { 678 if (_index) 679 *_index = 0; 680 if (_next) 681 *_next = node->OverflowLink(); 682 return B_ENTRY_NOT_FOUND; 683 } 684 685 off_t* values = node->Values(); 686 int16 saveIndex = -1; 687 688 // binary search in the key array 689 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 690 uint16 i = (first + last) >> 1; 691 692 uint16 searchLength; 693 uint8* searchKey = node->KeyAt(i, &searchLength); 694 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) 695 > (uint8*)node + fNodeSize 696 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 697 fStream->GetVolume()->Panic(); 698 RETURN_ERROR(B_BAD_DATA); 699 } 700 701 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength); 702 if (cmp < 0) { 703 last = i - 1; 704 saveIndex = i; 705 } else if (cmp > 0) { 706 saveIndex = first = i + 1; 707 } else { 708 if (_index) 709 *_index = i; 710 if (_next) 711 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 712 return B_OK; 713 } 714 } 715 716 if (_index) 717 *_index = saveIndex; 718 if (_next) { 719 if (saveIndex == node->NumKeys()) 720 *_next = node->OverflowLink(); 721 else 722 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 723 } 724 return B_ENTRY_NOT_FOUND; 725 } 726 727 728 /*! Prepares the stack to contain all nodes that were passed while 729 following the key, from the root node to the leaf node that could 730 or should contain that key. 731 */ 732 status_t 733 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key, 734 uint16 keyLength) 735 { 736 // set the root node to begin with 737 node_and_key nodeAndKey; 738 nodeAndKey.nodeOffset = fHeader.RootNode(); 739 740 CachedNode cached(this); 741 const bplustree_node* node; 742 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 743 // if we are already on leaf level, we're done 744 if (node->OverflowLink() == BPLUSTREE_NULL) { 745 // node that the keyIndex is not properly set here (but it's not 746 // needed in the calling functions anyway)! 747 nodeAndKey.keyIndex = 0; 748 stack.Push(nodeAndKey); 749 return B_OK; 750 } 751 752 off_t nextOffset; 753 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex, 754 &nextOffset); 755 756 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 757 RETURN_ERROR(B_ERROR); 758 759 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) { 760 dprintf("BPlusTree::_SeekDown() node walked too deep.\n"); 761 break; 762 } 763 764 // put the node offset & the correct keyIndex on the stack 765 stack.Push(nodeAndKey); 766 767 nodeAndKey.nodeOffset = nextOffset; 768 } 769 770 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF "\n", 771 nodeAndKey.nodeOffset)); 772 return B_ERROR; 773 } 774 775 776 /*! This will find a free duplicate fragment in the given bplustree_node. 777 The CachedNode will be set to the writable fragment on success. 778 */ 779 status_t 780 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction, 781 const bplustree_node* node, CachedNode& cached, 782 off_t* _offset, bplustree_node** _fragment, uint32* _index) 783 { 784 off_t* values = node->Values(); 785 for (int32 i = 0; i < node->NumKeys(); i++) { 786 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]); 787 788 // does the value link to a duplicate fragment? 789 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT) 790 continue; 791 792 const bplustree_node* fragment = cached.SetTo( 793 bplustree_node::FragmentOffset(value), false); 794 if (fragment == NULL) { 795 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF "\n", 796 value)); 797 continue; 798 } 799 800 // see if there is some space left for us 801 uint32 num = bplustree_node::MaxFragments(fNodeSize); 802 for (uint32 j = 0; j < num; j++) { 803 duplicate_array* array = fragment->FragmentAt(j); 804 805 if (array->count == 0) { 806 // found an unused fragment 807 *_fragment = cached.MakeWritable(transaction); 808 if (*_fragment == NULL) 809 return B_IO_ERROR; 810 811 *_offset = bplustree_node::FragmentOffset(value); 812 *_index = j; 813 return B_OK; 814 } 815 } 816 } 817 return B_ENTRY_NOT_FOUND; 818 } 819 820 821 status_t 822 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached, 823 const bplustree_node* node, uint16 index, off_t value) 824 { 825 CachedNode cachedDuplicate(this); 826 off_t* values = node->Values(); 827 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 828 status_t status; 829 off_t offset; 830 831 if (bplustree_node::IsDuplicate(oldValue)) { 832 // If it's a duplicate fragment, try to insert it into that, or if it 833 // doesn't fit anymore, create a new duplicate node 834 835 if (bplustree_node::LinkType(oldValue) 836 == BPLUSTREE_DUPLICATE_FRAGMENT) { 837 bplustree_node* duplicate 838 = cachedDuplicate.SetToWritable(transaction, 839 bplustree_node::FragmentOffset(oldValue), false); 840 if (duplicate == NULL) 841 return B_IO_ERROR; 842 843 duplicate_array* array = duplicate->FragmentAt( 844 bplustree_node::FragmentIndex(oldValue)); 845 if (array->count > NUM_FRAGMENT_VALUES 846 || array->count < 1) { 847 FATAL(("insertDuplicate: Invalid array[%d] size in fragment " 848 "%" B_PRIdOFF " == %" B_PRIdOFF "!\n", 849 (int)bplustree_node::FragmentIndex(oldValue), 850 bplustree_node::FragmentOffset(oldValue), array->count)); 851 return B_BAD_DATA; 852 } 853 854 if (array->count < NUM_FRAGMENT_VALUES) { 855 array->Insert(value); 856 } else { 857 // Test if the fragment will be empty if we remove this key's 858 // values 859 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 860 // The node will be empty without our values, so let us 861 // reuse it as a duplicate node 862 offset = bplustree_node::FragmentOffset(oldValue); 863 864 memmove(duplicate->DuplicateArray(), array, 865 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 866 duplicate->left_link = duplicate->right_link 867 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 868 869 array = duplicate->DuplicateArray(); 870 array->Insert(value); 871 } else { 872 // Create a new duplicate node 873 874 cachedDuplicate.UnsetUnchanged(transaction); 875 // The old duplicate has not been touched, so we can 876 // reuse it 877 878 bplustree_node* newDuplicate; 879 status = cachedDuplicate.Allocate(transaction, 880 &newDuplicate, &offset); 881 if (status < B_OK) 882 RETURN_ERROR(status); 883 884 // Copy the array from the fragment node to the duplicate 885 // node and free the old entry (by zero'ing all values) 886 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 887 array->count); 888 memcpy(&newDuplicate->all_key_count, &array->values[0], 889 array->count * sizeof(off_t)); 890 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 891 892 array = newDuplicate->DuplicateArray(); 893 array->Insert(value); 894 } 895 896 // Update the main pointer to link to a duplicate node 897 if (cached.MakeWritable(transaction) == NULL) 898 return B_IO_ERROR; 899 900 values[index] 901 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 902 BPLUSTREE_DUPLICATE_NODE, offset)); 903 } 904 905 return B_OK; 906 } 907 908 // Put the value into a dedicated duplicate node 909 910 // search for free space in the duplicate nodes of that key 911 duplicate_array* array; 912 const bplustree_node* duplicate; 913 off_t duplicateOffset; 914 do { 915 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 916 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 917 if (duplicate == NULL) 918 return B_IO_ERROR; 919 920 array = duplicate->DuplicateArray(); 921 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 922 FATAL(("removeDuplicate: Invalid array size in duplicate %" 923 B_PRIdOFF " == %" B_PRIdOFF "!\n", duplicateOffset, 924 array->count)); 925 return B_BAD_DATA; 926 } 927 } while (array->count >= NUM_DUPLICATE_VALUES 928 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 929 930 bplustree_node* writableDuplicate 931 = cachedDuplicate.MakeWritable(transaction); 932 if (writableDuplicate == NULL) 933 return B_IO_ERROR; 934 935 if (array->count < NUM_DUPLICATE_VALUES) { 936 array = writableDuplicate->DuplicateArray(); 937 array->Insert(value); 938 } else { 939 // no space left - add a new duplicate node 940 941 bplustree_node* newDuplicate; 942 status = cachedDuplicate.Allocate(transaction, &newDuplicate, 943 &offset); 944 if (status != B_OK) 945 RETURN_ERROR(status); 946 947 // link the two nodes together 948 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 949 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 950 951 array = newDuplicate->DuplicateArray(); 952 array->count = 0; 953 array->Insert(value); 954 } 955 return B_OK; 956 } 957 958 // Search for a free duplicate fragment or create a new one 959 // to insert the duplicate value into 960 961 uint32 fragmentIndex = 0; 962 bplustree_node* fragment; 963 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate, 964 &offset, &fragment, &fragmentIndex) != B_OK) { 965 // allocate a new duplicate fragment node 966 status = cachedDuplicate.Allocate(transaction, &fragment, &offset); 967 if (status != B_OK) 968 RETURN_ERROR(status); 969 970 memset(fragment, 0, fNodeSize); 971 } 972 973 duplicate_array* array = fragment->FragmentAt(fragmentIndex); 974 array->Insert(oldValue); 975 array->Insert(value); 976 977 if (cached.MakeWritable(transaction) == NULL) 978 return B_IO_ERROR; 979 980 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 981 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex)); 982 983 return B_OK; 984 } 985 986 987 void 988 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key, 989 uint16 keyLength, off_t value) 990 { 991 // should never happen, but who knows? 992 if (index > node->NumKeys()) 993 return; 994 995 off_t* values = node->Values(); 996 uint16* keyLengths = node->KeyLengths(); 997 uint8* keys = node->Keys(); 998 999 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 1000 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() 1001 + keyLength); 1002 1003 off_t* newValues = node->Values(); 1004 uint16* newKeyLengths = node->KeyLengths(); 1005 1006 // move values and copy new value into them 1007 memmove(newValues + index + 1, values + index, 1008 sizeof(off_t) * (node->NumKeys() - 1 - index)); 1009 memmove(newValues, values, sizeof(off_t) * index); 1010 1011 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value); 1012 1013 // move and update key length index 1014 for (uint16 i = node->NumKeys(); i-- > index + 1;) { 1015 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1016 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength); 1017 } 1018 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 1019 1020 int32 keyStart; 1021 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength 1022 + (keyStart = index > 0 1023 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0)); 1024 1025 // move keys and copy new key into them 1026 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]); 1027 int32 size = node->AllKeyLength() - length; 1028 if (size > 0) 1029 memmove(keys + length, keys + length - keyLength, size); 1030 1031 memcpy(keys + keyStart, key, keyLength); 1032 } 1033 1034 1035 /*! Splits the \a node into two halves - the other half will be put into 1036 \a other. It also takes care to create a new overflow link if the node 1037 to split is an index node. 1038 */ 1039 status_t 1040 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset, 1041 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key, 1042 uint16* _keyLength, off_t* _value) 1043 { 1044 if (*_keyIndex > node->NumKeys() + 1) 1045 return B_BAD_VALUE; 1046 1047 uint16* inKeyLengths = node->KeyLengths(); 1048 off_t* inKeyValues = node->Values(); 1049 uint8* inKeys = node->Keys(); 1050 uint8* outKeys = other->Keys(); 1051 int32 keyIndex = *_keyIndex; // can become less than zero! 1052 1053 if (keyIndex > node->NumKeys()) { 1054 FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex, 1055 node->NumKeys())); 1056 return B_BAD_VALUE; 1057 } 1058 1059 // how many keys will fit in one (half) page? 1060 // that loop will find the answer to this question and 1061 // change the key lengths indices for their new home 1062 1063 // "bytes" is the number of bytes written for the new key, 1064 // "bytesBefore" are the bytes before that key 1065 // "bytesAfter" are the bytes after the new key, if any 1066 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 1067 1068 size_t size = fNodeSize >> 1; 1069 int32 out, in; 1070 for (in = out = 0; in < node->NumKeys() + 1;) { 1071 if (!bytes) { 1072 bytesBefore = in > 0 1073 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1074 } 1075 1076 if (in == keyIndex && !bytes) { 1077 bytes = *_keyLength; 1078 } else { 1079 if (keyIndex < out) { 1080 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1081 - bytesBefore; 1082 } 1083 1084 in++; 1085 } 1086 out++; 1087 1088 if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) 1089 + out * (sizeof(uint16) + sizeof(off_t)) >= size) { 1090 // we have found the number of keys in the new node! 1091 break; 1092 } 1093 } 1094 1095 // if the new key was not inserted, set the length of the keys 1096 // that can be copied directly 1097 if (keyIndex >= out && in > 0) 1098 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]); 1099 1100 if (bytesBefore < 0 || bytesAfter < 0) 1101 return B_BAD_DATA; 1102 1103 other->left_link = node->left_link; 1104 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 1105 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1106 + bytesAfter); 1107 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1108 1109 uint16* outKeyLengths = other->KeyLengths(); 1110 off_t* outKeyValues = other->Values(); 1111 int32 keys = out > keyIndex ? keyIndex : out; 1112 1113 if (bytesBefore) { 1114 // copy the keys 1115 memcpy(outKeys, inKeys, bytesBefore); 1116 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 1117 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 1118 } 1119 if (bytes) { 1120 // copy the newly inserted key 1121 memcpy(outKeys + bytesBefore, key, bytes); 1122 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1123 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1124 1125 if (bytesAfter) { 1126 // copy the keys after the new key 1127 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, 1128 bytesAfter); 1129 keys = out - keyIndex - 1; 1130 for (int32 i = 0;i < keys;i++) { 1131 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16( 1132 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) 1133 + bytes); 1134 } 1135 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, 1136 keys * sizeof(off_t)); 1137 } 1138 } 1139 1140 // if the new key was already inserted, we shouldn't use it again 1141 if (in != out) 1142 keyIndex--; 1143 1144 int32 total = bytesBefore + bytesAfter; 1145 1146 // these variables are for the key that will be returned 1147 // to the parent node 1148 uint8* newKey = NULL; 1149 uint16 newLength; 1150 bool newAllocated = false; 1151 1152 // If we have split an index node, we have to drop the first key 1153 // of the next node (which can also be the new key to insert). 1154 // The dropped key is also the one which has to be inserted in 1155 // the parent node, so we will set the "newKey" already here. 1156 if (node->OverflowLink() != BPLUSTREE_NULL) { 1157 if (in == keyIndex) { 1158 newKey = key; 1159 newLength = *_keyLength; 1160 1161 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 1162 keyIndex--; 1163 } else { 1164 // If a key is dropped (is not the new key), we have to copy 1165 // it, because it would be lost if not. 1166 uint8* droppedKey = node->KeyAt(in, &newLength); 1167 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) 1168 > (uint8*)node + fNodeSize 1169 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 1170 fStream->GetVolume()->Panic(); 1171 RETURN_ERROR(B_BAD_DATA); 1172 } 1173 newKey = (uint8*)malloc(newLength); 1174 if (newKey == NULL) 1175 return B_NO_MEMORY; 1176 1177 newAllocated = true; 1178 memcpy(newKey, droppedKey, newLength); 1179 1180 other->overflow_link = inKeyValues[in]; 1181 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]); 1182 } 1183 } 1184 1185 // and now the same game for the other page and the rest of the keys 1186 // (but with memmove() instead of memcpy(), because they may overlap) 1187 1188 bytesBefore = bytesAfter = bytes = 0; 1189 out = 0; 1190 int32 skip = in; 1191 while (in < node->NumKeys() + 1) { 1192 if (in == keyIndex && !bytes) { 1193 // it's enough to set bytesBefore once here, because we do 1194 // not need to know the exact length of all keys in this 1195 // loop 1196 bytesBefore = in > skip 1197 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0; 1198 bytes = *_keyLength; 1199 out++; 1200 } else { 1201 if (in < node->NumKeys()) { 1202 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1203 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total); 1204 1205 if (bytes) { 1206 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16( 1207 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes); 1208 1209 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) 1210 - bytesBefore - bytes; 1211 } 1212 out++; 1213 } 1214 in++; 1215 } 1216 } 1217 1218 // adjust the byte counts (since we were a bit lazy in the loop) 1219 if (keyIndex < skip) 1220 bytesBefore = node->AllKeyLength() - total; 1221 1222 if (bytesBefore < 0 || bytesAfter < 0) { 1223 if (newAllocated) 1224 free(newKey); 1225 return B_BAD_DATA; 1226 } 1227 1228 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1229 // right link, and overflow link can stay the same 1230 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore 1231 + bytesAfter); 1232 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 1233 1234 // array positions have changed 1235 outKeyLengths = node->KeyLengths(); 1236 outKeyValues = node->Values(); 1237 1238 // move the keys in the old node: the order is important here, 1239 // because we don't want to overwrite any contents 1240 1241 keys = keyIndex <= skip ? out : keyIndex - skip; 1242 keyIndex -= skip; 1243 in = out - keyIndex - 1; 1244 // Note: keyIndex and in will contain invalid values when the new key 1245 // went to the other node. But in this case bytes and bytesAfter are 1246 // 0 and subsequently we never use keyIndex and in. 1247 1248 if (bytesBefore) 1249 memmove(inKeys, inKeys + total, bytesBefore); 1250 if (bytesAfter) { 1251 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, 1252 bytesAfter); 1253 } 1254 1255 if (bytesBefore) 1256 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1257 if (bytesAfter) { 1258 // if byteAfter is > 0, keyIndex is larger than skip 1259 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, 1260 in * sizeof(uint16)); 1261 } 1262 1263 if (bytesBefore) 1264 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1265 if (bytesAfter) { 1266 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, 1267 in * sizeof(off_t)); 1268 } 1269 1270 if (bytes) { 1271 // finally, copy the newly inserted key (don't overwrite anything) 1272 memcpy(inKeys + bytesBefore, key, bytes); 1273 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore); 1274 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value); 1275 } 1276 1277 // Prepare the key that will be inserted in the parent node which 1278 // is either the dropped key or the last of the other node. 1279 // If it's the dropped key, "newKey" was already set earlier. 1280 1281 if (newKey == NULL) 1282 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1283 1284 memcpy(key, newKey, newLength); 1285 *_keyLength = newLength; 1286 *_value = otherOffset; 1287 1288 if (newAllocated) 1289 free(newKey); 1290 1291 return B_OK; 1292 } 1293 1294 1295 /*! This inserts a key into the tree. The changes made to the tree will 1296 all be part of the \a transaction. 1297 You need to have the inode write locked. 1298 */ 1299 status_t 1300 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength, 1301 off_t value) 1302 { 1303 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1304 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1305 RETURN_ERROR(B_BAD_VALUE); 1306 #ifdef DEBUG 1307 if (value < 0) 1308 panic("tried to insert invalid value %Ld!\n", value); 1309 #endif 1310 1311 ASSERT_WRITE_LOCKED_INODE(fStream); 1312 1313 Stack<node_and_key> stack; 1314 if (_SeekDown(stack, key, keyLength) != B_OK) 1315 RETURN_ERROR(B_ERROR); 1316 1317 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1318 1319 memcpy(keyBuffer, key, keyLength); 1320 keyBuffer[keyLength] = 0; 1321 1322 node_and_key nodeAndKey; 1323 const bplustree_node* node; 1324 1325 CachedNode cached(this); 1326 while (stack.Pop(&nodeAndKey) 1327 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1328 #ifdef DEBUG 1329 NodeChecker checker(node, fNodeSize, "insert"); 1330 #endif 1331 if (node->IsLeaf()) { 1332 // first round, check for duplicate entries 1333 status_t status = _FindKey(node, key, keyLength, 1334 &nodeAndKey.keyIndex); 1335 1336 // is this a duplicate entry? 1337 if (status == B_OK) { 1338 if (fAllowDuplicates) { 1339 status = _InsertDuplicate(transaction, cached, node, 1340 nodeAndKey.keyIndex, value); 1341 if (status != B_OK) 1342 RETURN_ERROR(status); 1343 return B_OK; 1344 } 1345 1346 return B_NAME_IN_USE; 1347 } 1348 } 1349 1350 bplustree_node* writableNode = cached.MakeWritable(transaction); 1351 if (writableNode == NULL) 1352 return B_IO_ERROR; 1353 1354 // is the node big enough to hold the pair? 1355 if (int32(key_align(sizeof(bplustree_node) 1356 + writableNode->AllKeyLength() + keyLength) 1357 + (writableNode->NumKeys() + 1) * (sizeof(uint16) 1358 + sizeof(off_t))) < fNodeSize) { 1359 _InsertKey(writableNode, nodeAndKey.keyIndex, 1360 keyBuffer, keyLength, value); 1361 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, 1362 nodeAndKey.keyIndex, 0, 1); 1363 1364 return B_OK; 1365 } else { 1366 CachedNode cachedNewRoot(this); 1367 CachedNode cachedOther(this); 1368 1369 // do we need to allocate a new root node? if so, then do 1370 // it now 1371 off_t newRoot = BPLUSTREE_NULL; 1372 if (nodeAndKey.nodeOffset == fHeader.RootNode()) { 1373 bplustree_node* root; 1374 status_t status = cachedNewRoot.Allocate(transaction, &root, 1375 &newRoot); 1376 if (status < B_OK) { 1377 // The tree is most likely corrupted! 1378 // But it's still sane at leaf level - we could set 1379 // a flag in the header that forces the tree to be 1380 // rebuild next time... 1381 // But since we will have journaling, that's not a big 1382 // problem anyway. 1383 RETURN_ERROR(status); 1384 } 1385 } 1386 1387 // reserve space for the other node 1388 bplustree_node* other; 1389 off_t otherOffset; 1390 status_t status = cachedOther.Allocate(transaction, &other, 1391 &otherOffset); 1392 if (status < B_OK) { 1393 cachedNewRoot.Free(transaction, newRoot); 1394 RETURN_ERROR(status); 1395 } 1396 1397 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other, 1398 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength, 1399 &value) < B_OK) { 1400 // free root node & other node here 1401 cachedOther.Free(transaction, otherOffset); 1402 cachedNewRoot.Free(transaction, newRoot); 1403 1404 RETURN_ERROR(B_ERROR); 1405 } 1406 #ifdef DEBUG 1407 checker.Check("insert split"); 1408 NodeChecker otherChecker(other, fNodeSize, "insert split other"); 1409 #endif 1410 1411 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset, 1412 nodeAndKey.keyIndex, writableNode->NumKeys(), 1); 1413 1414 // update the right link of the node in the left of the new node 1415 if ((other = cachedOther.SetToWritable(transaction, 1416 other->LeftLink())) != NULL) { 1417 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1418 } 1419 1420 // create a new root if necessary 1421 if (newRoot != BPLUSTREE_NULL) { 1422 bplustree_node* root = cachedNewRoot.Node(); 1423 1424 _InsertKey(root, 0, keyBuffer, keyLength, 1425 writableNode->LeftLink()); 1426 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64( 1427 nodeAndKey.nodeOffset); 1428 1429 CachedNode cached(this); 1430 bplustree_header* header 1431 = cached.SetToWritableHeader(transaction); 1432 if (header == NULL) 1433 return B_IO_ERROR; 1434 1435 // finally, update header to point to the new root 1436 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1437 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32( 1438 header->MaxNumberOfLevels() + 1); 1439 1440 return B_OK; 1441 } 1442 } 1443 } 1444 RETURN_ERROR(B_ERROR); 1445 } 1446 1447 1448 /*! Removes the duplicate index/value pair from the tree. 1449 It's part of the private tree interface. 1450 */ 1451 status_t 1452 BPlusTree::_RemoveDuplicate(Transaction& transaction, 1453 const bplustree_node* node, CachedNode& cached, uint16 index, 1454 off_t value) 1455 { 1456 off_t* values = node->Values(); 1457 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]); 1458 1459 CachedNode cachedDuplicate(this); 1460 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1461 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction, 1462 duplicateOffset, false); 1463 if (duplicate == NULL) 1464 return B_IO_ERROR; 1465 1466 // if it's a duplicate fragment, remove the entry from there 1467 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1468 duplicate_array* array = duplicate->FragmentAt( 1469 bplustree_node::FragmentIndex(oldValue)); 1470 1471 if (array->count > NUM_FRAGMENT_VALUES 1472 || array->count < 1) { 1473 FATAL(("removeDuplicate: Invalid array[%d] size in fragment %" 1474 B_PRIdOFF " == %" B_PRIdOFF "!\n", 1475 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, 1476 array->count)); 1477 return B_BAD_DATA; 1478 } 1479 if (!array->Remove(value)) { 1480 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node " 1481 "%" B_PRIdOFF "...\n", value, duplicateOffset)); 1482 return B_ENTRY_NOT_FOUND; 1483 } 1484 1485 // remove the array from the fragment node if it is empty 1486 if (array->count == 1) { 1487 // set the link to the remaining value 1488 if (cached.MakeWritable(transaction) == NULL) 1489 return B_IO_ERROR; 1490 1491 values[index] = array->values[0]; 1492 1493 // Remove the whole fragment node, if this was the only array, 1494 // otherwise free just the array 1495 if (duplicate->FragmentsUsed(fNodeSize) == 1) { 1496 status_t status = cachedDuplicate.Free(transaction, 1497 duplicateOffset); 1498 if (status < B_OK) 1499 return status; 1500 } else 1501 array->count = 0; 1502 } 1503 return B_OK; 1504 } 1505 1506 // Remove value from a duplicate node! 1507 1508 duplicate_array* array = NULL; 1509 1510 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1511 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF 1512 "!\n", duplicate->LeftLink())); 1513 return B_BAD_DATA; 1514 } 1515 1516 // Search the duplicate nodes until the entry could be found (and removed) 1517 while (duplicate != NULL) { 1518 array = duplicate->DuplicateArray(); 1519 if (array->count > NUM_DUPLICATE_VALUES 1520 || array->count < 0) { 1521 FATAL(("removeDuplicate: Invalid array size in duplicate %" 1522 B_PRIdOFF " == %" B_PRIdOFF "!\n", duplicateOffset, 1523 array->count)); 1524 return B_BAD_DATA; 1525 } 1526 1527 if (array->Remove(value)) 1528 break; 1529 1530 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1531 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1532 1533 cachedDuplicate.UnsetUnchanged(transaction); 1534 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, 1535 false); 1536 } 1537 if (duplicate == NULL) 1538 RETURN_ERROR(B_IO_ERROR); 1539 1540 // The entry got removed from the duplicate node, but we might want to free 1541 // it now in case it's empty 1542 1543 while (true) { 1544 off_t left = duplicate->LeftLink(); 1545 off_t right = duplicate->RightLink(); 1546 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1547 1548 if ((isLast && array->count == 1) || array->count == 0) { 1549 // Free empty duplicate page, link their siblings together, and 1550 // update the duplicate link if needed (ie. when we either remove 1551 // the last duplicate node or have a new first one) 1552 1553 if (left == BPLUSTREE_NULL) { 1554 // the duplicate link points to us 1555 if (cached.MakeWritable(transaction) == NULL) 1556 return B_IO_ERROR; 1557 1558 if (array->count == 1) { 1559 // This is the last node, and there is only one value left; 1560 // replace the duplicate link with that value, it's no 1561 // duplicate anymore 1562 values[index] = array->values[0]; 1563 } else { 1564 // Move the duplicate link to the next node 1565 values[index] = HOST_ENDIAN_TO_BFS_INT64( 1566 bplustree_node::MakeLink( 1567 BPLUSTREE_DUPLICATE_NODE, right)); 1568 } 1569 } 1570 1571 status_t status = cachedDuplicate.Free(transaction, 1572 duplicateOffset); 1573 if (status < B_OK) 1574 return status; 1575 1576 if (left != BPLUSTREE_NULL 1577 && (duplicate = cachedDuplicate.SetToWritable(transaction, left, 1578 false)) != NULL) { 1579 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1580 1581 // If the next node is the last node, we need to free that node 1582 // and convert the duplicate entry back into a normal entry 1583 array = duplicate->DuplicateArray(); 1584 if (right == BPLUSTREE_NULL 1585 && duplicate->LeftLink() == BPLUSTREE_NULL 1586 && array->count <= NUM_FRAGMENT_VALUES) { 1587 duplicateOffset = left; 1588 continue; 1589 } 1590 } 1591 if (right != BPLUSTREE_NULL 1592 && (duplicate = cachedDuplicate.SetToWritable(transaction, 1593 right, false)) != NULL) { 1594 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1595 1596 // Again, we may need to turn the duplicate entry back into a 1597 // normal entry 1598 array = duplicate->DuplicateArray(); 1599 if (left == BPLUSTREE_NULL 1600 && duplicate->RightLink() == BPLUSTREE_NULL 1601 && array->count <= NUM_FRAGMENT_VALUES) { 1602 duplicateOffset = right; 1603 continue; 1604 } 1605 } 1606 return B_OK; 1607 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1608 // If the number of entries fits in a duplicate fragment, then 1609 // either find a free fragment node, or convert this node to a 1610 // fragment node. 1611 CachedNode cachedOther(this); 1612 1613 bplustree_node* fragment = NULL; 1614 uint32 fragmentIndex = 0; 1615 off_t offset; 1616 if (_FindFreeDuplicateFragment(transaction, node, cachedOther, 1617 &offset, &fragment, &fragmentIndex) == B_OK) { 1618 // move to other node 1619 duplicate_array* target = fragment->FragmentAt(fragmentIndex); 1620 memcpy(target, array, 1621 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1622 1623 cachedDuplicate.Free(transaction, duplicateOffset); 1624 duplicateOffset = offset; 1625 } else { 1626 // convert node 1627 memmove(duplicate, array, 1628 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1629 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1630 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1631 } 1632 1633 if (cached.MakeWritable(transaction) == NULL) 1634 return B_IO_ERROR; 1635 1636 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink( 1637 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex)); 1638 } 1639 return B_OK; 1640 } 1641 } 1642 1643 1644 /*! Removes the key with the given index from the specified node. 1645 Since it has to get the key from the node anyway (to obtain it's 1646 pointer), it's not needed to pass the key & its length, although 1647 the calling method (BPlusTree::Remove()) have this data. 1648 */ 1649 void 1650 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index) 1651 { 1652 // should never happen, but who knows? 1653 if (index > node->NumKeys() && node->NumKeys() > 0) { 1654 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1655 return; 1656 } 1657 1658 off_t* values = node->Values(); 1659 1660 // if we would have to drop the overflow link, drop 1661 // the last key instead and update the overflow link 1662 // to the value of that one 1663 if (!node->IsLeaf() && index == node->NumKeys()) 1664 node->overflow_link = values[--index]; 1665 1666 uint16 length; 1667 uint8* key = node->KeyAt(index, &length); 1668 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize 1669 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1670 FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length, 1671 (int)fStream->BlockRun().allocation_group, 1672 fStream->BlockRun().start)); 1673 fStream->GetVolume()->Panic(); 1674 return; 1675 } 1676 1677 uint16* keyLengths = node->KeyLengths(); 1678 uint8* keys = node->Keys(); 1679 1680 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1681 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64( 1682 node->AllKeyLength() - length); 1683 1684 off_t* newValues = node->Values(); 1685 uint16* newKeyLengths = node->KeyLengths(); 1686 1687 // move key data 1688 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1689 1690 // move and update key lengths 1691 if (index > 0 && newKeyLengths != keyLengths) 1692 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1693 for (uint16 i = index; i < node->NumKeys(); i++) { 1694 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16( 1695 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length); 1696 } 1697 1698 // move values 1699 if (index > 0) 1700 memmove(newValues, values, index * sizeof(off_t)); 1701 if (node->NumKeys() > index) { 1702 memmove(newValues + index, values + index + 1, 1703 (node->NumKeys() - index) * sizeof(off_t)); 1704 } 1705 } 1706 1707 1708 /*! Removes the specified key from the tree. The "value" parameter is only used 1709 for trees which allow duplicates, so you may safely ignore it. 1710 It's not an optional parameter, so at least you have to think about it. 1711 You need to have the inode write locked. 1712 */ 1713 status_t 1714 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength, 1715 off_t value) 1716 { 1717 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1718 || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1719 RETURN_ERROR(B_BAD_VALUE); 1720 1721 ASSERT_WRITE_LOCKED_INODE(fStream); 1722 1723 Stack<node_and_key> stack; 1724 if (_SeekDown(stack, key, keyLength) != B_OK) 1725 RETURN_ERROR(B_ERROR); 1726 1727 node_and_key nodeAndKey; 1728 const bplustree_node* node; 1729 1730 CachedNode cached(this); 1731 while (stack.Pop(&nodeAndKey) 1732 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1733 #ifdef DEBUG 1734 NodeChecker checker(node, fNodeSize, "remove"); 1735 #endif 1736 if (node->IsLeaf()) { 1737 // first round, check for duplicate entries 1738 status_t status = _FindKey(node, key, keyLength, 1739 &nodeAndKey.keyIndex); 1740 if (status != B_OK) 1741 RETURN_ERROR(status); 1742 1743 // Is this a duplicate entry? 1744 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64( 1745 node->Values()[nodeAndKey.keyIndex]))) { 1746 if (fAllowDuplicates) { 1747 return _RemoveDuplicate(transaction, node, cached, 1748 nodeAndKey.keyIndex, value); 1749 } 1750 1751 FATAL(("dupliate node found where no duplicates are " 1752 "allowed!\n")); 1753 RETURN_ERROR(B_ERROR); 1754 } else { 1755 if (node->Values()[nodeAndKey.keyIndex] != value) 1756 return B_ENTRY_NOT_FOUND; 1757 1758 // If we will remove the last key, the iterator will be set 1759 // to the next node after the current - if there aren't any 1760 // more nodes, we need a way to prevent the TreeIterators to 1761 // touch the old node again, we use BPLUSTREE_FREE for this 1762 off_t next = node->RightLink() == BPLUSTREE_NULL 1763 ? BPLUSTREE_FREE : node->RightLink(); 1764 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 1765 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1766 } 1767 } 1768 1769 bplustree_node* writableNode = cached.MakeWritable(transaction); 1770 if (writableNode == NULL) 1771 return B_IO_ERROR; 1772 1773 // if it's an empty root node, we have to convert it 1774 // to a leaf node by dropping the overflow link, or, 1775 // if it's already a leaf node, just empty it 1776 if (nodeAndKey.nodeOffset == fHeader.RootNode() 1777 && (node->NumKeys() == 0 1778 || (node->NumKeys() == 1 && node->IsLeaf()))) { 1779 writableNode->overflow_link 1780 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1781 writableNode->all_key_count = 0; 1782 writableNode->all_key_length = 0; 1783 1784 // if we've made a leaf node out of the root node, we need 1785 // to reset the maximum number of levels in the header 1786 if (fHeader.MaxNumberOfLevels() != 1) { 1787 CachedNode cached(this); 1788 bplustree_header* header 1789 = cached.SetToWritableHeader(transaction); 1790 if (header == NULL) 1791 return B_IO_ERROR; 1792 1793 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1794 } 1795 return B_OK; 1796 } 1797 1798 // if there is only one key left, we don't have to remove 1799 // it, we can just dump the node (index nodes still have 1800 // the overflow link, so we have to drop the last key) 1801 if (writableNode->NumKeys() > 1 1802 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) { 1803 _RemoveKey(writableNode, nodeAndKey.keyIndex); 1804 return B_OK; 1805 } 1806 1807 // when we are here, we can just free the node, but 1808 // we have to update the right/left link of the 1809 // siblings first 1810 CachedNode otherCached(this); 1811 bplustree_node* other = otherCached.SetToWritable(transaction, 1812 writableNode->LeftLink()); 1813 if (other != NULL) 1814 other->right_link = writableNode->right_link; 1815 1816 if ((other = otherCached.SetToWritable(transaction, node->RightLink())) 1817 != NULL) { 1818 other->left_link = writableNode->left_link; 1819 } 1820 1821 cached.Free(transaction, nodeAndKey.nodeOffset); 1822 } 1823 RETURN_ERROR(B_ERROR); 1824 } 1825 1826 1827 /*! Replaces the value for the key in the tree. 1828 Returns B_OK if the key could be found and its value replaced, 1829 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1830 to indicate that something went terribly wrong. 1831 Note that this doesn't work with duplicates - it will just 1832 return B_BAD_TYPE if you call this function on a tree where 1833 duplicates are allowed. 1834 You need to have the inode write locked. 1835 */ 1836 status_t 1837 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength, 1838 off_t value) 1839 { 1840 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1841 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1842 || key == NULL) 1843 RETURN_ERROR(B_BAD_VALUE); 1844 1845 if (fAllowDuplicates) 1846 RETURN_ERROR(B_BAD_TYPE); 1847 1848 ASSERT_WRITE_LOCKED_INODE(fStream); 1849 1850 off_t nodeOffset = fHeader.RootNode(); 1851 CachedNode cached(this); 1852 const bplustree_node* node; 1853 1854 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1855 uint16 keyIndex = 0; 1856 off_t nextOffset; 1857 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1858 &nextOffset); 1859 1860 if (node->OverflowLink() == BPLUSTREE_NULL) { 1861 if (status == B_OK) { 1862 bplustree_node* writableNode = cached.MakeWritable(transaction); 1863 if (writableNode != NULL) { 1864 writableNode->Values()[keyIndex] 1865 = HOST_ENDIAN_TO_BFS_INT64(value); 1866 } else 1867 status = B_IO_ERROR; 1868 } 1869 1870 return status; 1871 } else if (nextOffset == nodeOffset) 1872 RETURN_ERROR(B_ERROR); 1873 1874 nodeOffset = nextOffset; 1875 } 1876 RETURN_ERROR(B_ERROR); 1877 } 1878 1879 1880 /*! Searches the key in the tree, and stores the offset found in _value, 1881 if successful. 1882 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack 1883 while it descends the tree. 1884 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not. 1885 It can also return other errors to indicate that something went wrong. 1886 Note that this doesn't work with duplicates - it will just return 1887 B_BAD_TYPE if you call this function on a tree where duplicates are 1888 allowed. 1889 You need to have the inode read or write locked. 1890 */ 1891 status_t 1892 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value) 1893 { 1894 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 1895 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1896 || key == NULL) 1897 RETURN_ERROR(B_BAD_VALUE); 1898 1899 if (fAllowDuplicates) 1900 RETURN_ERROR(B_BAD_TYPE); 1901 1902 ASSERT_READ_LOCKED_INODE(fStream); 1903 1904 off_t nodeOffset = fHeader.RootNode(); 1905 CachedNode cached(this); 1906 const bplustree_node* node; 1907 1908 #ifdef DEBUG 1909 int32 levels = 0; 1910 #endif 1911 1912 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1913 uint16 keyIndex = 0; 1914 off_t nextOffset; 1915 status_t status = _FindKey(node, key, keyLength, &keyIndex, 1916 &nextOffset); 1917 1918 #ifdef DEBUG 1919 levels++; 1920 #endif 1921 if (node->OverflowLink() == BPLUSTREE_NULL) { 1922 if (status == B_OK && _value != NULL) 1923 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1924 1925 #ifdef DEBUG 1926 if (levels != (int32)fHeader.MaxNumberOfLevels()) 1927 DEBUGGER(("levels don't match")); 1928 #endif 1929 return status; 1930 } else if (nextOffset == nodeOffset) 1931 RETURN_ERROR(B_ERROR); 1932 1933 nodeOffset = nextOffset; 1934 } 1935 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded\n", nodeOffset)); 1936 RETURN_ERROR(B_ERROR); 1937 } 1938 1939 1940 // #pragma mark - 1941 1942 1943 TreeIterator::TreeIterator(BPlusTree* tree) 1944 : 1945 fTree(tree), 1946 fCurrentNodeOffset(BPLUSTREE_NULL) 1947 { 1948 tree->_AddIterator(this); 1949 } 1950 1951 1952 TreeIterator::~TreeIterator() 1953 { 1954 if (fTree) 1955 fTree->_RemoveIterator(this); 1956 } 1957 1958 1959 status_t 1960 TreeIterator::Goto(int8 to) 1961 { 1962 if (fTree == NULL || fTree->fStream == NULL) 1963 RETURN_ERROR(B_BAD_VALUE); 1964 1965 // lock access to stream 1966 InodeReadLocker locker(fTree->fStream); 1967 1968 off_t nodeOffset = fTree->fHeader.RootNode(); 1969 CachedNode cached(fTree); 1970 const bplustree_node* node; 1971 1972 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1973 // is the node a leaf node? 1974 if (node->OverflowLink() == BPLUSTREE_NULL) { 1975 fCurrentNodeOffset = nodeOffset; 1976 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1977 fDuplicateNode = BPLUSTREE_NULL; 1978 1979 return B_OK; 1980 } 1981 1982 // get the next node offset depending on the direction (and if there 1983 // are any keys in that node at all) 1984 off_t nextOffset; 1985 if (to == BPLUSTREE_END || node->all_key_count == 0) 1986 nextOffset = node->OverflowLink(); 1987 else { 1988 if (node->AllKeyLength() > fTree->fNodeSize 1989 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize 1990 - 8 * node->NumKeys()) 1991 RETURN_ERROR(B_ERROR); 1992 1993 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1994 } 1995 if (nextOffset == nodeOffset) 1996 break; 1997 1998 nodeOffset = nextOffset; 1999 } 2000 FATAL(("%s fails\n", __FUNCTION__)); 2001 2002 RETURN_ERROR(B_ERROR); 2003 } 2004 2005 2006 /*! Iterates through the tree in the specified direction. 2007 When it iterates through duplicates, the "key" is only updated for the 2008 first entry - if you need to know when this happens, use the "duplicate" 2009 parameter which is 0 for no duplicate, 1 for the first, and 2 for all 2010 the other duplicates. 2011 That's not too nice, but saves the 256 bytes that would be needed to 2012 store the last key - if this will ever become an issue, it will be 2013 easy to change. 2014 The other advantage of this is, that the queries can skip all duplicates 2015 at once when they are not relevant to them. 2016 */ 2017 status_t 2018 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength, 2019 uint16 maxLength, off_t* value, uint16* duplicate) 2020 { 2021 if (fTree == NULL) 2022 return B_INTERRUPTED; 2023 2024 bool forward = direction == BPLUSTREE_FORWARD; 2025 2026 if (fCurrentNodeOffset == BPLUSTREE_NULL 2027 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 2028 RETURN_ERROR(B_ERROR); 2029 2030 // if the tree was emptied since the last call 2031 if (fCurrentNodeOffset == BPLUSTREE_FREE) 2032 return B_ENTRY_NOT_FOUND; 2033 2034 // lock access to stream 2035 InodeReadLocker locker(fTree->fStream); 2036 2037 CachedNode cached(fTree); 2038 const bplustree_node* node; 2039 2040 if (fDuplicateNode != BPLUSTREE_NULL) { 2041 // regardless of traverse direction the duplicates are always presented 2042 // in the same order; since they are all considered as equal, this 2043 // shouldn't cause any problems 2044 2045 if (!fIsFragment || fDuplicate < fNumDuplicates) { 2046 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2047 false); 2048 } else 2049 node = NULL; 2050 2051 if (node != NULL) { 2052 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 2053 // If the node is out of duplicates, we go directly to the next 2054 // one 2055 fDuplicateNode = node->RightLink(); 2056 if (fDuplicateNode != BPLUSTREE_NULL 2057 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 2058 fNumDuplicates = node->CountDuplicates(fDuplicateNode, 2059 false); 2060 fDuplicate = 0; 2061 } 2062 } 2063 if (fDuplicate < fNumDuplicates) { 2064 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, 2065 fDuplicate++); 2066 if (duplicate) 2067 *duplicate = 2; 2068 return B_OK; 2069 } 2070 } 2071 fDuplicateNode = BPLUSTREE_NULL; 2072 } 2073 2074 off_t savedNodeOffset = fCurrentNodeOffset; 2075 int32 savedKey = fCurrentKey; 2076 2077 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 2078 RETURN_ERROR(B_ERROR); 2079 2080 if (duplicate) 2081 *duplicate = 0; 2082 2083 fCurrentKey += direction; 2084 2085 // is the current key in the current node? 2086 while ((forward && fCurrentKey >= node->NumKeys()) 2087 || (!forward && fCurrentKey < 0)) { 2088 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink(); 2089 2090 // are there any more nodes? 2091 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 2092 node = cached.SetTo(fCurrentNodeOffset); 2093 if (!node) 2094 RETURN_ERROR(B_ERROR); 2095 2096 // reset current key 2097 fCurrentKey = forward ? 0 : node->NumKeys(); 2098 } else { 2099 // there are no nodes left, so turn back to the last key 2100 fCurrentNodeOffset = savedNodeOffset; 2101 fCurrentKey = savedKey; 2102 2103 return B_ENTRY_NOT_FOUND; 2104 } 2105 } 2106 2107 if (node->all_key_count == 0) 2108 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 2109 2110 uint16 length; 2111 uint8* keyStart = node->KeyAt(fCurrentKey, &length); 2112 if (keyStart + length + sizeof(off_t) + sizeof(uint16) 2113 > (uint8*)node + fTree->fNodeSize 2114 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2115 fTree->fStream->GetVolume()->Panic(); 2116 RETURN_ERROR(B_BAD_DATA); 2117 } 2118 2119 length = min_c(length, maxLength); 2120 memcpy(key, keyStart, length); 2121 2122 if (fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE) { 2123 // terminate string type 2124 if (length == maxLength) 2125 length--; 2126 ((char*)key)[length] = '\0'; 2127 } 2128 *keyLength = length; 2129 2130 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 2131 2132 // duplicate fragments? 2133 uint8 type = bplustree_node::LinkType(offset); 2134 if (type == BPLUSTREE_DUPLICATE_FRAGMENT 2135 || type == BPLUSTREE_DUPLICATE_NODE) { 2136 fDuplicateNode = offset; 2137 2138 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), 2139 false); 2140 if (node == NULL) 2141 RETURN_ERROR(B_ERROR); 2142 2143 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 2144 2145 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 2146 if (fNumDuplicates) { 2147 offset = node->DuplicateAt(offset, fIsFragment, 0); 2148 fDuplicate = 1; 2149 if (duplicate) 2150 *duplicate = 1; 2151 } else { 2152 // Shouldn't happen, but we're dealing here with potentially 2153 // corrupt disks... 2154 fDuplicateNode = BPLUSTREE_NULL; 2155 offset = 0; 2156 } 2157 } 2158 *value = offset; 2159 2160 return B_OK; 2161 } 2162 2163 2164 /*! This is more or less a copy of BPlusTree::Find() - but it just 2165 sets the current position in the iterator, regardless of if the 2166 key could be found or not. 2167 */ 2168 status_t 2169 TreeIterator::Find(const uint8* key, uint16 keyLength) 2170 { 2171 if (fTree == NULL) 2172 return B_INTERRUPTED; 2173 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH 2174 || keyLength > BPLUSTREE_MAX_KEY_LENGTH 2175 || key == NULL) 2176 RETURN_ERROR(B_BAD_VALUE); 2177 2178 // lock access to stream 2179 InodeReadLocker locker(fTree->fStream); 2180 2181 off_t nodeOffset = fTree->fHeader.RootNode(); 2182 2183 CachedNode cached(fTree); 2184 const bplustree_node* node; 2185 while ((node = cached.SetTo(nodeOffset)) != NULL) { 2186 uint16 keyIndex = 0; 2187 off_t nextOffset; 2188 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex, 2189 &nextOffset); 2190 2191 if (node->OverflowLink() == BPLUSTREE_NULL) { 2192 fCurrentNodeOffset = nodeOffset; 2193 fCurrentKey = keyIndex - 1; 2194 fDuplicateNode = BPLUSTREE_NULL; 2195 2196 return status; 2197 } else if (nextOffset == nodeOffset) 2198 RETURN_ERROR(B_ERROR); 2199 2200 nodeOffset = nextOffset; 2201 } 2202 RETURN_ERROR(B_ERROR); 2203 } 2204 2205 2206 void 2207 TreeIterator::SkipDuplicates() 2208 { 2209 fDuplicateNode = BPLUSTREE_NULL; 2210 } 2211 2212 2213 void 2214 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, 2215 uint16 splitAt, int8 change) 2216 { 2217 if (offset != fCurrentNodeOffset) 2218 return; 2219 2220 if (nextOffset != BPLUSTREE_NULL) { 2221 fCurrentNodeOffset = nextOffset; 2222 if (splitAt <= fCurrentKey) { 2223 fCurrentKey -= splitAt; 2224 keyIndex -= splitAt; 2225 } 2226 } 2227 2228 // Adjust fCurrentKey to point to the same key as before. 2229 // Note, that if a key is inserted at the current position 2230 // it won't be included in this tree transition. 2231 if (keyIndex <= fCurrentKey) 2232 fCurrentKey += change; 2233 2234 // TODO: duplicate handling! 2235 } 2236 2237 2238 void 2239 TreeIterator::Stop() 2240 { 2241 fTree = NULL; 2242 } 2243 2244 2245 #ifdef DEBUG 2246 void 2247 TreeIterator::Dump() 2248 { 2249 __out("TreeIterator at %p:\n", this); 2250 __out("\tfTree = %p\n", fTree); 2251 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 2252 __out("\tfCurrentKey = %d\n", (int)fCurrentKey); 2253 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", 2254 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, 2255 fDuplicateNode); 2256 __out("\tfDuplicate = %u\n", fDuplicate); 2257 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 2258 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 2259 } 2260 #endif 2261 2262 2263 // #pragma mark - 2264 2265 2266 void 2267 bplustree_node::Initialize() 2268 { 2269 left_link = right_link = overflow_link 2270 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 2271 all_key_count = 0; 2272 all_key_length = 0; 2273 } 2274 2275 2276 uint8* 2277 bplustree_node::KeyAt(int32 index, uint16* keyLength) const 2278 { 2279 if (index < 0 || index > NumKeys()) 2280 return NULL; 2281 2282 uint8* keyStart = Keys(); 2283 uint16* keyLengths = KeyLengths(); 2284 2285 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 2286 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 2287 if (index > 0) 2288 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 2289 2290 return keyStart; 2291 } 2292 2293 2294 uint8 2295 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 2296 { 2297 // the duplicate fragment handling is currently hard-coded to a node size 2298 // of 1024 bytes - with future versions of BFS, this may be a problem 2299 2300 if (isFragment) { 2301 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 2302 2303 return ((off_t*)this)[fragment]; 2304 } 2305 return OverflowLink(); 2306 } 2307 2308 2309 off_t 2310 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 2311 { 2312 uint32 start; 2313 if (isFragment) 2314 start = 8 * ((uint64)offset & 0x3ff); 2315 else 2316 start = 2; 2317 2318 return ((off_t*)this)[start + 1 + index]; 2319 } 2320 2321 2322 /*! Although the name suggests it, this function doesn't return the real 2323 used fragment count; at least, it can only count to two: it returns 2324 0, if there is no fragment used, 1 if there is only one fragment 2325 used, and 2 if there are at least 2 fragments used. 2326 */ 2327 uint32 2328 bplustree_node::FragmentsUsed(uint32 nodeSize) const 2329 { 2330 uint32 used = 0; 2331 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) { 2332 duplicate_array* array = FragmentAt(i); 2333 if (array->count > 0 && ++used > 1) 2334 return used; 2335 } 2336 return used; 2337 } 2338 2339 2340 #ifdef DEBUG 2341 status_t 2342 bplustree_node::CheckIntegrity(uint32 nodeSize) const 2343 { 2344 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2345 DEBUGGER(("invalid node: key/length count")); 2346 2347 for (int32 i = 0; i < NumKeys(); i++) { 2348 uint16 length; 2349 uint8* key = KeyAt(i, &length); 2350 if (key + length + sizeof(off_t) + sizeof(uint16) 2351 > (uint8*)this + nodeSize 2352 || length > BPLUSTREE_MAX_KEY_LENGTH) { 2353 dprintf("node %p, key %d\n", this, (int)i); 2354 DEBUGGER(("invalid node: keys corrupted")); 2355 return B_BAD_DATA; 2356 } 2357 if (Values()[i] == -1) { 2358 dprintf("node %p, value %d: %lld\n", this, (int)i, Values()[i]); 2359 DEBUGGER(("invalid node: values corrupted")); 2360 return B_BAD_DATA; 2361 } 2362 } 2363 return B_OK; 2364 } 2365 #endif 2366 2367 2368 // #pragma mark - 2369 2370 2371 int32 2372 compareKeys(type_code type, const void* key1, int keyLength1, 2373 const void* key2, int keyLength2) 2374 { 2375 // if one of the keys is NULL, bail out gracefully 2376 if (key1 == NULL || key2 == NULL) { 2377 // even if it's most probably a bug in the calling code, we try to 2378 // give a meaningful result 2379 if (key1 == NULL && key2 != NULL) 2380 return 1; 2381 if (key2 == NULL && key1 != NULL) 2382 return -1; 2383 2384 return 0; 2385 } 2386 2387 switch (type) { 2388 case B_STRING_TYPE: 2389 { 2390 int result = memcmp(key1, key2, min_c(keyLength1, keyLength2)); 2391 if (result == 0) { 2392 // ignore trailing null bytes 2393 if ((keyLength1 == keyLength2 + 1 2394 && ((uint8*)key1)[keyLength2] == '\0') 2395 || (keyLength2 == keyLength1 + 1 2396 && ((uint8*)key2)[keyLength1] == '\0')) 2397 return 0; 2398 2399 result = keyLength1 - keyLength2; 2400 } 2401 2402 return result; 2403 } 2404 2405 case B_SSIZE_T_TYPE: 2406 case B_INT32_TYPE: 2407 return *(int32*)key1 - *(int32*)key2; 2408 2409 case B_SIZE_T_TYPE: 2410 case B_UINT32_TYPE: 2411 if (*(uint32*)key1 == *(uint32*)key2) 2412 return 0; 2413 if (*(uint32*)key1 > *(uint32*)key2) 2414 return 1; 2415 2416 return -1; 2417 2418 case B_OFF_T_TYPE: 2419 case B_INT64_TYPE: 2420 if (*(int64*)key1 == *(int64*)key2) 2421 return 0; 2422 if (*(int64*)key1 > *(int64*)key2) 2423 return 1; 2424 2425 return -1; 2426 2427 case B_UINT64_TYPE: 2428 if (*(uint64*)key1 == *(uint64*)key2) 2429 return 0; 2430 if (*(uint64*)key1 > *(uint64*)key2) 2431 return 1; 2432 2433 return -1; 2434 2435 case B_FLOAT_TYPE: 2436 { 2437 float result = *(float*)key1 - *(float*)key2; 2438 if (result == 0.0f) 2439 return 0; 2440 2441 return (result < 0.0f) ? -1 : 1; 2442 } 2443 2444 case B_DOUBLE_TYPE: 2445 { 2446 double result = *(double*)key1 - *(double*)key2; 2447 if (result == 0.0) 2448 return 0; 2449 2450 return (result < 0.0) ? -1 : 1; 2451 } 2452 } 2453 2454 // if the type is unknown, the entries don't match... 2455 return -1; 2456 } 2457 2458