1 /* BPlusTree - BFS B+Tree implementation 2 ** 3 ** Initial version by Axel Dörfler, axeld@pinc-software.de 4 ** Roughly based on 'btlib' written by Marcus J. Ranum 5 ** 6 ** Copyright (c) 2001-2004 pinc Software. All Rights Reserved. 7 ** This file may be used under the terms of the OpenBeOS License. 8 */ 9 10 11 #include "Debug.h" 12 #include "BPlusTree.h" 13 #include "Inode.h" 14 #include "Utility.h" 15 #include "Stack.h" 16 17 #include <util/kernel_cpp.h> 18 #include <TypeConstants.h> 19 20 #include <string.h> 21 #include <stdlib.h> 22 #include <stdio.h> 23 24 25 #ifdef DEBUG 26 class NodeChecker { 27 public: 28 NodeChecker(bplustree_node *node, int32 nodeSize) 29 : 30 fNode(node), 31 fSize(nodeSize) 32 { 33 } 34 35 ~NodeChecker() 36 { 37 fNode->CheckIntegrity(fSize); 38 } 39 40 private: 41 bplustree_node *fNode; 42 int32 fSize; 43 }; 44 #endif 45 46 // Node Caching for the BPlusTree class 47 // 48 // With write support, there is the need for a function that allocates new 49 // nodes by either returning empty nodes, or by growing the file's data stream 50 // 51 // !! The CachedNode class assumes that you have properly locked the stream 52 // !! before asking for nodes. 53 // 54 // Note: This code will fail if the block size is smaller than the node size! 55 // Since BFS supports block sizes of 1024 bytes or greater, and the node size 56 // is hard-coded to 1024 bytes, that's not an issue now. 57 58 void 59 CachedNode::Unset() 60 { 61 if (fTree == NULL || fTree->fStream == NULL) 62 return; 63 64 if (fBlock != NULL) { 65 release_block(fTree->fStream->GetVolume()->Device(), fBlockNumber); 66 67 fBlock = NULL; 68 fNode = NULL; 69 } 70 } 71 72 73 bplustree_node * 74 CachedNode::SetTo(off_t offset, bool check) 75 { 76 if (fTree == NULL || fTree->fStream == NULL) { 77 REPORT_ERROR(B_BAD_VALUE); 78 return NULL; 79 } 80 81 Unset(); 82 83 // You can only ask for nodes at valid positions - you can't 84 // even access the b+tree header with this method (use SetToHeader() 85 // instead) 86 if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize 87 || offset <= 0 88 || (offset % fTree->fNodeSize) != 0) 89 return NULL; 90 91 if (InternalSetTo(offset) != NULL && check) { 92 // sanity checks (links, all_key_count) 93 bplustree_header *header = fTree->fHeader; 94 if (!header->IsValidLink(fNode->LeftLink()) 95 || !header->IsValidLink(fNode->RightLink()) 96 || !header->IsValidLink(fNode->OverflowLink()) 97 || (int8 *)fNode->Values() + fNode->NumKeys() * sizeof(off_t) > 98 (int8 *)fNode + fTree->fNodeSize) { 99 FATAL(("invalid node read from offset %Ld, inode at %Ld\n", 100 offset, fTree->fStream->ID())); 101 return NULL; 102 } 103 } 104 return fNode; 105 } 106 107 108 bplustree_header * 109 CachedNode::SetToHeader() 110 { 111 if (fTree == NULL || fTree->fStream == NULL) { 112 REPORT_ERROR(B_BAD_VALUE); 113 return NULL; 114 } 115 116 Unset(); 117 118 InternalSetTo(0LL); 119 return (bplustree_header *)fNode; 120 } 121 122 123 bplustree_node * 124 CachedNode::InternalSetTo(off_t offset) 125 { 126 fNode = NULL; 127 128 off_t fileOffset; 129 block_run run; 130 if (offset < fTree->fStream->Size() 131 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) { 132 Volume *volume = fTree->fStream->GetVolume(); 133 134 int32 blockOffset = (offset - fileOffset) / volume->BlockSize(); 135 fBlockNumber = volume->ToBlock(run) + blockOffset; 136 137 fBlock = (uint8 *)get_block(volume->Device(), fBlockNumber, volume->BlockSize()); 138 if (fBlock) { 139 // the node is somewhere in that block... (confusing offset calculation) 140 fNode = (bplustree_node *)(fBlock + offset - 141 (fileOffset + blockOffset * volume->BlockSize())); 142 } else 143 REPORT_ERROR(B_IO_ERROR); 144 } 145 return fNode; 146 } 147 148 149 status_t 150 CachedNode::Free(Transaction *transaction, off_t offset) 151 { 152 if (transaction == NULL || fTree == NULL || fTree->fStream == NULL 153 || offset == BPLUSTREE_NULL) 154 RETURN_ERROR(B_BAD_VALUE); 155 156 // ToDo: scan the free nodes list and remove all nodes at the end 157 // of the tree - perhaps that shouldn't be done everytime that 158 // function is called, perhaps it should be done when the directory 159 // inode is closed or based on some calculation or whatever... 160 161 // if the node is the last one in the tree, we shrink 162 // the tree and file size by one node 163 off_t lastOffset = fTree->fHeader->MaximumSize() - fTree->fNodeSize; 164 if (offset == lastOffset) { 165 fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset); 166 167 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset); 168 if (status < B_OK) 169 return status; 170 171 return fTree->fCachedHeader.WriteBack(transaction); 172 } 173 174 // add the node to the free nodes list 175 fNode->left_link = fTree->fHeader->free_node_pointer; 176 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE); 177 178 if (WriteBack(transaction) == B_OK) { 179 fTree->fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset); 180 return fTree->fCachedHeader.WriteBack(transaction); 181 } 182 return B_ERROR; 183 } 184 185 186 status_t 187 CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset) 188 { 189 if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL 190 || fTree->fStream == NULL) { 191 RETURN_ERROR(B_BAD_VALUE); 192 } 193 194 status_t status; 195 196 // if there are any free nodes, recycle them 197 if (SetTo(fTree->fHeader->FreeNode(), false) != NULL) { 198 *_offset = fTree->fHeader->FreeNode(); 199 200 // set new free node pointer 201 fTree->fHeader->free_node_pointer = fNode->left_link; 202 if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) { 203 fNode->Initialize(); 204 *_node = fNode; 205 return B_OK; 206 } 207 return status; 208 } 209 // allocate space for a new node 210 Inode *stream = fTree->fStream; 211 if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK) 212 return status; 213 214 // the maximum_size has to be changed before the call to SetTo() - or 215 // else it will fail because the requested node is out of bounds 216 off_t offset = fTree->fHeader->MaximumSize(); 217 fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(fTree->fHeader->MaximumSize() + fTree->fNodeSize); 218 219 if (SetTo(offset, false) != NULL) { 220 *_offset = offset; 221 222 if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) { 223 fNode->Initialize(); 224 *_node = fNode; 225 return B_OK; 226 } 227 } 228 RETURN_ERROR(B_ERROR); 229 } 230 231 232 status_t 233 CachedNode::WriteBack(Transaction *transaction) 234 { 235 if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL) 236 RETURN_ERROR(B_BAD_VALUE); 237 238 return transaction->WriteBlocks(fBlockNumber, fBlock); 239 } 240 241 242 // #pragma mark - 243 244 245 BPlusTree::BPlusTree(Transaction *transaction, Inode *stream, int32 nodeSize) 246 : 247 fStream(NULL), 248 fHeader(NULL), 249 fCachedHeader(this) 250 { 251 SetTo(transaction, stream); 252 } 253 254 255 BPlusTree::BPlusTree(Inode *stream) 256 : 257 fStream(NULL), 258 fHeader(NULL), 259 fCachedHeader(this) 260 { 261 SetTo(stream); 262 } 263 264 265 BPlusTree::BPlusTree() 266 : 267 fStream(NULL), 268 fHeader(NULL), 269 fCachedHeader(this), 270 fNodeSize(BPLUSTREE_NODE_SIZE), 271 fAllowDuplicates(true), 272 fStatus(B_NO_INIT) 273 { 274 } 275 276 277 BPlusTree::~BPlusTree() 278 { 279 // if there are any TreeIterators left, we need to stop them 280 // (can happen when the tree's inode gets deleted while 281 // traversing the tree - a TreeIterator doesn't lock the inode) 282 if (fIteratorLock.Lock() < B_OK) 283 return; 284 285 TreeIterator *iterator = NULL; 286 while ((iterator = fIterators.Next(iterator)) != NULL) 287 iterator->Stop(); 288 289 fIteratorLock.Unlock(); 290 } 291 292 293 /** Create a new B+Tree on the specified stream */ 294 295 status_t 296 BPlusTree::SetTo(Transaction *transaction, Inode *stream, int32 nodeSize) 297 { 298 // initializes in-memory B+Tree 299 300 fCachedHeader.Unset(); 301 fStream = stream; 302 303 fHeader = fCachedHeader.SetToHeader(); 304 if (fHeader == NULL) { 305 // allocate space for new header + node! 306 fStatus = stream->SetFileSize(transaction, nodeSize * 2); 307 if (fStatus < B_OK) 308 RETURN_ERROR(fStatus); 309 310 fHeader = fCachedHeader.SetToHeader(); 311 if (fHeader == NULL) 312 RETURN_ERROR(fStatus = B_ERROR); 313 } 314 315 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 316 && stream->BlockRun() != stream->Parent()) 317 || (stream->Mode() & S_ALLOW_DUPS) != 0; 318 319 fNodeSize = nodeSize; 320 321 // initialize b+tree header 322 fHeader->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC); 323 fHeader->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize); 324 fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 325 fHeader->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode())); 326 fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize); 327 fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 328 fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2); 329 330 if (fCachedHeader.WriteBack(transaction) < B_OK) 331 RETURN_ERROR(fStatus = B_ERROR); 332 333 // initialize b+tree root node 334 CachedNode cached(this, fHeader->RootNode(), false); 335 if (cached.Node() == NULL) 336 RETURN_ERROR(B_ERROR); 337 338 cached.Node()->Initialize(); 339 return fStatus = cached.WriteBack(transaction); 340 } 341 342 343 status_t 344 BPlusTree::SetTo(Inode *stream) 345 { 346 if (stream == NULL || stream->Node() == NULL) 347 RETURN_ERROR(fStatus = B_BAD_VALUE); 348 349 // get on-disk B+Tree header 350 351 fCachedHeader.Unset(); 352 fStream = stream; 353 354 fHeader = fCachedHeader.SetToHeader(); 355 if (fHeader == NULL) 356 RETURN_ERROR(fStatus = B_NO_INIT); 357 358 // is header valid? 359 360 if (fHeader->Magic() != BPLUSTREE_MAGIC 361 || fHeader->MaximumSize() != stream->Size() 362 || (fHeader->RootNode() % fHeader->NodeSize()) != 0 363 || !fHeader->IsValidLink(fHeader->RootNode()) 364 || !fHeader->IsValidLink(fHeader->FreeNode())) 365 RETURN_ERROR(fStatus = B_BAD_DATA); 366 367 fNodeSize = fHeader->NodeSize(); 368 369 { 370 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 371 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 372 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 373 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 374 375 if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE 376 || (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode 377 || !stream->IsContainer()) { 378 D( dump_bplustree_header(fHeader); 379 dump_inode(stream->Node()); 380 ); 381 RETURN_ERROR(fStatus = B_BAD_TYPE); 382 } 383 384 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 385 // in the original BFS code - we will honour it nevertheless 386 fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR 387 && stream->BlockRun() != stream->Parent()) 388 || (stream->Mode() & S_ALLOW_DUPS) != 0; 389 } 390 391 CachedNode cached(this, fHeader->RootNode()); 392 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA); 393 } 394 395 396 status_t 397 BPlusTree::InitCheck() 398 { 399 return fStatus; 400 } 401 402 403 int32 404 BPlusTree::TypeCodeToKeyType(type_code code) 405 { 406 switch (code) { 407 case B_STRING_TYPE: 408 return BPLUSTREE_STRING_TYPE; 409 case B_SSIZE_T_TYPE: 410 case B_INT32_TYPE: 411 return BPLUSTREE_INT32_TYPE; 412 case B_SIZE_T_TYPE: 413 case B_UINT32_TYPE: 414 return BPLUSTREE_UINT32_TYPE; 415 case B_OFF_T_TYPE: 416 case B_INT64_TYPE: 417 return BPLUSTREE_INT64_TYPE; 418 case B_UINT64_TYPE: 419 return BPLUSTREE_UINT64_TYPE; 420 case B_FLOAT_TYPE: 421 return BPLUSTREE_FLOAT_TYPE; 422 case B_DOUBLE_TYPE: 423 return BPLUSTREE_DOUBLE_TYPE; 424 } 425 return -1; 426 } 427 428 429 int32 430 BPlusTree::ModeToKeyType(mode_t mode) 431 { 432 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 433 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) { 434 case S_INT_INDEX: 435 return BPLUSTREE_INT32_TYPE; 436 case S_UINT_INDEX: 437 return BPLUSTREE_UINT32_TYPE; 438 case S_LONG_LONG_INDEX: 439 return BPLUSTREE_INT64_TYPE; 440 case S_ULONG_LONG_INDEX: 441 return BPLUSTREE_UINT64_TYPE; 442 case S_FLOAT_INDEX: 443 return BPLUSTREE_FLOAT_TYPE; 444 case S_DOUBLE_INDEX: 445 return BPLUSTREE_DOUBLE_TYPE; 446 case S_STR_INDEX: 447 default: 448 // default is for standard directories 449 return BPLUSTREE_STRING_TYPE; 450 } 451 } 452 453 454 // #pragma mark - 455 456 457 void 458 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, 459 int8 change) 460 { 461 // Although every iterator which is affected by this update currently 462 // waits on a semaphore, other iterators could be added/removed at 463 // any time, so we need to protect this loop 464 if (fIteratorLock.Lock() < B_OK) 465 return; 466 467 TreeIterator *iterator = NULL; 468 while ((iterator = fIterators.Next(iterator)) != NULL) 469 iterator->Update(offset, nextOffset, keyIndex, splitAt, change); 470 471 fIteratorLock.Unlock(); 472 } 473 474 475 void 476 BPlusTree::AddIterator(TreeIterator *iterator) 477 { 478 if (fIteratorLock.Lock() < B_OK) 479 return; 480 481 fIterators.Add(iterator); 482 483 fIteratorLock.Unlock(); 484 } 485 486 487 void 488 BPlusTree::RemoveIterator(TreeIterator *iterator) 489 { 490 if (fIteratorLock.Lock() < B_OK) 491 return; 492 493 fIterators.Remove(iterator); 494 495 fIteratorLock.Unlock(); 496 } 497 498 499 int32 500 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 501 { 502 type_code type = 0; 503 switch (fHeader->data_type) { 504 case BPLUSTREE_STRING_TYPE: 505 type = B_STRING_TYPE; 506 break; 507 case BPLUSTREE_INT32_TYPE: 508 type = B_INT32_TYPE; 509 break; 510 case BPLUSTREE_UINT32_TYPE: 511 type = B_UINT32_TYPE; 512 break; 513 case BPLUSTREE_INT64_TYPE: 514 type = B_INT64_TYPE; 515 break; 516 case BPLUSTREE_UINT64_TYPE: 517 type = B_UINT64_TYPE; 518 break; 519 case BPLUSTREE_FLOAT_TYPE: 520 type = B_FLOAT_TYPE; 521 break; 522 case BPLUSTREE_DOUBLE_TYPE: 523 type = B_DOUBLE_TYPE; 524 break; 525 } 526 return compareKeys(type, key1, keyLength1, key2, keyLength2); 527 } 528 529 530 status_t 531 BPlusTree::FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index, 532 off_t *next) 533 { 534 if (node->all_key_count == 0) { 535 if (index) 536 *index = 0; 537 if (next) 538 *next = node->OverflowLink(); 539 return B_ENTRY_NOT_FOUND; 540 } 541 542 off_t *values = node->Values(); 543 int16 saveIndex = -1; 544 545 // binary search in the key array 546 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) { 547 uint16 i = (first + last) >> 1; 548 549 uint16 searchLength; 550 uint8 *searchKey = node->KeyAt(i, &searchLength); 551 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 552 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) { 553 fStream->GetVolume()->Panic(); 554 RETURN_ERROR(B_BAD_DATA); 555 } 556 557 int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength); 558 if (cmp < 0) { 559 last = i - 1; 560 saveIndex = i; 561 } else if (cmp > 0) { 562 saveIndex = first = i + 1; 563 } else { 564 if (index) 565 *index = i; 566 if (next) 567 *next = BFS_ENDIAN_TO_HOST_INT64(values[i]); 568 return B_OK; 569 } 570 } 571 572 if (index) 573 *index = saveIndex; 574 if (next) { 575 if (saveIndex == node->NumKeys()) 576 *next = node->OverflowLink(); 577 else 578 *next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]); 579 } 580 return B_ENTRY_NOT_FOUND; 581 } 582 583 584 /** Prepares the stack to contain all nodes that were passed while 585 * following the key, from the root node to the leaf node that could 586 * or should contain that key. 587 */ 588 589 status_t 590 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength) 591 { 592 // set the root node to begin with 593 node_and_key nodeAndKey; 594 nodeAndKey.nodeOffset = fHeader->RootNode(); 595 596 CachedNode cached(this); 597 bplustree_node *node; 598 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 599 // if we are already on leaf level, we're done 600 if (node->OverflowLink() == BPLUSTREE_NULL) { 601 // node that the keyIndex is not properly set here (but it's not 602 // needed in the calling functions anyway)! 603 nodeAndKey.keyIndex = 0; 604 stack.Push(nodeAndKey); 605 return B_OK; 606 } 607 608 off_t nextOffset; 609 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset); 610 611 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 612 RETURN_ERROR(B_ERROR); 613 614 // put the node offset & the correct keyIndex on the stack 615 stack.Push(nodeAndKey); 616 617 nodeAndKey.nodeOffset = nextOffset; 618 } 619 RETURN_ERROR(B_ERROR); 620 } 621 622 623 status_t 624 BPlusTree::FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset, 625 bplustree_node **_fragment, uint32 *_index) 626 { 627 off_t *values = node->Values(); 628 for (int32 i = 0; i < node->NumKeys(); i++) { 629 // does the value link to a duplicate fragment? 630 if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT) 631 continue; 632 633 bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]), false); 634 if (fragment == NULL) { 635 FATAL(("Could not get duplicate fragment at %Ld\n", values[i])); 636 continue; 637 } 638 639 // see if there is some space left for us 640 int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1); 641 for (int32 j = 0;j < num;j++) { 642 duplicate_array *array = fragment->FragmentAt(j); 643 644 if (array->count == 0) { 645 *_offset = bplustree_node::FragmentOffset(values[i]); 646 *_fragment = fragment; 647 *_index = j; 648 return B_OK; 649 } 650 } 651 } 652 return B_ENTRY_NOT_FOUND; 653 } 654 655 656 status_t 657 BPlusTree::InsertDuplicate(Transaction *transaction, CachedNode *cached, bplustree_node *node, 658 uint16 index, off_t value) 659 { 660 CachedNode cachedDuplicate(this); 661 off_t *values = node->Values(); 662 off_t oldValue = values[index]; 663 status_t status; 664 off_t offset; 665 666 if (bplustree_node::IsDuplicate(oldValue)) { 667 // 668 // If it's a duplicate fragment, try to insert it into that, or if it 669 // doesn't fit anymore, create a new duplicate node 670 // 671 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 672 bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue), false); 673 if (duplicate == NULL) 674 return B_IO_ERROR; 675 676 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 677 if (array->count > NUM_FRAGMENT_VALUES 678 || array->count < 1) { 679 FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 680 bplustree_node::FragmentIndex(oldValue), 681 bplustree_node::FragmentOffset(oldValue), 682 array->count)); 683 return B_BAD_DATA; 684 } 685 686 if (array->count < NUM_FRAGMENT_VALUES) { 687 array->Insert(value); 688 } else { 689 // test if the fragment will be empty if we remove this key's values 690 if (duplicate->FragmentsUsed(fNodeSize) < 2) { 691 // the node will be empty without our values, so let us 692 // reuse it as a duplicate node 693 offset = bplustree_node::FragmentOffset(oldValue); 694 695 memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 696 duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 697 698 array = duplicate->DuplicateArray(); 699 array->Insert(value); 700 } else { 701 // create a new duplicate node 702 CachedNode cachedNewDuplicate(this); 703 bplustree_node *newDuplicate; 704 status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset); 705 if (status < B_OK) 706 return status; 707 708 // copy the array from the fragment node to the duplicate node 709 // and free the old entry (by zero'ing all values) 710 newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count); 711 memcpy(&newDuplicate->all_key_count, &array->values[0], 712 array->count * sizeof(off_t)); 713 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 714 715 array = newDuplicate->DuplicateArray(); 716 array->Insert(value); 717 718 // if this fails, the old fragments node will contain wrong 719 // data... (but since it couldn't be written, it shouldn't 720 // be fatal) 721 if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK) 722 return status; 723 } 724 725 // update the main pointer to link to a duplicate node 726 values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, offset); 727 if ((status = cached->WriteBack(transaction)) < B_OK) 728 return status; 729 } 730 731 return cachedDuplicate.WriteBack(transaction); 732 } 733 734 // 735 // Put the value into a dedicated duplicate node 736 // 737 738 // search for free space in the duplicate nodes of that key 739 duplicate_array *array; 740 bplustree_node *duplicate; 741 off_t duplicateOffset; 742 do { 743 duplicateOffset = bplustree_node::FragmentOffset(oldValue); 744 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 745 if (duplicate == NULL) 746 return B_IO_ERROR; 747 748 array = duplicate->DuplicateArray(); 749 if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) { 750 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 751 duplicateOffset, array->count)); 752 return B_BAD_DATA; 753 } 754 } while (array->count >= NUM_DUPLICATE_VALUES 755 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL); 756 757 if (array->count < NUM_DUPLICATE_VALUES) { 758 array->Insert(value); 759 } else { 760 // no space left - add a new duplicate node 761 762 CachedNode cachedNewDuplicate(this); 763 bplustree_node *newDuplicate; 764 status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset); 765 if (status < B_OK) 766 return status; 767 768 // link the two nodes together 769 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset); 770 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset); 771 772 array = newDuplicate->DuplicateArray(); 773 array->count = 0; 774 array->Insert(value); 775 776 status = cachedNewDuplicate.WriteBack(transaction); 777 if (status < B_OK) 778 return status; 779 } 780 return cachedDuplicate.WriteBack(transaction); 781 } 782 783 // 784 // Search for a free duplicate fragment or create a new one 785 // to insert the duplicate value into 786 // 787 788 uint32 fragmentIndex = 0; 789 bplustree_node *fragment; 790 if (FindFreeDuplicateFragment(node, &cachedDuplicate, &offset, &fragment, &fragmentIndex) < B_OK) { 791 // allocate a new duplicate fragment node 792 if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK) 793 return status; 794 795 memset(fragment, 0, fNodeSize); 796 } 797 duplicate_array *array = fragment->FragmentAt(fragmentIndex); 798 array->Insert(oldValue); 799 array->Insert(value); 800 801 if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK) 802 return status; 803 804 values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex); 805 806 return cached->WriteBack(transaction); 807 } 808 809 810 void 811 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength, 812 off_t value) 813 { 814 // should never happen, but who knows? 815 if (index > node->NumKeys()) 816 return; 817 818 off_t *values = node->Values(); 819 uint16 *keyLengths = node->KeyLengths(); 820 uint8 *keys = node->Keys(); 821 822 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1); 823 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength); 824 825 off_t *newValues = node->Values(); 826 uint16 *newKeyLengths = node->KeyLengths(); 827 828 // move values and copy new value into them 829 memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index)); 830 memmove(newValues, values, sizeof(off_t) * index); 831 832 newValues[index] = value; 833 834 // move and update key length index 835 for (uint16 i = node->NumKeys(); i-- > index + 1;) 836 newKeyLengths[i] = keyLengths[i - 1] + keyLength; 837 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index); 838 839 int32 keyStart; 840 newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); 841 842 // move keys and copy new key into them 843 int32 size = node->AllKeyLength() - newKeyLengths[index]; 844 if (size > 0) 845 memmove(keys + newKeyLengths[index], keys + newKeyLengths[index] - keyLength, size); 846 847 memcpy(keys + keyStart, key, keyLength); 848 } 849 850 851 status_t 852 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other, 853 off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value) 854 { 855 if (*_keyIndex > node->NumKeys() + 1) 856 return B_BAD_VALUE; 857 858 uint16 *inKeyLengths = node->KeyLengths(); 859 off_t *inKeyValues = node->Values(); 860 uint8 *inKeys = node->Keys(); 861 uint8 *outKeys = other->Keys(); 862 int32 keyIndex = *_keyIndex; // can become less than zero! 863 864 // how many keys will fit in one (half) page? 865 // that loop will find the answer to this question and 866 // change the key lengths indices for their new home 867 868 // "bytes" is the number of bytes written for the new key, 869 // "bytesBefore" are the bytes before that key 870 // "bytesAfter" are the bytes after the new key, if any 871 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0; 872 873 size_t size = fNodeSize >> 1; 874 int32 out, in; 875 for (in = out = 0; in < node->NumKeys() + 1;) { 876 if (!bytes) 877 bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; 878 879 if (in == keyIndex && !bytes) { 880 bytes = *_keyLength; 881 } else { 882 if (keyIndex < out) 883 bytesAfter = inKeyLengths[in] - bytesBefore; 884 885 in++; 886 } 887 out++; 888 889 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 890 out * (sizeof(uint16) + sizeof(off_t)) >= size) { 891 // we have found the number of keys in the new node! 892 break; 893 } 894 } 895 896 // if the new key was not inserted, set the length of the keys 897 // that can be copied directly 898 if (keyIndex >= out && in > 0) 899 bytesBefore = inKeyLengths[in - 1]; 900 901 if (bytesBefore < 0 || bytesAfter < 0) 902 return B_BAD_DATA; 903 904 other->left_link = node->left_link; 905 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset); 906 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 907 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out); 908 909 uint16 *outKeyLengths = other->KeyLengths(); 910 off_t *outKeyValues = other->Values(); 911 int32 keys = out > keyIndex ? keyIndex : out; 912 913 if (bytesBefore) { 914 // copy the keys 915 memcpy(outKeys, inKeys, bytesBefore); 916 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16)); 917 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t)); 918 } 919 if (bytes) { 920 // copy the newly inserted key 921 memcpy(outKeys + bytesBefore, key, bytes); 922 outKeyLengths[keyIndex] = bytes + bytesBefore; 923 outKeyValues[keyIndex] = *_value; 924 925 if (bytesAfter) { 926 // copy the keys after the new key 927 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter); 928 keys = out - keyIndex - 1; 929 for (int32 i = 0;i < keys;i++) 930 outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes; 931 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t)); 932 } 933 } 934 935 // if the new key was already inserted, we shouldn't use it again 936 if (in != out) 937 keyIndex--; 938 939 int32 total = bytesBefore + bytesAfter; 940 941 // these variables are for the key that will be returned 942 // to the parent node 943 uint8 *newKey = NULL; 944 uint16 newLength; 945 bool newAllocated = false; 946 947 // If we have split an index node, we have to drop the first key 948 // of the next node (which can also be the new key to insert). 949 // The dropped key is also the one which has to be inserted in 950 // the parent node, so we will set the "newKey" already here. 951 if (node->OverflowLink() != BPLUSTREE_NULL) { 952 if (in == keyIndex) { 953 newKey = key; 954 newLength = *_keyLength; 955 956 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value); 957 keyIndex--; 958 } else { 959 // If a key is dropped (is not the new key), we have to copy 960 // it, because it would be lost if not. 961 uint8 *droppedKey = node->KeyAt(in, &newLength); 962 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 963 || newLength > BPLUSTREE_MAX_KEY_LENGTH) { 964 fStream->GetVolume()->Panic(); 965 RETURN_ERROR(B_BAD_DATA); 966 } 967 newKey = (uint8 *)malloc(newLength); 968 if (newKey == NULL) 969 return B_NO_MEMORY; 970 memcpy(newKey, droppedKey, newLength); 971 972 other->overflow_link = inKeyValues[in]; 973 total = inKeyLengths[in++]; 974 } 975 } 976 977 // and now the same game for the other page and the rest of the keys 978 // (but with memmove() instead of memcpy(), because they may overlap) 979 980 bytesBefore = bytesAfter = bytes = 0; 981 out = 0; 982 int32 skip = in; 983 while (in < node->NumKeys() + 1) { 984 if (in == keyIndex && !bytes) { 985 // it's enough to set bytesBefore once here, because we do 986 // not need to know the exact length of all keys in this 987 // loop 988 bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; 989 bytes = *_keyLength; 990 } else { 991 if (in < node->NumKeys()) { 992 inKeyLengths[in] -= total; 993 if (bytes) { 994 inKeyLengths[in] += bytes; 995 bytesAfter = inKeyLengths[in] - bytesBefore - bytes; 996 } 997 } 998 in++; 999 } 1000 1001 out++; 1002 1003 // break out when all keys are done 1004 if (in > node->NumKeys() && keyIndex < in) 1005 break; 1006 } 1007 1008 // adjust the byte counts (since we were a bit lazy in the loop) 1009 if (keyIndex >= in && keyIndex - skip < out) 1010 bytesAfter = inKeyLengths[in] - bytesBefore - total; 1011 else if (keyIndex < skip) 1012 bytesBefore = node->AllKeyLength() - total; 1013 1014 if (bytesBefore < 0 || bytesAfter < 0) 1015 return B_BAD_DATA; 1016 1017 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1018 // right link, and overflow link can stay the same 1019 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter); 1020 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1); 1021 1022 // array positions have changed 1023 outKeyLengths = node->KeyLengths(); 1024 outKeyValues = node->Values(); 1025 1026 // move the keys in the old node: the order is important here, 1027 // because we don't want to overwrite any contents 1028 1029 keys = keyIndex <= skip ? out : keyIndex - skip; 1030 keyIndex -= skip; 1031 1032 if (bytesBefore) 1033 memmove(inKeys, inKeys + total, bytesBefore); 1034 if (bytesAfter) 1035 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter); 1036 1037 if (bytesBefore) 1038 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16)); 1039 in = out - keyIndex - 1; 1040 if (bytesAfter) 1041 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16)); 1042 1043 if (bytesBefore) 1044 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t)); 1045 if (bytesAfter) 1046 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t)); 1047 1048 if (bytes) { 1049 // finally, copy the newly inserted key (don't overwrite anything) 1050 memcpy(inKeys + bytesBefore, key, bytes); 1051 outKeyLengths[keyIndex] = bytes + bytesBefore; 1052 outKeyValues[keyIndex] = *_value; 1053 } 1054 1055 // Prepare the key that will be inserted in the parent node which 1056 // is either the dropped key or the last of the other node. 1057 // If it's the dropped key, "newKey" was already set earlier. 1058 1059 if (newKey == NULL) 1060 newKey = other->KeyAt(other->NumKeys() - 1, &newLength); 1061 1062 memcpy(key, newKey, newLength); 1063 *_keyLength = newLength; 1064 *_value = otherOffset; 1065 1066 if (newAllocated) 1067 free(newKey); 1068 1069 return B_OK; 1070 } 1071 1072 1073 status_t 1074 BPlusTree::Insert(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value) 1075 { 1076 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1077 RETURN_ERROR(B_BAD_VALUE); 1078 1079 // lock access to stream 1080 WriteLocked locked(fStream->Lock()); 1081 1082 Stack<node_and_key> stack; 1083 if (SeekDown(stack, key, keyLength) != B_OK) 1084 RETURN_ERROR(B_ERROR); 1085 1086 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 1087 1088 memcpy(keyBuffer, key, keyLength); 1089 keyBuffer[keyLength] = 0; 1090 1091 node_and_key nodeAndKey; 1092 bplustree_node *node; 1093 1094 CachedNode cached(this); 1095 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1096 #ifdef DEBUG 1097 NodeChecker checker(node, fNodeSize); 1098 #endif 1099 if (node->IsLeaf()) { 1100 // first round, check for duplicate entries 1101 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1102 1103 // is this a duplicate entry? 1104 if (status == B_OK) { 1105 if (fAllowDuplicates) 1106 return InsertDuplicate(transaction, &cached, node, nodeAndKey.keyIndex, value); 1107 1108 RETURN_ERROR(B_NAME_IN_USE); 1109 } 1110 } 1111 1112 // is the node big enough to hold the pair? 1113 if (int32(round_up(sizeof(bplustree_node) + node->AllKeyLength() + keyLength) 1114 + (node->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 1115 { 1116 InsertKey(node, nodeAndKey.keyIndex, keyBuffer, keyLength, value); 1117 UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1); 1118 1119 return cached.WriteBack(transaction); 1120 } else { 1121 CachedNode cachedNewRoot(this); 1122 CachedNode cachedOther(this); 1123 1124 // do we need to allocate a new root node? if so, then do 1125 // it now 1126 off_t newRoot = BPLUSTREE_NULL; 1127 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1128 bplustree_node *root; 1129 status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot); 1130 if (status < B_OK) { 1131 // The tree is most likely corrupted! 1132 // But it's still sane at leaf level - we could set 1133 // a flag in the header that forces the tree to be 1134 // rebuild next time... 1135 // But since we will have journaling, that's not a big 1136 // problem anyway. 1137 RETURN_ERROR(status); 1138 } 1139 } 1140 1141 // reserve space for the other node 1142 bplustree_node *other; 1143 off_t otherOffset; 1144 status_t status = cachedOther.Allocate(transaction, &other, &otherOffset); 1145 if (status < B_OK) { 1146 cachedNewRoot.Free(transaction, newRoot); 1147 RETURN_ERROR(status); 1148 } 1149 1150 if (SplitNode(node, nodeAndKey.nodeOffset, other, otherOffset, 1151 &nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) { 1152 // free root node & other node here 1153 cachedNewRoot.Free(transaction, newRoot); 1154 cachedOther.Free(transaction, otherOffset); 1155 1156 RETURN_ERROR(B_ERROR); 1157 } 1158 1159 // write the updated nodes back 1160 1161 if (cached.WriteBack(transaction) < B_OK 1162 || cachedOther.WriteBack(transaction) < B_OK) 1163 RETURN_ERROR(B_ERROR); 1164 1165 UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex, 1166 node->NumKeys(), 1); 1167 1168 // update the right link of the node in the left of the new node 1169 if ((other = cachedOther.SetTo(other->LeftLink())) != NULL) { 1170 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset); 1171 if (cachedOther.WriteBack(transaction) < B_OK) 1172 RETURN_ERROR(B_ERROR); 1173 } 1174 1175 // create a new root if necessary 1176 if (newRoot != BPLUSTREE_NULL) { 1177 bplustree_node *root = cachedNewRoot.Node(); 1178 1179 InsertKey(root, 0, keyBuffer, keyLength, node->LeftLink()); 1180 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset); 1181 1182 if (cachedNewRoot.WriteBack(transaction) < B_OK) 1183 RETURN_ERROR(B_ERROR); 1184 1185 // finally, update header to point to the new root 1186 fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot); 1187 fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(fHeader->MaxNumberOfLevels() + 1); 1188 1189 return fCachedHeader.WriteBack(transaction); 1190 } 1191 } 1192 } 1193 RETURN_ERROR(B_ERROR); 1194 } 1195 1196 1197 status_t 1198 BPlusTree::RemoveDuplicate(Transaction *transaction, bplustree_node *node, CachedNode *cached, 1199 uint16 index, off_t value) 1200 { 1201 CachedNode cachedDuplicate(this); 1202 off_t *values = node->Values(); 1203 off_t oldValue = values[index]; 1204 status_t status; 1205 1206 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue); 1207 bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1208 if (duplicate == NULL) 1209 return B_IO_ERROR; 1210 1211 // if it's a duplicate fragment, remove the entry from there 1212 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) { 1213 duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue)); 1214 1215 if (array->count > NUM_FRAGMENT_VALUES 1216 || array->count < 1) { 1217 FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n", 1218 bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count)); 1219 return B_BAD_DATA; 1220 } 1221 if (!array->Remove(value)) 1222 FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n", 1223 value, duplicateOffset)); 1224 1225 // remove the array from the fragment node if it is empty 1226 if (array->count == 1) { 1227 // set the link to the remaining value 1228 values[index] = array->values[0]; 1229 1230 // Remove the whole fragment node, if this was the only array, 1231 // otherwise free the array and write the changes back 1232 if (duplicate->FragmentsUsed(fNodeSize) == 1) 1233 status = cachedDuplicate.Free(transaction, duplicateOffset); 1234 else { 1235 array->count = 0; 1236 status = cachedDuplicate.WriteBack(transaction); 1237 } 1238 if (status < B_OK) 1239 return status; 1240 1241 return cached->WriteBack(transaction); 1242 } 1243 return cachedDuplicate.WriteBack(transaction); 1244 } 1245 1246 // 1247 // Remove value from a duplicate node! 1248 // 1249 1250 duplicate_array *array; 1251 1252 if (duplicate->LeftLink() != BPLUSTREE_NULL) { 1253 FATAL(("invalid duplicate node: first left link points to %Ld!\n", duplicate->LeftLink())); 1254 return B_BAD_DATA; 1255 } 1256 1257 // Search the duplicate nodes until the entry could be found (and removed) 1258 while (duplicate != NULL) { 1259 array = duplicate->DuplicateArray(); 1260 if (array->count > NUM_DUPLICATE_VALUES 1261 || array->count < 0) { 1262 FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n", 1263 duplicateOffset, array->count)); 1264 return B_BAD_DATA; 1265 } 1266 1267 if (array->Remove(value)) 1268 break; 1269 1270 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL) 1271 RETURN_ERROR(B_ENTRY_NOT_FOUND); 1272 1273 duplicate = cachedDuplicate.SetTo(duplicateOffset, false); 1274 } 1275 if (duplicate == NULL) 1276 RETURN_ERROR(B_IO_ERROR); 1277 1278 while (true) { 1279 off_t left = duplicate->LeftLink(); 1280 off_t right = duplicate->RightLink(); 1281 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL; 1282 1283 if (isLast && array->count == 1 || array->count == 0) { 1284 // Free empty duplicate page, link their siblings together, and 1285 // update the duplicate link if needed (which should not be, if 1286 // we are the only one working on that tree...) 1287 1288 if (duplicateOffset == bplustree_node::FragmentOffset(oldValue) 1289 || array->count == 1) { 1290 if (array->count == 1 && isLast) 1291 values[index] = array->values[0]; 1292 else if (isLast) { 1293 FATAL(("removed last value from duplicate!\n")); 1294 } else 1295 values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, right); 1296 1297 if ((status = cached->WriteBack(transaction)) < B_OK) 1298 return status; 1299 } 1300 1301 if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK) 1302 return status; 1303 1304 if (left != BPLUSTREE_NULL 1305 && (duplicate = cachedDuplicate.SetTo(left, false)) != NULL) { 1306 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right); 1307 1308 // If the next node is the last node, we need to free that node 1309 // and convert the duplicate entry back into a normal entry 1310 if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL 1311 && duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) { 1312 duplicateOffset = left; 1313 continue; 1314 } 1315 1316 status = cachedDuplicate.WriteBack(transaction); 1317 if (status < B_OK) 1318 return status; 1319 } 1320 if (right != BPLUSTREE_NULL 1321 && (duplicate = cachedDuplicate.SetTo(right, false)) != NULL) { 1322 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left); 1323 1324 // Again, we may need to turn the duplicate entry back into a normal entry 1325 array = duplicate->DuplicateArray(); 1326 if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL 1327 && duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) { 1328 duplicateOffset = right; 1329 continue; 1330 } 1331 1332 return cachedDuplicate.WriteBack(transaction); 1333 } 1334 return status; 1335 } else if (isLast && array->count <= NUM_FRAGMENT_VALUES) { 1336 // If the number of entries fits in a duplicate fragment, then 1337 // either find a free fragment node, or convert this node to a 1338 // fragment node. 1339 CachedNode cachedOther(this); 1340 1341 bplustree_node *fragment = NULL; 1342 uint32 fragmentIndex = 0; 1343 off_t offset; 1344 if (FindFreeDuplicateFragment(node, &cachedOther, &offset, 1345 &fragment, &fragmentIndex) < B_OK) { 1346 // convert node 1347 memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1348 memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0, 1349 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1350 } else { 1351 // move to other node 1352 duplicate_array *target = fragment->FragmentAt(fragmentIndex); 1353 memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); 1354 1355 cachedDuplicate.Free(transaction, duplicateOffset); 1356 duplicateOffset = offset; 1357 } 1358 values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT, 1359 duplicateOffset, fragmentIndex); 1360 1361 if ((status = cached->WriteBack(transaction)) < B_OK) 1362 return status; 1363 1364 if (fragment != NULL) 1365 return cachedOther.WriteBack(transaction); 1366 } 1367 return cachedDuplicate.WriteBack(transaction); 1368 } 1369 } 1370 1371 1372 /** Removes the key with the given index from the specified node. 1373 * Since it has to get the key from the node anyway (to obtain it's 1374 * pointer), it's not needed to pass the key & its length, although 1375 * the calling method (BPlusTree::Remove()) have this data. 1376 */ 1377 1378 void 1379 BPlusTree::RemoveKey(bplustree_node *node, uint16 index) 1380 { 1381 // should never happen, but who knows? 1382 if (index > node->NumKeys() && node->NumKeys() > 0) { 1383 FATAL(("Asked me to remove key outer limits: %u\n", index)); 1384 return; 1385 } 1386 1387 off_t *values = node->Values(); 1388 1389 // if we would have to drop the overflow link, drop 1390 // the last key instead and update the overflow link 1391 // to the value of that one 1392 if (!node->IsLeaf() && index == node->NumKeys()) 1393 node->overflow_link = values[--index]; 1394 1395 uint16 length; 1396 uint8 *key = node->KeyAt(index, &length); 1397 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize 1398 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1399 FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length, 1400 fStream->BlockRun().allocation_group, fStream->BlockRun().start)); 1401 fStream->GetVolume()->Panic(); 1402 return; 1403 } 1404 1405 uint16 *keyLengths = node->KeyLengths(); 1406 uint8 *keys = node->Keys(); 1407 1408 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1); 1409 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length); 1410 1411 off_t *newValues = node->Values(); 1412 uint16 *newKeyLengths = node->KeyLengths(); 1413 1414 // move key data 1415 memmove(key, key + length, node->AllKeyLength() - (key - keys)); 1416 1417 // move and update key lengths 1418 if (index > 0 && newKeyLengths != keyLengths) 1419 memmove(newKeyLengths, keyLengths, index * sizeof(uint16)); 1420 for (uint16 i = index; i < node->NumKeys(); i++) 1421 newKeyLengths[i] = keyLengths[i + 1] - length; 1422 1423 // move values 1424 if (index > 0) 1425 memmove(newValues, values, index * sizeof(off_t)); 1426 if (node->NumKeys() > index) 1427 memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t)); 1428 } 1429 1430 1431 /** Removes the specified key from the tree. The "value" parameter is only used 1432 * for trees which allow duplicates, so you may safely ignore it. 1433 * It's not an optional parameter, so at least you have to think about it. 1434 */ 1435 1436 status_t 1437 BPlusTree::Remove(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value) 1438 { 1439 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1440 RETURN_ERROR(B_BAD_VALUE); 1441 1442 // lock access to stream 1443 WriteLocked locked(fStream->Lock()); 1444 1445 Stack<node_and_key> stack; 1446 if (SeekDown(stack, key, keyLength) != B_OK) 1447 RETURN_ERROR(B_ERROR); 1448 1449 node_and_key nodeAndKey; 1450 bplustree_node *node; 1451 1452 CachedNode cached(this); 1453 while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) { 1454 #ifdef DEBUG 1455 NodeChecker checker(node, fNodeSize); 1456 #endif 1457 if (node->IsLeaf()) { 1458 // first round, check for duplicate entries 1459 status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex); 1460 if (status < B_OK) 1461 RETURN_ERROR(status); 1462 1463 // If we will remove the last key, the iterator will be set 1464 // to the next node after the current - if there aren't any 1465 // more nodes, we need a way to prevent the TreeIterators to 1466 // touch the old node again, we use BPLUSTREE_FREE for this 1467 off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink(); 1468 UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ? 1469 next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1); 1470 1471 // is this a duplicate entry? 1472 if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) { 1473 if (fAllowDuplicates) 1474 return RemoveDuplicate(transaction, node, &cached, nodeAndKey.keyIndex, value); 1475 else 1476 RETURN_ERROR(B_NAME_IN_USE); 1477 } 1478 } 1479 1480 // if it's an empty root node, we have to convert it 1481 // to a leaf node by dropping the overflow link, or, 1482 // if it's a leaf node, just empty it 1483 if (nodeAndKey.nodeOffset == fHeader->RootNode() 1484 && node->NumKeys() == 0 1485 || node->NumKeys() == 1 && node->IsLeaf()) { 1486 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1487 node->all_key_count = 0; 1488 node->all_key_length = 0; 1489 1490 if (cached.WriteBack(transaction) < B_OK) 1491 return B_IO_ERROR; 1492 1493 // if we've cleared the root node, reset the maximum 1494 // number of levels in the header 1495 if (nodeAndKey.nodeOffset == fHeader->RootNode()) { 1496 fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1); 1497 return fCachedHeader.WriteBack(transaction); 1498 } 1499 return B_OK; 1500 } 1501 1502 // if there is only one key left, we don't have to remove 1503 // it, we can just dump the node (index nodes still have 1504 // the overflow link, so we have to drop the last key) 1505 if (node->NumKeys() > 1 1506 || !node->IsLeaf() && node->NumKeys() == 1) { 1507 RemoveKey(node, nodeAndKey.keyIndex); 1508 return cached.WriteBack(transaction); 1509 } 1510 1511 // when we are here, we can just free the node, but 1512 // we have to update the right/left link of the 1513 // siblings first 1514 CachedNode otherCached(this); 1515 bplustree_node *other = otherCached.SetTo(node->LeftLink()); 1516 if (other != NULL) { 1517 other->right_link = node->right_link; 1518 if (otherCached.WriteBack(transaction) < B_OK) 1519 return B_IO_ERROR; 1520 } 1521 1522 if ((other = otherCached.SetTo(node->RightLink())) != NULL) { 1523 other->left_link = node->left_link; 1524 if (otherCached.WriteBack(transaction) < B_OK) 1525 return B_IO_ERROR; 1526 } 1527 1528 cached.Free(transaction, nodeAndKey.nodeOffset); 1529 } 1530 RETURN_ERROR(B_ERROR); 1531 } 1532 1533 1534 /** Replaces the value for the key in the tree. 1535 * Returns B_OK if the key could be found and its value replaced, 1536 * B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors 1537 * to indicate that something went terribly wrong. 1538 * Note that this doesn't work with duplicates - it will just 1539 * return B_BAD_TYPE if you call this function on a tree where 1540 * duplicates are allowed. 1541 */ 1542 1543 status_t 1544 BPlusTree::Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value) 1545 { 1546 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1547 || key == NULL) 1548 RETURN_ERROR(B_BAD_VALUE); 1549 1550 if (fAllowDuplicates) 1551 RETURN_ERROR(B_BAD_TYPE); 1552 1553 // lock access to stream (a read lock is okay for this purpose) 1554 ReadLocked locked(fStream->Lock()); 1555 1556 off_t nodeOffset = fHeader->RootNode(); 1557 CachedNode cached(this); 1558 bplustree_node *node; 1559 1560 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1561 uint16 keyIndex = 0; 1562 off_t nextOffset; 1563 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1564 1565 if (node->OverflowLink() == BPLUSTREE_NULL) { 1566 if (status == B_OK) { 1567 node->Values()[keyIndex] = value; 1568 return cached.WriteBack(transaction); 1569 } 1570 1571 return status; 1572 } else if (nextOffset == nodeOffset) 1573 RETURN_ERROR(B_ERROR); 1574 1575 nodeOffset = nextOffset; 1576 } 1577 RETURN_ERROR(B_ERROR); 1578 } 1579 1580 1581 /** Searches the key in the tree, and stores the offset found in 1582 * _value, if successful. 1583 * It's very similar to BPlusTree::SeekDown(), but doesn't fill 1584 * a stack while it descends the tree. 1585 * Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND 1586 * if not. It can also return other errors to indicate that 1587 * something went wrong. 1588 * Note that this doesn't work with duplicates - it will just 1589 * return B_BAD_TYPE if you call this function on a tree where 1590 * duplicates are allowed. 1591 */ 1592 1593 status_t 1594 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value) 1595 { 1596 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1597 || key == NULL) 1598 RETURN_ERROR(B_BAD_VALUE); 1599 1600 if (fAllowDuplicates) 1601 RETURN_ERROR(B_BAD_TYPE); 1602 1603 // lock access to stream 1604 ReadLocked locked(fStream->Lock()); 1605 1606 off_t nodeOffset = fHeader->RootNode(); 1607 CachedNode cached(this); 1608 bplustree_node *node; 1609 1610 #ifdef DEBUG 1611 int32 levels = 0; 1612 #endif 1613 1614 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1615 uint16 keyIndex = 0; 1616 off_t nextOffset; 1617 status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1618 1619 #ifdef DEBUG 1620 levels++; 1621 #endif 1622 if (node->OverflowLink() == BPLUSTREE_NULL) { 1623 if (status == B_OK && _value != NULL) 1624 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]); 1625 1626 #ifdef DEBUG 1627 if (levels != (int32)fHeader->MaxNumberOfLevels()) 1628 DEBUGGER(("levels don't match")); 1629 #endif 1630 return status; 1631 } else if (nextOffset == nodeOffset) 1632 RETURN_ERROR(B_ERROR); 1633 1634 nodeOffset = nextOffset; 1635 } 1636 FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset)); 1637 RETURN_ERROR(B_ERROR); 1638 } 1639 1640 1641 // #pragma mark - 1642 1643 1644 TreeIterator::TreeIterator(BPlusTree *tree) 1645 : 1646 fTree(tree), 1647 fCurrentNodeOffset(BPLUSTREE_NULL), 1648 fNext(NULL) 1649 { 1650 tree->AddIterator(this); 1651 } 1652 1653 1654 TreeIterator::~TreeIterator() 1655 { 1656 if (fTree) 1657 fTree->RemoveIterator(this); 1658 } 1659 1660 1661 status_t 1662 TreeIterator::Goto(int8 to) 1663 { 1664 if (fTree == NULL || fTree->fHeader == NULL) 1665 RETURN_ERROR(B_BAD_VALUE); 1666 1667 // lock access to stream 1668 ReadLocked locked(fTree->fStream->Lock()); 1669 1670 off_t nodeOffset = fTree->fHeader->RootNode(); 1671 CachedNode cached(fTree); 1672 bplustree_node *node; 1673 1674 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1675 // is the node a leaf node? 1676 if (node->OverflowLink() == BPLUSTREE_NULL) { 1677 fCurrentNodeOffset = nodeOffset; 1678 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys(); 1679 fDuplicateNode = BPLUSTREE_NULL; 1680 1681 return B_OK; 1682 } 1683 1684 // get the next node offset depending on the direction (and if there 1685 // are any keys in that node at all) 1686 off_t nextOffset; 1687 if (to == BPLUSTREE_END || node->all_key_count == 0) 1688 nextOffset = node->OverflowLink(); 1689 else { 1690 if (node->AllKeyLength() > fTree->fNodeSize 1691 || (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys()) 1692 RETURN_ERROR(B_ERROR); 1693 1694 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]); 1695 } 1696 if (nextOffset == nodeOffset) 1697 break; 1698 1699 nodeOffset = nextOffset; 1700 } 1701 FATAL(("%s fails\n", __FUNCTION__)); 1702 1703 RETURN_ERROR(B_ERROR); 1704 } 1705 1706 1707 /** Iterates through the tree in the specified direction. 1708 * When it iterates through duplicates, the "key" is only updated for the 1709 * first entry - if you need to know when this happens, use the "duplicate" 1710 * parameter which is 0 for no duplicate, 1 for the first, and 2 for all 1711 * the other duplicates. 1712 * That's not too nice, but saves the 256 bytes that would be needed to 1713 * store the last key - if this will ever become an issue, it will be 1714 * easy to change. 1715 * The other advantage of this is, that the queries can skip all duplicates 1716 * at once when they are not relevant to them. 1717 */ 1718 1719 status_t 1720 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength, 1721 off_t *value, uint16 *duplicate) 1722 { 1723 if (fTree == NULL) 1724 return B_INTERRUPTED; 1725 if (fCurrentNodeOffset == BPLUSTREE_NULL 1726 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 1727 RETURN_ERROR(B_ERROR); 1728 1729 // if the tree was emptied since the last call 1730 if (fCurrentNodeOffset == BPLUSTREE_FREE) 1731 return B_ENTRY_NOT_FOUND; 1732 1733 // lock access to stream 1734 ReadLocked locked(fTree->fStream->Lock()); 1735 1736 CachedNode cached(fTree); 1737 bplustree_node *node; 1738 1739 if (fDuplicateNode != BPLUSTREE_NULL) { 1740 // regardless of traverse direction the duplicates are always presented in 1741 // the same order; since they are all considered as equal, this shouldn't 1742 // cause any problems 1743 1744 if (!fIsFragment || fDuplicate < fNumDuplicates) 1745 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1746 else 1747 node = NULL; 1748 1749 if (node != NULL) { 1750 if (!fIsFragment && fDuplicate >= fNumDuplicates) { 1751 // if the node is out of duplicates, we go directly to the next one 1752 fDuplicateNode = node->RightLink(); 1753 if (fDuplicateNode != BPLUSTREE_NULL 1754 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) { 1755 fNumDuplicates = node->CountDuplicates(fDuplicateNode, false); 1756 fDuplicate = 0; 1757 } 1758 } 1759 if (fDuplicate < fNumDuplicates) { 1760 *value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++); 1761 if (duplicate) 1762 *duplicate = 2; 1763 return B_OK; 1764 } 1765 } 1766 fDuplicateNode = BPLUSTREE_NULL; 1767 } 1768 1769 off_t savedNodeOffset = fCurrentNodeOffset; 1770 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL) 1771 RETURN_ERROR(B_ERROR); 1772 1773 if (duplicate) 1774 *duplicate = 0; 1775 1776 fCurrentKey += direction; 1777 1778 // is the current key in the current node? 1779 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys()) 1780 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) { 1781 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink(); 1782 1783 // are there any more nodes? 1784 if (fCurrentNodeOffset != BPLUSTREE_NULL) { 1785 node = cached.SetTo(fCurrentNodeOffset); 1786 if (!node) 1787 RETURN_ERROR(B_ERROR); 1788 1789 // reset current key 1790 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys(); 1791 } else { 1792 // there are no nodes left, so turn back to the last key 1793 fCurrentNodeOffset = savedNodeOffset; 1794 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1; 1795 1796 return B_ENTRY_NOT_FOUND; 1797 } 1798 } 1799 1800 if (node->all_key_count == 0) 1801 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ? 1802 1803 uint16 length; 1804 uint8 *keyStart = node->KeyAt(fCurrentKey, &length); 1805 if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize 1806 || length > BPLUSTREE_MAX_KEY_LENGTH) { 1807 fTree->fStream->GetVolume()->Panic(); 1808 RETURN_ERROR(B_BAD_DATA); 1809 } 1810 1811 length = min_c(length, maxLength); 1812 memcpy(key, keyStart, length); 1813 1814 if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE) { 1815 // terminate string type 1816 if (length == maxLength) 1817 length--; 1818 ((char *)key)[length] = '\0'; 1819 } 1820 *keyLength = length; 1821 1822 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]); 1823 1824 // duplicate fragments? 1825 uint8 type = bplustree_node::LinkType(offset); 1826 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) { 1827 fDuplicateNode = offset; 1828 1829 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false); 1830 if (node == NULL) 1831 RETURN_ERROR(B_ERROR); 1832 1833 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 1834 1835 fNumDuplicates = node->CountDuplicates(offset, fIsFragment); 1836 if (fNumDuplicates) { 1837 offset = node->DuplicateAt(offset, fIsFragment, 0); 1838 fDuplicate = 1; 1839 if (duplicate) 1840 *duplicate = 1; 1841 } else { 1842 // shouldn't happen, but we're dealing here with potentially corrupt disks... 1843 fDuplicateNode = BPLUSTREE_NULL; 1844 offset = 0; 1845 } 1846 } 1847 *value = offset; 1848 1849 return B_OK; 1850 } 1851 1852 1853 /** This is more or less a copy of BPlusTree::Find() - but it just 1854 * sets the current position in the iterator, regardless of if the 1855 * key could be found or not. 1856 */ 1857 1858 status_t 1859 TreeIterator::Find(const uint8 *key, uint16 keyLength) 1860 { 1861 if (fTree == NULL) 1862 return B_INTERRUPTED; 1863 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH 1864 || key == NULL) 1865 RETURN_ERROR(B_BAD_VALUE); 1866 1867 // lock access to stream 1868 ReadLocked locked(fTree->fStream->Lock()); 1869 1870 off_t nodeOffset = fTree->fHeader->RootNode(); 1871 1872 CachedNode cached(fTree); 1873 bplustree_node *node; 1874 while ((node = cached.SetTo(nodeOffset)) != NULL) { 1875 uint16 keyIndex = 0; 1876 off_t nextOffset; 1877 status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset); 1878 1879 if (node->OverflowLink() == BPLUSTREE_NULL) { 1880 fCurrentNodeOffset = nodeOffset; 1881 fCurrentKey = keyIndex - 1; 1882 fDuplicateNode = BPLUSTREE_NULL; 1883 1884 return status; 1885 } else if (nextOffset == nodeOffset) 1886 RETURN_ERROR(B_ERROR); 1887 1888 nodeOffset = nextOffset; 1889 } 1890 RETURN_ERROR(B_ERROR); 1891 } 1892 1893 1894 void 1895 TreeIterator::SkipDuplicates() 1896 { 1897 fDuplicateNode = BPLUSTREE_NULL; 1898 } 1899 1900 1901 void 1902 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change) 1903 { 1904 if (offset != fCurrentNodeOffset) 1905 return; 1906 1907 if (nextOffset != BPLUSTREE_NULL) { 1908 fCurrentNodeOffset = nextOffset; 1909 if (splitAt <= fCurrentKey) { 1910 fCurrentKey -= splitAt; 1911 keyIndex -= splitAt; 1912 } 1913 } 1914 1915 // Adjust fCurrentKey to point to the same key as before. 1916 // Note, that if a key is inserted at the current position 1917 // it won't be included in this tree transition. 1918 if (keyIndex <= fCurrentKey) 1919 fCurrentKey += change; 1920 1921 // ToDo: duplicate handling! 1922 } 1923 1924 1925 void 1926 TreeIterator::Stop() 1927 { 1928 fTree = NULL; 1929 } 1930 1931 1932 #ifdef DEBUG 1933 void 1934 TreeIterator::Dump() 1935 { 1936 __out("TreeIterator at %p:\n", this); 1937 __out("\tfTree = %p\n", fTree); 1938 __out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset); 1939 __out("\tfCurrentKey = %ld\n", fCurrentKey); 1940 __out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode); 1941 __out("\tfDuplicate = %u\n", fDuplicate); 1942 __out("\tfNumDuplicates = %u\n", fNumDuplicates); 1943 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false"); 1944 } 1945 #endif 1946 1947 1948 // #pragma mark - 1949 1950 1951 void 1952 bplustree_node::Initialize() 1953 { 1954 left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL); 1955 all_key_count = 0; 1956 all_key_length = 0; 1957 } 1958 1959 1960 uint8 * 1961 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const 1962 { 1963 if (index < 0 || index > NumKeys()) 1964 return NULL; 1965 1966 uint8 *keyStart = Keys(); 1967 uint16 *keyLengths = KeyLengths(); 1968 1969 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index]) 1970 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0); 1971 if (index > 0) 1972 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]); 1973 1974 return keyStart; 1975 } 1976 1977 1978 uint8 1979 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const 1980 { 1981 // the duplicate fragment handling is currently hard-coded to a node size 1982 // of 1024 bytes - with future versions of BFS, this may be a problem 1983 1984 if (isFragment) { 1985 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff); 1986 1987 return ((off_t *)this)[fragment]; 1988 } 1989 return OverflowLink(); 1990 } 1991 1992 1993 off_t 1994 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const 1995 { 1996 uint32 start; 1997 if (isFragment) 1998 start = 8 * ((uint64)offset & 0x3ff); 1999 else 2000 start = 2; 2001 2002 return ((off_t *)this)[start + 1 + index]; 2003 } 2004 2005 2006 /** Although the name suggests it, this function doesn't return the real 2007 * used fragment count; at least, it can only count to two: it returns 2008 * 0, if there is no fragment used, 1 if there is only one fragment 2009 * used, and 2 if there are at least 2 fragments used. 2010 */ 2011 2012 int32 2013 bplustree_node::FragmentsUsed(uint32 nodeSize) 2014 { 2015 uint32 used = 0; 2016 for (uint32 i = 0; i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); i++) { 2017 duplicate_array *array = FragmentAt(i); 2018 if (array->count > 0 && ++used > 1) 2019 return used; 2020 } 2021 return used; 2022 } 2023 2024 2025 #ifdef DEBUG 2026 void 2027 bplustree_node::CheckIntegrity(uint32 nodeSize) 2028 { 2029 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize) 2030 DEBUGGER(("invalid node: key/length count")); 2031 2032 for (int32 i = 0; i < NumKeys(); i++) { 2033 uint16 length; 2034 uint8 *key = KeyAt(i, &length); 2035 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize 2036 || length > BPLUSTREE_MAX_KEY_LENGTH) 2037 DEBUGGER(("invalid node: keys corrupted")); 2038 } 2039 } 2040 #endif 2041 2042 2043 // #pragma mark - 2044 2045 2046 int32 2047 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2) 2048 { 2049 // if one of the keys is NULL, bail out gracefully 2050 if (key1 == NULL || key2 == NULL) { 2051 //#if USER 2052 // // that's here to see if it's reasonable to handle this case nicely at all 2053 // DEBUGGER(("compareKeys() got NULL key!")); 2054 //#endif 2055 // even if it's most probably a bug in the calling code, we try to 2056 // give a meaningful result 2057 if (key1 == NULL && key2 != NULL) 2058 return 1; 2059 else if (key2 == NULL && key1 != NULL) 2060 return -1; 2061 2062 return 0; 2063 } 2064 2065 switch (type) { 2066 case B_STRING_TYPE: 2067 { 2068 int len = min_c(keyLength1, keyLength2); 2069 int result = strncmp((const char *)key1, (const char *)key2, len); 2070 2071 if (result == 0 2072 && !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0')) 2073 result = keyLength1 - keyLength2; 2074 2075 return result; 2076 } 2077 2078 case B_SSIZE_T_TYPE: 2079 case B_INT32_TYPE: 2080 return *(int32 *)key1 - *(int32 *)key2; 2081 2082 case B_SIZE_T_TYPE: 2083 case B_UINT32_TYPE: 2084 if (*(uint32 *)key1 == *(uint32 *)key2) 2085 return 0; 2086 else if (*(uint32 *)key1 > *(uint32 *)key2) 2087 return 1; 2088 2089 return -1; 2090 2091 case B_OFF_T_TYPE: 2092 case B_INT64_TYPE: 2093 if (*(int64 *)key1 == *(int64 *)key2) 2094 return 0; 2095 else if (*(int64 *)key1 > *(int64 *)key2) 2096 return 1; 2097 2098 return -1; 2099 2100 case B_UINT64_TYPE: 2101 if (*(uint64 *)key1 == *(uint64 *)key2) 2102 return 0; 2103 else if (*(uint64 *)key1 > *(uint64 *)key2) 2104 return 1; 2105 2106 return -1; 2107 2108 case B_FLOAT_TYPE: 2109 { 2110 float result = *(float *)key1 - *(float *)key2; 2111 if (result == 0.0f) 2112 return 0; 2113 2114 return (result < 0.0f) ? -1 : 1; 2115 } 2116 2117 case B_DOUBLE_TYPE: 2118 { 2119 double result = *(double *)key1 - *(double *)key2; 2120 if (result == 0.0) 2121 return 0; 2122 2123 return (result < 0.0) ? -1 : 1; 2124 } 2125 } 2126 2127 // if the type is unknown, the entries don't match... 2128 return -1; 2129 } 2130 2131