1 /* BPlusTree - BFS B+Tree implementation 2 ** 3 ** Copyright 2001-2002 pinc Software. All Rights Reserved. 4 ** Released under the terms of the MIT license. 5 */ 6 7 8 #include "BPlusTree.h" 9 #include "Inode.h" 10 #include "Stack.h" 11 #include "dump.h" 12 13 #include <Debug.h> 14 15 #include <string.h> 16 #include <stdlib.h> 17 #include <stdio.h> 18 19 20 #define MAX_NODES_IN_CACHE 20 21 22 class CacheableNode : public NodeCache::Cacheable 23 { 24 public: 25 CacheableNode(off_t offset,bplustree_node *node) 26 : 27 fOffset(offset), 28 fNode(node) 29 { 30 } 31 32 virtual ~CacheableNode() 33 { 34 if (fNode) 35 free(fNode); 36 } 37 38 virtual bool Equals(off_t offset) 39 { 40 return offset == fOffset; 41 } 42 43 off_t fOffset; 44 bplustree_node *fNode; 45 }; 46 47 48 NodeCache::NodeCache(BPlusTree *tree) 49 : Cache<off_t>(), 50 fTree(tree) 51 { 52 } 53 54 55 Cache<off_t>::Cacheable * 56 NodeCache::NewCacheable(off_t offset) 57 { 58 return new CacheableNode(offset,fTree->Node(offset,false)); 59 } 60 61 62 bplustree_node * 63 NodeCache::Get(off_t offset) 64 { 65 CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset); 66 return node->fNode; 67 } 68 69 70 // #pragma mark - 71 72 73 BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates) 74 : 75 fStream(NULL), 76 fHeader(NULL), 77 fCache(this), 78 fCurrentNodeOffset(BPLUSTREE_NULL) 79 { 80 SetTo(keyType,nodeSize,allowDuplicates); 81 } 82 83 84 BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates) 85 : 86 fStream(NULL), 87 fHeader(NULL), 88 fCache(this), 89 fCurrentNodeOffset(BPLUSTREE_NULL) 90 { 91 SetTo(stream,allowDuplicates); 92 } 93 94 95 BPlusTree::BPlusTree() 96 : 97 fStream(NULL), 98 fHeader(NULL), 99 fNodeSize(BPLUSTREE_NODE_SIZE), 100 fAllowDuplicates(true), 101 fStatus(B_NO_INIT), 102 fCache(this), 103 fCurrentNodeOffset(BPLUSTREE_NULL) 104 { 105 } 106 107 108 BPlusTree::~BPlusTree() 109 { 110 fCache.Clear(); 111 } 112 113 114 void BPlusTree::Initialize(int32 nodeSize) 115 { 116 // free old data 117 fCache.Clear(0,true); 118 119 if (fHeader) 120 free(fHeader); 121 122 fStream = NULL; 123 124 fNodeSize = nodeSize; 125 fHeader = (bplustree_header *)malloc(fNodeSize); 126 memset(fHeader,0,fNodeSize); 127 128 fCurrentNodeOffset = BPLUSTREE_NULL; 129 } 130 131 132 status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates) 133 { 134 // initializes in-memory B+Tree 135 136 Initialize(nodeSize); 137 138 fAllowDuplicates = allowDuplicates; 139 fCache.SetHoldChanges(true); 140 141 // initialize b+tree header 142 fHeader->magic = BPLUSTREE_MAGIC; 143 fHeader->node_size = fNodeSize; 144 fHeader->max_number_of_levels = 1; 145 fHeader->data_type = keyType; 146 fHeader->root_node_pointer = fNodeSize; 147 fHeader->free_node_pointer = BPLUSTREE_NULL; 148 fHeader->maximum_size = fNodeSize * 2; 149 150 return fStatus = B_OK; 151 } 152 153 154 status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates) 155 { 156 // initializes on-disk B+Tree 157 158 bplustree_header header; 159 ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header)); 160 if (read < 0) 161 return fStatus = read; 162 163 // is header valid? 164 165 stream->Seek(0,SEEK_END); 166 off_t size = stream->Position(); 167 stream->Seek(0,SEEK_SET); 168 169 //dump_bplustree_header(&header); 170 171 if (header.magic != BPLUSTREE_MAGIC 172 || header.maximum_size != size 173 || (header.root_node_pointer % header.node_size) != 0 174 || !header.IsValidLink(header.root_node_pointer) 175 || !header.IsValidLink(header.free_node_pointer)) 176 return fStatus = B_BAD_DATA; 177 178 fAllowDuplicates = allowDuplicates; 179 180 if (DataStream *dataStream = dynamic_cast<DataStream *>(stream)) 181 { 182 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 183 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 184 uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 185 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 186 187 if (header.data_type > BPLUSTREE_DOUBLE_TYPE 188 || (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode 189 || !dataStream->IsDirectory()) 190 return fStatus = B_BAD_TYPE; 191 192 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 193 fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR; 194 195 //printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no"); 196 } 197 198 Initialize(header.node_size); 199 200 memcpy(fHeader,&header,sizeof(bplustree_header)); 201 202 fStream = stream; 203 204 bplustree_node *node = fCache.Get(header.root_node_pointer); 205 //if (node) 206 // dump_bplustree_node(node); 207 208 return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA; 209 } 210 211 212 status_t BPlusTree::InitCheck() 213 { 214 return fStatus; 215 } 216 217 218 struct validate_info { 219 off_t offset; 220 off_t from; 221 bool free; 222 }; 223 224 225 status_t 226 BPlusTree::Validate(bool verbose) 227 { 228 // validates the on-disk b+tree 229 // (for now only the node integrity is checked) 230 231 if (!fStream) 232 return B_OK; 233 234 struct validate_info info; 235 236 Stack<validate_info> stack; 237 info.offset = fHeader->root_node_pointer; 238 info.from = BPLUSTREE_NULL; 239 info.free = false; 240 stack.Push(info); 241 242 if (fHeader->free_node_pointer != BPLUSTREE_NULL) { 243 info.offset = fHeader->free_node_pointer; 244 info.free = true; 245 stack.Push(info); 246 } 247 248 char escape[3] = {0x1b, '[', 0}; 249 int32 count = 0,freeCount = 0; 250 251 while (true) { 252 if (!stack.Pop(&info)) { 253 if (verbose) { 254 printf(" %" B_PRId32 " B+tree node(s) processed (%" B_PRId32 255 " free node(s)).\n", count, freeCount); 256 } 257 return B_OK; 258 } 259 260 if (!info.free && verbose && ++count % 10 == 0) 261 printf(" %" B_PRId32 "%s1A\n", count, escape); 262 else if (info.free) 263 freeCount++; 264 265 bplustree_node *node; 266 if ((node = fCache.Get(info.offset)) == NULL 267 || !info.free && !CheckNode(node)) { 268 if (verbose) { 269 fprintf(stderr," B+Tree: Could not get data at position %" 270 B_PRIdOFF " (referenced by node at %" B_PRIdOFF ")\n", 271 info.offset, info.from); 272 if ((node = fCache.Get(info.from)) != NULL) 273 dump_bplustree_node(node,fHeader); 274 } 275 return B_BAD_DATA; 276 } 277 278 info.from = info.offset; 279 280 if (node->overflow_link != BPLUSTREE_NULL && !info.free) { 281 info.offset = node->overflow_link; 282 stack.Push(info); 283 284 //dump_bplustree_node(node,fHeader); 285 off_t *values = node->Values(); 286 for (int32 i = 0;i < node->all_key_count;i++) { 287 info.offset = values[i]; 288 stack.Push(info); 289 } 290 } else if (node->left_link != BPLUSTREE_NULL && info.free) { 291 info.offset = node->left_link; 292 stack.Push(info); 293 } 294 } 295 } 296 297 298 status_t 299 BPlusTree::WriteTo(BPositionIO *stream) 300 { 301 ssize_t written = stream->WriteAt(0,fHeader,fNodeSize); 302 if (written < fNodeSize) 303 return written < B_OK ? written : B_IO_ERROR; 304 305 for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize) 306 { 307 bplustree_node *node = fCache.Get(offset); 308 if (node == NULL) 309 return B_ERROR; 310 311 written = stream->WriteAt(offset,node,fNodeSize); 312 if (written < fNodeSize) 313 return written < B_OK ? written : B_IO_ERROR; 314 } 315 return stream->SetSize(fHeader->maximum_size); 316 } 317 318 319 // #pragma mark - 320 321 322 void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to) 323 { 324 fCurrentNodeOffset = offset; 325 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count; 326 fDuplicateNode = BPLUSTREE_NULL; 327 } 328 329 330 status_t BPlusTree::Goto(int8 to) 331 { 332 if (fHeader == NULL) 333 return B_BAD_VALUE; 334 335 Stack<off_t> stack; 336 if (stack.Push(fHeader->root_node_pointer) < B_OK) 337 return B_NO_MEMORY; 338 339 bplustree_node *node; 340 off_t pos; 341 while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node)) 342 { 343 // is the node a leaf node? 344 if (node->overflow_link == BPLUSTREE_NULL) 345 { 346 SetCurrentNode(node,pos,to); 347 348 return B_OK; 349 } 350 351 if (to == BPLUSTREE_END || node->all_key_count == 0) 352 pos = node->overflow_link; 353 else 354 { 355 if (node->all_key_length > fNodeSize 356 || (addr_t)node->Values() > (addr_t)node + fNodeSize 357 - 8 * node->all_key_count) 358 return B_ERROR; 359 360 pos = *node->Values(); 361 } 362 363 if (stack.Push(pos) < B_OK) 364 break; 365 } 366 return B_ERROR; 367 } 368 369 370 status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value) 371 { 372 if (fCurrentNodeOffset == BPLUSTREE_NULL 373 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 374 return B_ERROR; 375 376 bplustree_node *node; 377 378 if (fDuplicateNode != BPLUSTREE_NULL) 379 { 380 // regardless of traverse direction the duplicates are always presented in 381 // the same order; since they are all considered as equal, this shouldn't 382 // cause any problems 383 384 if (!fIsFragment || fDuplicate < fNumDuplicates) 385 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 386 else 387 node = NULL; 388 389 if (node != NULL) 390 { 391 if (!fIsFragment && fDuplicate >= fNumDuplicates) 392 { 393 // if the node is out of duplicates, we go directly to the next one 394 fDuplicateNode = node->right_link; 395 if (fDuplicateNode != BPLUSTREE_NULL 396 && (node = fCache.Get(fDuplicateNode)) != NULL) 397 { 398 fNumDuplicates = node->CountDuplicates(fDuplicateNode,false); 399 fDuplicate = 0; 400 } 401 } 402 if (fDuplicate < fNumDuplicates) 403 { 404 *value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++); 405 return B_OK; 406 } 407 } 408 fDuplicateNode = BPLUSTREE_NULL; 409 } 410 411 off_t savedNodeOffset = fCurrentNodeOffset; 412 if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node)) 413 return B_ERROR; 414 415 fCurrentKey += direction; 416 417 // is the current key in the current node? 418 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count) 419 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) 420 { 421 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link; 422 423 // are there any more nodes? 424 if (fCurrentNodeOffset != BPLUSTREE_NULL) 425 { 426 node = fCache.Get(fCurrentNodeOffset); 427 if (!node || !CheckNode(node)) 428 return B_ERROR; 429 430 // reset current key 431 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count; 432 } 433 else 434 { 435 // there are no nodes left, so turn back to the last key 436 fCurrentNodeOffset = savedNodeOffset; 437 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1; 438 439 return B_ENTRY_NOT_FOUND; 440 } 441 } 442 443 if (node->all_key_count == 0) 444 return B_ERROR; //B_ENTRY_NOT_FOUND; 445 446 uint16 length; 447 uint8 *keyStart = node->KeyAt(fCurrentKey,&length); 448 449 length = min_c(length,maxLength); 450 memcpy(key,keyStart,length); 451 452 if (fHeader->data_type == BPLUSTREE_STRING_TYPE) // terminate string type 453 { 454 if (length == maxLength) 455 length--; 456 ((char *)key)[length] = '\0'; 457 } 458 *keyLength = length; 459 460 off_t offset = node->Values()[fCurrentKey]; 461 462 // duplicate fragments? 463 uint8 type = bplustree_node::LinkType(offset); 464 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) 465 { 466 fDuplicateNode = offset; 467 468 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 469 if (node == NULL) 470 return B_ERROR; 471 472 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 473 474 fNumDuplicates = node->CountDuplicates(offset,fIsFragment); 475 if (fNumDuplicates) 476 { 477 // give back first duplicate 478 fDuplicate = 1; 479 offset = node->DuplicateAt(offset,fIsFragment,0); 480 } 481 else 482 { 483 // shouldn't happen, but we're dealing here with corrupt disks... 484 fDuplicateNode = BPLUSTREE_NULL; 485 offset = 0; 486 } 487 } 488 *value = offset; 489 490 return B_OK; 491 } 492 493 494 int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 495 { 496 switch (fHeader->data_type) 497 { 498 case BPLUSTREE_STRING_TYPE: 499 { 500 int len = min_c(keyLength1,keyLength2); 501 int result = strncmp((const char *)key1,(const char *)key2,len); 502 503 if (result == 0) 504 result = keyLength1 - keyLength2; 505 506 return result; 507 } 508 509 case BPLUSTREE_INT32_TYPE: 510 return *(int32 *)key1 - *(int32 *)key2; 511 512 case BPLUSTREE_UINT32_TYPE: 513 { 514 if (*(uint32 *)key1 == *(uint32 *)key2) 515 return 0; 516 else if (*(uint32 *)key1 > *(uint32 *)key2) 517 return 1; 518 519 return -1; 520 } 521 522 case BPLUSTREE_INT64_TYPE: 523 { 524 if (*(int64 *)key1 == *(int64 *)key2) 525 return 0; 526 else if (*(int64 *)key1 > *(int64 *)key2) 527 return 1; 528 529 return -1; 530 } 531 532 case BPLUSTREE_UINT64_TYPE: 533 { 534 if (*(uint64 *)key1 == *(uint64 *)key2) 535 return 0; 536 else if (*(uint64 *)key1 > *(uint64 *)key2) 537 return 1; 538 539 return -1; 540 } 541 542 case BPLUSTREE_FLOAT_TYPE: 543 { 544 float result = *(float *)key1 - *(float *)key2; 545 if (result == 0.0f) 546 return 0; 547 548 return (result < 0.0f) ? -1 : 1; 549 } 550 551 case BPLUSTREE_DOUBLE_TYPE: 552 { 553 double result = *(double *)key1 - *(double *)key2; 554 if (result == 0.0) 555 return 0; 556 557 return (result < 0.0) ? -1 : 1; 558 } 559 } 560 return 0; 561 } 562 563 564 status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next) 565 { 566 if (node->all_key_count == 0) 567 { 568 if (index) 569 *index = 0; 570 if (next) 571 *next = node->overflow_link; 572 return B_ENTRY_NOT_FOUND; 573 } 574 575 off_t *values = node->Values(); 576 int16 saveIndex = 0; 577 578 // binary search in the key array 579 for (int16 first = 0,last = node->all_key_count - 1;first <= last;) 580 { 581 uint16 i = (first + last) >> 1; 582 583 uint16 searchLength; 584 uint8 *searchKey = node->KeyAt(i,&searchLength); 585 586 int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength); 587 if (cmp < 0) 588 { 589 last = i - 1; 590 saveIndex = i; 591 } 592 else if (cmp > 0) 593 { 594 saveIndex = first = i + 1; 595 } 596 else 597 { 598 if (index) 599 *index = i; 600 if (next) 601 *next = values[i]; 602 return B_OK; 603 } 604 } 605 606 if (index) 607 *index = saveIndex; 608 if (next) 609 { 610 if (saveIndex == node->all_key_count) 611 *next = node->overflow_link; 612 else 613 *next = values[saveIndex]; 614 } 615 return B_ENTRY_NOT_FOUND; 616 } 617 618 619 status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength) 620 { 621 node_and_key nodeAndKey; 622 nodeAndKey.nodeOffset = fHeader->root_node_pointer; 623 nodeAndKey.keyIndex = 0; 624 625 bplustree_node *node; 626 while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) { 627 // if we are already on leaf level, we're done 628 if (node->overflow_link == BPLUSTREE_NULL) { 629 // put the node on the stack 630 // node that the keyIndex is not properly set here! 631 nodeAndKey.keyIndex = 0; 632 stack.Push(nodeAndKey); 633 return B_OK; 634 } 635 636 off_t nextOffset; 637 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset); 638 639 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 640 return B_ERROR; 641 642 // put the node & the correct keyIndex on the stack 643 stack.Push(nodeAndKey); 644 645 nodeAndKey.nodeOffset = nextOffset; 646 } 647 return B_ERROR; 648 } 649 650 651 void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index) 652 { 653 // should never happen, but who knows? 654 if (index > node->all_key_count) 655 return; 656 657 off_t *values = node->Values(); 658 uint16 *keyLengths = node->KeyLengths(); 659 uint8 *keys = node->Keys(); 660 661 node->all_key_count++; 662 node->all_key_length += keyLength; 663 664 off_t *newValues = node->Values(); 665 uint16 *newKeyLengths = node->KeyLengths(); 666 667 // move values and copy new value into them 668 memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index)); 669 memmove(newValues,values,sizeof(off_t) * index); 670 671 newValues[index] = value; 672 673 // move and update key length index 674 for (uint16 i = node->all_key_count;i-- > index + 1;) 675 newKeyLengths[i] = keyLengths[i - 1] + keyLength; 676 memmove(newKeyLengths,keyLengths,sizeof(uint16) * index); 677 678 int32 keyStart; 679 newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); 680 681 // move keys and copy new key into them 682 int32 size = node->all_key_length - newKeyLengths[index]; 683 if (size > 0) 684 memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size); 685 686 memcpy(keys + keyStart,key,keyLength); 687 } 688 689 690 status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/) 691 { 692 printf("DUPLICATE ENTRY!!\n"); 693 694 // /* handle dup keys */ 695 // if (dupflg == 0) 696 // { 697 // bt_errno(b) = BT_DUPKEY; 698 // goto bombout; 699 // } 700 // else 701 // { 702 // /* paste new data ptr in page */ 703 // /* and write it out again. */ 704 // off_t *p; 705 // p = (off_t *)KEYCLD(op->p); 706 // *(p + keyat) = rrn; 707 // op->flags = BT_CHE_DIRTY; 708 // if(bt_wpage(b,op) == BT_ERR || 709 // bt_wpage(b,kp) == BT_ERR) 710 // return(BT_ERR); 711 // 712 // /* mark all as well with tree */ 713 // bt_clearerr(b); 714 // return(BT_OK); 715 // } 716 717 return B_OK; 718 } 719 720 721 status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value) 722 { 723 if (*_keyIndex > node->all_key_count + 1) 724 return B_BAD_VALUE; 725 726 //printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex); 727 //dump_bplustree_node(node,fHeader); 728 //hexdump(node,fNodeSize); 729 730 off_t otherOffset; 731 bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/); 732 if (other == NULL) 733 return B_NO_MEMORY; 734 735 uint16 *inKeyLengths = node->KeyLengths(); 736 off_t *inKeyValues = node->Values(); 737 uint8 *inKeys = node->Keys(); 738 uint8 *outKeys = other->Keys(); 739 uint16 keyIndex = *_keyIndex; 740 741 // how many keys will fit in one (half) page? 742 743 // "bytes" is the number of bytes written for the new key, 744 // "bytesBefore" are the bytes before that key 745 // "bytesAfter" are the bytes after the new key, if any 746 int32 bytes = 0,bytesBefore = 0,bytesAfter = 0; 747 748 size_t size = fNodeSize >> 1; 749 int32 out,in; 750 for (in = out = 0;in < node->all_key_count + 1;) { 751 if (!bytes) 752 bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; 753 if (in == keyIndex && !bytes) { 754 bytes = *_keyLength; 755 } else { 756 if (keyIndex < out) { 757 bytesAfter = inKeyLengths[in] - bytesBefore; 758 // fix the key lengths for the new node 759 inKeyLengths[in] = bytesAfter + bytesBefore + bytes; 760 } //else 761 in++; 762 } 763 out++; 764 765 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 766 out * (sizeof(uint16) + sizeof(off_t)) >= size) { 767 // we have found the number of keys in the new node! 768 break; 769 } 770 } 771 772 // if the new key was not inserted, set the length of the keys 773 // that can be copied directly 774 if (keyIndex >= out && in > 0) 775 bytesBefore = inKeyLengths[in - 1]; 776 777 if (bytesBefore < 0 || bytesAfter < 0) 778 return B_BAD_DATA; 779 780 //printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 781 782 other->left_link = node->left_link; 783 other->right_link = nodeOffset; 784 other->all_key_length = bytes + bytesBefore + bytesAfter; 785 other->all_key_count = out; 786 //printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node)); 787 788 uint16 *outKeyLengths = other->KeyLengths(); 789 off_t *outKeyValues = other->Values(); 790 int32 keys = out > keyIndex ? keyIndex : out; 791 792 if (bytesBefore) { 793 // copy the keys 794 memcpy(outKeys,inKeys,bytesBefore); 795 memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16)); 796 memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t)); 797 } 798 if (bytes) { 799 // copy the newly inserted key 800 memcpy(outKeys + bytesBefore,key,bytes); 801 outKeyLengths[keyIndex] = bytes + bytesBefore; 802 outKeyValues[keyIndex] = *_value; 803 804 if (bytesAfter) { 805 // copy the keys after the new key 806 memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter); 807 keys = out - keyIndex - 1; 808 memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16)); 809 memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t)); 810 } 811 } 812 813 // if the new key was already inserted, we shouldn't use it again 814 if (in != out) 815 keyIndex--; 816 817 int32 total = bytesBefore + bytesAfter; 818 819 // if we have split an index node, we have to drop the first key 820 // of the next node (which can also be the new key to insert) 821 if (node->overflow_link != BPLUSTREE_NULL) { 822 if (in == keyIndex) { 823 other->overflow_link = *_value; 824 keyIndex--; 825 } else { 826 other->overflow_link = inKeyValues[in]; 827 total = inKeyLengths[in++]; 828 } 829 } 830 831 // and now the same game for the other page and the rest of the keys 832 // (but with memmove() instead of memcpy(), because they may overlap) 833 834 bytesBefore = bytesAfter = bytes = 0; 835 out = 0; 836 int32 skip = in; 837 while (in < node->all_key_count + 1) { 838 //printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes); 839 if (in == keyIndex && !bytes) { 840 bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; 841 //printf("bytesBefore = %ld\n",bytesBefore); 842 bytes = *_keyLength; 843 } else if (in < node->all_key_count) { 844 //printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]); 845 inKeyLengths[in] -= total; 846 if (bytes) { 847 inKeyLengths[in] += bytes; 848 bytesAfter = inKeyLengths[in] - bytesBefore - bytes; 849 //printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore); 850 } 851 852 in++; 853 } else 854 in++; 855 856 out++; 857 858 //printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 859 // out * (sizeof(uint16) + sizeof(off_t))); 860 if (in > node->all_key_count && keyIndex < in) 861 break; 862 } 863 864 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 865 866 if (keyIndex >= in && keyIndex - skip < out) { 867 bytesAfter = inKeyLengths[in] - bytesBefore - total; 868 } else if (keyIndex < skip) 869 bytesBefore = node->all_key_length - total; 870 871 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 872 873 if (bytesBefore < 0 || bytesAfter < 0) 874 return B_BAD_DATA; 875 876 node->left_link = otherOffset; 877 // right link, and overflow link can stay the same 878 node->all_key_length = bytes + bytesBefore + bytesAfter; 879 node->all_key_count = out - 1; 880 881 // array positions have changed 882 outKeyLengths = node->KeyLengths(); 883 outKeyValues = node->Values(); 884 885 //printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 886 887 // move the keys in the old node: the order is important here, 888 // because we don't want to overwrite any contents 889 890 keys = keyIndex <= skip ? out : keyIndex - skip; 891 keyIndex -= skip; 892 893 if (bytesBefore) 894 memmove(inKeys,inKeys + total,bytesBefore); 895 if (bytesAfter) 896 memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter); 897 898 if (bytesBefore) 899 memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16)); 900 in = out - keyIndex - 1; 901 if (bytesAfter) 902 memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16)); 903 904 if (bytesBefore) 905 memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t)); 906 if (bytesAfter) 907 memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t)); 908 909 if (bytes) { 910 // finally, copy the newly inserted key (don't overwrite anything) 911 memcpy(inKeys + bytesBefore,key,bytes); 912 outKeyLengths[keyIndex] = bytes + bytesBefore; 913 outKeyValues[keyIndex] = *_value; 914 } 915 916 //puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n"); 917 //dump_bplustree_node(other,fHeader); 918 //hexdump(other,fNodeSize); 919 //puts("\n"); 920 //dump_bplustree_node(node,fHeader); 921 //hexdump(node,fNodeSize); 922 923 // write the updated nodes back 924 925 fCache.SetDirty(otherOffset,true); 926 fCache.SetDirty(nodeOffset,true); 927 928 // update the right link of the node in the left of the new node 929 if (other->left_link != BPLUSTREE_NULL) { 930 bplustree_node *left = fCache.Get(other->left_link); 931 if (left != NULL) { 932 left->right_link = otherOffset; 933 fCache.SetDirty(other->left_link,true); 934 } 935 } 936 937 // prepare key to insert in the parent node 938 939 //printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1); 940 uint16 length; 941 uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length); 942 memcpy(key,lastKey,length); 943 *_keyLength = length; 944 *_value = otherOffset; 945 946 return B_OK; 947 } 948 949 950 status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value) 951 { 952 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 953 return B_BAD_VALUE; 954 955 Stack<node_and_key> stack; 956 if (SeekDown(stack,key,keyLength) != B_OK) 957 return B_ERROR; 958 959 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 960 961 memcpy(keyBuffer,key,keyLength); 962 keyBuffer[keyLength] = 0; 963 964 off_t valueToInsert = value; 965 966 fCurrentNodeOffset = BPLUSTREE_NULL; 967 968 node_and_key nodeAndKey; 969 bplustree_node *node; 970 uint32 count = 0; 971 972 while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 973 { 974 if (count++ == 0) // first round, check for duplicate entries 975 { 976 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 977 if (status == B_ERROR) 978 return B_ERROR; 979 980 // is this a duplicate entry? 981 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 982 { 983 if (fAllowDuplicates) 984 return InsertDuplicate(node,nodeAndKey.keyIndex); 985 else 986 return B_NAME_IN_USE; 987 } 988 } 989 990 // is the node big enough to hold the pair? 991 if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength) 992 + (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 993 { 994 InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex); 995 fCache.SetDirty(nodeAndKey.nodeOffset,true); 996 997 return B_OK; 998 } 999 else 1000 { 1001 // do we need to allocate a new root node? if so, then do 1002 // it now 1003 bplustree_node *rootNode = NULL; 1004 off_t newRoot = BPLUSTREE_NULL; 1005 if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) { 1006 rootNode = fCache.Get(newRoot = fHeader->maximum_size); 1007 if (rootNode == NULL) { 1008 // the tree is most likely dead 1009 return B_NO_MEMORY; 1010 } 1011 } 1012 1013 if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) { 1014 if (newRoot != BPLUSTREE_NULL) { 1015 // free root node 1016 } 1017 return B_ERROR; 1018 } 1019 1020 if (newRoot != BPLUSTREE_NULL) { 1021 rootNode->overflow_link = nodeAndKey.nodeOffset; 1022 1023 InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0); 1024 1025 fHeader->root_node_pointer = newRoot; 1026 fHeader->max_number_of_levels++; 1027 // write header 1028 1029 fCache.SetDirty(newRoot,true); 1030 1031 return B_OK; 1032 } 1033 } 1034 } 1035 1036 return B_ERROR; 1037 } 1038 1039 1040 status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value) 1041 { 1042 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1043 return B_BAD_VALUE; 1044 1045 Stack<node_and_key> stack; 1046 if (SeekDown(stack,key,keyLength) != B_OK) 1047 return B_ERROR; 1048 1049 fCurrentNodeOffset = BPLUSTREE_NULL; 1050 1051 node_and_key nodeAndKey; 1052 bplustree_node *node; 1053 1054 if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 1055 { 1056 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 1057 if (status == B_ERROR) 1058 return B_ERROR; 1059 1060 SetCurrentNode(node,nodeAndKey.nodeOffset); 1061 1062 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 1063 { 1064 *value = node->Values()[nodeAndKey.keyIndex]; 1065 return B_OK; 1066 } 1067 } 1068 return B_ENTRY_NOT_FOUND; 1069 } 1070 1071 1072 // #pragma mark - 1073 1074 1075 bool 1076 BPlusTree::CheckNode(bplustree_node *node) 1077 { 1078 if (!fHeader->IsValidLink(node->left_link) 1079 || !fHeader->IsValidLink(node->right_link) 1080 || !fHeader->IsValidLink(node->overflow_link) 1081 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1082 return false; 1083 1084 return true; 1085 } 1086 1087 1088 bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check) 1089 { 1090 /*printf("1: %d,%d,%d\n", 1091 nodeOffset > fHeader->maximum_size - fNodeSize, 1092 nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL, 1093 (nodeOffset % fNodeSize) != 0); 1094 */ 1095 // the super node is always in memory, and shouldn't 1096 // never be taken out of the cache 1097 if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/ 1098 || nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL 1099 || (nodeOffset % fNodeSize) != 0) 1100 return NULL; 1101 1102 bplustree_node *node = (bplustree_node *)malloc(fNodeSize); 1103 if (node == NULL) 1104 return NULL; 1105 1106 if (nodeOffset == BPLUSTREE_NULL || !fStream) 1107 { 1108 node->left_link = BPLUSTREE_NULL; 1109 node->right_link = BPLUSTREE_NULL; 1110 node->overflow_link = BPLUSTREE_NULL; 1111 node->all_key_count = 0; 1112 node->all_key_length = 0; 1113 } 1114 else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize) 1115 { 1116 free(node); 1117 return NULL; 1118 } 1119 1120 if (check && node != NULL) 1121 { 1122 // sanity checks (links, all_key_count) 1123 if (!fHeader->IsValidLink(node->left_link) 1124 || !fHeader->IsValidLink(node->right_link) 1125 || !fHeader->IsValidLink(node->overflow_link) 1126 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1127 return NULL; 1128 } 1129 1130 if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) { 1131 // do some hacks here 1132 fHeader->maximum_size += fNodeSize; 1133 } 1134 1135 return node; 1136 } 1137 1138 1139 void BPlusTree::SetHoldChanges(bool hold) 1140 { 1141 fCache.SetHoldChanges(hold); 1142 } 1143 1144 1145 // #pragma mark - 1146 1147 1148 uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const 1149 { 1150 if (index < 0 || index > all_key_count) 1151 return NULL; 1152 1153 uint8 *keyStart = Keys(); 1154 uint16 *keyLengths = KeyLengths(); 1155 1156 *keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0); 1157 if (index > 0) 1158 keyStart += keyLengths[index - 1]; 1159 1160 return keyStart; 1161 } 1162 1163 1164 uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const 1165 { 1166 // the duplicate fragment handling is currently hard-coded to a node size 1167 // of 1024 bytes - with future versions of BFS, this may be a problem 1168 1169 if (isFragment) 1170 { 1171 uint32 fragment = 8 * ((uint64)offset & 0x3ff); 1172 1173 return ((off_t *)this)[fragment]; 1174 } 1175 return overflow_link; 1176 } 1177 1178 1179 off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const 1180 { 1181 uint32 start; 1182 if (isFragment) 1183 start = 8 * ((uint64)offset & 0x3ff); 1184 else 1185 start = 2; 1186 1187 return ((off_t *)this)[start + 1 + index]; 1188 } 1189 1190