1 /* BPlusTree - BFS B+Tree implementation 2 ** 3 ** Copyright 2001-2002 pinc Software. All Rights Reserved. 4 */ 5 6 7 #include "BPlusTree.h" 8 #include "Inode.h" 9 #include "Stack.h" 10 #include "dump.h" 11 12 #include <Debug.h> 13 14 #include <string.h> 15 #include <stdlib.h> 16 #include <stdio.h> 17 18 19 #define MAX_NODES_IN_CACHE 20 20 21 class CacheableNode : public NodeCache::Cacheable 22 { 23 public: 24 CacheableNode(off_t offset,bplustree_node *node) 25 : 26 fOffset(offset), 27 fNode(node) 28 { 29 } 30 31 virtual ~CacheableNode() 32 { 33 if (fNode) 34 free(fNode); 35 } 36 37 virtual bool Equals(off_t offset) 38 { 39 return offset == fOffset; 40 } 41 42 off_t fOffset; 43 bplustree_node *fNode; 44 }; 45 46 47 NodeCache::NodeCache(BPlusTree *tree) 48 : Cache<off_t>(), 49 fTree(tree) 50 { 51 } 52 53 54 Cache<off_t>::Cacheable * 55 NodeCache::NewCacheable(off_t offset) 56 { 57 return new CacheableNode(offset,fTree->Node(offset,false)); 58 } 59 60 61 bplustree_node * 62 NodeCache::Get(off_t offset) 63 { 64 CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset); 65 return node->fNode; 66 } 67 68 69 // #pragma mark - 70 71 72 BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates) 73 : 74 fStream(NULL), 75 fHeader(NULL), 76 fCache(this), 77 fCurrentNodeOffset(BPLUSTREE_NULL) 78 { 79 SetTo(keyType,nodeSize,allowDuplicates); 80 } 81 82 83 BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates) 84 : 85 fStream(NULL), 86 fHeader(NULL), 87 fCache(this), 88 fCurrentNodeOffset(BPLUSTREE_NULL) 89 { 90 SetTo(stream,allowDuplicates); 91 } 92 93 94 BPlusTree::BPlusTree() 95 : 96 fStream(NULL), 97 fHeader(NULL), 98 fNodeSize(BPLUSTREE_NODE_SIZE), 99 fAllowDuplicates(true), 100 fStatus(B_NO_INIT), 101 fCache(this), 102 fCurrentNodeOffset(BPLUSTREE_NULL) 103 { 104 } 105 106 107 BPlusTree::~BPlusTree() 108 { 109 fCache.Clear(); 110 } 111 112 113 void BPlusTree::Initialize(int32 nodeSize) 114 { 115 // free old data 116 fCache.Clear(0,true); 117 118 if (fHeader) 119 free(fHeader); 120 121 fStream = NULL; 122 123 fNodeSize = nodeSize; 124 fHeader = (bplustree_header *)malloc(fNodeSize); 125 memset(fHeader,0,fNodeSize); 126 127 fCurrentNodeOffset = BPLUSTREE_NULL; 128 } 129 130 131 status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates) 132 { 133 // initializes in-memory B+Tree 134 135 Initialize(nodeSize); 136 137 fAllowDuplicates = allowDuplicates; 138 fCache.SetHoldChanges(true); 139 140 // initialize b+tree header 141 fHeader->magic = BPLUSTREE_MAGIC; 142 fHeader->node_size = fNodeSize; 143 fHeader->max_number_of_levels = 1; 144 fHeader->data_type = keyType; 145 fHeader->root_node_pointer = fNodeSize; 146 fHeader->free_node_pointer = BPLUSTREE_NULL; 147 fHeader->maximum_size = fNodeSize * 2; 148 149 return fStatus = B_OK; 150 } 151 152 153 status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates) 154 { 155 // initializes on-disk B+Tree 156 157 bplustree_header header; 158 ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header)); 159 if (read < 0) 160 return fStatus = read; 161 162 // is header valid? 163 164 stream->Seek(0,SEEK_END); 165 off_t size = stream->Position(); 166 stream->Seek(0,SEEK_SET); 167 168 //dump_bplustree_header(&header); 169 170 if (header.magic != BPLUSTREE_MAGIC 171 || header.maximum_size != size 172 || (header.root_node_pointer % header.node_size) != 0 173 || !header.IsValidLink(header.root_node_pointer) 174 || !header.IsValidLink(header.free_node_pointer)) 175 return fStatus = B_BAD_DATA; 176 177 fAllowDuplicates = allowDuplicates; 178 179 if (DataStream *dataStream = dynamic_cast<DataStream *>(stream)) 180 { 181 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 182 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 183 uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 184 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 185 186 if (header.data_type > BPLUSTREE_DOUBLE_TYPE 187 || (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode 188 || !dataStream->IsDirectory()) 189 return fStatus = B_BAD_TYPE; 190 191 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 192 fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR; 193 194 //printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no"); 195 } 196 197 Initialize(header.node_size); 198 199 memcpy(fHeader,&header,sizeof(bplustree_header)); 200 201 fStream = stream; 202 203 bplustree_node *node = fCache.Get(header.root_node_pointer); 204 //if (node) 205 // dump_bplustree_node(node); 206 207 return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA; 208 } 209 210 211 status_t BPlusTree::InitCheck() 212 { 213 return fStatus; 214 } 215 216 217 struct validate_info { 218 off_t offset; 219 off_t from; 220 bool free; 221 }; 222 223 224 status_t 225 BPlusTree::Validate(bool verbose) 226 { 227 // validates the on-disk b+tree 228 // (for now only the node integrity is checked) 229 230 if (!fStream) 231 return B_OK; 232 233 struct validate_info info; 234 235 Stack<validate_info> stack; 236 info.offset = fHeader->root_node_pointer; 237 info.from = BPLUSTREE_NULL; 238 info.free = false; 239 stack.Push(info); 240 241 if (fHeader->free_node_pointer != BPLUSTREE_NULL) { 242 info.offset = fHeader->free_node_pointer; 243 info.free = true; 244 stack.Push(info); 245 } 246 247 char escape[3] = {0x1b, '[', 0}; 248 int32 count = 0,freeCount = 0; 249 250 while (true) { 251 if (!stack.Pop(&info)) { 252 if (verbose) { 253 printf(" %ld B+tree node(s) processed (%ld free node(s)).\n", 254 count, freeCount); 255 } 256 return B_OK; 257 } 258 259 if (!info.free && verbose && ++count % 10 == 0) 260 printf(" %ld%s1A\n", count, escape); 261 else if (info.free) 262 freeCount++; 263 264 bplustree_node *node; 265 if ((node = fCache.Get(info.offset)) == NULL 266 || !info.free && !CheckNode(node)) { 267 if (verbose) { 268 fprintf(stderr," B+Tree: Could not get data at position %Ld (referenced by node at %Ld)\n",info.offset,info.from); 269 if ((node = fCache.Get(info.from)) != NULL) 270 dump_bplustree_node(node,fHeader); 271 } 272 return B_BAD_DATA; 273 } 274 275 info.from = info.offset; 276 277 if (node->overflow_link != BPLUSTREE_NULL && !info.free) { 278 info.offset = node->overflow_link; 279 stack.Push(info); 280 281 //dump_bplustree_node(node,fHeader); 282 off_t *values = node->Values(); 283 for (int32 i = 0;i < node->all_key_count;i++) { 284 info.offset = values[i]; 285 stack.Push(info); 286 } 287 } else if (node->left_link != BPLUSTREE_NULL && info.free) { 288 info.offset = node->left_link; 289 stack.Push(info); 290 } 291 } 292 } 293 294 295 status_t 296 BPlusTree::WriteTo(BPositionIO *stream) 297 { 298 ssize_t written = stream->WriteAt(0,fHeader,fNodeSize); 299 if (written < fNodeSize) 300 return written < B_OK ? written : B_IO_ERROR; 301 302 for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize) 303 { 304 bplustree_node *node = fCache.Get(offset); 305 if (node == NULL) 306 return B_ERROR; 307 308 written = stream->WriteAt(offset,node,fNodeSize); 309 if (written < fNodeSize) 310 return written < B_OK ? written : B_IO_ERROR; 311 } 312 return stream->SetSize(fHeader->maximum_size); 313 } 314 315 316 // #pragma mark - 317 318 319 void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to) 320 { 321 fCurrentNodeOffset = offset; 322 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count; 323 fDuplicateNode = BPLUSTREE_NULL; 324 } 325 326 327 status_t BPlusTree::Goto(int8 to) 328 { 329 if (fHeader == NULL) 330 return B_BAD_VALUE; 331 332 Stack<off_t> stack; 333 if (stack.Push(fHeader->root_node_pointer) < B_OK) 334 return B_NO_MEMORY; 335 336 bplustree_node *node; 337 off_t pos; 338 while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node)) 339 { 340 // is the node a leaf node? 341 if (node->overflow_link == BPLUSTREE_NULL) 342 { 343 SetCurrentNode(node,pos,to); 344 345 return B_OK; 346 } 347 348 if (to == BPLUSTREE_END || node->all_key_count == 0) 349 pos = node->overflow_link; 350 else 351 { 352 if (node->all_key_length > fNodeSize 353 || (uint32)node->Values() > (uint32)node + fNodeSize - 8 * node->all_key_count) 354 return B_ERROR; 355 356 pos = *node->Values(); 357 } 358 359 if (stack.Push(pos) < B_OK) 360 break; 361 } 362 return B_ERROR; 363 } 364 365 366 status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value) 367 { 368 if (fCurrentNodeOffset == BPLUSTREE_NULL 369 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 370 return B_ERROR; 371 372 bplustree_node *node; 373 374 if (fDuplicateNode != BPLUSTREE_NULL) 375 { 376 // regardless of traverse direction the duplicates are always presented in 377 // the same order; since they are all considered as equal, this shouldn't 378 // cause any problems 379 380 if (!fIsFragment || fDuplicate < fNumDuplicates) 381 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 382 else 383 node = NULL; 384 385 if (node != NULL) 386 { 387 if (!fIsFragment && fDuplicate >= fNumDuplicates) 388 { 389 // if the node is out of duplicates, we go directly to the next one 390 fDuplicateNode = node->right_link; 391 if (fDuplicateNode != BPLUSTREE_NULL 392 && (node = fCache.Get(fDuplicateNode)) != NULL) 393 { 394 fNumDuplicates = node->CountDuplicates(fDuplicateNode,false); 395 fDuplicate = 0; 396 } 397 } 398 if (fDuplicate < fNumDuplicates) 399 { 400 *value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++); 401 return B_OK; 402 } 403 } 404 fDuplicateNode = BPLUSTREE_NULL; 405 } 406 407 off_t savedNodeOffset = fCurrentNodeOffset; 408 if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node)) 409 return B_ERROR; 410 411 fCurrentKey += direction; 412 413 // is the current key in the current node? 414 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count) 415 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) 416 { 417 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link; 418 419 // are there any more nodes? 420 if (fCurrentNodeOffset != BPLUSTREE_NULL) 421 { 422 node = fCache.Get(fCurrentNodeOffset); 423 if (!node || !CheckNode(node)) 424 return B_ERROR; 425 426 // reset current key 427 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count; 428 } 429 else 430 { 431 // there are no nodes left, so turn back to the last key 432 fCurrentNodeOffset = savedNodeOffset; 433 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1; 434 435 return B_ENTRY_NOT_FOUND; 436 } 437 } 438 439 if (node->all_key_count == 0) 440 return B_ERROR; //B_ENTRY_NOT_FOUND; 441 442 uint16 length; 443 uint8 *keyStart = node->KeyAt(fCurrentKey,&length); 444 445 length = min_c(length,maxLength); 446 memcpy(key,keyStart,length); 447 448 if (fHeader->data_type == BPLUSTREE_STRING_TYPE) // terminate string type 449 { 450 if (length == maxLength) 451 length--; 452 ((char *)key)[length] = '\0'; 453 } 454 *keyLength = length; 455 456 off_t offset = node->Values()[fCurrentKey]; 457 458 // duplicate fragments? 459 uint8 type = bplustree_node::LinkType(offset); 460 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) 461 { 462 fDuplicateNode = offset; 463 464 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 465 if (node == NULL) 466 return B_ERROR; 467 468 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 469 470 fNumDuplicates = node->CountDuplicates(offset,fIsFragment); 471 if (fNumDuplicates) 472 { 473 // give back first duplicate 474 fDuplicate = 1; 475 offset = node->DuplicateAt(offset,fIsFragment,0); 476 } 477 else 478 { 479 // shouldn't happen, but we're dealing here with corrupt disks... 480 fDuplicateNode = BPLUSTREE_NULL; 481 offset = 0; 482 } 483 } 484 *value = offset; 485 486 return B_OK; 487 } 488 489 490 int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 491 { 492 switch (fHeader->data_type) 493 { 494 case BPLUSTREE_STRING_TYPE: 495 { 496 int len = min_c(keyLength1,keyLength2); 497 int result = strncmp((const char *)key1,(const char *)key2,len); 498 499 if (result == 0) 500 result = keyLength1 - keyLength2; 501 502 return result; 503 } 504 505 case BPLUSTREE_INT32_TYPE: 506 return *(int32 *)key1 - *(int32 *)key2; 507 508 case BPLUSTREE_UINT32_TYPE: 509 { 510 if (*(uint32 *)key1 == *(uint32 *)key2) 511 return 0; 512 else if (*(uint32 *)key1 > *(uint32 *)key2) 513 return 1; 514 515 return -1; 516 } 517 518 case BPLUSTREE_INT64_TYPE: 519 { 520 if (*(int64 *)key1 == *(int64 *)key2) 521 return 0; 522 else if (*(int64 *)key1 > *(int64 *)key2) 523 return 1; 524 525 return -1; 526 } 527 528 case BPLUSTREE_UINT64_TYPE: 529 { 530 if (*(uint64 *)key1 == *(uint64 *)key2) 531 return 0; 532 else if (*(uint64 *)key1 > *(uint64 *)key2) 533 return 1; 534 535 return -1; 536 } 537 538 case BPLUSTREE_FLOAT_TYPE: 539 { 540 float result = *(float *)key1 - *(float *)key2; 541 if (result == 0.0f) 542 return 0; 543 544 return (result < 0.0f) ? -1 : 1; 545 } 546 547 case BPLUSTREE_DOUBLE_TYPE: 548 { 549 double result = *(double *)key1 - *(double *)key2; 550 if (result == 0.0) 551 return 0; 552 553 return (result < 0.0) ? -1 : 1; 554 } 555 } 556 return 0; 557 } 558 559 560 status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next) 561 { 562 if (node->all_key_count == 0) 563 { 564 if (index) 565 *index = 0; 566 if (next) 567 *next = node->overflow_link; 568 return B_ENTRY_NOT_FOUND; 569 } 570 571 off_t *values = node->Values(); 572 int16 saveIndex = 0; 573 574 // binary search in the key array 575 for (int16 first = 0,last = node->all_key_count - 1;first <= last;) 576 { 577 uint16 i = (first + last) >> 1; 578 579 uint16 searchLength; 580 uint8 *searchKey = node->KeyAt(i,&searchLength); 581 582 int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength); 583 if (cmp < 0) 584 { 585 last = i - 1; 586 saveIndex = i; 587 } 588 else if (cmp > 0) 589 { 590 saveIndex = first = i + 1; 591 } 592 else 593 { 594 if (index) 595 *index = i; 596 if (next) 597 *next = values[i]; 598 return B_OK; 599 } 600 } 601 602 if (index) 603 *index = saveIndex; 604 if (next) 605 { 606 if (saveIndex == node->all_key_count) 607 *next = node->overflow_link; 608 else 609 *next = values[saveIndex]; 610 } 611 return B_ENTRY_NOT_FOUND; 612 } 613 614 615 status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength) 616 { 617 node_and_key nodeAndKey; 618 nodeAndKey.nodeOffset = fHeader->root_node_pointer; 619 nodeAndKey.keyIndex = 0; 620 621 bplustree_node *node; 622 while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) { 623 // if we are already on leaf level, we're done 624 if (node->overflow_link == BPLUSTREE_NULL) { 625 // put the node on the stack 626 // node that the keyIndex is not properly set here! 627 nodeAndKey.keyIndex = 0; 628 stack.Push(nodeAndKey); 629 return B_OK; 630 } 631 632 off_t nextOffset; 633 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset); 634 635 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 636 return B_ERROR; 637 638 // put the node & the correct keyIndex on the stack 639 stack.Push(nodeAndKey); 640 641 nodeAndKey.nodeOffset = nextOffset; 642 } 643 return B_ERROR; 644 } 645 646 647 void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index) 648 { 649 // should never happen, but who knows? 650 if (index > node->all_key_count) 651 return; 652 653 off_t *values = node->Values(); 654 uint16 *keyLengths = node->KeyLengths(); 655 uint8 *keys = node->Keys(); 656 657 node->all_key_count++; 658 node->all_key_length += keyLength; 659 660 off_t *newValues = node->Values(); 661 uint16 *newKeyLengths = node->KeyLengths(); 662 663 // move values and copy new value into them 664 memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index)); 665 memmove(newValues,values,sizeof(off_t) * index); 666 667 newValues[index] = value; 668 669 // move and update key length index 670 for (uint16 i = node->all_key_count;i-- > index + 1;) 671 newKeyLengths[i] = keyLengths[i - 1] + keyLength; 672 memmove(newKeyLengths,keyLengths,sizeof(uint16) * index); 673 674 int32 keyStart; 675 newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); 676 677 // move keys and copy new key into them 678 int32 size = node->all_key_length - newKeyLengths[index]; 679 if (size > 0) 680 memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size); 681 682 memcpy(keys + keyStart,key,keyLength); 683 } 684 685 686 status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/) 687 { 688 printf("DUPLICATE ENTRY!!\n"); 689 690 // /* handle dup keys */ 691 // if (dupflg == 0) 692 // { 693 // bt_errno(b) = BT_DUPKEY; 694 // goto bombout; 695 // } 696 // else 697 // { 698 // /* paste new data ptr in page */ 699 // /* and write it out again. */ 700 // off_t *p; 701 // p = (off_t *)KEYCLD(op->p); 702 // *(p + keyat) = rrn; 703 // op->flags = BT_CHE_DIRTY; 704 // if(bt_wpage(b,op) == BT_ERR || 705 // bt_wpage(b,kp) == BT_ERR) 706 // return(BT_ERR); 707 // 708 // /* mark all as well with tree */ 709 // bt_clearerr(b); 710 // return(BT_OK); 711 // } 712 713 return B_OK; 714 } 715 716 717 status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value) 718 { 719 if (*_keyIndex > node->all_key_count + 1) 720 return B_BAD_VALUE; 721 722 //printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex); 723 //dump_bplustree_node(node,fHeader); 724 //hexdump(node,fNodeSize); 725 726 off_t otherOffset; 727 bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/); 728 if (other == NULL) 729 return B_NO_MEMORY; 730 731 uint16 *inKeyLengths = node->KeyLengths(); 732 off_t *inKeyValues = node->Values(); 733 uint8 *inKeys = node->Keys(); 734 uint8 *outKeys = other->Keys(); 735 uint16 keyIndex = *_keyIndex; 736 737 // how many keys will fit in one (half) page? 738 739 // "bytes" is the number of bytes written for the new key, 740 // "bytesBefore" are the bytes before that key 741 // "bytesAfter" are the bytes after the new key, if any 742 int32 bytes = 0,bytesBefore = 0,bytesAfter = 0; 743 744 size_t size = fNodeSize >> 1; 745 int32 out,in; 746 for (in = out = 0;in < node->all_key_count + 1;) { 747 if (!bytes) 748 bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; 749 if (in == keyIndex && !bytes) { 750 bytes = *_keyLength; 751 } else { 752 if (keyIndex < out) { 753 bytesAfter = inKeyLengths[in] - bytesBefore; 754 // fix the key lengths for the new node 755 inKeyLengths[in] = bytesAfter + bytesBefore + bytes; 756 } //else 757 in++; 758 } 759 out++; 760 761 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 762 out * (sizeof(uint16) + sizeof(off_t)) >= size) { 763 // we have found the number of keys in the new node! 764 break; 765 } 766 } 767 768 // if the new key was not inserted, set the length of the keys 769 // that can be copied directly 770 if (keyIndex >= out && in > 0) 771 bytesBefore = inKeyLengths[in - 1]; 772 773 if (bytesBefore < 0 || bytesAfter < 0) 774 return B_BAD_DATA; 775 776 //printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 777 778 other->left_link = node->left_link; 779 other->right_link = nodeOffset; 780 other->all_key_length = bytes + bytesBefore + bytesAfter; 781 other->all_key_count = out; 782 //printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node)); 783 784 uint16 *outKeyLengths = other->KeyLengths(); 785 off_t *outKeyValues = other->Values(); 786 int32 keys = out > keyIndex ? keyIndex : out; 787 788 if (bytesBefore) { 789 // copy the keys 790 memcpy(outKeys,inKeys,bytesBefore); 791 memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16)); 792 memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t)); 793 } 794 if (bytes) { 795 // copy the newly inserted key 796 memcpy(outKeys + bytesBefore,key,bytes); 797 outKeyLengths[keyIndex] = bytes + bytesBefore; 798 outKeyValues[keyIndex] = *_value; 799 800 if (bytesAfter) { 801 // copy the keys after the new key 802 memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter); 803 keys = out - keyIndex - 1; 804 memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16)); 805 memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t)); 806 } 807 } 808 809 // if the new key was already inserted, we shouldn't use it again 810 if (in != out) 811 keyIndex--; 812 813 int32 total = bytesBefore + bytesAfter; 814 815 // if we have split an index node, we have to drop the first key 816 // of the next node (which can also be the new key to insert) 817 if (node->overflow_link != BPLUSTREE_NULL) { 818 if (in == keyIndex) { 819 other->overflow_link = *_value; 820 keyIndex--; 821 } else { 822 other->overflow_link = inKeyValues[in]; 823 total = inKeyLengths[in++]; 824 } 825 } 826 827 // and now the same game for the other page and the rest of the keys 828 // (but with memmove() instead of memcpy(), because they may overlap) 829 830 bytesBefore = bytesAfter = bytes = 0; 831 out = 0; 832 int32 skip = in; 833 while (in < node->all_key_count + 1) { 834 //printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes); 835 if (in == keyIndex && !bytes) { 836 bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; 837 //printf("bytesBefore = %ld\n",bytesBefore); 838 bytes = *_keyLength; 839 } else if (in < node->all_key_count) { 840 //printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]); 841 inKeyLengths[in] -= total; 842 if (bytes) { 843 inKeyLengths[in] += bytes; 844 bytesAfter = inKeyLengths[in] - bytesBefore - bytes; 845 //printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore); 846 } 847 848 in++; 849 } else 850 in++; 851 852 out++; 853 854 //printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 855 // out * (sizeof(uint16) + sizeof(off_t))); 856 if (in > node->all_key_count && keyIndex < in) 857 break; 858 } 859 860 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 861 862 if (keyIndex >= in && keyIndex - skip < out) { 863 bytesAfter = inKeyLengths[in] - bytesBefore - total; 864 } else if (keyIndex < skip) 865 bytesBefore = node->all_key_length - total; 866 867 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 868 869 if (bytesBefore < 0 || bytesAfter < 0) 870 return B_BAD_DATA; 871 872 node->left_link = otherOffset; 873 // right link, and overflow link can stay the same 874 node->all_key_length = bytes + bytesBefore + bytesAfter; 875 node->all_key_count = out - 1; 876 877 // array positions have changed 878 outKeyLengths = node->KeyLengths(); 879 outKeyValues = node->Values(); 880 881 //printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 882 883 // move the keys in the old node: the order is important here, 884 // because we don't want to overwrite any contents 885 886 keys = keyIndex <= skip ? out : keyIndex - skip; 887 keyIndex -= skip; 888 889 if (bytesBefore) 890 memmove(inKeys,inKeys + total,bytesBefore); 891 if (bytesAfter) 892 memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter); 893 894 if (bytesBefore) 895 memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16)); 896 in = out - keyIndex - 1; 897 if (bytesAfter) 898 memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16)); 899 900 if (bytesBefore) 901 memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t)); 902 if (bytesAfter) 903 memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t)); 904 905 if (bytes) { 906 // finally, copy the newly inserted key (don't overwrite anything) 907 memcpy(inKeys + bytesBefore,key,bytes); 908 outKeyLengths[keyIndex] = bytes + bytesBefore; 909 outKeyValues[keyIndex] = *_value; 910 } 911 912 //puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n"); 913 //dump_bplustree_node(other,fHeader); 914 //hexdump(other,fNodeSize); 915 //puts("\n"); 916 //dump_bplustree_node(node,fHeader); 917 //hexdump(node,fNodeSize); 918 919 // write the updated nodes back 920 921 fCache.SetDirty(otherOffset,true); 922 fCache.SetDirty(nodeOffset,true); 923 924 // update the right link of the node in the left of the new node 925 if (other->left_link != BPLUSTREE_NULL) { 926 bplustree_node *left = fCache.Get(other->left_link); 927 if (left != NULL) { 928 left->right_link = otherOffset; 929 fCache.SetDirty(other->left_link,true); 930 } 931 } 932 933 // prepare key to insert in the parent node 934 935 //printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1); 936 uint16 length; 937 uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length); 938 memcpy(key,lastKey,length); 939 *_keyLength = length; 940 *_value = otherOffset; 941 942 return B_OK; 943 } 944 945 946 status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value) 947 { 948 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 949 return B_BAD_VALUE; 950 951 Stack<node_and_key> stack; 952 if (SeekDown(stack,key,keyLength) != B_OK) 953 return B_ERROR; 954 955 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 956 957 memcpy(keyBuffer,key,keyLength); 958 keyBuffer[keyLength] = 0; 959 960 off_t valueToInsert = value; 961 962 fCurrentNodeOffset = BPLUSTREE_NULL; 963 964 node_and_key nodeAndKey; 965 bplustree_node *node; 966 uint32 count = 0; 967 968 while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 969 { 970 if (count++ == 0) // first round, check for duplicate entries 971 { 972 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 973 if (status == B_ERROR) 974 return B_ERROR; 975 976 // is this a duplicate entry? 977 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 978 { 979 if (fAllowDuplicates) 980 return InsertDuplicate(node,nodeAndKey.keyIndex); 981 else 982 return B_NAME_IN_USE; 983 } 984 } 985 986 // is the node big enough to hold the pair? 987 if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength) 988 + (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 989 { 990 InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex); 991 fCache.SetDirty(nodeAndKey.nodeOffset,true); 992 993 return B_OK; 994 } 995 else 996 { 997 // do we need to allocate a new root node? if so, then do 998 // it now 999 bplustree_node *rootNode = NULL; 1000 off_t newRoot = BPLUSTREE_NULL; 1001 if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) { 1002 rootNode = fCache.Get(newRoot = fHeader->maximum_size); 1003 if (rootNode == NULL) { 1004 // the tree is most likely dead 1005 return B_NO_MEMORY; 1006 } 1007 } 1008 1009 if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) { 1010 if (newRoot != BPLUSTREE_NULL) { 1011 // free root node 1012 } 1013 return B_ERROR; 1014 } 1015 1016 if (newRoot != BPLUSTREE_NULL) { 1017 rootNode->overflow_link = nodeAndKey.nodeOffset; 1018 1019 InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0); 1020 1021 fHeader->root_node_pointer = newRoot; 1022 fHeader->max_number_of_levels++; 1023 // write header 1024 1025 fCache.SetDirty(newRoot,true); 1026 1027 return B_OK; 1028 } 1029 } 1030 } 1031 1032 return B_ERROR; 1033 } 1034 1035 1036 status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value) 1037 { 1038 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1039 return B_BAD_VALUE; 1040 1041 Stack<node_and_key> stack; 1042 if (SeekDown(stack,key,keyLength) != B_OK) 1043 return B_ERROR; 1044 1045 fCurrentNodeOffset = BPLUSTREE_NULL; 1046 1047 node_and_key nodeAndKey; 1048 bplustree_node *node; 1049 1050 if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 1051 { 1052 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 1053 if (status == B_ERROR) 1054 return B_ERROR; 1055 1056 SetCurrentNode(node,nodeAndKey.nodeOffset); 1057 1058 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 1059 { 1060 *value = node->Values()[nodeAndKey.keyIndex]; 1061 return B_OK; 1062 } 1063 } 1064 return B_ENTRY_NOT_FOUND; 1065 } 1066 1067 1068 // #pragma mark - 1069 1070 1071 bool 1072 BPlusTree::CheckNode(bplustree_node *node) 1073 { 1074 if (!fHeader->IsValidLink(node->left_link) 1075 || !fHeader->IsValidLink(node->right_link) 1076 || !fHeader->IsValidLink(node->overflow_link) 1077 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1078 return false; 1079 1080 return true; 1081 } 1082 1083 1084 bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check) 1085 { 1086 /*printf("1: %d,%d,%d\n", 1087 nodeOffset > fHeader->maximum_size - fNodeSize, 1088 nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL, 1089 (nodeOffset % fNodeSize) != 0); 1090 */ 1091 // the super node is always in memory, and shouldn't 1092 // never be taken out of the cache 1093 if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/ 1094 || nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL 1095 || (nodeOffset % fNodeSize) != 0) 1096 return NULL; 1097 1098 bplustree_node *node = (bplustree_node *)malloc(fNodeSize); 1099 if (node == NULL) 1100 return NULL; 1101 1102 if (nodeOffset == BPLUSTREE_NULL || !fStream) 1103 { 1104 node->left_link = BPLUSTREE_NULL; 1105 node->right_link = BPLUSTREE_NULL; 1106 node->overflow_link = BPLUSTREE_NULL; 1107 node->all_key_count = 0; 1108 node->all_key_length = 0; 1109 } 1110 else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize) 1111 { 1112 free(node); 1113 return NULL; 1114 } 1115 1116 if (check && node != NULL) 1117 { 1118 // sanity checks (links, all_key_count) 1119 if (!fHeader->IsValidLink(node->left_link) 1120 || !fHeader->IsValidLink(node->right_link) 1121 || !fHeader->IsValidLink(node->overflow_link) 1122 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1123 return NULL; 1124 } 1125 1126 if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) { 1127 // do some hacks here 1128 fHeader->maximum_size += fNodeSize; 1129 } 1130 1131 return node; 1132 } 1133 1134 1135 void BPlusTree::SetHoldChanges(bool hold) 1136 { 1137 fCache.SetHoldChanges(hold); 1138 } 1139 1140 1141 // #pragma mark - 1142 1143 1144 uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const 1145 { 1146 if (index < 0 || index > all_key_count) 1147 return NULL; 1148 1149 uint8 *keyStart = Keys(); 1150 uint16 *keyLengths = KeyLengths(); 1151 1152 *keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0); 1153 if (index > 0) 1154 keyStart += keyLengths[index - 1]; 1155 1156 return keyStart; 1157 } 1158 1159 1160 uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const 1161 { 1162 // the duplicate fragment handling is currently hard-coded to a node size 1163 // of 1024 bytes - with future versions of BFS, this may be a problem 1164 1165 if (isFragment) 1166 { 1167 uint32 fragment = 8 * ((uint64)offset & 0x3ff); 1168 1169 return ((off_t *)this)[fragment]; 1170 } 1171 return overflow_link; 1172 } 1173 1174 1175 off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const 1176 { 1177 uint32 start; 1178 if (isFragment) 1179 start = 8 * ((uint64)offset & 0x3ff); 1180 else 1181 start = 2; 1182 1183 return ((off_t *)this)[start + 1 + index]; 1184 } 1185 1186