1 /* BPlusTree - BFS B+Tree implementation 2 ** 3 ** Copyright 2001-2002 pinc Software. All Rights Reserved. 4 */ 5 6 7 #include "BPlusTree.h" 8 #include "Inode.h" 9 #include "Stack.h" 10 #include "dump.h" 11 12 #include <Debug.h> 13 14 #include <string.h> 15 #include <stdlib.h> 16 #include <stdio.h> 17 18 19 #define MAX_NODES_IN_CACHE 20 20 21 class CacheableNode : public NodeCache::Cacheable 22 { 23 public: 24 CacheableNode(off_t offset,bplustree_node *node) 25 : 26 fOffset(offset), 27 fNode(node) 28 { 29 } 30 31 virtual ~CacheableNode() 32 { 33 if (fNode) 34 free(fNode); 35 } 36 37 virtual bool Equals(off_t offset) 38 { 39 return offset == fOffset; 40 } 41 42 off_t fOffset; 43 bplustree_node *fNode; 44 }; 45 46 47 NodeCache::NodeCache(BPlusTree *tree) 48 : Cache<off_t>(), 49 fTree(tree) 50 { 51 } 52 53 54 Cache<off_t>::Cacheable * 55 NodeCache::NewCacheable(off_t offset) 56 { 57 return new CacheableNode(offset,fTree->Node(offset,false)); 58 } 59 60 61 bplustree_node * 62 NodeCache::Get(off_t offset) 63 { 64 CacheableNode *node = (CacheableNode *)Cache<off_t>::Get(offset); 65 return node->fNode; 66 } 67 68 69 // #pragma mark - 70 71 72 BPlusTree::BPlusTree(int32 keyType,int32 nodeSize,bool allowDuplicates) 73 : 74 fStream(NULL), 75 fHeader(NULL), 76 fCache(this), 77 fCurrentNodeOffset(BPLUSTREE_NULL) 78 { 79 SetTo(keyType,nodeSize,allowDuplicates); 80 } 81 82 83 BPlusTree::BPlusTree(BPositionIO *stream,bool allowDuplicates) 84 : 85 fStream(NULL), 86 fHeader(NULL), 87 fCache(this), 88 fCurrentNodeOffset(BPLUSTREE_NULL) 89 { 90 SetTo(stream,allowDuplicates); 91 } 92 93 94 BPlusTree::BPlusTree() 95 : 96 fStream(NULL), 97 fHeader(NULL), 98 fNodeSize(BPLUSTREE_NODE_SIZE), 99 fAllowDuplicates(true), 100 fStatus(B_NO_INIT), 101 fCache(this), 102 fCurrentNodeOffset(BPLUSTREE_NULL) 103 { 104 } 105 106 107 BPlusTree::~BPlusTree() 108 { 109 fCache.Clear(); 110 } 111 112 113 void BPlusTree::Initialize(int32 nodeSize) 114 { 115 // free old data 116 fCache.Clear(0,true); 117 118 if (fHeader) 119 free(fHeader); 120 121 fStream = NULL; 122 123 fNodeSize = nodeSize; 124 fHeader = (bplustree_header *)malloc(fNodeSize); 125 memset(fHeader,0,fNodeSize); 126 127 fCurrentNodeOffset = BPLUSTREE_NULL; 128 } 129 130 131 status_t BPlusTree::SetTo(int32 keyType,int32 nodeSize,bool allowDuplicates) 132 { 133 // initializes in-memory B+Tree 134 135 Initialize(nodeSize); 136 137 fAllowDuplicates = allowDuplicates; 138 fCache.SetHoldChanges(true); 139 140 // initialize b+tree header 141 fHeader->magic = BPLUSTREE_MAGIC; 142 fHeader->node_size = fNodeSize; 143 fHeader->max_number_of_levels = 1; 144 fHeader->data_type = keyType; 145 fHeader->root_node_pointer = fNodeSize; 146 fHeader->free_node_pointer = BPLUSTREE_NULL; 147 fHeader->maximum_size = fNodeSize * 2; 148 149 return fStatus = B_OK; 150 } 151 152 153 status_t BPlusTree::SetTo(BPositionIO *stream,bool allowDuplicates) 154 { 155 // initializes on-disk B+Tree 156 157 bplustree_header header; 158 ssize_t read = stream->ReadAt(0,&header,sizeof(bplustree_header)); 159 if (read < 0) 160 return fStatus = read; 161 162 // is header valid? 163 164 stream->Seek(0,SEEK_END); 165 off_t size = stream->Position(); 166 stream->Seek(0,SEEK_SET); 167 168 //dump_bplustree_header(&header); 169 170 if (header.magic != BPLUSTREE_MAGIC 171 || header.maximum_size != size 172 || (header.root_node_pointer % header.node_size) != 0 173 || !header.IsValidLink(header.root_node_pointer) 174 || !header.IsValidLink(header.free_node_pointer)) 175 return fStatus = B_BAD_DATA; 176 177 fAllowDuplicates = allowDuplicates; 178 179 if (DataStream *dataStream = dynamic_cast<DataStream *>(stream)) 180 { 181 uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX, 182 S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX}; 183 uint32 mode = dataStream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX 184 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX); 185 186 if (header.data_type > BPLUSTREE_DOUBLE_TYPE 187 || (dataStream->Mode() & S_INDEX_DIR) && toMode[header.data_type] != mode 188 || !dataStream->IsDirectory()) 189 return fStatus = B_BAD_TYPE; 190 191 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused 192 fAllowDuplicates = (dataStream->Mode() & (S_INDEX_DIR | 0777)) == S_INDEX_DIR; 193 194 //printf("allows duplicates? %s\n",fAllowDuplicates ? "yes" : "no"); 195 } 196 197 Initialize(header.node_size); 198 199 memcpy(fHeader,&header,sizeof(bplustree_header)); 200 201 fStream = stream; 202 203 bplustree_node *node = fCache.Get(header.root_node_pointer); 204 //if (node) 205 // dump_bplustree_node(node); 206 207 return fStatus = node && CheckNode(node) ? B_OK : B_BAD_DATA; 208 } 209 210 211 status_t BPlusTree::InitCheck() 212 { 213 return fStatus; 214 } 215 216 217 struct validate_info { 218 off_t offset; 219 off_t from; 220 bool free; 221 }; 222 223 224 status_t 225 BPlusTree::Validate(bool verbose) 226 { 227 // validates the on-disk b+tree 228 // (for now only the node integrity is checked) 229 230 if (!fStream) 231 return B_OK; 232 233 struct validate_info info; 234 235 Stack<validate_info> stack; 236 info.offset = fHeader->root_node_pointer; 237 info.from = BPLUSTREE_NULL; 238 info.free = false; 239 stack.Push(info); 240 241 if (fHeader->free_node_pointer != BPLUSTREE_NULL) { 242 info.offset = fHeader->free_node_pointer; 243 info.free = true; 244 stack.Push(info); 245 } 246 247 char escape[3] = {0x1b, '[', 0}; 248 int32 count = 0,freeCount = 0; 249 250 while (true) { 251 if (!stack.Pop(&info)) { 252 if (verbose) { 253 printf(" %" B_PRId32 " B+tree node(s) processed (%" B_PRId32 254 " free node(s)).\n", count, freeCount); 255 } 256 return B_OK; 257 } 258 259 if (!info.free && verbose && ++count % 10 == 0) 260 printf(" %" B_PRId32 "%s1A\n", count, escape); 261 else if (info.free) 262 freeCount++; 263 264 bplustree_node *node; 265 if ((node = fCache.Get(info.offset)) == NULL 266 || !info.free && !CheckNode(node)) { 267 if (verbose) { 268 fprintf(stderr," B+Tree: Could not get data at position %" 269 B_PRIdOFF " (referenced by node at %" B_PRIdOFF ")\n", 270 info.offset, info.from); 271 if ((node = fCache.Get(info.from)) != NULL) 272 dump_bplustree_node(node,fHeader); 273 } 274 return B_BAD_DATA; 275 } 276 277 info.from = info.offset; 278 279 if (node->overflow_link != BPLUSTREE_NULL && !info.free) { 280 info.offset = node->overflow_link; 281 stack.Push(info); 282 283 //dump_bplustree_node(node,fHeader); 284 off_t *values = node->Values(); 285 for (int32 i = 0;i < node->all_key_count;i++) { 286 info.offset = values[i]; 287 stack.Push(info); 288 } 289 } else if (node->left_link != BPLUSTREE_NULL && info.free) { 290 info.offset = node->left_link; 291 stack.Push(info); 292 } 293 } 294 } 295 296 297 status_t 298 BPlusTree::WriteTo(BPositionIO *stream) 299 { 300 ssize_t written = stream->WriteAt(0,fHeader,fNodeSize); 301 if (written < fNodeSize) 302 return written < B_OK ? written : B_IO_ERROR; 303 304 for (off_t offset = fNodeSize;offset < fHeader->maximum_size;offset += fNodeSize) 305 { 306 bplustree_node *node = fCache.Get(offset); 307 if (node == NULL) 308 return B_ERROR; 309 310 written = stream->WriteAt(offset,node,fNodeSize); 311 if (written < fNodeSize) 312 return written < B_OK ? written : B_IO_ERROR; 313 } 314 return stream->SetSize(fHeader->maximum_size); 315 } 316 317 318 // #pragma mark - 319 320 321 void BPlusTree::SetCurrentNode(bplustree_node *node,off_t offset,int8 to) 322 { 323 fCurrentNodeOffset = offset; 324 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->all_key_count; 325 fDuplicateNode = BPLUSTREE_NULL; 326 } 327 328 329 status_t BPlusTree::Goto(int8 to) 330 { 331 if (fHeader == NULL) 332 return B_BAD_VALUE; 333 334 Stack<off_t> stack; 335 if (stack.Push(fHeader->root_node_pointer) < B_OK) 336 return B_NO_MEMORY; 337 338 bplustree_node *node; 339 off_t pos; 340 while (stack.Pop(&pos) && (node = fCache.Get(pos)) != NULL && CheckNode(node)) 341 { 342 // is the node a leaf node? 343 if (node->overflow_link == BPLUSTREE_NULL) 344 { 345 SetCurrentNode(node,pos,to); 346 347 return B_OK; 348 } 349 350 if (to == BPLUSTREE_END || node->all_key_count == 0) 351 pos = node->overflow_link; 352 else 353 { 354 if (node->all_key_length > fNodeSize 355 || (addr_t)node->Values() > (addr_t)node + fNodeSize 356 - 8 * node->all_key_count) 357 return B_ERROR; 358 359 pos = *node->Values(); 360 } 361 362 if (stack.Push(pos) < B_OK) 363 break; 364 } 365 return B_ERROR; 366 } 367 368 369 status_t BPlusTree::Traverse(int8 direction,void *key,uint16 *keyLength,uint16 maxLength,off_t *value) 370 { 371 if (fCurrentNodeOffset == BPLUSTREE_NULL 372 && Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK) 373 return B_ERROR; 374 375 bplustree_node *node; 376 377 if (fDuplicateNode != BPLUSTREE_NULL) 378 { 379 // regardless of traverse direction the duplicates are always presented in 380 // the same order; since they are all considered as equal, this shouldn't 381 // cause any problems 382 383 if (!fIsFragment || fDuplicate < fNumDuplicates) 384 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 385 else 386 node = NULL; 387 388 if (node != NULL) 389 { 390 if (!fIsFragment && fDuplicate >= fNumDuplicates) 391 { 392 // if the node is out of duplicates, we go directly to the next one 393 fDuplicateNode = node->right_link; 394 if (fDuplicateNode != BPLUSTREE_NULL 395 && (node = fCache.Get(fDuplicateNode)) != NULL) 396 { 397 fNumDuplicates = node->CountDuplicates(fDuplicateNode,false); 398 fDuplicate = 0; 399 } 400 } 401 if (fDuplicate < fNumDuplicates) 402 { 403 *value = node->DuplicateAt(fDuplicateNode,fIsFragment,fDuplicate++); 404 return B_OK; 405 } 406 } 407 fDuplicateNode = BPLUSTREE_NULL; 408 } 409 410 off_t savedNodeOffset = fCurrentNodeOffset; 411 if ((node = fCache.Get(fCurrentNodeOffset)) == NULL || !CheckNode(node)) 412 return B_ERROR; 413 414 fCurrentKey += direction; 415 416 // is the current key in the current node? 417 while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->all_key_count) 418 || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) 419 { 420 fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->right_link : node->left_link; 421 422 // are there any more nodes? 423 if (fCurrentNodeOffset != BPLUSTREE_NULL) 424 { 425 node = fCache.Get(fCurrentNodeOffset); 426 if (!node || !CheckNode(node)) 427 return B_ERROR; 428 429 // reset current key 430 fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->all_key_count; 431 } 432 else 433 { 434 // there are no nodes left, so turn back to the last key 435 fCurrentNodeOffset = savedNodeOffset; 436 fCurrentKey = direction == BPLUSTREE_FORWARD ? node->all_key_count : -1; 437 438 return B_ENTRY_NOT_FOUND; 439 } 440 } 441 442 if (node->all_key_count == 0) 443 return B_ERROR; //B_ENTRY_NOT_FOUND; 444 445 uint16 length; 446 uint8 *keyStart = node->KeyAt(fCurrentKey,&length); 447 448 length = min_c(length,maxLength); 449 memcpy(key,keyStart,length); 450 451 if (fHeader->data_type == BPLUSTREE_STRING_TYPE) // terminate string type 452 { 453 if (length == maxLength) 454 length--; 455 ((char *)key)[length] = '\0'; 456 } 457 *keyLength = length; 458 459 off_t offset = node->Values()[fCurrentKey]; 460 461 // duplicate fragments? 462 uint8 type = bplustree_node::LinkType(offset); 463 if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) 464 { 465 fDuplicateNode = offset; 466 467 node = fCache.Get(bplustree_node::FragmentOffset(fDuplicateNode)); 468 if (node == NULL) 469 return B_ERROR; 470 471 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT; 472 473 fNumDuplicates = node->CountDuplicates(offset,fIsFragment); 474 if (fNumDuplicates) 475 { 476 // give back first duplicate 477 fDuplicate = 1; 478 offset = node->DuplicateAt(offset,fIsFragment,0); 479 } 480 else 481 { 482 // shouldn't happen, but we're dealing here with corrupt disks... 483 fDuplicateNode = BPLUSTREE_NULL; 484 offset = 0; 485 } 486 } 487 *value = offset; 488 489 return B_OK; 490 } 491 492 493 int32 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2) 494 { 495 switch (fHeader->data_type) 496 { 497 case BPLUSTREE_STRING_TYPE: 498 { 499 int len = min_c(keyLength1,keyLength2); 500 int result = strncmp((const char *)key1,(const char *)key2,len); 501 502 if (result == 0) 503 result = keyLength1 - keyLength2; 504 505 return result; 506 } 507 508 case BPLUSTREE_INT32_TYPE: 509 return *(int32 *)key1 - *(int32 *)key2; 510 511 case BPLUSTREE_UINT32_TYPE: 512 { 513 if (*(uint32 *)key1 == *(uint32 *)key2) 514 return 0; 515 else if (*(uint32 *)key1 > *(uint32 *)key2) 516 return 1; 517 518 return -1; 519 } 520 521 case BPLUSTREE_INT64_TYPE: 522 { 523 if (*(int64 *)key1 == *(int64 *)key2) 524 return 0; 525 else if (*(int64 *)key1 > *(int64 *)key2) 526 return 1; 527 528 return -1; 529 } 530 531 case BPLUSTREE_UINT64_TYPE: 532 { 533 if (*(uint64 *)key1 == *(uint64 *)key2) 534 return 0; 535 else if (*(uint64 *)key1 > *(uint64 *)key2) 536 return 1; 537 538 return -1; 539 } 540 541 case BPLUSTREE_FLOAT_TYPE: 542 { 543 float result = *(float *)key1 - *(float *)key2; 544 if (result == 0.0f) 545 return 0; 546 547 return (result < 0.0f) ? -1 : 1; 548 } 549 550 case BPLUSTREE_DOUBLE_TYPE: 551 { 552 double result = *(double *)key1 - *(double *)key2; 553 if (result == 0.0) 554 return 0; 555 556 return (result < 0.0) ? -1 : 1; 557 } 558 } 559 return 0; 560 } 561 562 563 status_t BPlusTree::FindKey(bplustree_node *node,uint8 *key,uint16 keyLength,uint16 *index,off_t *next) 564 { 565 if (node->all_key_count == 0) 566 { 567 if (index) 568 *index = 0; 569 if (next) 570 *next = node->overflow_link; 571 return B_ENTRY_NOT_FOUND; 572 } 573 574 off_t *values = node->Values(); 575 int16 saveIndex = 0; 576 577 // binary search in the key array 578 for (int16 first = 0,last = node->all_key_count - 1;first <= last;) 579 { 580 uint16 i = (first + last) >> 1; 581 582 uint16 searchLength; 583 uint8 *searchKey = node->KeyAt(i,&searchLength); 584 585 int32 cmp = CompareKeys(key,keyLength,searchKey,searchLength); 586 if (cmp < 0) 587 { 588 last = i - 1; 589 saveIndex = i; 590 } 591 else if (cmp > 0) 592 { 593 saveIndex = first = i + 1; 594 } 595 else 596 { 597 if (index) 598 *index = i; 599 if (next) 600 *next = values[i]; 601 return B_OK; 602 } 603 } 604 605 if (index) 606 *index = saveIndex; 607 if (next) 608 { 609 if (saveIndex == node->all_key_count) 610 *next = node->overflow_link; 611 else 612 *next = values[saveIndex]; 613 } 614 return B_ENTRY_NOT_FOUND; 615 } 616 617 618 status_t BPlusTree::SeekDown(Stack<node_and_key> &stack,uint8 *key,uint16 keyLength) 619 { 620 node_and_key nodeAndKey; 621 nodeAndKey.nodeOffset = fHeader->root_node_pointer; 622 nodeAndKey.keyIndex = 0; 623 624 bplustree_node *node; 625 while ((node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) { 626 // if we are already on leaf level, we're done 627 if (node->overflow_link == BPLUSTREE_NULL) { 628 // put the node on the stack 629 // node that the keyIndex is not properly set here! 630 nodeAndKey.keyIndex = 0; 631 stack.Push(nodeAndKey); 632 return B_OK; 633 } 634 635 off_t nextOffset; 636 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex,&nextOffset); 637 638 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset) 639 return B_ERROR; 640 641 // put the node & the correct keyIndex on the stack 642 stack.Push(nodeAndKey); 643 644 nodeAndKey.nodeOffset = nextOffset; 645 } 646 return B_ERROR; 647 } 648 649 650 void BPlusTree::InsertKey(bplustree_node *node,uint8 *key,uint16 keyLength,off_t value,uint16 index) 651 { 652 // should never happen, but who knows? 653 if (index > node->all_key_count) 654 return; 655 656 off_t *values = node->Values(); 657 uint16 *keyLengths = node->KeyLengths(); 658 uint8 *keys = node->Keys(); 659 660 node->all_key_count++; 661 node->all_key_length += keyLength; 662 663 off_t *newValues = node->Values(); 664 uint16 *newKeyLengths = node->KeyLengths(); 665 666 // move values and copy new value into them 667 memmove(newValues + index + 1,values + index,sizeof(off_t) * (node->all_key_count - 1 - index)); 668 memmove(newValues,values,sizeof(off_t) * index); 669 670 newValues[index] = value; 671 672 // move and update key length index 673 for (uint16 i = node->all_key_count;i-- > index + 1;) 674 newKeyLengths[i] = keyLengths[i - 1] + keyLength; 675 memmove(newKeyLengths,keyLengths,sizeof(uint16) * index); 676 677 int32 keyStart; 678 newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0); 679 680 // move keys and copy new key into them 681 int32 size = node->all_key_length - newKeyLengths[index]; 682 if (size > 0) 683 memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size); 684 685 memcpy(keys + keyStart,key,keyLength); 686 } 687 688 689 status_t BPlusTree::InsertDuplicate(bplustree_node */*node*/,uint16 /*index*/) 690 { 691 printf("DUPLICATE ENTRY!!\n"); 692 693 // /* handle dup keys */ 694 // if (dupflg == 0) 695 // { 696 // bt_errno(b) = BT_DUPKEY; 697 // goto bombout; 698 // } 699 // else 700 // { 701 // /* paste new data ptr in page */ 702 // /* and write it out again. */ 703 // off_t *p; 704 // p = (off_t *)KEYCLD(op->p); 705 // *(p + keyat) = rrn; 706 // op->flags = BT_CHE_DIRTY; 707 // if(bt_wpage(b,op) == BT_ERR || 708 // bt_wpage(b,kp) == BT_ERR) 709 // return(BT_ERR); 710 // 711 // /* mark all as well with tree */ 712 // bt_clearerr(b); 713 // return(BT_OK); 714 // } 715 716 return B_OK; 717 } 718 719 720 status_t BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,uint16 *_keyIndex,uint8 *key,uint16 *_keyLength,off_t *_value) 721 { 722 if (*_keyIndex > node->all_key_count + 1) 723 return B_BAD_VALUE; 724 725 //printf("before (insert \"%s\" (%d bytes) at %d):\n\n",key,*_keyLength,*_keyIndex); 726 //dump_bplustree_node(node,fHeader); 727 //hexdump(node,fNodeSize); 728 729 off_t otherOffset; 730 bplustree_node *other = fCache.Get(otherOffset = fHeader->maximum_size); //Node(otherOffset = fHeader->maximum_size/*,false*/); 731 if (other == NULL) 732 return B_NO_MEMORY; 733 734 uint16 *inKeyLengths = node->KeyLengths(); 735 off_t *inKeyValues = node->Values(); 736 uint8 *inKeys = node->Keys(); 737 uint8 *outKeys = other->Keys(); 738 uint16 keyIndex = *_keyIndex; 739 740 // how many keys will fit in one (half) page? 741 742 // "bytes" is the number of bytes written for the new key, 743 // "bytesBefore" are the bytes before that key 744 // "bytesAfter" are the bytes after the new key, if any 745 int32 bytes = 0,bytesBefore = 0,bytesAfter = 0; 746 747 size_t size = fNodeSize >> 1; 748 int32 out,in; 749 for (in = out = 0;in < node->all_key_count + 1;) { 750 if (!bytes) 751 bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0; 752 if (in == keyIndex && !bytes) { 753 bytes = *_keyLength; 754 } else { 755 if (keyIndex < out) { 756 bytesAfter = inKeyLengths[in] - bytesBefore; 757 // fix the key lengths for the new node 758 inKeyLengths[in] = bytesAfter + bytesBefore + bytes; 759 } //else 760 in++; 761 } 762 out++; 763 764 if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 765 out * (sizeof(uint16) + sizeof(off_t)) >= size) { 766 // we have found the number of keys in the new node! 767 break; 768 } 769 } 770 771 // if the new key was not inserted, set the length of the keys 772 // that can be copied directly 773 if (keyIndex >= out && in > 0) 774 bytesBefore = inKeyLengths[in - 1]; 775 776 if (bytesBefore < 0 || bytesAfter < 0) 777 return B_BAD_DATA; 778 779 //printf("put %ld keys in the other node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 780 781 other->left_link = node->left_link; 782 other->right_link = nodeOffset; 783 other->all_key_length = bytes + bytesBefore + bytesAfter; 784 other->all_key_count = out; 785 //printf("-> used = %ld (bplustree_node = %ld bytes)\n",other->Used(),sizeof(bplustree_node)); 786 787 uint16 *outKeyLengths = other->KeyLengths(); 788 off_t *outKeyValues = other->Values(); 789 int32 keys = out > keyIndex ? keyIndex : out; 790 791 if (bytesBefore) { 792 // copy the keys 793 memcpy(outKeys,inKeys,bytesBefore); 794 memcpy(outKeyLengths,inKeyLengths,keys * sizeof(uint16)); 795 memcpy(outKeyValues,inKeyValues,keys * sizeof(off_t)); 796 } 797 if (bytes) { 798 // copy the newly inserted key 799 memcpy(outKeys + bytesBefore,key,bytes); 800 outKeyLengths[keyIndex] = bytes + bytesBefore; 801 outKeyValues[keyIndex] = *_value; 802 803 if (bytesAfter) { 804 // copy the keys after the new key 805 memcpy(outKeys + bytesBefore + bytes,inKeys + bytesBefore,bytesAfter); 806 keys = out - keyIndex - 1; 807 memcpy(outKeyLengths + keyIndex + 1,inKeyLengths + keyIndex,keys * sizeof(uint16)); 808 memcpy(outKeyValues + keyIndex + 1,inKeyValues + keyIndex,keys * sizeof(off_t)); 809 } 810 } 811 812 // if the new key was already inserted, we shouldn't use it again 813 if (in != out) 814 keyIndex--; 815 816 int32 total = bytesBefore + bytesAfter; 817 818 // if we have split an index node, we have to drop the first key 819 // of the next node (which can also be the new key to insert) 820 if (node->overflow_link != BPLUSTREE_NULL) { 821 if (in == keyIndex) { 822 other->overflow_link = *_value; 823 keyIndex--; 824 } else { 825 other->overflow_link = inKeyValues[in]; 826 total = inKeyLengths[in++]; 827 } 828 } 829 830 // and now the same game for the other page and the rest of the keys 831 // (but with memmove() instead of memcpy(), because they may overlap) 832 833 bytesBefore = bytesAfter = bytes = 0; 834 out = 0; 835 int32 skip = in; 836 while (in < node->all_key_count + 1) { 837 //printf("in = %ld, keyIndex = %d, bytes = %ld\n",in,keyIndex,bytes); 838 if (in == keyIndex && !bytes) { 839 bytesBefore = in > skip ? inKeyLengths[in - 1] : 0; 840 //printf("bytesBefore = %ld\n",bytesBefore); 841 bytes = *_keyLength; 842 } else if (in < node->all_key_count) { 843 //printf("1.inKeyLength[%ld] = %u\n",in,inKeyLengths[in]); 844 inKeyLengths[in] -= total; 845 if (bytes) { 846 inKeyLengths[in] += bytes; 847 bytesAfter = inKeyLengths[in] - bytesBefore - bytes; 848 //printf("2.inKeyLength[%ld] = %u, bytesAfter = %ld, bytesBefore = %ld\n",in,inKeyLengths[in],bytesAfter,bytesBefore); 849 } 850 851 in++; 852 } else 853 in++; 854 855 out++; 856 857 //printf("-> out = %ld, keylen = %ld, %ld bytes total\n",out,bytesBefore,round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) + 858 // out * (sizeof(uint16) + sizeof(off_t))); 859 if (in > node->all_key_count && keyIndex < in) 860 break; 861 } 862 863 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 864 865 if (keyIndex >= in && keyIndex - skip < out) { 866 bytesAfter = inKeyLengths[in] - bytesBefore - total; 867 } else if (keyIndex < skip) 868 bytesBefore = node->all_key_length - total; 869 870 //printf("bytes = (%ld, %ld, %ld), in = %ld, total = %ld\n",bytesBefore,bytes,bytesAfter,in,total); 871 872 if (bytesBefore < 0 || bytesAfter < 0) 873 return B_BAD_DATA; 874 875 node->left_link = otherOffset; 876 // right link, and overflow link can stay the same 877 node->all_key_length = bytes + bytesBefore + bytesAfter; 878 node->all_key_count = out - 1; 879 880 // array positions have changed 881 outKeyLengths = node->KeyLengths(); 882 outKeyValues = node->Values(); 883 884 //printf("put %ld keys in the first node (%ld, %ld, %ld) (in = %ld)\n",out,bytesBefore,bytes,bytesAfter,in); 885 886 // move the keys in the old node: the order is important here, 887 // because we don't want to overwrite any contents 888 889 keys = keyIndex <= skip ? out : keyIndex - skip; 890 keyIndex -= skip; 891 892 if (bytesBefore) 893 memmove(inKeys,inKeys + total,bytesBefore); 894 if (bytesAfter) 895 memmove(inKeys + bytesBefore + bytes,inKeys + total + bytesBefore,bytesAfter); 896 897 if (bytesBefore) 898 memmove(outKeyLengths,inKeyLengths + skip,keys * sizeof(uint16)); 899 in = out - keyIndex - 1; 900 if (bytesAfter) 901 memmove(outKeyLengths + keyIndex + 1,inKeyLengths + skip + keyIndex,in * sizeof(uint16)); 902 903 if (bytesBefore) 904 memmove(outKeyValues,inKeyValues + skip,keys * sizeof(off_t)); 905 if (bytesAfter) 906 memmove(outKeyValues + keyIndex + 1,inKeyValues + skip + keyIndex,in * sizeof(off_t)); 907 908 if (bytes) { 909 // finally, copy the newly inserted key (don't overwrite anything) 910 memcpy(inKeys + bytesBefore,key,bytes); 911 outKeyLengths[keyIndex] = bytes + bytesBefore; 912 outKeyValues[keyIndex] = *_value; 913 } 914 915 //puts("\n!!!!!!!!!!!!!!! after: !!!!!!!!!!!!!!!\n"); 916 //dump_bplustree_node(other,fHeader); 917 //hexdump(other,fNodeSize); 918 //puts("\n"); 919 //dump_bplustree_node(node,fHeader); 920 //hexdump(node,fNodeSize); 921 922 // write the updated nodes back 923 924 fCache.SetDirty(otherOffset,true); 925 fCache.SetDirty(nodeOffset,true); 926 927 // update the right link of the node in the left of the new node 928 if (other->left_link != BPLUSTREE_NULL) { 929 bplustree_node *left = fCache.Get(other->left_link); 930 if (left != NULL) { 931 left->right_link = otherOffset; 932 fCache.SetDirty(other->left_link,true); 933 } 934 } 935 936 // prepare key to insert in the parent node 937 938 //printf("skip: %ld, count-1: %u\n",skip,other->all_key_count - 1); 939 uint16 length; 940 uint8 *lastKey = other->KeyAt(other->all_key_count - 1,&length); 941 memcpy(key,lastKey,length); 942 *_keyLength = length; 943 *_value = otherOffset; 944 945 return B_OK; 946 } 947 948 949 status_t BPlusTree::Insert(uint8 *key,uint16 keyLength,off_t value) 950 { 951 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 952 return B_BAD_VALUE; 953 954 Stack<node_and_key> stack; 955 if (SeekDown(stack,key,keyLength) != B_OK) 956 return B_ERROR; 957 958 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1]; 959 960 memcpy(keyBuffer,key,keyLength); 961 keyBuffer[keyLength] = 0; 962 963 off_t valueToInsert = value; 964 965 fCurrentNodeOffset = BPLUSTREE_NULL; 966 967 node_and_key nodeAndKey; 968 bplustree_node *node; 969 uint32 count = 0; 970 971 while (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 972 { 973 if (count++ == 0) // first round, check for duplicate entries 974 { 975 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 976 if (status == B_ERROR) 977 return B_ERROR; 978 979 // is this a duplicate entry? 980 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 981 { 982 if (fAllowDuplicates) 983 return InsertDuplicate(node,nodeAndKey.keyIndex); 984 else 985 return B_NAME_IN_USE; 986 } 987 } 988 989 // is the node big enough to hold the pair? 990 if (int32(round_up(sizeof(bplustree_node) + node->all_key_length + keyLength) 991 + (node->all_key_count + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize) 992 { 993 InsertKey(node,keyBuffer,keyLength,valueToInsert,nodeAndKey.keyIndex); 994 fCache.SetDirty(nodeAndKey.nodeOffset,true); 995 996 return B_OK; 997 } 998 else 999 { 1000 // do we need to allocate a new root node? if so, then do 1001 // it now 1002 bplustree_node *rootNode = NULL; 1003 off_t newRoot = BPLUSTREE_NULL; 1004 if (nodeAndKey.nodeOffset == fHeader->root_node_pointer) { 1005 rootNode = fCache.Get(newRoot = fHeader->maximum_size); 1006 if (rootNode == NULL) { 1007 // the tree is most likely dead 1008 return B_NO_MEMORY; 1009 } 1010 } 1011 1012 if (SplitNode(node,nodeAndKey.nodeOffset,&nodeAndKey.keyIndex,keyBuffer,&keyLength,&valueToInsert) < B_OK) { 1013 if (newRoot != BPLUSTREE_NULL) { 1014 // free root node 1015 } 1016 return B_ERROR; 1017 } 1018 1019 if (newRoot != BPLUSTREE_NULL) { 1020 rootNode->overflow_link = nodeAndKey.nodeOffset; 1021 1022 InsertKey(rootNode,keyBuffer,keyLength,node->left_link,0); 1023 1024 fHeader->root_node_pointer = newRoot; 1025 fHeader->max_number_of_levels++; 1026 // write header 1027 1028 fCache.SetDirty(newRoot,true); 1029 1030 return B_OK; 1031 } 1032 } 1033 } 1034 1035 return B_ERROR; 1036 } 1037 1038 1039 status_t BPlusTree::Find(uint8 *key,uint16 keyLength,off_t *value) 1040 { 1041 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH) 1042 return B_BAD_VALUE; 1043 1044 Stack<node_and_key> stack; 1045 if (SeekDown(stack,key,keyLength) != B_OK) 1046 return B_ERROR; 1047 1048 fCurrentNodeOffset = BPLUSTREE_NULL; 1049 1050 node_and_key nodeAndKey; 1051 bplustree_node *node; 1052 1053 if (stack.Pop(&nodeAndKey) && (node = fCache.Get(nodeAndKey.nodeOffset)) != NULL && CheckNode(node)) 1054 { 1055 status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex); 1056 if (status == B_ERROR) 1057 return B_ERROR; 1058 1059 SetCurrentNode(node,nodeAndKey.nodeOffset); 1060 1061 if (status == B_OK && node->overflow_link == BPLUSTREE_NULL) 1062 { 1063 *value = node->Values()[nodeAndKey.keyIndex]; 1064 return B_OK; 1065 } 1066 } 1067 return B_ENTRY_NOT_FOUND; 1068 } 1069 1070 1071 // #pragma mark - 1072 1073 1074 bool 1075 BPlusTree::CheckNode(bplustree_node *node) 1076 { 1077 if (!fHeader->IsValidLink(node->left_link) 1078 || !fHeader->IsValidLink(node->right_link) 1079 || !fHeader->IsValidLink(node->overflow_link) 1080 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1081 return false; 1082 1083 return true; 1084 } 1085 1086 1087 bplustree_node *BPlusTree::Node(off_t nodeOffset,bool check) 1088 { 1089 /*printf("1: %d,%d,%d\n", 1090 nodeOffset > fHeader->maximum_size - fNodeSize, 1091 nodeOffset < 0 && nodeOffset != BPLUSTREE_NULL, 1092 (nodeOffset % fNodeSize) != 0); 1093 */ 1094 // the super node is always in memory, and shouldn't 1095 // never be taken out of the cache 1096 if (nodeOffset > fHeader->maximum_size /*- fNodeSize*/ 1097 || nodeOffset <= 0 && nodeOffset != BPLUSTREE_NULL 1098 || (nodeOffset % fNodeSize) != 0) 1099 return NULL; 1100 1101 bplustree_node *node = (bplustree_node *)malloc(fNodeSize); 1102 if (node == NULL) 1103 return NULL; 1104 1105 if (nodeOffset == BPLUSTREE_NULL || !fStream) 1106 { 1107 node->left_link = BPLUSTREE_NULL; 1108 node->right_link = BPLUSTREE_NULL; 1109 node->overflow_link = BPLUSTREE_NULL; 1110 node->all_key_count = 0; 1111 node->all_key_length = 0; 1112 } 1113 else if (fStream && fStream->ReadAt(nodeOffset,node,fNodeSize) < fNodeSize) 1114 { 1115 free(node); 1116 return NULL; 1117 } 1118 1119 if (check && node != NULL) 1120 { 1121 // sanity checks (links, all_key_count) 1122 if (!fHeader->IsValidLink(node->left_link) 1123 || !fHeader->IsValidLink(node->right_link) 1124 || !fHeader->IsValidLink(node->overflow_link) 1125 || (int8 *)node->Values() + node->all_key_count * sizeof(off_t) > (int8 *)node + fNodeSize) 1126 return NULL; 1127 } 1128 1129 if (!fStream && nodeOffset > fHeader->maximum_size - fNodeSize) { 1130 // do some hacks here 1131 fHeader->maximum_size += fNodeSize; 1132 } 1133 1134 return node; 1135 } 1136 1137 1138 void BPlusTree::SetHoldChanges(bool hold) 1139 { 1140 fCache.SetHoldChanges(hold); 1141 } 1142 1143 1144 // #pragma mark - 1145 1146 1147 uint8 *bplustree_node::KeyAt(int32 index,uint16 *keyLength) const 1148 { 1149 if (index < 0 || index > all_key_count) 1150 return NULL; 1151 1152 uint8 *keyStart = Keys(); 1153 uint16 *keyLengths = KeyLengths(); 1154 1155 *keyLength = keyLengths[index] - (index != 0 ? keyLengths[index - 1] : 0); 1156 if (index > 0) 1157 keyStart += keyLengths[index - 1]; 1158 1159 return keyStart; 1160 } 1161 1162 1163 uint8 bplustree_node::CountDuplicates(off_t offset,bool isFragment) const 1164 { 1165 // the duplicate fragment handling is currently hard-coded to a node size 1166 // of 1024 bytes - with future versions of BFS, this may be a problem 1167 1168 if (isFragment) 1169 { 1170 uint32 fragment = 8 * ((uint64)offset & 0x3ff); 1171 1172 return ((off_t *)this)[fragment]; 1173 } 1174 return overflow_link; 1175 } 1176 1177 1178 off_t bplustree_node::DuplicateAt(off_t offset,bool isFragment,int8 index) const 1179 { 1180 uint32 start; 1181 if (isFragment) 1182 start = 8 * ((uint64)offset & 0x3ff); 1183 else 1184 start = 2; 1185 1186 return ((off_t *)this)[start + 1 + index]; 1187 } 1188 1189