1 /*
2 * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 *
5 * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6 * no code but achieves binary compatibility with the on disk format.
7 */
8
9
10 //! B+Tree implementation
11
12
13 #include "BPlusTree.h"
14
15 #include <file_systems/QueryParserUtils.h>
16
17 #include "Debug.h"
18 #include "Utility.h"
19
20 #if !_BOOT_MODE
21 # include "Inode.h"
22 #else
23 # include "Stream.h"
24
25 // BFS::Stream from the bootloader has the same API as Inode.
26 # define Inode BFS::Stream
27
28 # define strerror(x) "error message unavailable"
29
30 namespace BFS {
31 #endif
32
33
34 /*! Simple array used for the duplicate handling in the B+Tree. This is an
35 on disk structure.
36 */
37 struct duplicate_array {
38 off_t count;
39 off_t values[0];
40
IsEmptyBFS::duplicate_array41 inline bool IsEmpty() const
42 {
43 return count == 0;
44 }
45
CountBFS::duplicate_array46 inline int32 Count() const
47 {
48 return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49 }
50
ValueAtBFS::duplicate_array51 inline off_t ValueAt(uint32 index) const
52 {
53 return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54 }
55
SetValueAtBFS::duplicate_array56 inline void SetValueAt(uint32 index, off_t value)
57 {
58 values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59 }
60
FindBFS::duplicate_array61 inline int32 Find(off_t value) const
62 {
63 int32 i;
64 return _FindInternal(value, i) ? i : -1;
65 }
66
67 void Insert(off_t value);
68 bool Remove(off_t value);
69
70 private:
71 bool _FindInternal(off_t value, int32& index) const;
72 } _PACKED;
73
74
75 #ifdef DEBUG
76 class NodeChecker {
77 public:
NodeChecker(const bplustree_node * node,int32 nodeSize,const char * text)78 NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79 :
80 fNode(node),
81 fSize(nodeSize),
82 fText(text)
83 {
84 Check("integrity check failed on construction.");
85 }
86
~NodeChecker()87 ~NodeChecker()
88 {
89 Check("integrity check failed on destruction.");
90 }
91
92 void
Check(const char * message)93 Check(const char* message)
94 {
95 if (fNode->CheckIntegrity(fSize) != B_OK) {
96 dprintf("%s: %s\n", fText, message);
97 DEBUGGER(("NodeChecker integrity check failed!"));
98 }
99 }
100
101 private:
102 const bplustree_node* fNode;
103 int32 fSize;
104 const char* fText;
105 };
106 #endif // DEBUG
107
108
109 #if !_BOOT_MODE
110 class BitmapArray {
111 public:
112 BitmapArray(size_t numBits);
113 ~BitmapArray();
114
115 status_t InitCheck() const;
116
117 bool IsSet(size_t index) const;
118 void Set(size_t index, bool set);
119
CountSet() const120 size_t CountSet() const { return fCountSet; }
121
122 private:
123 uint8* fBitmap;
124 size_t fSize;
125 size_t fCountSet;
126 };
127
128
129 struct TreeCheck {
TreeCheckBFS::TreeCheck130 TreeCheck(BPlusTree* tree)
131 :
132 fLevelCount(0),
133 fFreeCount(0),
134 fNodeSize(tree->NodeSize()),
135 fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136 fFoundErrors(0),
137 fVisited(tree->Stream()->Size() / tree->NodeSize()),
138 fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139 {
140 fPreviousOffsets = (off_t*)malloc(
141 sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142 if (fPreviousOffsets != NULL) {
143 for (size_t i = 0; i < fMaxLevels; i++)
144 fPreviousOffsets[i] = BPLUSTREE_NULL;
145 }
146 }
147
~TreeCheckBFS::TreeCheck148 ~TreeCheck()
149 {
150 free(fPreviousOffsets);
151 }
152
InitCheckBFS::TreeCheck153 status_t InitCheck() const
154 {
155 if (fPreviousOffsets == NULL)
156 return B_NO_MEMORY;
157
158 status_t status = fVisited.InitCheck();
159 if (status != B_OK)
160 return status;
161
162 return fVisitedFragment.InitCheck();
163 }
164
VisitedBFS::TreeCheck165 bool Visited(off_t offset) const
166 {
167 return fVisited.IsSet(offset / fNodeSize);
168 }
169
SetVisitedBFS::TreeCheck170 void SetVisited(off_t offset)
171 {
172 fVisited.Set(offset / fNodeSize, true);
173 }
174
VisitedCountBFS::TreeCheck175 size_t VisitedCount() const
176 {
177 return fVisited.CountSet();
178 }
179
VisitedFragmentBFS::TreeCheck180 bool VisitedFragment(off_t offset) const
181 {
182 return fVisitedFragment.IsSet(offset / fNodeSize);
183 }
184
SetVisitedFragmentBFS::TreeCheck185 void SetVisitedFragment(off_t offset)
186 {
187 fVisitedFragment.Set(offset / fNodeSize, true);
188 }
189
MaxLevelsBFS::TreeCheck190 uint32 MaxLevels() const
191 {
192 return fLevelCount;
193 }
194
SetLevelBFS::TreeCheck195 void SetLevel(uint32 level)
196 {
197 if (fLevelCount < level)
198 fLevelCount = level;
199 }
200
PreviousOffsetBFS::TreeCheck201 off_t PreviousOffset(uint32 level)
202 {
203 return fPreviousOffsets[level];
204 }
205
SetPreviousOffsetBFS::TreeCheck206 void SetPreviousOffset(uint32 level, off_t offset)
207 {
208 fPreviousOffsets[level] = offset;
209 }
210
FoundErrorBFS::TreeCheck211 void FoundError()
212 {
213 fFoundErrors++;
214 }
215
ErrorsFoundBFS::TreeCheck216 bool ErrorsFound()
217 {
218 return fFoundErrors != 0;
219 }
220
221 private:
222 uint32 fLevelCount;
223 uint32 fFreeCount;
224 uint32 fNodeSize;
225 uint32 fMaxLevels;
226 uint32 fFoundErrors;
227 BitmapArray fVisited;
228 BitmapArray fVisitedFragment;
229 off_t* fPreviousOffsets;
230 };
231
232
233 // #pragma mark -
234
235
236 // Node Caching for the BPlusTree class
237 //
238 // With write support, there is the need for a function that allocates new
239 // nodes by either returning empty nodes, or by growing the file's data stream
240 //
241 // !! The CachedNode class assumes that you have properly locked the stream
242 // !! before asking for nodes.
243 //
244 // Note: This code will fail if the block size is smaller than the node size!
245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
246 // is hard-coded to 1024 bytes, that's not an issue now.
247
248 void
UnsetUnchanged(Transaction & transaction)249 CachedNode::UnsetUnchanged(Transaction& transaction)
250 {
251 if (fTree == NULL || fTree->fStream == NULL)
252 return;
253
254 if (fNode != NULL) {
255 void* cache = fTree->fStream->GetVolume()->BlockCache();
256
257 block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258 block_cache_put(cache, fBlockNumber);
259 fNode = NULL;
260 }
261 }
262 #endif // !_BOOT_MODE
263
264
265 void
Unset()266 CachedNode::Unset()
267 {
268 if (fTree == NULL || fTree->fStream == NULL)
269 return;
270
271 if (fNode != NULL) {
272 #if !_BOOT_MODE
273 if (fWritable && fOffset == 0) {
274 // The B+tree header has been updated - we need to update the
275 // BPlusTrees copy of it, as well.
276 memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277 }
278
279 block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280 fBlockNumber);
281 #endif // !_BOOT_MODE
282
283 fNode = NULL;
284 }
285 }
286
287
288 const bplustree_node*
SetTo(off_t offset,bool check)289 CachedNode::SetTo(off_t offset, bool check)
290 {
291 const bplustree_node* node;
292 if (SetTo(offset, &node, check) == B_OK)
293 return node;
294
295 return NULL;
296 }
297
298
299 status_t
SetTo(off_t offset,const bplustree_node ** _node,bool check)300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301 {
302 if (fTree == NULL || fTree->fStream == NULL)
303 RETURN_ERROR(B_BAD_VALUE);
304
305 Unset();
306
307 // You can only ask for nodes at valid positions - you can't
308 // even access the b+tree header with this method (use SetToHeader()
309 // instead)
310 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311 || offset <= 0
312 || (offset % fTree->fNodeSize) != 0) {
313 RETURN_ERROR(B_BAD_VALUE);
314 }
315
316 if (InternalSetTo(NULL, offset) != NULL && check) {
317 // sanity checks (links, all_key_count)
318 if (!fTree->fHeader.CheckNode(fNode)) {
319 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321 fBlockNumber, fTree->fStream->ID()));
322 return B_BAD_DATA;
323 }
324 }
325
326 *_node = fNode;
327 return B_OK;
328 }
329
330
331 #if !_BOOT_MODE
332 bplustree_node*
SetToWritable(Transaction & transaction,off_t offset,bool check)333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334 {
335 if (fTree == NULL || fTree->fStream == NULL) {
336 REPORT_ERROR(B_BAD_VALUE);
337 return NULL;
338 }
339
340 Unset();
341
342 // You can only ask for nodes at valid positions - you can't
343 // even access the b+tree header with this method (use SetToHeader()
344 // instead)
345 if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346 || offset <= 0
347 || (offset % fTree->fNodeSize) != 0)
348 return NULL;
349
350 if (InternalSetTo(&transaction, offset) != NULL && check) {
351 // sanity checks (links, all_key_count)
352 if (!fTree->fHeader.CheckNode(fNode)) {
353 FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354 B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355 fBlockNumber, fTree->fStream->ID()));
356 return NULL;
357 }
358 }
359 return fNode;
360 }
361
362
363 bplustree_node*
MakeWritable(Transaction & transaction)364 CachedNode::MakeWritable(Transaction& transaction)
365 {
366 if (fNode == NULL)
367 return NULL;
368
369 if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370 fBlockNumber, transaction.ID()) == B_OK) {
371 return fNode;
372 }
373
374 return NULL;
375 }
376 #endif // !_BOOT_MODE
377
378
379 const bplustree_header*
SetToHeader()380 CachedNode::SetToHeader()
381 {
382 if (fTree == NULL || fTree->fStream == NULL) {
383 REPORT_ERROR(B_BAD_VALUE);
384 return NULL;
385 }
386
387 Unset();
388
389 InternalSetTo(NULL, 0LL);
390 return (bplustree_header*)fNode;
391 }
392
393
394 #if !_BOOT_MODE
395 bplustree_header*
SetToWritableHeader(Transaction & transaction)396 CachedNode::SetToWritableHeader(Transaction& transaction)
397 {
398 if (fTree == NULL || fTree->fStream == NULL) {
399 REPORT_ERROR(B_BAD_VALUE);
400 return NULL;
401 }
402
403 Unset();
404
405 InternalSetTo(&transaction, 0LL);
406
407 if (fNode != NULL && !fTree->fInTransaction) {
408 transaction.AddListener(fTree);
409 fTree->fInTransaction = true;
410
411 if (!transaction.GetVolume()->IsInitializing()) {
412 acquire_vnode(transaction.GetVolume()->FSVolume(),
413 fTree->fStream->ID());
414 }
415 }
416
417 return (bplustree_header*)fNode;
418 }
419 #endif // !_BOOT_MODE
420
421
422 bplustree_node*
InternalSetTo(Transaction * transaction,off_t offset)423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424 {
425 fNode = NULL;
426 fOffset = offset;
427
428 off_t fileOffset;
429 block_run run;
430 if (offset < fTree->fStream->Size()
431 && fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432
433 #if !_BOOT_MODE
434 Volume* volume = fTree->fStream->GetVolume();
435 #else
436 Volume* volume = &fTree->fStream->GetVolume();
437 #endif
438
439 int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440 fBlockNumber = volume->ToBlock(run) + blockOffset;
441 uint8* block = NULL;
442
443 #if !_BOOT_MODE
444 if (transaction != NULL) {
445 block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446 fBlockNumber, transaction->ID());
447 fWritable = true;
448 } else {
449 block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450 fWritable = false;
451 }
452 #else // !_BOOT_MODE
453 if (fBlock == NULL) {
454 fBlock = (uint8*)malloc(volume->BlockSize());
455 if (fBlock == NULL)
456 return NULL;
457 }
458
459 if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460 fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461 block = fBlock;
462 }
463
464 fWritable = false;
465 #endif // _BOOT_MODE
466
467 if (block != NULL) {
468 // The node is somewhere in that block...
469 // (confusing offset calculation)
470 fNode = (bplustree_node*)(block + offset
471 - (fileOffset + (blockOffset << volume->BlockShift())));
472 } else
473 REPORT_ERROR(B_IO_ERROR);
474 }
475 return fNode;
476 }
477
478
479 #if !_BOOT_MODE
480 status_t
Free(Transaction & transaction,off_t offset)481 CachedNode::Free(Transaction& transaction, off_t offset)
482 {
483 if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484 RETURN_ERROR(B_BAD_VALUE);
485
486 // TODO: scan the free nodes list and remove all nodes at the end
487 // of the tree - perhaps that shouldn't be done everytime that
488 // function is called, perhaps it should be done when the directory
489 // inode is closed or based on some calculation or whatever...
490
491 CachedNode cached(fTree);
492 bplustree_header* header = cached.SetToWritableHeader(transaction);
493 if (header == NULL)
494 return B_IO_ERROR;
495
496 #if 0
497 // TODO: temporarily disabled because CheckNode() doesn't like this...
498 // Also, it's such an edge case that it's almost useless, anyway.
499 // if the node is the last one in the tree, we shrink
500 // the tree and file size by one node
501 off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502 if (offset == lastOffset) {
503 status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504 if (status != B_OK)
505 return status;
506
507 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508 return B_OK;
509 }
510 #endif
511
512 // add the node to the free nodes list
513 fNode->left_link = header->free_node_pointer;
514 fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515
516 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517 return B_OK;
518 }
519
520
521 status_t
Allocate(Transaction & transaction,bplustree_node ** _node,off_t * _offset)522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523 off_t* _offset)
524 {
525 if (fTree == NULL || fTree->fStream == NULL)
526 RETURN_ERROR(B_BAD_VALUE);
527
528 Unset();
529
530 bplustree_header* header;
531 status_t status;
532
533 // if there are any free nodes, recycle them
534 if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535 CachedNode cached(fTree);
536 header = cached.SetToWritableHeader(transaction);
537 if (header == NULL)
538 return B_IO_ERROR;
539
540 *_offset = header->FreeNode();
541 *_node = fNode;
542
543 // set new free node pointer
544 header->free_node_pointer = fNode->left_link;
545 fNode->Initialize();
546 return B_OK;
547 }
548
549 // allocate space for a new node
550 Inode* stream = fTree->fStream;
551 if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552 return status;
553
554 CachedNode cached(fTree);
555 header = cached.SetToWritableHeader(transaction);
556 if (header == NULL)
557 return B_IO_ERROR;
558
559 // the maximum_size has to be changed before the call to SetTo() - or
560 // else it will fail because the requested node is out of bounds
561 off_t offset = header->MaximumSize();
562 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563 + fTree->fNodeSize);
564
565 cached.Unset();
566 // SetToWritable() below needs the new values in the tree's header
567
568 if (SetToWritable(transaction, offset, false) == NULL)
569 RETURN_ERROR(B_ERROR);
570
571 fNode->Initialize();
572
573 *_offset = offset;
574 *_node = fNode;
575 return B_OK;
576 }
577
578
579 // #pragma mark -
580
581
BPlusTree(Transaction & transaction,Inode * stream,int32 nodeSize)582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583 :
584 fStream(NULL),
585 fInTransaction(false)
586 {
587 mutex_init(&fIteratorLock, "bfs b+tree iterator");
588 SetTo(transaction, stream);
589 }
590 #endif // !_BOOT_MODE
591
592
BPlusTree(Inode * stream)593 BPlusTree::BPlusTree(Inode* stream)
594 :
595 fStream(NULL),
596 fInTransaction(false)
597 {
598 #if !_BOOT_MODE
599 mutex_init(&fIteratorLock, "bfs b+tree iterator");
600 #endif
601
602 SetTo(stream);
603 }
604
605
BPlusTree()606 BPlusTree::BPlusTree()
607 :
608 fStream(NULL),
609 fNodeSize(BPLUSTREE_NODE_SIZE),
610 fAllowDuplicates(true),
611 fInTransaction(false),
612 fStatus(B_NO_INIT)
613 {
614 #if !_BOOT_MODE
615 mutex_init(&fIteratorLock, "bfs b+tree iterator");
616 #endif
617 }
618
619
~BPlusTree()620 BPlusTree::~BPlusTree()
621 {
622 #if !_BOOT_MODE
623 // if there are any TreeIterators left, we need to stop them
624 // (can happen when the tree's inode gets deleted while
625 // traversing the tree - a TreeIterator doesn't lock the inode)
626 mutex_lock(&fIteratorLock);
627
628 SinglyLinkedList<TreeIterator>::ConstIterator iterator
629 = fIterators.GetIterator();
630 while (iterator.HasNext())
631 iterator.Next()->Stop();
632
633 mutex_destroy(&fIteratorLock);
634
635 ASSERT(!fInTransaction);
636 #endif // !_BOOT_MODE
637 }
638
639
640 #if !_BOOT_MODE
641 /*! Create a new B+Tree on the specified stream */
642 status_t
SetTo(Transaction & transaction,Inode * stream,int32 nodeSize)643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644 {
645 // initializes in-memory B+Tree
646
647 fStream = stream;
648
649 CachedNode cached(this);
650 bplustree_header* header = cached.SetToWritableHeader(transaction);
651 if (header == NULL) {
652 // allocate space for new header + node!
653 fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654 if (fStatus != B_OK)
655 RETURN_ERROR(fStatus);
656
657 header = cached.SetToWritableHeader(transaction);
658 if (header == NULL)
659 RETURN_ERROR(fStatus = B_ERROR);
660 }
661
662 fAllowDuplicates = stream->IsIndex()
663 || (stream->Mode() & S_ALLOW_DUPS) != 0;
664
665 fNodeSize = nodeSize;
666
667 // initialize b+tree header
668 header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669 header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671 header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673 header->free_node_pointer
674 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675 header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676
677 cached.Unset();
678
679 // initialize b+tree root node
680 cached.SetToWritable(transaction, fHeader.RootNode(), false);
681 if (cached.Node() == NULL)
682 RETURN_ERROR(B_IO_ERROR);
683
684 cached.Node()->Initialize();
685
686 return fStatus = B_OK;
687 }
688 #endif // !_BOOT_MODE
689
690
691 status_t
SetTo(Inode * stream)692 BPlusTree::SetTo(Inode* stream)
693 {
694 if (stream == NULL)
695 RETURN_ERROR(fStatus = B_BAD_VALUE);
696
697 fStream = stream;
698
699 // get on-disk B+Tree header
700
701 CachedNode cached(this);
702 const bplustree_header* header = cached.SetToHeader();
703 if (header != NULL)
704 memcpy(&fHeader, header, sizeof(bplustree_header));
705 else
706 RETURN_ERROR(fStatus = B_IO_ERROR);
707
708 // is header valid?
709
710 if (fHeader.MaximumSize() != stream->Size()) {
711 dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712 B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713 // we can't change the header since we don't have a transaction
714 //fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715 }
716 if (!fHeader.IsValid()) {
717 #ifdef DEBUG
718 dump_bplustree_header(&fHeader);
719 dump_block((const char*)&fHeader, 128);
720 #endif
721 RETURN_ERROR(fStatus = B_BAD_DATA);
722 }
723
724 fNodeSize = fHeader.NodeSize();
725
726 // validity check
727 static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728 S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729 S_DOUBLE_INDEX};
730 uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731 | S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732 | S_FLOAT_INDEX | S_DOUBLE_INDEX);
733
734 if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735 || ((stream->Mode() & S_INDEX_DIR) != 0
736 && kToMode[fHeader.DataType()] != mode)
737 || !stream->IsContainer()) {
738 D( dump_bplustree_header(&fHeader);
739 dump_inode(&stream->Node());
740 );
741 RETURN_ERROR(fStatus = B_BAD_TYPE);
742 }
743
744 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745 // in the original BFS code - we will honour it nevertheless
746 fAllowDuplicates = stream->IsIndex()
747 || (stream->Mode() & S_ALLOW_DUPS) != 0;
748
749 cached.SetTo(fHeader.RootNode());
750 RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751 }
752
753
754 status_t
InitCheck()755 BPlusTree::InitCheck()
756 {
757 return fStatus;
758 }
759
760
761 #if !_BOOT_MODE
762 status_t
Validate(bool repair,bool & _errorsFound)763 BPlusTree::Validate(bool repair, bool& _errorsFound)
764 {
765 TreeCheck check(this);
766 if (check.InitCheck() != B_OK)
767 return B_NO_MEMORY;
768
769 check.SetVisited(0);
770
771 // Walk the free nodes
772
773 CachedNode cached(this);
774 off_t freeOffset = fHeader.FreeNode();
775 while (freeOffset > 0) {
776 const bplustree_node* node;
777 status_t status = cached.SetTo(freeOffset, &node, false);
778 if (status != B_OK) {
779 if (status == B_IO_ERROR)
780 return B_IO_ERROR;
781
782 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783 "not be read: %s\n", fStream->ID(), freeOffset,
784 strerror(status));
785 check.FoundError();
786 break;
787 }
788
789 if (check.Visited(freeOffset)) {
790 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791 " circular!\n", fStream->ID(), freeOffset);
792 // TODO: if 'repair' is true, we could collect all unvisited nodes
793 // at the end, and put the into the free list
794 check.FoundError();
795 break;
796 }
797
798 check.SetVisited(freeOffset);
799
800 if (node->OverflowLink() != BPLUSTREE_FREE) {
801 dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802 " misses free mark!\n", fStream->ID(), freeOffset);
803 }
804 freeOffset = node->LeftLink();
805 }
806
807 // Iterate over the complete tree recursively
808
809 const bplustree_node* root;
810 status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811 if (status != B_OK)
812 return status;
813
814 status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815
816 if (check.ErrorsFound())
817 _errorsFound = true;
818
819 if (status != B_OK)
820 return status;
821
822 if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823 dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824 "declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825 fHeader.MaxNumberOfLevels());
826 }
827
828 if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829 dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830 " nodes.\n", fStream->ID(), check.VisitedCount(),
831 fHeader.MaximumSize() / fNodeSize);
832 }
833
834 return B_OK;
835 }
836
837
838 status_t
MakeEmpty()839 BPlusTree::MakeEmpty()
840 {
841 // Put all nodes into the free list in order
842 Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843
844 // Reset the header, and root node
845 CachedNode cached(this);
846 bplustree_header* header = cached.SetToWritableHeader(transaction);
847 if (header == NULL)
848 return B_IO_ERROR;
849
850 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852 if (fStream->Size() > (off_t)NodeSize() * 2)
853 header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854 else {
855 header->free_node_pointer
856 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857 }
858
859 bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860 if (node == NULL)
861 return B_IO_ERROR;
862
863 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864 node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866 node->all_key_count = 0;
867 node->all_key_length = 0;
868
869 for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870 offset += NodeSize()) {
871 bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872 if (node == NULL) {
873 dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874 return B_IO_ERROR;
875 }
876 if (offset < fStream->Size() - (off_t)NodeSize())
877 node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878 else
879 node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880
881 node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882
883 // It's not important to write it out in a single transaction; just
884 // make sure it doesn't get too large
885 if (offset % (1024 * 1024) == 0) {
886 transaction.Done();
887 transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888 }
889 }
890
891 return transaction.Done();
892 }
893
894
895 int32
TypeCodeToKeyType(type_code code)896 BPlusTree::TypeCodeToKeyType(type_code code)
897 {
898 switch (code) {
899 case B_STRING_TYPE:
900 return BPLUSTREE_STRING_TYPE;
901 case B_SSIZE_T_TYPE:
902 case B_INT32_TYPE:
903 return BPLUSTREE_INT32_TYPE;
904 case B_SIZE_T_TYPE:
905 case B_UINT32_TYPE:
906 return BPLUSTREE_UINT32_TYPE;
907 case B_OFF_T_TYPE:
908 case B_INT64_TYPE:
909 return BPLUSTREE_INT64_TYPE;
910 case B_UINT64_TYPE:
911 return BPLUSTREE_UINT64_TYPE;
912 case B_FLOAT_TYPE:
913 return BPLUSTREE_FLOAT_TYPE;
914 case B_DOUBLE_TYPE:
915 return BPLUSTREE_DOUBLE_TYPE;
916 }
917 return -1;
918 }
919
920
921 int32
ModeToKeyType(mode_t mode)922 BPlusTree::ModeToKeyType(mode_t mode)
923 {
924 switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925 | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926 case S_INT_INDEX:
927 return BPLUSTREE_INT32_TYPE;
928 case S_UINT_INDEX:
929 return BPLUSTREE_UINT32_TYPE;
930 case S_LONG_LONG_INDEX:
931 return BPLUSTREE_INT64_TYPE;
932 case S_ULONG_LONG_INDEX:
933 return BPLUSTREE_UINT64_TYPE;
934 case S_FLOAT_INDEX:
935 return BPLUSTREE_FLOAT_TYPE;
936 case S_DOUBLE_INDEX:
937 return BPLUSTREE_DOUBLE_TYPE;
938 case S_STR_INDEX:
939 default:
940 // default is for standard directories
941 return BPLUSTREE_STRING_TYPE;
942 }
943 }
944 #endif // !_BOOT_MODE
945
946
947 // #pragma mark - TransactionListener implementation
948
949
950 #if !_BOOT_MODE
951 void
TransactionDone(bool success)952 BPlusTree::TransactionDone(bool success)
953 {
954 if (!success) {
955 // update header from disk
956 CachedNode cached(this);
957 const bplustree_header* header = cached.SetToHeader();
958 if (header != NULL)
959 memcpy(&fHeader, header, sizeof(bplustree_header));
960 }
961 }
962
963
964 void
RemovedFromTransaction()965 BPlusTree::RemovedFromTransaction()
966 {
967 fInTransaction = false;
968
969 if (!fStream->GetVolume()->IsInitializing())
970 put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971 }
972
973
974 // #pragma mark -
975
976
977 void
_UpdateIterators(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change)978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979 uint16 splitAt, int8 change)
980 {
981 // Although every iterator which is affected by this update currently
982 // waits on a semaphore, other iterators could be added/removed at
983 // any time, so we need to protect this loop
984 MutexLocker _(fIteratorLock);
985
986 SinglyLinkedList<TreeIterator>::ConstIterator iterator
987 = fIterators.GetIterator();
988 while (iterator.HasNext())
989 iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990 }
991
992
993 void
_AddIterator(TreeIterator * iterator)994 BPlusTree::_AddIterator(TreeIterator* iterator)
995 {
996 MutexLocker _(fIteratorLock);
997 fIterators.Add(iterator);
998 }
999
1000
1001 void
_RemoveIterator(TreeIterator * iterator)1002 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003 {
1004 MutexLocker _(fIteratorLock);
1005 fIterators.Remove(iterator);
1006 }
1007 #endif // !_BOOT_MODE
1008
1009
1010 int32
_CompareKeys(const void * key1,int keyLength1,const void * key2,int keyLength2)1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012 int keyLength2)
1013 {
1014 type_code type = 0;
1015 switch (fHeader.DataType()) {
1016 case BPLUSTREE_STRING_TYPE:
1017 type = B_STRING_TYPE;
1018 break;
1019 case BPLUSTREE_INT32_TYPE:
1020 type = B_INT32_TYPE;
1021 break;
1022 case BPLUSTREE_UINT32_TYPE:
1023 type = B_UINT32_TYPE;
1024 break;
1025 case BPLUSTREE_INT64_TYPE:
1026 type = B_INT64_TYPE;
1027 break;
1028 case BPLUSTREE_UINT64_TYPE:
1029 type = B_UINT64_TYPE;
1030 break;
1031 case BPLUSTREE_FLOAT_TYPE:
1032 type = B_FLOAT_TYPE;
1033 break;
1034 case BPLUSTREE_DOUBLE_TYPE:
1035 type = B_DOUBLE_TYPE;
1036 break;
1037 }
1038 return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039 }
1040
1041
1042 status_t
_FindKey(const bplustree_node * node,const uint8 * key,uint16 keyLength,uint16 * _index,off_t * _next)1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044 uint16 keyLength, uint16* _index, off_t* _next)
1045 {
1046 #ifdef DEBUG
1047 NodeChecker checker(node, fNodeSize, "find");
1048 #endif
1049
1050 if (node->all_key_count == 0) {
1051 if (_index)
1052 *_index = 0;
1053 if (_next)
1054 *_next = node->OverflowLink();
1055 return B_ENTRY_NOT_FOUND;
1056 }
1057
1058 Unaligned<off_t>* values = node->Values();
1059 int16 saveIndex = -1;
1060
1061 // binary search in the key array
1062 for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063 uint16 i = (first + last) >> 1;
1064
1065 uint16 searchLength = 0;
1066 uint8* searchKey = node->KeyAt(i, &searchLength);
1067 if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068 > (uint8*)node + fNodeSize
1069 || searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070 #if !_BOOT_MODE
1071 fStream->GetVolume()->Panic();
1072 #endif
1073 RETURN_ERROR(B_BAD_DATA);
1074 }
1075
1076 int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077 if (cmp < 0) {
1078 last = i - 1;
1079 saveIndex = i;
1080 } else if (cmp > 0) {
1081 saveIndex = first = i + 1;
1082 } else {
1083 if (_index)
1084 *_index = i;
1085 if (_next)
1086 *_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087 return B_OK;
1088 }
1089 }
1090
1091 if (_index)
1092 *_index = saveIndex;
1093 if (_next) {
1094 if (saveIndex == node->NumKeys())
1095 *_next = node->OverflowLink();
1096 else
1097 *_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098 }
1099 return B_ENTRY_NOT_FOUND;
1100 }
1101
1102
1103 #if !_BOOT_MODE
1104 /*! Prepares the stack to contain all nodes that were passed while
1105 following the key, from the root node to the leaf node that could
1106 or should contain that key.
1107 */
1108 status_t
_SeekDown(Stack<node_and_key> & stack,const uint8 * key,uint16 keyLength)1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110 uint16 keyLength)
1111 {
1112 // set the root node to begin with
1113 node_and_key nodeAndKey;
1114 nodeAndKey.nodeOffset = fHeader.RootNode();
1115
1116 CachedNode cached(this);
1117 const bplustree_node* node;
1118 while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119 // if we are already on leaf level, we're done
1120 if (node->OverflowLink() == BPLUSTREE_NULL) {
1121 // node that the keyIndex is not properly set here (but it's not
1122 // needed in the calling functions anyway)!
1123 nodeAndKey.keyIndex = 0;
1124 stack.Push(nodeAndKey);
1125 return B_OK;
1126 }
1127
1128 off_t nextOffset;
1129 status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130 &nextOffset);
1131
1132 if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133 RETURN_ERROR(B_ERROR);
1134
1135 if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136 dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137 break;
1138 }
1139
1140 // put the node offset & the correct keyIndex on the stack
1141 stack.Push(nodeAndKey);
1142
1143 nodeAndKey.nodeOffset = nextOffset;
1144 }
1145
1146 FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147 B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148 return B_ERROR;
1149 }
1150
1151
1152 /*! This will find a free duplicate fragment in the given bplustree_node.
1153 The CachedNode will be set to the writable fragment on success.
1154 */
1155 status_t
_FindFreeDuplicateFragment(Transaction & transaction,const bplustree_node * node,CachedNode & cached,off_t * _offset,bplustree_node ** _fragment,uint32 * _index)1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157 const bplustree_node* node, CachedNode& cached,
1158 off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159 {
1160 Unaligned<off_t>* values = node->Values();
1161 for (int32 i = 0; i < node->NumKeys(); i++) {
1162 off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163
1164 // does the value link to a duplicate fragment?
1165 if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166 continue;
1167
1168 const bplustree_node* fragment = cached.SetTo(
1169 bplustree_node::FragmentOffset(value), false);
1170 if (fragment == NULL) {
1171 FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172 B_PRIdOFF "\n", value, fStream->ID()));
1173 continue;
1174 }
1175
1176 // see if there is some space left for us
1177 uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178 for (uint32 j = 0; j < num; j++) {
1179 duplicate_array* array = fragment->FragmentAt(j);
1180
1181 if (array->IsEmpty()) {
1182 // found an unused fragment
1183 *_fragment = cached.MakeWritable(transaction);
1184 if (*_fragment == NULL)
1185 return B_IO_ERROR;
1186
1187 *_offset = bplustree_node::FragmentOffset(value);
1188 *_index = j;
1189 return B_OK;
1190 }
1191 }
1192 }
1193 return B_ENTRY_NOT_FOUND;
1194 }
1195
1196
1197 status_t
_InsertDuplicate(Transaction & transaction,CachedNode & cached,const bplustree_node * node,uint16 index,off_t value)1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199 const bplustree_node* node, uint16 index, off_t value)
1200 {
1201 CachedNode cachedDuplicate(this);
1202 Unaligned<off_t>* values = node->Values();
1203 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204 status_t status;
1205 off_t offset;
1206
1207 if (bplustree_node::IsDuplicate(oldValue)) {
1208 // If it's a duplicate fragment, try to insert it into that, or if it
1209 // doesn't fit anymore, create a new duplicate node
1210
1211 if (bplustree_node::LinkType(oldValue)
1212 == BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214 transaction, bplustree_node::FragmentOffset(oldValue), false);
1215 if (duplicate == NULL)
1216 return B_IO_ERROR;
1217
1218 duplicate_array* array = duplicate->FragmentAt(
1219 bplustree_node::FragmentIndex(oldValue));
1220 int32 arrayCount = array->Count();
1221 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222 FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223 "%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224 (int)bplustree_node::FragmentIndex(oldValue),
1225 bplustree_node::FragmentOffset(oldValue), arrayCount,
1226 fStream->ID()));
1227 return B_BAD_DATA;
1228 }
1229
1230 if (arrayCount < NUM_FRAGMENT_VALUES) {
1231 array->Insert(value);
1232 } else {
1233 // Test if the fragment will be empty if we remove this key's
1234 // values
1235 if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236 // The node will be empty without our values, so let us
1237 // reuse it as a duplicate node
1238 offset = bplustree_node::FragmentOffset(oldValue);
1239
1240 memmove(duplicate->DuplicateArray(), array,
1241 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242 duplicate->left_link = duplicate->right_link
1243 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244
1245 array = duplicate->DuplicateArray();
1246 array->Insert(value);
1247 } else {
1248 // Create a new duplicate node
1249
1250 cachedDuplicate.UnsetUnchanged(transaction);
1251 // The old duplicate has not been touched, so we can
1252 // reuse it
1253
1254 bplustree_node* newDuplicate;
1255 status = cachedDuplicate.Allocate(transaction,
1256 &newDuplicate, &offset);
1257 if (status != B_OK)
1258 RETURN_ERROR(status);
1259
1260 // Copy the array from the fragment node to the duplicate
1261 // node and free the old entry (by zero'ing all values)
1262 newDuplicate->overflow_link = array->count;
1263 memcpy(&newDuplicate->all_key_count, &array->values[0],
1264 array->Count() * sizeof(off_t));
1265 memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266
1267 array = newDuplicate->DuplicateArray();
1268 array->Insert(value);
1269 }
1270
1271 // Update the main pointer to link to a duplicate node
1272 if (cached.MakeWritable(transaction) == NULL)
1273 return B_IO_ERROR;
1274
1275 values[index]
1276 = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277 BPLUSTREE_DUPLICATE_NODE, offset));
1278 }
1279
1280 return B_OK;
1281 }
1282
1283 // Put the value into a dedicated duplicate node
1284
1285 // search for free space in the duplicate nodes of that key
1286 duplicate_array* array;
1287 int32 arrayCount;
1288 const bplustree_node* duplicate;
1289 off_t duplicateOffset;
1290 do {
1291 duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292 duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293 if (duplicate == NULL)
1294 return B_IO_ERROR;
1295
1296 array = duplicate->DuplicateArray();
1297 arrayCount =array->Count();
1298 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299 FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301 duplicateOffset, arrayCount, fStream->ID()));
1302 return B_BAD_DATA;
1303 }
1304 } while (arrayCount >= NUM_DUPLICATE_VALUES
1305 && (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306
1307 bplustree_node* writableDuplicate
1308 = cachedDuplicate.MakeWritable(transaction);
1309 if (writableDuplicate == NULL)
1310 return B_IO_ERROR;
1311
1312 if (arrayCount < NUM_DUPLICATE_VALUES) {
1313 array = writableDuplicate->DuplicateArray();
1314 array->Insert(value);
1315 } else {
1316 // no space left - add a new duplicate node
1317
1318 bplustree_node* newDuplicate;
1319 status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320 &offset);
1321 if (status != B_OK)
1322 RETURN_ERROR(status);
1323
1324 // link the two nodes together
1325 writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326 newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327
1328 array = newDuplicate->DuplicateArray();
1329 array->count = 0;
1330 array->Insert(value);
1331 }
1332 return B_OK;
1333 }
1334
1335 // Search for a free duplicate fragment or create a new one
1336 // to insert the duplicate value into
1337
1338 uint32 fragmentIndex = 0;
1339 bplustree_node* fragment;
1340 if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341 &offset, &fragment, &fragmentIndex) != B_OK) {
1342 // allocate a new duplicate fragment node
1343 status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344 if (status != B_OK)
1345 RETURN_ERROR(status);
1346
1347 memset(fragment, 0, fNodeSize);
1348 }
1349
1350 duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351 array->Insert(oldValue);
1352 array->Insert(value);
1353
1354 if (cached.MakeWritable(transaction) == NULL)
1355 return B_IO_ERROR;
1356
1357 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358 BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359
1360 return B_OK;
1361 }
1362
1363
1364 void
_InsertKey(bplustree_node * node,uint16 index,uint8 * key,uint16 keyLength,off_t value)1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366 uint16 keyLength, off_t value)
1367 {
1368 // should never happen, but who knows?
1369 if (index > node->NumKeys())
1370 return;
1371
1372 Unaligned<off_t>* values = node->Values();
1373 Unaligned<uint16>* keyLengths = node->KeyLengths();
1374 uint8* keys = node->Keys();
1375
1376 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378 + keyLength);
1379
1380 Unaligned<off_t>* newValues = node->Values();
1381 Unaligned<uint16>* newKeyLengths = node->KeyLengths();
1382
1383 // move values and copy new value into them
1384 memmove(newValues + index + 1, values + index,
1385 sizeof(off_t) * (node->NumKeys() - 1 - index));
1386 memmove(newValues, values, sizeof(off_t) * index);
1387
1388 newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389
1390 // move and update key length index
1391 for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394 }
1395 memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396
1397 int32 keyStart;
1398 newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399 + (keyStart = index > 0
1400 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401
1402 // move keys and copy new key into them
1403 uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404 int32 size = node->AllKeyLength() - length;
1405 if (size > 0)
1406 memmove(keys + length, keys + length - keyLength, size);
1407
1408 memcpy(keys + keyStart, key, keyLength);
1409 }
1410
1411
1412 /*! Splits the \a node into two halves - the other half will be put into
1413 \a other. It also takes care to create a new overflow link if the node
1414 to split is an index node.
1415 */
1416 status_t
_SplitNode(bplustree_node * node,off_t nodeOffset,bplustree_node * other,off_t otherOffset,uint16 * _keyIndex,uint8 * key,uint16 * _keyLength,off_t * _value)1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418 bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419 uint16* _keyLength, off_t* _value)
1420 {
1421 if (*_keyIndex > node->NumKeys() + 1)
1422 return B_BAD_VALUE;
1423
1424 Unaligned<uint16>* inKeyLengths = node->KeyLengths();
1425 Unaligned<off_t>* inKeyValues = node->Values();
1426 uint8* inKeys = node->Keys();
1427 uint8* outKeys = other->Keys();
1428 int32 keyIndex = *_keyIndex; // can become less than zero!
1429
1430 if (keyIndex > node->NumKeys()) {
1431 FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432 "\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433 return B_BAD_VALUE;
1434 }
1435
1436 // How many keys will fit in one (half) page?
1437 // The following loop will find the answer to this question and
1438 // change the key lengths indices for their new home
1439
1440 // "bytes" is the number of bytes written for the new key,
1441 // "bytesBefore" are the bytes before that key
1442 // "bytesAfter" are the bytes after the new key, if any
1443 int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444
1445 size_t size = fNodeSize >> 1;
1446 int32 out, in;
1447 size_t keyLengths = 0;
1448 for (in = out = 0; in < node->NumKeys() + 1;) {
1449 keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
1450
1451 if (in == keyIndex && !bytes) {
1452 bytes = *_keyLength;
1453 bytesBefore = in > 0
1454 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1455 } else {
1456 if (keyIndex < out)
1457 bytesAfter = keyLengths - bytesBefore;
1458
1459 in++;
1460 }
1461 out++;
1462
1463 if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
1464 + out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1465 // we have found the number of keys in the new node!
1466 break;
1467 }
1468 }
1469
1470 // if the new key was not inserted, set the length of the keys
1471 // that can be copied directly
1472
1473 if (keyIndex >= out && in > 0)
1474 bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1475 else if (keyIndex + 1 < out)
1476 bytesAfter = keyLengths - bytesBefore;
1477
1478 if (bytesBefore < 0 || bytesAfter < 0)
1479 return B_BAD_DATA;
1480
1481 other->left_link = node->left_link;
1482 other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1483 other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1484 + bytesAfter);
1485 other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1486
1487 Unaligned<uint16>* outKeyLengths = other->KeyLengths();
1488 Unaligned<off_t>* outKeyValues = other->Values();
1489 int32 keys = out > keyIndex ? keyIndex : out;
1490
1491 if (bytesBefore) {
1492 // copy the keys
1493 memcpy(outKeys, inKeys, bytesBefore);
1494 memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1495 memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1496 }
1497 if (bytes) {
1498 // copy the newly inserted key
1499 memcpy(outKeys + bytesBefore, key, bytes);
1500 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1501 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1502
1503 if (bytesAfter) {
1504 // copy the keys after the new key
1505 memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1506 bytesAfter);
1507 keys = out - keyIndex - 1;
1508 for (int32 i = 0;i < keys;i++) {
1509 outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1510 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1511 + bytes);
1512 }
1513 memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1514 keys * sizeof(off_t));
1515 }
1516 }
1517
1518 // if the new key was already inserted, we shouldn't use it again
1519 if (in != out)
1520 keyIndex--;
1521
1522 int32 total = bytesBefore + bytesAfter;
1523
1524 // these variables are for the key that will be returned
1525 // to the parent node
1526 uint8* newKey = NULL;
1527 uint16 newLength = 0;
1528 bool newAllocated = false;
1529
1530 // If we have split an index node, we have to drop the first key
1531 // of the next node (which can also be the new key to insert).
1532 // The dropped key is also the one which has to be inserted in
1533 // the parent node, so we will set the "newKey" already here.
1534 if (node->OverflowLink() != BPLUSTREE_NULL) {
1535 if (in == keyIndex) {
1536 newKey = key;
1537 newLength = *_keyLength;
1538
1539 other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1540 keyIndex--;
1541 } else {
1542 // If a key is dropped (is not the new key), we have to copy
1543 // it, because it would be lost if not.
1544 uint8* droppedKey = node->KeyAt(in, &newLength);
1545 if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1546 > (uint8*)node + fNodeSize
1547 || newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1548 fStream->GetVolume()->Panic();
1549 RETURN_ERROR(B_BAD_DATA);
1550 }
1551 newKey = (uint8*)malloc(newLength);
1552 if (newKey == NULL)
1553 return B_NO_MEMORY;
1554
1555 newAllocated = true;
1556 memcpy(newKey, droppedKey, newLength);
1557
1558 other->overflow_link = inKeyValues[in];
1559 total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1560 }
1561 }
1562
1563 // and now the same game for the other page and the rest of the keys
1564 // (but with memmove() instead of memcpy(), because they may overlap)
1565
1566 bytesBefore = bytesAfter = bytes = 0;
1567 out = 0;
1568 int32 skip = in;
1569 while (in < node->NumKeys() + 1) {
1570 if (in == keyIndex && !bytes) {
1571 // it's enough to set bytesBefore once here, because we do
1572 // not need to know the exact length of all keys in this
1573 // loop
1574 bytesBefore = in > skip
1575 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1576 bytes = *_keyLength;
1577 out++;
1578 } else {
1579 if (in < node->NumKeys()) {
1580 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1582
1583 if (bytes) {
1584 inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1585 BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1586
1587 bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1588 - bytesBefore - bytes;
1589 }
1590 out++;
1591 }
1592 in++;
1593 }
1594 }
1595
1596 // adjust the byte counts (since we were a bit lazy in the loop)
1597 if (keyIndex < skip)
1598 bytesBefore = node->AllKeyLength() - total;
1599
1600 if (bytesBefore < 0 || bytesAfter < 0) {
1601 if (newAllocated)
1602 free(newKey);
1603 return B_BAD_DATA;
1604 }
1605
1606 node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1607 // right link, and overflow link can stay the same
1608 node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1609 + bytesAfter);
1610 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1611
1612 // array positions have changed
1613 outKeyLengths = node->KeyLengths();
1614 outKeyValues = node->Values();
1615
1616 // move the keys in the old node: the order is important here,
1617 // because we don't want to overwrite any contents
1618
1619 keys = keyIndex <= skip ? out : keyIndex - skip;
1620 keyIndex -= skip;
1621 in = out - keyIndex - 1;
1622 // Note: keyIndex and in will contain invalid values when the new key
1623 // went to the other node. But in this case bytes and bytesAfter are
1624 // 0 and subsequently we never use keyIndex and in.
1625
1626 if (bytesBefore)
1627 memmove(inKeys, inKeys + total, bytesBefore);
1628 if (bytesAfter) {
1629 memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1630 bytesAfter);
1631 }
1632
1633 if (bytesBefore)
1634 memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1635 if (bytesAfter) {
1636 // if byteAfter is > 0, keyIndex is larger than skip
1637 memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1638 in * sizeof(uint16));
1639 }
1640
1641 if (bytesBefore)
1642 memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1643 if (bytesAfter) {
1644 memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1645 in * sizeof(off_t));
1646 }
1647
1648 if (bytes) {
1649 // finally, copy the newly inserted key (don't overwrite anything)
1650 memcpy(inKeys + bytesBefore, key, bytes);
1651 outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1652 outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1653 }
1654
1655 // Prepare the key that will be inserted in the parent node which
1656 // is either the dropped key or the last of the other node.
1657 // If it's the dropped key, "newKey" was already set earlier.
1658
1659 if (newKey == NULL)
1660 newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1661
1662 memcpy(key, newKey, newLength);
1663 *_keyLength = newLength;
1664 *_value = otherOffset;
1665
1666 if (newAllocated)
1667 free(newKey);
1668
1669 return B_OK;
1670 }
1671
1672
1673 /*! This inserts a key into the tree. The changes made to the tree will
1674 all be part of the \a transaction.
1675 You need to have the inode write locked.
1676 */
1677 status_t
Insert(Transaction & transaction,const uint8 * key,uint16 keyLength,off_t value)1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1679 off_t value)
1680 {
1681 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1682 || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1683 RETURN_ERROR(B_BAD_VALUE);
1684 #ifdef DEBUG
1685 if (value < 0)
1686 panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1687 #endif
1688
1689 ASSERT_WRITE_LOCKED_INODE(fStream);
1690
1691 Stack<node_and_key> stack;
1692 if (_SeekDown(stack, key, keyLength) != B_OK)
1693 RETURN_ERROR(B_ERROR);
1694
1695 uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1696 memcpy(keyBuffer, key, keyLength);
1697
1698 node_and_key nodeAndKey;
1699 const bplustree_node* node;
1700
1701 CachedNode cached(this);
1702 while (stack.Pop(&nodeAndKey)
1703 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1704 #ifdef DEBUG
1705 NodeChecker checker(node, fNodeSize, "insert");
1706 #endif
1707 if (node->IsLeaf()) {
1708 // first round, check for duplicate entries
1709 status_t status = _FindKey(node, key, keyLength,
1710 &nodeAndKey.keyIndex);
1711
1712 // is this a duplicate entry?
1713 if (status == B_OK) {
1714 if (fAllowDuplicates) {
1715 status = _InsertDuplicate(transaction, cached, node,
1716 nodeAndKey.keyIndex, value);
1717 if (status != B_OK)
1718 RETURN_ERROR(status);
1719 return B_OK;
1720 }
1721
1722 return B_NAME_IN_USE;
1723 }
1724 }
1725
1726 bplustree_node* writableNode = cached.MakeWritable(transaction);
1727 if (writableNode == NULL)
1728 return B_IO_ERROR;
1729
1730 // is the node big enough to hold the pair?
1731 if (int32(key_align(sizeof(bplustree_node)
1732 + writableNode->AllKeyLength() + keyLength)
1733 + (writableNode->NumKeys() + 1) * (sizeof(uint16)
1734 + sizeof(off_t))) < fNodeSize) {
1735 _InsertKey(writableNode, nodeAndKey.keyIndex,
1736 keyBuffer, keyLength, value);
1737 _UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1738 nodeAndKey.keyIndex, 0, 1);
1739
1740 return B_OK;
1741 } else {
1742 CachedNode cachedNewRoot(this);
1743 CachedNode cachedOther(this);
1744
1745 // do we need to allocate a new root node? if so, then do
1746 // it now
1747 off_t newRoot = BPLUSTREE_NULL;
1748 if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1749 bplustree_node* root;
1750 status_t status = cachedNewRoot.Allocate(transaction, &root,
1751 &newRoot);
1752 if (status != B_OK) {
1753 // The tree is most likely corrupted!
1754 // But it's still sane at leaf level - we could set
1755 // a flag in the header that forces the tree to be
1756 // rebuild next time...
1757 // But since we will have journaling, that's not a big
1758 // problem anyway.
1759 RETURN_ERROR(status);
1760 }
1761 }
1762
1763 // reserve space for the other node
1764 bplustree_node* other;
1765 off_t otherOffset;
1766 status_t status = cachedOther.Allocate(transaction, &other,
1767 &otherOffset);
1768 if (status != B_OK) {
1769 cachedNewRoot.Free(transaction, newRoot);
1770 RETURN_ERROR(status);
1771 }
1772
1773 if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1774 otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1775 &value) != B_OK) {
1776 // free root node & other node here
1777 cachedOther.Free(transaction, otherOffset);
1778 cachedNewRoot.Free(transaction, newRoot);
1779
1780 RETURN_ERROR(B_ERROR);
1781 }
1782 #ifdef DEBUG
1783 checker.Check("insert split");
1784 NodeChecker otherChecker(other, fNodeSize, "insert split other");
1785 #endif
1786
1787 _UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1788 nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1789
1790 // update the right link of the node in the left of the new node
1791 if ((other = cachedOther.SetToWritable(transaction,
1792 other->LeftLink())) != NULL) {
1793 other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1794 }
1795
1796 // create a new root if necessary
1797 if (newRoot != BPLUSTREE_NULL) {
1798 bplustree_node* root = cachedNewRoot.Node();
1799
1800 _InsertKey(root, 0, keyBuffer, keyLength,
1801 writableNode->LeftLink());
1802 root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1803 nodeAndKey.nodeOffset);
1804
1805 CachedNode cached(this);
1806 bplustree_header* header
1807 = cached.SetToWritableHeader(transaction);
1808 if (header == NULL)
1809 return B_IO_ERROR;
1810
1811 // finally, update header to point to the new root
1812 header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1813 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1814 header->MaxNumberOfLevels() + 1);
1815
1816 return B_OK;
1817 }
1818 }
1819 }
1820 RETURN_ERROR(B_ERROR);
1821 }
1822
1823
1824 /*! Removes the duplicate index/value pair from the tree.
1825 It's part of the private tree interface.
1826 */
1827 status_t
_RemoveDuplicate(Transaction & transaction,const bplustree_node * node,CachedNode & cached,uint16 index,off_t value)1828 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1829 const bplustree_node* node, CachedNode& cached, uint16 index,
1830 off_t value)
1831 {
1832 Unaligned<off_t>* values = node->Values();
1833 off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1834
1835 CachedNode cachedDuplicate(this);
1836 off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1837 bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1838 duplicateOffset, false);
1839 if (duplicate == NULL)
1840 return B_IO_ERROR;
1841
1842 // if it's a duplicate fragment, remove the entry from there
1843 if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1844 duplicate_array* array = duplicate->FragmentAt(
1845 bplustree_node::FragmentIndex(oldValue));
1846 int32 arrayCount = array->Count();
1847
1848 if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1849 FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1850 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1851 (int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1852 arrayCount, fStream->ID()));
1853 return B_BAD_DATA;
1854 }
1855 if (!array->Remove(value)) {
1856 FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1857 "%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1858 duplicateOffset, fStream->ID()));
1859 return B_ENTRY_NOT_FOUND;
1860 }
1861
1862 // remove the array from the fragment node if it is empty
1863 if (--arrayCount == 1) {
1864 // set the link to the remaining value
1865 if (cached.MakeWritable(transaction) == NULL)
1866 return B_IO_ERROR;
1867
1868 values[index] = array->values[0];
1869
1870 // Remove the whole fragment node, if this was the only array,
1871 // otherwise free just the array
1872 if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1873 status_t status = cachedDuplicate.Free(transaction,
1874 duplicateOffset);
1875 if (status != B_OK)
1876 return status;
1877 } else
1878 array->count = 0;
1879 }
1880 return B_OK;
1881 }
1882
1883 // Remove value from a duplicate node!
1884
1885 duplicate_array* array = NULL;
1886 int32 arrayCount = 0;
1887
1888 if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1889 FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1890 ", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1891 return B_BAD_DATA;
1892 }
1893
1894 // Search the duplicate nodes until the entry could be found (and removed)
1895 while (duplicate != NULL) {
1896 array = duplicate->DuplicateArray();
1897 arrayCount = array->Count();
1898
1899 if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1900 FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1901 B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1902 duplicateOffset, arrayCount, fStream->ID()));
1903 return B_BAD_DATA;
1904 }
1905
1906 if (array->Remove(value)) {
1907 arrayCount--;
1908 break;
1909 }
1910
1911 if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1912 RETURN_ERROR(B_ENTRY_NOT_FOUND);
1913
1914 cachedDuplicate.UnsetUnchanged(transaction);
1915 duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1916 false);
1917 }
1918 if (duplicate == NULL)
1919 RETURN_ERROR(B_IO_ERROR);
1920
1921 // The entry got removed from the duplicate node, but we might want to free
1922 // it now in case it's empty
1923
1924 while (true) {
1925 off_t left = duplicate->LeftLink();
1926 off_t right = duplicate->RightLink();
1927 bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1928
1929 if ((isLast && arrayCount == 1) || arrayCount == 0) {
1930 // Free empty duplicate page, link their siblings together, and
1931 // update the duplicate link if needed (ie. when we either remove
1932 // the last duplicate node or have a new first one)
1933
1934 if (left == BPLUSTREE_NULL) {
1935 // the duplicate link points to us
1936 if (cached.MakeWritable(transaction) == NULL)
1937 return B_IO_ERROR;
1938
1939 if (arrayCount == 1) {
1940 // This is the last node, and there is only one value left;
1941 // replace the duplicate link with that value, it's no
1942 // duplicate anymore
1943 values[index] = array->values[0];
1944 } else {
1945 // Move the duplicate link to the next node
1946 values[index] = HOST_ENDIAN_TO_BFS_INT64(
1947 bplustree_node::MakeLink(
1948 BPLUSTREE_DUPLICATE_NODE, right));
1949 }
1950 }
1951
1952 status_t status = cachedDuplicate.Free(transaction,
1953 duplicateOffset);
1954 if (status != B_OK)
1955 return status;
1956
1957 if (left != BPLUSTREE_NULL
1958 && (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1959 false)) != NULL) {
1960 duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1961
1962 // If the next node is the last node, we need to free that node
1963 // and convert the duplicate entry back into a normal entry
1964 array = duplicate->DuplicateArray();
1965 arrayCount = array->Count();
1966 if (right == BPLUSTREE_NULL
1967 && duplicate->LeftLink() == BPLUSTREE_NULL
1968 && arrayCount <= NUM_FRAGMENT_VALUES) {
1969 duplicateOffset = left;
1970 continue;
1971 }
1972 }
1973 if (right != BPLUSTREE_NULL
1974 && (duplicate = cachedDuplicate.SetToWritable(transaction,
1975 right, false)) != NULL) {
1976 duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1977
1978 // Again, we may need to turn the duplicate entry back into a
1979 // normal entry
1980 array = duplicate->DuplicateArray();
1981 arrayCount = array->Count();
1982 if (left == BPLUSTREE_NULL
1983 && duplicate->RightLink() == BPLUSTREE_NULL
1984 && arrayCount <= NUM_FRAGMENT_VALUES) {
1985 duplicateOffset = right;
1986 continue;
1987 }
1988 }
1989 return B_OK;
1990 }
1991 if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1992 // If the number of entries fits in a duplicate fragment, then
1993 // either find a free fragment node, or convert this node to a
1994 // fragment node.
1995 CachedNode cachedOther(this);
1996
1997 bplustree_node* fragment = NULL;
1998 uint32 fragmentIndex = 0;
1999 off_t offset;
2000 if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2001 &offset, &fragment, &fragmentIndex) == B_OK) {
2002 // move to other node
2003 duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2004 memcpy(target, array,
2005 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2006
2007 cachedDuplicate.Free(transaction, duplicateOffset);
2008 duplicateOffset = offset;
2009 } else {
2010 // convert node
2011 memmove(duplicate, array,
2012 (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2013 memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2014 fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2015 }
2016
2017 if (cached.MakeWritable(transaction) == NULL)
2018 return B_IO_ERROR;
2019
2020 values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2021 BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2022 }
2023 return B_OK;
2024 }
2025 }
2026
2027
2028 /*! Removes the key with the given index from the specified node.
2029 Since it has to get the key from the node anyway (to obtain it's
2030 pointer), it's not needed to pass the key & its length, although
2031 the calling method (BPlusTree::Remove()) have this data.
2032 */
2033 void
_RemoveKey(bplustree_node * node,uint16 index)2034 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2035 {
2036 // should never happen, but who knows?
2037 if (index > node->NumKeys() && node->NumKeys() > 0) {
2038 FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2039 "\n", index, fStream->ID()));
2040 return;
2041 }
2042
2043 Unaligned<off_t>* values = node->Values();
2044
2045 // if we would have to drop the overflow link, drop
2046 // the last key instead and update the overflow link
2047 // to the value of that one
2048 if (!node->IsLeaf() && index == node->NumKeys())
2049 node->overflow_link = values[--index];
2050
2051 uint16 length = 0;
2052 uint8* key = node->KeyAt(index, &length);
2053 if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2054 || length > BPLUSTREE_MAX_KEY_LENGTH) {
2055 FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2056 fStream->ID()));
2057 fStream->GetVolume()->Panic();
2058 return;
2059 }
2060
2061 Unaligned<uint16>* keyLengths = node->KeyLengths();
2062 uint8* keys = node->Keys();
2063
2064 node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2065 node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2066 node->AllKeyLength() - length);
2067
2068 Unaligned<off_t>* newValues = node->Values();
2069 Unaligned<uint16>* newKeyLengths = node->KeyLengths();
2070
2071 // move key data
2072 memmove(key, key + length, node->AllKeyLength() - (key - keys));
2073
2074 // move and update key lengths
2075 if (index > 0 && newKeyLengths != keyLengths)
2076 memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2077 for (uint16 i = index; i < node->NumKeys(); i++) {
2078 newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2079 BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2080 }
2081
2082 // move values
2083 if (index > 0)
2084 memmove(newValues, values, index * sizeof(off_t));
2085 if (node->NumKeys() > index) {
2086 memmove(newValues + index, values + index + 1,
2087 (node->NumKeys() - index) * sizeof(off_t));
2088 }
2089 }
2090
2091
2092 /*! Removes the specified key from the tree. The "value" parameter is only used
2093 for trees which allow duplicates, so you may safely ignore it.
2094 It's not an optional parameter, so at least you have to think about it.
2095 You need to have the inode write locked.
2096 */
2097 status_t
Remove(Transaction & transaction,const uint8 * key,uint16 keyLength,off_t value)2098 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2099 off_t value)
2100 {
2101 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2102 || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2103 RETURN_ERROR(B_BAD_VALUE);
2104
2105 ASSERT_WRITE_LOCKED_INODE(fStream);
2106
2107 Stack<node_and_key> stack;
2108 if (_SeekDown(stack, key, keyLength) != B_OK)
2109 RETURN_ERROR(B_ERROR);
2110
2111 node_and_key nodeAndKey;
2112 const bplustree_node* node;
2113
2114 CachedNode cached(this);
2115 while (stack.Pop(&nodeAndKey)
2116 && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2117 #ifdef DEBUG
2118 NodeChecker checker(node, fNodeSize, "remove");
2119 #endif
2120 if (node->IsLeaf()) {
2121 // first round, check for duplicate entries
2122 status_t status = _FindKey(node, key, keyLength,
2123 &nodeAndKey.keyIndex);
2124 if (status != B_OK)
2125 RETURN_ERROR(status);
2126
2127 // Is this a duplicate entry?
2128 if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2129 node->Values()[nodeAndKey.keyIndex]))) {
2130 if (fAllowDuplicates) {
2131 return _RemoveDuplicate(transaction, node, cached,
2132 nodeAndKey.keyIndex, value);
2133 }
2134
2135 FATAL(("dupliate node found where no duplicates are "
2136 "allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2137 RETURN_ERROR(B_ERROR);
2138 } else {
2139 if (node->Values()[nodeAndKey.keyIndex] != value)
2140 return B_ENTRY_NOT_FOUND;
2141
2142 // If we will remove the last key, the iterator will be set
2143 // to the next node after the current - if there aren't any
2144 // more nodes, we need a way to prevent the TreeIterators to
2145 // touch the old node again, we use BPLUSTREE_FREE for this
2146 off_t next = node->RightLink() == BPLUSTREE_NULL
2147 ? BPLUSTREE_FREE : node->RightLink();
2148 _UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2149 ? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2150 }
2151 }
2152
2153 bplustree_node* writableNode = cached.MakeWritable(transaction);
2154 if (writableNode == NULL)
2155 return B_IO_ERROR;
2156
2157 // if it's an empty root node, we have to convert it
2158 // to a leaf node by dropping the overflow link, or,
2159 // if it's already a leaf node, just empty it
2160 if (nodeAndKey.nodeOffset == fHeader.RootNode()
2161 && (node->NumKeys() == 0
2162 || (node->NumKeys() == 1 && node->IsLeaf()))) {
2163 writableNode->overflow_link
2164 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2165 writableNode->all_key_count = 0;
2166 writableNode->all_key_length = 0;
2167
2168 // if we've made a leaf node out of the root node, we need
2169 // to reset the maximum number of levels in the header
2170 if (fHeader.MaxNumberOfLevels() != 1) {
2171 CachedNode cached(this);
2172 bplustree_header* header
2173 = cached.SetToWritableHeader(transaction);
2174 if (header == NULL)
2175 return B_IO_ERROR;
2176
2177 header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2178 }
2179 return B_OK;
2180 }
2181
2182 // if there is only one key left, we don't have to remove
2183 // it, we can just dump the node (index nodes still have
2184 // the overflow link, so we have to drop the last key)
2185 if (writableNode->NumKeys() > 1
2186 || (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2187 _RemoveKey(writableNode, nodeAndKey.keyIndex);
2188 return B_OK;
2189 }
2190
2191 // when we are here, we can just free the node, but
2192 // we have to update the right/left link of the
2193 // siblings first
2194 CachedNode otherCached(this);
2195 bplustree_node* other = otherCached.SetToWritable(transaction,
2196 writableNode->LeftLink());
2197 if (other != NULL)
2198 other->right_link = writableNode->right_link;
2199
2200 if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2201 != NULL) {
2202 other->left_link = writableNode->left_link;
2203 }
2204
2205 cached.Free(transaction, nodeAndKey.nodeOffset);
2206 }
2207 RETURN_ERROR(B_ERROR);
2208 }
2209
2210
2211 /*! Replaces the value for the key in the tree.
2212 Returns B_OK if the key could be found and its value replaced,
2213 B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2214 to indicate that something went terribly wrong.
2215 Note that this doesn't work with duplicates - it will just
2216 return B_BAD_TYPE if you call this function on a tree where
2217 duplicates are allowed.
2218 You need to have the inode write locked.
2219 */
2220 status_t
Replace(Transaction & transaction,const uint8 * key,uint16 keyLength,off_t value)2221 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2222 off_t value)
2223 {
2224 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2225 || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2226 || key == NULL)
2227 RETURN_ERROR(B_BAD_VALUE);
2228
2229 if (fAllowDuplicates)
2230 RETURN_ERROR(B_BAD_TYPE);
2231
2232 ASSERT_WRITE_LOCKED_INODE(fStream);
2233
2234 off_t nodeOffset = fHeader.RootNode();
2235 CachedNode cached(this);
2236 const bplustree_node* node;
2237
2238 while ((node = cached.SetTo(nodeOffset)) != NULL) {
2239 uint16 keyIndex = 0;
2240 off_t nextOffset;
2241 status_t status = _FindKey(node, key, keyLength, &keyIndex,
2242 &nextOffset);
2243
2244 if (node->OverflowLink() == BPLUSTREE_NULL) {
2245 if (status == B_OK) {
2246 bplustree_node* writableNode = cached.MakeWritable(transaction);
2247 if (writableNode != NULL) {
2248 writableNode->Values()[keyIndex]
2249 = HOST_ENDIAN_TO_BFS_INT64(value);
2250 } else
2251 status = B_IO_ERROR;
2252 }
2253
2254 return status;
2255 } else if (nextOffset == nodeOffset)
2256 RETURN_ERROR(B_ERROR);
2257
2258 nodeOffset = nextOffset;
2259 }
2260 RETURN_ERROR(B_ERROR);
2261 }
2262 #endif // !_BOOT_MODE
2263
2264
2265 /*! Searches the key in the tree, and stores the offset found in _value,
2266 if successful.
2267 It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2268 while it descends the tree.
2269 Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2270 It can also return other errors to indicate that something went wrong.
2271 Note that this doesn't work with duplicates - it will just return
2272 B_BAD_TYPE if you call this function on a tree where duplicates are
2273 allowed.
2274 You need to have the inode read or write locked.
2275 */
2276 status_t
Find(const uint8 * key,uint16 keyLength,off_t * _value)2277 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2278 {
2279 if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH)
2280 RETURN_ERROR(B_BAD_VALUE);
2281
2282 if (keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2283 RETURN_ERROR(B_NAME_TOO_LONG);
2284
2285 if (fAllowDuplicates)
2286 RETURN_ERROR(B_BAD_TYPE);
2287
2288 #if !_BOOT_MODE
2289 ASSERT_READ_LOCKED_INODE(fStream);
2290 #endif
2291
2292 off_t nodeOffset = fHeader.RootNode();
2293 CachedNode cached(this);
2294 const bplustree_node* node;
2295
2296 #ifdef DEBUG
2297 int32 levels = 0;
2298 #endif
2299
2300 while ((node = cached.SetTo(nodeOffset)) != NULL) {
2301 uint16 keyIndex = 0;
2302 off_t nextOffset;
2303 status_t status = _FindKey(node, key, keyLength, &keyIndex,
2304 &nextOffset);
2305
2306 #ifdef DEBUG
2307 levels++;
2308 #endif
2309 if (node->OverflowLink() == BPLUSTREE_NULL) {
2310 if (status == B_OK && _value != NULL)
2311 *_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2312
2313 #ifdef DEBUG
2314 if (levels != (int32)fHeader.MaxNumberOfLevels())
2315 DEBUGGER(("levels don't match"));
2316 #endif
2317 return status;
2318 } else if (nextOffset == nodeOffset)
2319 RETURN_ERROR(B_ERROR);
2320
2321 nodeOffset = nextOffset;
2322 }
2323 FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2324 B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2325 RETURN_ERROR(B_ERROR);
2326 }
2327
2328
2329 #if !_BOOT_MODE
2330 status_t
_ValidateChildren(TreeCheck & check,uint32 level,off_t offset,const uint8 * largestKey,uint16 largestKeyLength,const bplustree_node * parent)2331 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2332 const uint8* largestKey, uint16 largestKeyLength,
2333 const bplustree_node* parent)
2334 {
2335 if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2336 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2337 "failed!\n", fStream->ID(), offset);
2338 check.FoundError();
2339 return B_OK;
2340 }
2341 if (level >= fHeader.MaxNumberOfLevels()) {
2342 dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2343 "!\n", fStream->ID(), offset);
2344 check.FoundError();
2345 return B_OK;
2346 }
2347
2348 check.SetLevel(level);
2349
2350 if (check.Visited(offset)) {
2351 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2352 fStream->ID(), offset);
2353 check.FoundError();
2354 return B_OK;
2355 }
2356
2357 check.SetVisited(offset);
2358
2359 uint32 count = parent->NumKeys();
2360 Unaligned<off_t>* values = parent->Values();
2361 off_t lastOffset = check.PreviousOffset(level);
2362 CachedNode cached(this);
2363
2364 for (uint32 i = 0; i < count; i++) {
2365 uint16 keyLength;
2366 uint8* key = parent->KeyAt(i, &keyLength);
2367 if (largestKey != NULL) {
2368 int result = _CompareKeys(key, keyLength, largestKey,
2369 largestKeyLength);
2370 if (result > 0 || (result == 0 && i != count - 1)) {
2371 dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2372 B_PRIu32 " larger than it should!\n",
2373 fStream->ID(), offset, i);
2374 check.FoundError();
2375 }
2376 }
2377
2378 off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2379 if (bplustree_node::IsDuplicate(childOffset)) {
2380 // Walk the duplicate nodes
2381 off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2382 off_t lastDuplicateOffset = BPLUSTREE_NULL;
2383
2384 while (duplicateOffset != BPLUSTREE_NULL) {
2385 const bplustree_node* node;
2386 status_t status = cached.SetTo(duplicateOffset, &node, false);
2387 if (status != B_OK) {
2388 if (status == B_IO_ERROR)
2389 return B_IO_ERROR;
2390
2391 dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2392 B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2393 duplicateOffset, strerror(status));
2394 check.FoundError();
2395 break;
2396 }
2397
2398 bool isFragmentNode = bplustree_node::LinkType(childOffset)
2399 == BPLUSTREE_DUPLICATE_FRAGMENT;
2400 bool isKnownFragment = isFragmentNode
2401 && check.VisitedFragment(duplicateOffset);
2402
2403 if (!isKnownFragment && check.Visited(duplicateOffset)) {
2404 dprintf("inode %" B_PRIdOFF ": %s node at %"
2405 B_PRIdOFF " already visited, referenced from %"
2406 B_PRIdOFF "!\n", fStream->ID(),
2407 isFragmentNode ? "fragment" : "duplicate",
2408 duplicateOffset, offset);
2409 check.FoundError();
2410 break;
2411 }
2412
2413 // Fragment nodes may be visited more than once from different
2414 // places
2415 if (!check.Visited(duplicateOffset))
2416 check.SetVisited(duplicateOffset);
2417 if (!isKnownFragment && isFragmentNode)
2418 check.SetVisitedFragment(duplicateOffset);
2419
2420 duplicate_array* array;
2421 int32 minSize;
2422 int32 maxSize;
2423 if (isFragmentNode) {
2424 array = node->FragmentAt(
2425 bplustree_node::FragmentIndex(childOffset));
2426 minSize = 2;
2427 maxSize = NUM_FRAGMENT_VALUES;
2428 } else {
2429 array = node->DuplicateArray();
2430 minSize = 1;
2431 maxSize = NUM_DUPLICATE_VALUES;
2432 }
2433 int32 arrayCount = array->Count();
2434
2435 if (arrayCount < minSize || arrayCount > maxSize) {
2436 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2437 " has invalid array size %" B_PRId32 "!\n",
2438 fStream->ID(), duplicateOffset, arrayCount);
2439 check.FoundError();
2440 } else {
2441 // Simple check if the values in the array may be valid
2442 for (int32 j = 0; j < arrayCount; j++) {
2443 if (!fStream->GetVolume()->IsValidInodeBlock(
2444 array->ValueAt(j))) {
2445 dprintf("inode %" B_PRIdOFF ": duplicate at %"
2446 B_PRIdOFF " contains invalid block %" B_PRIdOFF
2447 " at %" B_PRId32 "!\n", fStream->ID(),
2448 duplicateOffset, array->ValueAt(j), j);
2449 check.FoundError();
2450 break;
2451 }
2452 }
2453 }
2454
2455 // A fragment node is not linked (and does not have valid links)
2456 if (isFragmentNode)
2457 break;
2458
2459 if (node->LeftLink() != lastDuplicateOffset) {
2460 dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2461 " has wrong left link %" B_PRIdOFF ", expected %"
2462 B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2463 node->LeftLink(), lastDuplicateOffset);
2464 check.FoundError();
2465 }
2466
2467 lastDuplicateOffset = duplicateOffset;
2468 duplicateOffset = node->RightLink();
2469 }
2470 } else if (!parent->IsLeaf()) {
2471 // Test a regular child node recursively
2472 off_t nextOffset = parent->OverflowLink();
2473 if (i < count - 1)
2474 nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2475
2476 if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2477 // Test right link of the previous node
2478 const bplustree_node* previous = cached.SetTo(lastOffset, true);
2479 if (previous == NULL)
2480 return B_IO_ERROR;
2481
2482 if (previous->RightLink() != childOffset) {
2483 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2484 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2485 "!\n", fStream->ID(), lastOffset, previous->RightLink(),
2486 childOffset);
2487 check.FoundError();
2488 }
2489 }
2490
2491 status_t status = _ValidateChild(check, cached, level, childOffset,
2492 lastOffset, nextOffset, key, keyLength);
2493 if (status != B_OK)
2494 return status;
2495 } else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2496 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2497 "invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2498 fStream->ID(), offset, childOffset, i);
2499 check.FoundError();
2500 }
2501
2502 lastOffset = childOffset;
2503 }
2504
2505 if (parent->OverflowLink() != BPLUSTREE_NULL) {
2506 off_t childOffset = parent->OverflowLink();
2507 status_t status = _ValidateChild(check, cached, level, childOffset,
2508 lastOffset, 0, NULL, 0);
2509 if (status != B_OK)
2510 return status;
2511
2512 lastOffset = childOffset;
2513 }
2514
2515 check.SetPreviousOffset(level, lastOffset);
2516 return B_OK;
2517 }
2518
2519
2520 status_t
_ValidateChild(TreeCheck & check,CachedNode & cached,uint32 level,off_t offset,off_t lastOffset,off_t nextOffset,const uint8 * key,uint16 keyLength)2521 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2522 off_t offset, off_t lastOffset, off_t nextOffset,
2523 const uint8* key, uint16 keyLength)
2524 {
2525 const bplustree_node* node;
2526 status_t status = cached.SetTo(offset, &node, true);
2527 if (status != B_OK) {
2528 if (status == B_IO_ERROR)
2529 return B_IO_ERROR;
2530
2531 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2532 "read: %s\n", fStream->ID(), offset, strerror(status));
2533 check.FoundError();
2534 return B_OK;
2535 }
2536
2537 if (node->LeftLink() != lastOffset) {
2538 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2539 "wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2540 "!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2541 check.FoundError();
2542 }
2543
2544 if (nextOffset != 0 && node->RightLink() != nextOffset) {
2545 dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2546 "wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2547 "!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2548 check.FoundError();
2549 }
2550
2551 return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2552 }
2553 #endif // !_BOOT_MODE
2554
2555
2556 // #pragma mark -
2557
2558
TreeIterator(BPlusTree * tree)2559 TreeIterator::TreeIterator(BPlusTree* tree)
2560 :
2561 fTree(tree),
2562 fCurrentNodeOffset(BPLUSTREE_NULL)
2563 {
2564 #if !_BOOT_MODE
2565 tree->_AddIterator(this);
2566 #endif
2567 }
2568
2569
~TreeIterator()2570 TreeIterator::~TreeIterator()
2571 {
2572 #if !_BOOT_MODE
2573 if (fTree)
2574 fTree->_RemoveIterator(this);
2575 #endif
2576 }
2577
2578
2579 status_t
Goto(int8 to)2580 TreeIterator::Goto(int8 to)
2581 {
2582 if (fTree == NULL || fTree->fStream == NULL)
2583 RETURN_ERROR(B_BAD_VALUE);
2584
2585 #if !_BOOT_MODE
2586 // lock access to stream
2587 InodeReadLocker locker(fTree->fStream);
2588 #endif
2589
2590 off_t nodeOffset = fTree->fHeader.RootNode();
2591 CachedNode cached(fTree);
2592 const bplustree_node* node;
2593
2594 while ((node = cached.SetTo(nodeOffset)) != NULL) {
2595 // is the node a leaf node?
2596 if (node->OverflowLink() == BPLUSTREE_NULL) {
2597 fCurrentNodeOffset = nodeOffset;
2598 fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2599 fDuplicateNode = BPLUSTREE_NULL;
2600
2601 return B_OK;
2602 }
2603
2604 // get the next node offset depending on the direction (and if there
2605 // are any keys in that node at all)
2606 off_t nextOffset;
2607 if (to == BPLUSTREE_END || node->all_key_count == 0)
2608 nextOffset = node->OverflowLink();
2609 else {
2610 if (node->AllKeyLength() > fTree->fNodeSize
2611 || (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2612 - 8 * node->NumKeys())
2613 RETURN_ERROR(B_ERROR);
2614
2615 nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2616 }
2617 if (nextOffset == nodeOffset)
2618 break;
2619
2620 nodeOffset = nextOffset;
2621 }
2622 FATAL(("%s fails\n", __FUNCTION__));
2623
2624 RETURN_ERROR(B_ERROR);
2625 }
2626
2627
2628 /*! Iterates through the tree in the specified direction.
2629 When it iterates through duplicates, the "key" is only updated for the
2630 first entry - if you need to know when this happens, use the "duplicate"
2631 parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2632 the other duplicates.
2633 That's not too nice, but saves the 256 bytes that would be needed to
2634 store the last key - if this will ever become an issue, it will be
2635 easy to change.
2636 The other advantage of this is, that the queries can skip all duplicates
2637 at once when they are not relevant to them.
2638 */
2639 status_t
Traverse(int8 direction,void * key,uint16 * keyLength,uint16 maxLength,off_t * value,uint16 * duplicate)2640 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2641 uint16 maxLength, off_t* value, uint16* duplicate)
2642 {
2643 if (fTree == NULL)
2644 return B_INTERRUPTED;
2645
2646 bool forward = direction == BPLUSTREE_FORWARD;
2647
2648 if (fCurrentNodeOffset == BPLUSTREE_NULL
2649 && Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2650 RETURN_ERROR(B_ERROR);
2651
2652 // if the tree was emptied since the last call
2653 if (fCurrentNodeOffset == BPLUSTREE_FREE)
2654 return B_ENTRY_NOT_FOUND;
2655
2656 #if !_BOOT_MODE
2657 // lock access to stream
2658 InodeReadLocker locker(fTree->fStream);
2659 #endif
2660
2661 CachedNode cached(fTree);
2662 const bplustree_node* node;
2663
2664 if (fDuplicateNode != BPLUSTREE_NULL) {
2665 // regardless of traverse direction the duplicates are always presented
2666 // in the same order; since they are all considered as equal, this
2667 // shouldn't cause any problems
2668
2669 if (!fIsFragment || fDuplicate < fNumDuplicates) {
2670 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2671 false);
2672 } else
2673 node = NULL;
2674
2675 if (node != NULL) {
2676 if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2677 // If the node is out of duplicates, we go directly to the next
2678 // one
2679 fDuplicateNode = node->RightLink();
2680 if (fDuplicateNode != BPLUSTREE_NULL
2681 && (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2682 fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2683 false);
2684 fDuplicate = 0;
2685 }
2686 }
2687 if (fDuplicate < fNumDuplicates) {
2688 *value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2689 fDuplicate++);
2690 if (duplicate)
2691 *duplicate = 2;
2692 return B_OK;
2693 }
2694 }
2695 fDuplicateNode = BPLUSTREE_NULL;
2696 }
2697
2698 off_t savedNodeOffset = fCurrentNodeOffset;
2699 int32 savedKey = fCurrentKey;
2700
2701 if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2702 RETURN_ERROR(B_ERROR);
2703
2704 if (duplicate)
2705 *duplicate = 0;
2706
2707 fCurrentKey += direction;
2708
2709 // is the current key in the current node?
2710 while ((forward && fCurrentKey >= node->NumKeys())
2711 || (!forward && fCurrentKey < 0)) {
2712 fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2713
2714 // are there any more nodes?
2715 if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2716 node = cached.SetTo(fCurrentNodeOffset);
2717 if (!node)
2718 RETURN_ERROR(B_ERROR);
2719
2720 // reset current key
2721 fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2722 } else {
2723 // there are no nodes left, so turn back to the last key
2724 fCurrentNodeOffset = savedNodeOffset;
2725 fCurrentKey = savedKey;
2726
2727 return B_ENTRY_NOT_FOUND;
2728 }
2729 }
2730
2731 if (node->all_key_count == 0)
2732 RETURN_ERROR(B_ERROR); // B_ENTRY_NOT_FOUND ?
2733
2734 uint16 length = 0;
2735 uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2736 if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2737 > (uint8*)node + fTree->fNodeSize
2738 || length > BPLUSTREE_MAX_KEY_LENGTH) {
2739 #if !_BOOT_MODE
2740 fTree->fStream->GetVolume()->Panic();
2741 #endif
2742 RETURN_ERROR(B_BAD_DATA);
2743 }
2744
2745 // include the termination for string types
2746 bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2747 if (length + (needsTermination ? 1 : 0) > maxLength) {
2748 if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
2749 // The buffer is too small, restore the last key and return
2750 // an error
2751 fCurrentNodeOffset = savedNodeOffset;
2752 fCurrentKey = savedKey;
2753 return B_BUFFER_OVERFLOW;
2754 }
2755
2756 // Always cut off strings at the maximum buffer size, and leave
2757 // room for a terminating null byte.
2758 // This allows to handle larger key sizes gracefully.
2759 length = maxLength - 1;
2760 }
2761
2762 memcpy(key, keyStart, length);
2763
2764 if (needsTermination)
2765 ((char*)key)[length] = '\0';
2766
2767 *keyLength = length;
2768
2769 off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2770
2771 // duplicate fragments?
2772 uint8 type = bplustree_node::LinkType(offset);
2773 if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2774 || type == BPLUSTREE_DUPLICATE_NODE) {
2775 fDuplicateNode = offset;
2776
2777 node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2778 false);
2779 if (node == NULL)
2780 RETURN_ERROR(B_ERROR);
2781
2782 fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2783
2784 fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2785 if (fNumDuplicates) {
2786 offset = node->DuplicateAt(offset, fIsFragment, 0);
2787 fDuplicate = 1;
2788 if (duplicate)
2789 *duplicate = 1;
2790 } else {
2791 // Shouldn't happen, but we're dealing here with potentially
2792 // corrupt disks...
2793 fDuplicateNode = BPLUSTREE_NULL;
2794 offset = 0;
2795 }
2796 }
2797 *value = offset;
2798
2799 return B_OK;
2800 }
2801
2802
2803 /*! This is more or less a copy of BPlusTree::Find() - but it just
2804 sets the current position in the iterator, regardless of if the
2805 key could be found or not.
2806 */
2807 status_t
Find(const uint8 * key,uint16 keyLength)2808 TreeIterator::Find(const uint8* key, uint16 keyLength)
2809 {
2810 if (fTree == NULL)
2811 return B_INTERRUPTED;
2812 if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2813 || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2814 || key == NULL)
2815 RETURN_ERROR(B_BAD_VALUE);
2816
2817 #if !_BOOT_MODE
2818 // lock access to stream
2819 InodeReadLocker locker(fTree->fStream);
2820 #endif
2821
2822 off_t nodeOffset = fTree->fHeader.RootNode();
2823
2824 CachedNode cached(fTree);
2825 const bplustree_node* node;
2826 while ((node = cached.SetTo(nodeOffset)) != NULL) {
2827 uint16 keyIndex = 0;
2828 off_t nextOffset = 0;
2829 status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2830 &nextOffset);
2831
2832 if (node->OverflowLink() == BPLUSTREE_NULL) {
2833 fCurrentNodeOffset = nodeOffset;
2834 fCurrentKey = keyIndex - 1;
2835 fDuplicateNode = BPLUSTREE_NULL;
2836
2837 return status;
2838 } else if (nextOffset == nodeOffset)
2839 RETURN_ERROR(B_ERROR);
2840
2841 nodeOffset = nextOffset;
2842 }
2843 RETURN_ERROR(B_ERROR);
2844 }
2845
2846
2847 void
SkipDuplicates()2848 TreeIterator::SkipDuplicates()
2849 {
2850 fDuplicateNode = BPLUSTREE_NULL;
2851 }
2852
2853
2854 void
Update(off_t offset,off_t nextOffset,uint16 keyIndex,uint16 splitAt,int8 change)2855 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2856 uint16 splitAt, int8 change)
2857 {
2858 if (offset != fCurrentNodeOffset)
2859 return;
2860
2861 if (nextOffset != BPLUSTREE_NULL) {
2862 fCurrentNodeOffset = nextOffset;
2863 if (splitAt <= fCurrentKey) {
2864 fCurrentKey -= splitAt;
2865 keyIndex -= splitAt;
2866 }
2867 }
2868
2869 // Adjust fCurrentKey to point to the same key as before.
2870 // Note, that if a key is inserted at the current position
2871 // it won't be included in this tree transition.
2872 if (keyIndex <= fCurrentKey)
2873 fCurrentKey += change;
2874
2875 // TODO: duplicate handling!
2876 }
2877
2878
2879 void
Stop()2880 TreeIterator::Stop()
2881 {
2882 fTree = NULL;
2883 }
2884
2885
2886 #ifdef DEBUG
2887 void
Dump()2888 TreeIterator::Dump()
2889 {
2890 __out("TreeIterator at %p:\n", this);
2891 __out("\tfTree = %p\n", fTree);
2892 __out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2893 __out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2894 __out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2895 bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2896 fDuplicateNode);
2897 __out("\tfDuplicate = %u\n", fDuplicate);
2898 __out("\tfNumDuplicates = %u\n", fNumDuplicates);
2899 __out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2900 }
2901 #endif
2902
2903
2904 // #pragma mark -
2905
2906
2907 bool
IsValid() const2908 bplustree_header::IsValid() const
2909 {
2910 return Magic() == BPLUSTREE_MAGIC
2911 && (RootNode() % NodeSize()) == 0
2912 && IsValidLink(RootNode())
2913 && IsValidLink(FreeNode());
2914 }
2915
2916
2917 // #pragma mark -
2918
2919
2920 void
Initialize()2921 bplustree_node::Initialize()
2922 {
2923 left_link = right_link = overflow_link
2924 = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2925 all_key_count = 0;
2926 all_key_length = 0;
2927 }
2928
2929
2930 uint8*
KeyAt(int32 index,uint16 * keyLength) const2931 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2932 {
2933 if (index < 0 || index > NumKeys())
2934 return NULL;
2935
2936 uint8* keyStart = Keys();
2937 Unaligned<uint16>* keyLengths = KeyLengths();
2938
2939 *keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2940 - (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2941 if (index > 0)
2942 keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2943
2944 return keyStart;
2945 }
2946
2947
2948 uint8
CountDuplicates(off_t offset,bool isFragment) const2949 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2950 {
2951 // the duplicate fragment handling is currently hard-coded to a node size
2952 // of 1024 bytes - with future versions of BFS, this may be a problem
2953
2954 if (isFragment) {
2955 uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2956
2957 return ((off_t*)this)[fragment];
2958 }
2959 return OverflowLink();
2960 }
2961
2962
2963 off_t
DuplicateAt(off_t offset,bool isFragment,int8 index) const2964 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2965 {
2966 uint32 start;
2967 if (isFragment)
2968 start = 8 * ((uint64)offset & 0x3ff);
2969 else
2970 start = 2;
2971
2972 return ((off_t*)this)[start + 1 + index];
2973 }
2974
2975
2976 /*! Although the name suggests it, this function doesn't return the real
2977 used fragment count; at least, it can only count to two: it returns
2978 0, if there is no fragment used, 1 if there is only one fragment
2979 used, and 2 if there are at least 2 fragments used.
2980 */
2981 uint32
FragmentsUsed(uint32 nodeSize) const2982 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2983 {
2984 uint32 used = 0;
2985 for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2986 duplicate_array* array = FragmentAt(i);
2987 if (array->Count() > 0 && ++used > 1)
2988 return used;
2989 }
2990 return used;
2991 }
2992
2993
2994 status_t
CheckIntegrity(uint32 nodeSize) const2995 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2996 {
2997 if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2998 DEBUGGER(("invalid node: key/length count"));
2999
3000 for (int32 i = 0; i < NumKeys(); i++) {
3001 uint16 length = 0;
3002 uint8* key = KeyAt(i, &length);
3003 if (key + length + sizeof(off_t) + sizeof(uint16)
3004 > (uint8*)this + nodeSize
3005 || length > BPLUSTREE_MAX_KEY_LENGTH) {
3006 dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
3007 return B_BAD_DATA;
3008 }
3009 if (Values()[i] == -1) {
3010 dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3011 "corrupted\n", this, (int)i, Values()[i].value);
3012 return B_BAD_DATA;
3013 }
3014 }
3015 return B_OK;
3016 }
3017
3018
3019 // #pragma mark -
3020
3021
3022 #if !_BOOT_MODE
BitmapArray(size_t numBits)3023 BitmapArray::BitmapArray(size_t numBits)
3024 {
3025 fSize = (numBits + 7) / 8;
3026 fBitmap = (uint8*)calloc(fSize, 1);
3027 fCountSet = 0;
3028 }
3029
3030
~BitmapArray()3031 BitmapArray::~BitmapArray()
3032 {
3033 free(fBitmap);
3034 }
3035
3036
3037 status_t
InitCheck() const3038 BitmapArray::InitCheck() const
3039 {
3040 return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3041 }
3042
3043
3044 bool
IsSet(size_t index) const3045 BitmapArray::IsSet(size_t index) const
3046 {
3047 uint32 byteIndex = index / 8;
3048 if (byteIndex >= fSize)
3049 return false;
3050
3051 return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3052 }
3053
3054
3055 void
Set(size_t index,bool set)3056 BitmapArray::Set(size_t index, bool set)
3057 {
3058 uint32 byteIndex = index / 8;
3059 if (byteIndex >= fSize)
3060 return;
3061
3062 if (set) {
3063 fBitmap[byteIndex] |= 1UL << (index & 0x7);
3064 fCountSet++;
3065 } else {
3066 fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3067 fCountSet--;
3068 }
3069 }
3070 #endif // !_BOOT_MODE
3071
3072
3073 // #pragma mark -
3074
3075
3076 bool
_FindInternal(off_t value,int32 & index) const3077 duplicate_array::_FindInternal(off_t value, int32& index) const
3078 {
3079 int32 min = 0, max = Count() - 1;
3080 off_t cmp;
3081 while (min <= max) {
3082 index = (min + max) / 2;
3083
3084 cmp = ValueAt(index) - value;
3085 if (cmp < 0)
3086 min = index + 1;
3087 else if (cmp > 0)
3088 max = index - 1;
3089 else
3090 return true;
3091 }
3092 return false;
3093 }
3094
3095
3096 void
Insert(off_t value)3097 duplicate_array::Insert(off_t value)
3098 {
3099 // if there are more than 8 values in this array, use a
3100 // binary search, if not, just iterate linearly to find
3101 // the insertion point
3102 int32 size = Count();
3103 int32 i = 0;
3104 if (size > 8 ) {
3105 if (!_FindInternal(value, i) && ValueAt(i) <= value)
3106 i++;
3107 } else {
3108 for (i = 0; i < size; i++) {
3109 if (ValueAt(i) > value)
3110 break;
3111 }
3112 }
3113
3114 memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3115 values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3116 count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3117 }
3118
3119
3120 bool
Remove(off_t value)3121 duplicate_array::Remove(off_t value)
3122 {
3123 int32 index = Find(value);
3124 if (index == -1)
3125 return false;
3126
3127 int32 newSize = Count() - 1;
3128 memmove(&values[index], &values[index + 1],
3129 (newSize - index) * sizeof(off_t));
3130 count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3131
3132 return true;
3133 }
3134
3135
3136 #if _BOOT_MODE
3137 } // namespace BFS
3138 #endif
3139