xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.h (revision 60f8e54f2bf405ac24d25fcf0c4ccab8314020e8)
1 /*
2  * Copyright 2001-2015, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 #ifndef B_PLUS_TREE_H
6 #define B_PLUS_TREE_H
7 
8 
9 #include "bfs.h"
10 
11 #include "Utility.h"
12 
13 #if !_BOOT_MODE
14 #include "Journal.h"
15 class Inode;
16 #else
17 #define Inode BFS::Stream
18 
19 namespace BFS {
20 
21 class Stream;
22 class Transaction;
23 class TransactionListener {};
24 #endif // _BOOT_MODE
25 
26 //	#pragma mark - on-disk structures
27 
28 struct bplustree_node;
29 
30 #define BPLUSTREE_NULL			-1LL
31 #define BPLUSTREE_FREE			-2LL
32 
33 struct bplustree_header {
34 	uint32		magic;
35 	uint32		node_size;
36 	uint32		max_number_of_levels;
37 	uint32		data_type;
38 	int64		root_node_pointer;
39 	int64		free_node_pointer;
40 	int64		maximum_size;
41 
Magicbplustree_header42 	uint32 Magic() const { return BFS_ENDIAN_TO_HOST_INT32(magic); }
NodeSizebplustree_header43 	uint32 NodeSize() const { return BFS_ENDIAN_TO_HOST_INT32(node_size); }
DataTypebplustree_header44 	uint32 DataType() const { return BFS_ENDIAN_TO_HOST_INT32(data_type); }
RootNodebplustree_header45 	off_t RootNode() const
46 		{ return BFS_ENDIAN_TO_HOST_INT64(root_node_pointer); }
FreeNodebplustree_header47 	off_t FreeNode() const
48 		{ return BFS_ENDIAN_TO_HOST_INT64(free_node_pointer); }
MaximumSizebplustree_header49 	off_t MaximumSize() const { return BFS_ENDIAN_TO_HOST_INT64(maximum_size); }
MaxNumberOfLevelsbplustree_header50 	uint32 MaxNumberOfLevels() const
51 		{ return BFS_ENDIAN_TO_HOST_INT32(max_number_of_levels); }
52 
53 	inline bool CheckNode(const bplustree_node* node) const;
54 	inline bool IsValidLink(off_t link) const;
55 	bool IsValid() const;
56 } _PACKED;
57 
58 #define BPLUSTREE_MAGIC 			0x69f6c2e8
59 #define BPLUSTREE_NODE_SIZE 		1024
60 #define BPLUSTREE_MAX_KEY_LENGTH	256
61 #define BPLUSTREE_MIN_KEY_LENGTH	1
62 
63 enum bplustree_types {
64 	BPLUSTREE_STRING_TYPE	= 0,
65 	BPLUSTREE_INT32_TYPE	= 1,
66 	BPLUSTREE_UINT32_TYPE	= 2,
67 	BPLUSTREE_INT64_TYPE	= 3,
68 	BPLUSTREE_UINT64_TYPE	= 4,
69 	BPLUSTREE_FLOAT_TYPE	= 5,
70 	BPLUSTREE_DOUBLE_TYPE	= 6
71 };
72 
73 
74 struct duplicate_array;
75 
76 
77 template <typename T>
78 struct  __attribute__((packed)) Unaligned {
79 	T value;
80 
81 	Unaligned<T>& operator=(const T& newValue)
82 	{
83 		value = newValue; return *this;
84 	}
TUnaligned85 	operator T() const { return value; }
86 };
87 
88 
89 struct bplustree_node {
90 			int64				left_link;
91 			int64				right_link;
92 			int64				overflow_link;
93 			uint16				all_key_count;
94 			uint16				all_key_length;
95 
LeftLinkbplustree_node96 			off_t				LeftLink() const
97 									{ return BFS_ENDIAN_TO_HOST_INT64(
98 										left_link); }
RightLinkbplustree_node99 			off_t				RightLink() const
100 									{ return BFS_ENDIAN_TO_HOST_INT64(
101 										right_link); }
OverflowLinkbplustree_node102 			off_t				OverflowLink() const
103 									{ return BFS_ENDIAN_TO_HOST_INT64(
104 										overflow_link); }
NumKeysbplustree_node105 			uint16				NumKeys() const
106 									{ return BFS_ENDIAN_TO_HOST_INT16(
107 										all_key_count); }
AllKeyLengthbplustree_node108 			uint16				AllKeyLength() const
109 									{ return BFS_ENDIAN_TO_HOST_INT16(
110 										all_key_length); }
111 
112 	inline	Unaligned<uint16>*	KeyLengths() const;
113 	inline	Unaligned<off_t>*   Values() const;
114 	inline	uint8*				Keys() const;
115 	inline	int32				Used() const;
116 			uint8*				KeyAt(int32 index, uint16* keyLength) const;
117 
118 	inline	bool				IsLeaf() const;
119 
120 			void				Initialize();
121 			uint8				CountDuplicates(off_t offset,
122 									bool isFragment) const;
123 			off_t				DuplicateAt(off_t offset, bool isFragment,
124 									int8 index) const;
125 			uint32				FragmentsUsed(uint32 nodeSize) const;
126 	inline	duplicate_array*	FragmentAt(int8 index) const;
127 	inline	duplicate_array*	DuplicateArray() const;
128 
129 	static inline uint8			LinkType(off_t link);
130 	static inline off_t			MakeLink(uint8 type, off_t link,
131 									uint32 fragmentIndex = 0);
132 	static inline bool			IsDuplicate(off_t link);
133 	static inline off_t			FragmentOffset(off_t link);
134 	static inline uint32		FragmentIndex(off_t link);
135 	static inline uint32		MaxFragments(uint32 nodeSize);
136 
137 			status_t			CheckIntegrity(uint32 nodeSize) const;
138 } _PACKED;
139 
140 //#define BPLUSTREE_NODE 0
141 #define BPLUSTREE_DUPLICATE_NODE 2
142 #define BPLUSTREE_DUPLICATE_FRAGMENT 3
143 
144 #define NUM_FRAGMENT_VALUES 7
145 #define NUM_DUPLICATE_VALUES 125
146 
147 //**************************************
148 
149 enum bplustree_traversing {
150 	BPLUSTREE_FORWARD = 1,
151 	BPLUSTREE_BACKWARD = -1,
152 
153 	BPLUSTREE_BEGIN = 0,
154 	BPLUSTREE_END = 1
155 };
156 
157 
158 //	#pragma mark - in-memory structures
159 
160 
161 class BPlusTree;
162 struct TreeCheck;
163 class TreeIterator;
164 
165 
166 #if !_BOOT_MODE
167 // needed for searching (utilizing a stack)
168 struct node_and_key {
169 	off_t	nodeOffset;
170 	uint16	keyIndex;
171 };
172 #endif // !_BOOT_MODE
173 
174 
175 class CachedNode {
176 public:
CachedNode(BPlusTree * tree)177 	CachedNode(BPlusTree* tree)
178 		:
179 		fTree(tree),
180 		fNode(NULL),
181 		fOffset(0),
182 		fBlockNumber(0),
183 		fWritable(false)
184 	{
185 #if _BOOT_MODE
186 		fBlock = NULL;
187 #endif
188 	}
189 
190 	CachedNode(BPlusTree* tree, off_t offset, bool check = true)
191 		:
fTree(tree)192 		fTree(tree),
193 		fNode(NULL),
194 		fOffset(0),
195 		fBlockNumber(0),
196 		fWritable(false)
197 	{
198 #if _BOOT_MODE
199 		fBlock = NULL;
200 #endif
201 		SetTo(offset, check);
202 	}
203 
~CachedNode()204 	~CachedNode()
205 	{
206 		Unset();
207 #if _BOOT_MODE
208 		free(fBlock);
209 #endif
210 	}
211 
212 			const bplustree_node* SetTo(off_t offset, bool check = true);
213 			status_t			SetTo(off_t offset,
214 									const bplustree_node** _node,
215 									bool check = true);
216 
217 			const bplustree_header* SetToHeader();
218 			void				Unset();
219 
220 #if !_BOOT_MODE
221 			bplustree_node*		SetToWritable(Transaction& transaction,
222 									off_t offset, bool check = true);
223 			bplustree_node*		MakeWritable(Transaction& transaction);
224 			bplustree_header*	SetToWritableHeader(Transaction& transaction);
225 
226 			void				UnsetUnchanged(Transaction& transaction);
227 
228 			status_t			Free(Transaction& transaction, off_t offset);
229 			status_t			Allocate(Transaction& transaction,
230 									bplustree_node** _node, off_t* _offset);
231 #endif // !_BOOT_MODE
232 
IsWritable()233 			bool				IsWritable() const { return fWritable; }
Node()234 			bplustree_node*		Node() const { return fNode; }
235 
236 protected:
237 			bplustree_node*		InternalSetTo(Transaction* transaction,
238 									off_t offset);
239 
240 			BPlusTree*			fTree;
241 			bplustree_node*		fNode;
242 			off_t				fOffset;
243 			off_t				fBlockNumber;
244 			bool				fWritable;
245 #if _BOOT_MODE
246 			uint8*				fBlock;
247 #endif
248 };
249 
250 
251 class BPlusTree : public TransactionListener {
252 public:
253 #if !_BOOT_MODE
254 								BPlusTree(Transaction& transaction,
255 									Inode* stream,
256 									int32 nodeSize = BPLUSTREE_NODE_SIZE);
257 #endif
258 								BPlusTree(Inode* stream);
259 								BPlusTree();
260 								~BPlusTree();
261 
262 #if !_BOOT_MODE
263 			status_t			SetTo(Transaction& transaction, Inode* stream,
264 									int32 nodeSize = BPLUSTREE_NODE_SIZE);
265 #endif
266 			status_t			SetTo(Inode* stream);
267 			status_t			SetStream(Inode* stream);
268 
269 			status_t			InitCheck();
270 
NodeSize()271 			size_t				NodeSize() const { return fNodeSize; }
Stream()272 			Inode*				Stream() const { return fStream; }
273 
274 #if !_BOOT_MODE
275 			status_t			Validate(bool repair, bool& _errorsFound);
276 			status_t			MakeEmpty();
277 
278 			status_t			Remove(Transaction& transaction,
279 									const uint8* key, uint16 keyLength,
280 									off_t value);
281 			status_t			Insert(Transaction& transaction,
282 									const uint8* key, uint16 keyLength,
283 									off_t value);
284 
285 			status_t			Remove(Transaction& transaction, const char* key,
286 									off_t value);
287 			status_t			Insert(Transaction& transaction, const char* key,
288 									off_t value);
289 			status_t			Insert(Transaction& transaction, int32 key,
290 									off_t value);
291 			status_t			Insert(Transaction& transaction, uint32 key,
292 									off_t value);
293 			status_t			Insert(Transaction& transaction, int64 key,
294 									off_t value);
295 			status_t			Insert(Transaction& transaction, uint64 key,
296 									off_t value);
297 			status_t			Insert(Transaction& transaction, float key,
298 									off_t value);
299 			status_t			Insert(Transaction& transaction, double key,
300 									off_t value);
301 
302 			status_t			Replace(Transaction& transaction,
303 									const uint8* key, uint16 keyLength,
304 									off_t value);
305 #endif // !_BOOT_MODE
306 
307 			status_t			Find(const uint8* key, uint16 keyLength,
308 									off_t* value);
309 
310 #if !_BOOT_MODE
311 	static	int32				TypeCodeToKeyType(type_code code);
312 	static	int32				ModeToKeyType(mode_t mode);
313 
314 protected:
315 	virtual void				TransactionDone(bool success);
316 	virtual void				RemovedFromTransaction();
317 #endif // !_BOOT_MODE
318 
319 private:
320 								BPlusTree(const BPlusTree& other);
321 								BPlusTree& operator=(const BPlusTree& other);
322 									// no implementation
323 
324 			int32				_CompareKeys(const void* key1, int keylength1,
325 									const void* key2, int keylength2);
326 			status_t			_FindKey(const bplustree_node* node,
327 									const uint8* key, uint16 keyLength,
328 									uint16* index = NULL, off_t* next = NULL);
329 #if !_BOOT_MODE
330 			status_t			_SeekDown(Stack<node_and_key>& stack,
331 									const uint8* key, uint16 keyLength);
332 
333 			status_t			_FindFreeDuplicateFragment(
334 									Transaction& transaction,
335 									const bplustree_node* node,
336 									CachedNode& cached, off_t* _offset,
337 									bplustree_node** _fragment, uint32* _index);
338 			status_t			_InsertDuplicate(Transaction& transaction,
339 									CachedNode& cached,
340 									const bplustree_node* node, uint16 index,
341 									off_t value);
342 			void				_InsertKey(bplustree_node* node, uint16 index,
343 									uint8* key, uint16 keyLength, off_t value);
344 			status_t			_SplitNode(bplustree_node* node,
345 									off_t nodeOffset, bplustree_node* other,
346 									off_t otherOffset, uint16* _keyIndex,
347 									uint8* key, uint16* _keyLength,
348 									off_t* _value);
349 
350 			status_t			_RemoveDuplicate(Transaction& transaction,
351 									const bplustree_node* node,
352 									CachedNode& cached, uint16 keyIndex,
353 									off_t value);
354 			void				_RemoveKey(bplustree_node* node, uint16 index);
355 
356 			void				_UpdateIterators(off_t offset, off_t nextOffset,
357 									uint16 keyIndex, uint16 splitAt,
358 									int8 change);
359 			void				_AddIterator(TreeIterator* iterator);
360 			void				_RemoveIterator(TreeIterator* iterator);
361 
362 			status_t			_ValidateChildren(TreeCheck& check,
363 									uint32 level, off_t offset,
364 									const uint8* largestKey, uint16 keyLength,
365 									const bplustree_node* parent);
366 			status_t			_ValidateChild(TreeCheck& check,
367 									CachedNode& cached, uint32 level,
368 									off_t offset, off_t lastOffset,
369 									off_t nextOffset, const uint8* key,
370 									uint16 keyLength);
371 #endif // !_BOOT_MODE
372 
373 private:
374 			friend class TreeIterator;
375 			friend class CachedNode;
376 			friend struct TreeCheck;
377 
378 			Inode*				fStream;
379 			bplustree_header	fHeader;
380 			int32				fNodeSize;
381 			bool				fAllowDuplicates;
382 			bool				fInTransaction;
383 			status_t			fStatus;
384 
385 #if !_BOOT_MODE
386 			mutex				fIteratorLock;
387 			SinglyLinkedList<TreeIterator> fIterators;
388 #endif
389 };
390 
391 
392 //	#pragma mark - helper classes/functions
393 
394 
395 class TreeIterator : public SinglyLinkedListLinkImpl<TreeIterator> {
396 public:
397 								TreeIterator(BPlusTree* tree);
398 								~TreeIterator();
399 
400 			status_t			Goto(int8 to);
401 			status_t			Traverse(int8 direction, void* key,
402 									uint16* keyLength, uint16 maxLength,
403 									off_t* value, uint16* duplicate = NULL);
404 			status_t			Find(const uint8* key, uint16 keyLength);
405 
406 			status_t			Rewind();
407 			status_t			GetNextEntry(void* key, uint16* keyLength,
408 									uint16 maxLength, off_t* value,
409 									uint16* duplicate = NULL);
410 			status_t			GetPreviousEntry(void* key, uint16* keyLength,
411 									uint16 maxLength, off_t* value,
412 									uint16* duplicate = NULL);
413 			void				SkipDuplicates();
414 
Tree()415 			BPlusTree*			Tree() const { return fTree; }
416 
417 #ifdef DEBUG
418 			void				Dump();
419 #endif
420 
421 private:
422 			friend class BPlusTree;
423 
424 			// called by BPlusTree
425 			void				Update(off_t offset, off_t nextOffset,
426 									uint16 keyIndex, uint16 splitAt,
427 									int8 change);
428 			void				Stop();
429 
430 private:
431 			BPlusTree*			fTree;
432 			off_t				fCurrentNodeOffset;
433 									// traverse position
434 			int32				fCurrentKey;
435 			off_t				fDuplicateNode;
436 			uint16				fDuplicate;
437 			uint16				fNumDuplicates;
438 			bool				fIsFragment;
439 };
440 
441 
442 //	#pragma mark - BPlusTree's inline functions
443 //	(most of them may not be needed)
444 
445 
446 #if !_BOOT_MODE
447 inline status_t
Remove(Transaction & transaction,const char * key,off_t value)448 BPlusTree::Remove(Transaction& transaction, const char* key, off_t value)
449 {
450 	if (fHeader.DataType() != BPLUSTREE_STRING_TYPE)
451 		return B_BAD_TYPE;
452 	return Remove(transaction, (uint8*)key, strlen(key), value);
453 }
454 
455 
456 inline status_t
Insert(Transaction & transaction,const char * key,off_t value)457 BPlusTree::Insert(Transaction& transaction, const char* key, off_t value)
458 {
459 	if (fHeader.DataType() != BPLUSTREE_STRING_TYPE)
460 		return B_BAD_TYPE;
461 	return Insert(transaction, (uint8*)key, strlen(key), value);
462 }
463 
464 
465 inline status_t
Insert(Transaction & transaction,int32 key,off_t value)466 BPlusTree::Insert(Transaction& transaction, int32 key, off_t value)
467 {
468 	if (fHeader.DataType() != BPLUSTREE_INT32_TYPE)
469 		return B_BAD_TYPE;
470 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
471 }
472 
473 
474 inline status_t
Insert(Transaction & transaction,uint32 key,off_t value)475 BPlusTree::Insert(Transaction& transaction, uint32 key, off_t value)
476 {
477 	if (fHeader.DataType() != BPLUSTREE_UINT32_TYPE)
478 		return B_BAD_TYPE;
479 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
480 }
481 
482 
483 inline status_t
Insert(Transaction & transaction,int64 key,off_t value)484 BPlusTree::Insert(Transaction& transaction, int64 key, off_t value)
485 {
486 	if (fHeader.DataType() != BPLUSTREE_INT64_TYPE)
487 		return B_BAD_TYPE;
488 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
489 }
490 
491 
492 inline status_t
Insert(Transaction & transaction,uint64 key,off_t value)493 BPlusTree::Insert(Transaction& transaction, uint64 key, off_t value)
494 {
495 	if (fHeader.DataType() != BPLUSTREE_UINT64_TYPE)
496 		return B_BAD_TYPE;
497 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
498 }
499 
500 
501 inline status_t
Insert(Transaction & transaction,float key,off_t value)502 BPlusTree::Insert(Transaction& transaction, float key, off_t value)
503 {
504 	if (fHeader.DataType() != BPLUSTREE_FLOAT_TYPE)
505 		return B_BAD_TYPE;
506 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
507 }
508 
509 
510 inline status_t
Insert(Transaction & transaction,double key,off_t value)511 BPlusTree::Insert(Transaction& transaction, double key, off_t value)
512 {
513 	if (fHeader.DataType() != BPLUSTREE_DOUBLE_TYPE)
514 		return B_BAD_TYPE;
515 	return Insert(transaction, (uint8*)&key, sizeof(key), value);
516 }
517 #endif // !_BOOT_MODE
518 
519 
520 //	#pragma mark - TreeIterator inline functions
521 
522 
523 inline status_t
Rewind()524 TreeIterator::Rewind()
525 {
526 	return Goto(BPLUSTREE_BEGIN);
527 }
528 
529 
530 inline status_t
GetNextEntry(void * key,uint16 * keyLength,uint16 maxLength,off_t * value,uint16 * duplicate)531 TreeIterator::GetNextEntry(void* key, uint16* keyLength, uint16 maxLength,
532 	off_t* value, uint16* duplicate)
533 {
534 	return Traverse(BPLUSTREE_FORWARD, key, keyLength, maxLength, value,
535 		duplicate);
536 }
537 
538 
539 inline status_t
GetPreviousEntry(void * key,uint16 * keyLength,uint16 maxLength,off_t * value,uint16 * duplicate)540 TreeIterator::GetPreviousEntry(void* key, uint16* keyLength, uint16 maxLength,
541 	off_t* value, uint16* duplicate)
542 {
543 	return Traverse(BPLUSTREE_BACKWARD, key, keyLength, maxLength, value,
544 		duplicate);
545 }
546 
547 
548 //	#pragma mark - bplustree_header inline functions
549 
550 
551 inline bool
CheckNode(const bplustree_node * node)552 bplustree_header::CheckNode(const bplustree_node* node) const
553 {
554 	// sanity checks (links, all_key_count)
555 	return IsValidLink(node->LeftLink())
556 		&& IsValidLink(node->RightLink())
557 		&& IsValidLink(node->OverflowLink())
558 		&& (int8*)node->Values() + node->NumKeys() * sizeof(off_t)
559 				<= (int8*)node + NodeSize();
560 }
561 
562 
563 inline bool
IsValidLink(off_t link)564 bplustree_header::IsValidLink(off_t link) const
565 {
566 	return link == BPLUSTREE_NULL
567 		|| (link > 0 && link <= MaximumSize() - NodeSize());
568 }
569 
570 
571 //	#pragma mark - bplustree_node inline functions
572 
573 
574 inline Unaligned<uint16>*
KeyLengths()575 bplustree_node::KeyLengths() const
576 {
577 	return (Unaligned<uint16>*)(((char*)this) + key_align(sizeof(bplustree_node)
578 		+ AllKeyLength()));
579 }
580 
581 
582 inline Unaligned<off_t>*
Values()583 bplustree_node::Values() const
584 {
585 	return (Unaligned<off_t>*)(
586 		(char*)KeyLengths() + NumKeys() * sizeof(uint16));
587 }
588 
589 
590 inline uint8*
Keys()591 bplustree_node::Keys() const
592 {
593 	return (uint8*)this + sizeof(bplustree_node);
594 }
595 
596 
597 inline int32
Used()598 bplustree_node::Used() const
599 {
600 	return key_align(sizeof(bplustree_node) + AllKeyLength()) + NumKeys()
601 		* (sizeof(uint16) + sizeof(off_t));
602 }
603 
604 
605 inline bool
IsLeaf()606 bplustree_node::IsLeaf() const
607 {
608 	return OverflowLink() == BPLUSTREE_NULL;
609 }
610 
611 
612 inline duplicate_array*
FragmentAt(int8 index)613 bplustree_node::FragmentAt(int8 index) const
614 {
615 	return (duplicate_array*)((off_t*)this + index * (NUM_FRAGMENT_VALUES + 1));
616 }
617 
618 
619 inline duplicate_array*
DuplicateArray()620 bplustree_node::DuplicateArray() const
621 {
622 	return (duplicate_array*)&overflow_link;
623 }
624 
625 
626 inline uint8
LinkType(off_t link)627 bplustree_node::LinkType(off_t link)
628 {
629 	return *(uint64*)&link >> 62;
630 }
631 
632 
633 inline off_t
MakeLink(uint8 type,off_t link,uint32 fragmentIndex)634 bplustree_node::MakeLink(uint8 type, off_t link, uint32 fragmentIndex)
635 {
636 	return ((off_t)type << 62) | (link & 0x3ffffffffffffc00LL)
637 		| (fragmentIndex & 0x3ff);
638 }
639 
640 
641 inline bool
IsDuplicate(off_t link)642 bplustree_node::IsDuplicate(off_t link)
643 {
644 	return (LinkType(link)
645 		& (BPLUSTREE_DUPLICATE_NODE | BPLUSTREE_DUPLICATE_FRAGMENT)) > 0;
646 }
647 
648 
649 inline off_t
FragmentOffset(off_t link)650 bplustree_node::FragmentOffset(off_t link)
651 {
652 	return link & 0x3ffffffffffffc00LL;
653 }
654 
655 
656 inline uint32
FragmentIndex(off_t link)657 bplustree_node::FragmentIndex(off_t link)
658 {
659 	return (uint32)(link & 0x3ff);
660 }
661 
662 
663 inline uint32
MaxFragments(uint32 nodeSize)664 bplustree_node::MaxFragments(uint32 nodeSize)
665 {
666 	return nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
667 }
668 
669 
670 #if _BOOT_MODE
671 } // namespace BFS
672 
673 #undef Inode
674 #endif
675 
676 #endif	// B_PLUS_TREE_H
677