xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 60b39cd7416028e61e3d30bb3ba28bd3526e6001)
1 /*
2  * Copyright 2001-2013, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Block bitmap handling and allocation policies
8 
9 
10 #include "BlockAllocator.h"
11 
12 #include "bfs_control.h"
13 #include "BPlusTree.h"
14 #include "Debug.h"
15 #include "Inode.h"
16 #include "Volume.h"
17 
18 
19 // Things the BlockAllocator should do:
20 
21 // - find a range of blocks of a certain size nearby a specific position
22 // - allocating an unsharp range of blocks for pre-allocation
23 // - free blocks
24 // - know how to deal with each allocation, special handling for directories,
25 //   files, symlinks, etc. (type sensitive allocation policies)
26 
27 // What makes the code complicated is the fact that we are not just reading
28 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
29 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
30 // of partitions will grow even more - so that's not an option.
31 // Instead we are reading in every block when it's used - since an allocation
32 // group can span several blocks in the block bitmap, the AllocationBlock
33 // class is there to make handling those easier.
34 
35 // The current implementation is only slightly optimized and could probably
36 // be improved a lot. Furthermore, the allocation policies used here should
37 // have some real world tests.
38 
39 #if BFS_TRACING && !defined(BFS_SHELL)
40 namespace BFSBlockTracing {
41 
42 
43 class Allocate : public AbstractTraceEntry {
44 public:
45 	Allocate(block_run run)
46 		:
47 		fRun(run)
48 	{
49 		Initialized();
50 	}
51 
52 	virtual void AddDump(TraceOutput& out)
53 	{
54 		out.Print("bfs:alloc %lu.%u.%u", fRun.AllocationGroup(),
55 			fRun.Start(), fRun.Length());
56 	}
57 
58 	const block_run& Run() const { return fRun; }
59 
60 private:
61 			block_run			fRun;
62 };
63 
64 
65 class Free : public AbstractTraceEntry {
66 public:
67 	Free(block_run run)
68 		:
69 		fRun(run)
70 	{
71 		Initialized();
72 	}
73 
74 	virtual void AddDump(TraceOutput& out)
75 	{
76 		out.Print("bfs:free %lu.%u.%u", fRun.AllocationGroup(),
77 			fRun.Start(), fRun.Length());
78 	}
79 
80 	const block_run& Run() const { return fRun; }
81 
82 private:
83 			block_run			fRun;
84 };
85 
86 
87 static uint32
88 checksum(const uint8* data, size_t size)
89 {
90 	const uint32* data4 = (const uint32*)data;
91 	uint32 sum = 0;
92 	while (size >= 4) {
93 		sum += *data4;
94 		data4++;
95 		size -= 4;
96 	}
97 	return sum;
98 }
99 
100 
101 class Block : public AbstractTraceEntry {
102 public:
103 	Block(const char* label, off_t blockNumber, const uint8* data,
104 			size_t size, uint32 start = 0, uint32 length = 0)
105 		:
106 		fBlock(blockNumber),
107 		fData(data),
108 		fStart(start),
109 		fLength(length),
110 		fLabel(label)
111 	{
112 		fSum = checksum(data, size);
113 		Initialized();
114 	}
115 
116 	virtual void AddDump(TraceOutput& out)
117 	{
118 		out.Print("bfs:%s: block %Ld (%p), sum %lu, s/l %lu/%lu", fLabel,
119 			fBlock, fData, fSum, fStart, fLength);
120 	}
121 
122 private:
123 			off_t				fBlock;
124 			const uint8*		fData;
125 			uint32				fStart;
126 			uint32				fLength;
127 			uint32				fSum;
128 			const char*			fLabel;
129 };
130 
131 
132 class BlockChange : public AbstractTraceEntry {
133 public:
134 	BlockChange(const char* label, int32 block, uint32 oldData, uint32 newData)
135 		:
136 		fBlock(block),
137 		fOldData(oldData),
138 		fNewData(newData),
139 		fLabel(label)
140 	{
141 		Initialized();
142 	}
143 
144 	virtual void AddDump(TraceOutput& out)
145 	{
146 		out.Print("bfs:%s: block %ld, %08lx -> %08lx", fLabel,
147 			fBlock, fOldData, fNewData);
148 	}
149 
150 private:
151 			int32				fBlock;
152 			uint32				fOldData;
153 			uint32				fNewData;
154 			const char*			fLabel;
155 };
156 
157 
158 }	// namespace BFSBlockTracing
159 
160 #	define T(x) new(std::nothrow) BFSBlockTracing::x;
161 #else
162 #	define T(x) ;
163 #endif
164 
165 #ifdef DEBUG_ALLOCATION_GROUPS
166 #	define CHECK_ALLOCATION_GROUP(group) _CheckGroup(group)
167 #else
168 #	define CHECK_ALLOCATION_GROUP(group) ;
169 #endif
170 
171 
172 struct check_index {
173 	check_index()
174 		:
175 		inode(NULL)
176 	{
177 	}
178 
179 	char				name[B_FILE_NAME_LENGTH];
180 	block_run			run;
181 	Inode*				inode;
182 };
183 
184 
185 struct check_cookie {
186 	check_cookie()
187 	{
188 	}
189 
190 	uint32				pass;
191 	block_run			current;
192 	Inode*				parent;
193 	mode_t				parent_mode;
194 	Stack<block_run>	stack;
195 	TreeIterator*		iterator;
196 	check_control		control;
197 	Stack<check_index*>	indices;
198 };
199 
200 
201 class AllocationBlock : public CachedBlock {
202 public:
203 	AllocationBlock(Volume* volume);
204 
205 	inline void Allocate(uint16 start, uint16 numBlocks);
206 	inline void Free(uint16 start, uint16 numBlocks);
207 	inline bool IsUsed(uint16 block);
208 
209 	status_t SetTo(AllocationGroup& group, uint16 block);
210 	status_t SetToWritable(Transaction& transaction, AllocationGroup& group,
211 		uint16 block);
212 
213 	uint32 NumBlockBits() const { return fNumBits; }
214 	uint32& Block(int32 index) { return ((uint32*)fBlock)[index]; }
215 	uint8* Block() const { return (uint8*)fBlock; }
216 
217 private:
218 	uint32	fNumBits;
219 #ifdef DEBUG
220 	bool	fWritable;
221 #endif
222 };
223 
224 
225 class AllocationGroup {
226 public:
227 	AllocationGroup();
228 
229 	void AddFreeRange(int32 start, int32 blocks);
230 	bool IsFull() const { return fFreeBits == 0; }
231 
232 	status_t Allocate(Transaction& transaction, uint16 start, int32 length);
233 	status_t Free(Transaction& transaction, uint16 start, int32 length);
234 
235 	uint32 NumBits() const { return fNumBits; }
236 	uint32 NumBlocks() const { return fNumBlocks; }
237 	int32 Start() const { return fStart; }
238 
239 private:
240 	friend class BlockAllocator;
241 
242 	uint32	fNumBits;
243 	uint32	fNumBlocks;
244 	int32	fStart;
245 	int32	fFirstFree;
246 	int32	fFreeBits;
247 
248 	int32	fLargestStart;
249 	int32	fLargestLength;
250 	bool	fLargestValid;
251 };
252 
253 
254 AllocationBlock::AllocationBlock(Volume* volume)
255 	: CachedBlock(volume)
256 {
257 }
258 
259 
260 status_t
261 AllocationBlock::SetTo(AllocationGroup& group, uint16 block)
262 {
263 	// 8 blocks per byte
264 	fNumBits = fVolume->BlockSize() << 3;
265 	// the last group may have less bits than the others
266 	if ((block + 1) * fNumBits > group.NumBits())
267 		fNumBits = group.NumBits() % fNumBits;
268 
269 #ifdef DEBUG
270 	fWritable = false;
271 #endif
272 	return CachedBlock::SetTo(group.Start() + block) != NULL ? B_OK : B_ERROR;
273 }
274 
275 
276 status_t
277 AllocationBlock::SetToWritable(Transaction& transaction, AllocationGroup& group,
278 	uint16 block)
279 {
280 	// 8 blocks per byte
281 	fNumBits = fVolume->BlockSize() << 3;
282 	// the last group may have less bits in the last block
283 	if ((block + 1) * fNumBits > group.NumBits())
284 		fNumBits = group.NumBits() % fNumBits;
285 
286 #ifdef DEBUG
287 	fWritable = true;
288 #endif
289 	return CachedBlock::SetToWritable(transaction, group.Start() + block)
290 		!= NULL ? B_OK : B_ERROR;
291 }
292 
293 
294 bool
295 AllocationBlock::IsUsed(uint16 block)
296 {
297 	if (block > fNumBits)
298 		return true;
299 
300 	// the block bitmap is accessed in 32-bit blocks
301 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
302 }
303 
304 
305 void
306 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
307 {
308 	ASSERT(start < fNumBits);
309 	ASSERT(uint32(start + numBlocks) <= fNumBits);
310 #ifdef DEBUG
311 	ASSERT(fWritable);
312 #endif
313 
314 	T(Block("b-alloc-in", fBlockNumber, fBlock, fVolume->BlockSize(),
315 		start, numBlocks));
316 
317 	int32 block = start >> 5;
318 
319 	while (numBlocks > 0) {
320 		uint32 mask = 0;
321 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
322 			mask |= 1UL << i;
323 
324 		T(BlockChange("b-alloc", block, Block(block),
325 			Block(block) | HOST_ENDIAN_TO_BFS_INT32(mask)));
326 
327 #if KDEBUG
328 		// check for already set blocks
329 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32*)fBlock)[block]) {
330 			FATAL(("AllocationBlock::Allocate(): some blocks are already "
331 				"allocated, start = %u, numBlocks = %u\n", start, numBlocks));
332 			panic("blocks already set!");
333 		}
334 #endif
335 
336 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
337 		start = 0;
338 	}
339 	T(Block("b-alloc-out", fBlockNumber, fBlock, fVolume->BlockSize(),
340 		start, numBlocks));
341 }
342 
343 
344 void
345 AllocationBlock::Free(uint16 start, uint16 numBlocks)
346 {
347 	ASSERT(start < fNumBits);
348 	ASSERT(uint32(start + numBlocks) <= fNumBits);
349 #ifdef DEBUG
350 	ASSERT(fWritable);
351 #endif
352 
353 	int32 block = start >> 5;
354 
355 	while (numBlocks > 0) {
356 		uint32 mask = 0;
357 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
358 			mask |= 1UL << (i % 32);
359 
360 		T(BlockChange("b-free", block, Block(block),
361 			Block(block) & HOST_ENDIAN_TO_BFS_INT32(~mask)));
362 
363 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
364 		start = 0;
365 	}
366 }
367 
368 
369 //	#pragma mark -
370 
371 
372 /*!	The allocation groups are created and initialized in
373 	BlockAllocator::Initialize() and BlockAllocator::InitializeAndClearBitmap()
374 	respectively.
375 */
376 AllocationGroup::AllocationGroup()
377 	:
378 	fFirstFree(-1),
379 	fFreeBits(0),
380 	fLargestValid(false)
381 {
382 }
383 
384 
385 void
386 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
387 {
388 	//D(if (blocks > 512)
389 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
390 
391 	if (fFirstFree == -1)
392 		fFirstFree = start;
393 
394 	if (!fLargestValid || fLargestLength < blocks) {
395 		fLargestStart = start;
396 		fLargestLength = blocks;
397 		fLargestValid = true;
398 	}
399 
400 	fFreeBits += blocks;
401 }
402 
403 
404 /*!	Allocates the specified run in the allocation group.
405 	Doesn't check if the run is valid or already allocated partially, nor
406 	does it maintain the free ranges hints or the volume's used blocks count.
407 	It only does the low-level work of allocating some bits in the block bitmap.
408 	Assumes that the block bitmap lock is hold.
409 */
410 status_t
411 AllocationGroup::Allocate(Transaction& transaction, uint16 start, int32 length)
412 {
413 	ASSERT(start + length <= (int32)fNumBits);
414 
415 	// Update the allocation group info
416 	// TODO: this info will be incorrect if something goes wrong later
417 	// Note, the fFirstFree block doesn't have to be really free
418 	if (start == fFirstFree)
419 		fFirstFree = start + length;
420 	fFreeBits -= length;
421 
422 	if (fLargestValid) {
423 		bool cut = false;
424 		if (fLargestStart == start) {
425 			// cut from start
426 			fLargestStart += length;
427 			fLargestLength -= length;
428 			cut = true;
429 		} else if (start > fLargestStart
430 			&& start < fLargestStart + fLargestLength) {
431 			// cut from end
432 			fLargestLength = start - fLargestStart;
433 			cut = true;
434 		}
435 		if (cut && (fLargestLength < fLargestStart
436 				|| fLargestLength
437 						< (int32)fNumBits - (fLargestStart + fLargestLength))) {
438 			// might not be the largest block anymore
439 			fLargestValid = false;
440 		}
441 	}
442 
443 	Volume* volume = transaction.GetVolume();
444 
445 	// calculate block in the block bitmap and position within
446 	uint32 bitsPerBlock = volume->BlockSize() << 3;
447 	uint32 block = start / bitsPerBlock;
448 	start = start % bitsPerBlock;
449 
450 	AllocationBlock cached(volume);
451 
452 	while (length > 0) {
453 		if (cached.SetToWritable(transaction, *this, block) < B_OK) {
454 			fLargestValid = false;
455 			RETURN_ERROR(B_IO_ERROR);
456 		}
457 
458 		uint32 numBlocks = length;
459 		if (start + numBlocks > cached.NumBlockBits())
460 			numBlocks = cached.NumBlockBits() - start;
461 
462 		cached.Allocate(start, numBlocks);
463 
464 		length -= numBlocks;
465 		start = 0;
466 		block++;
467 	}
468 
469 	return B_OK;
470 }
471 
472 
473 /*!	Frees the specified run in the allocation group.
474 	Doesn't check if the run is valid or was not completely allocated, nor
475 	does it maintain the free ranges hints or the volume's used blocks count.
476 	It only does the low-level work of freeing some bits in the block bitmap.
477 	Assumes that the block bitmap lock is hold.
478 */
479 status_t
480 AllocationGroup::Free(Transaction& transaction, uint16 start, int32 length)
481 {
482 	ASSERT(start + length <= (int32)fNumBits);
483 
484 	// Update the allocation group info
485 	// TODO: this info will be incorrect if something goes wrong later
486 	if (fFirstFree > start)
487 		fFirstFree = start;
488 	fFreeBits += length;
489 
490 	// The range to be freed cannot be part of the valid largest range
491 	ASSERT(!fLargestValid || start + length <= fLargestStart
492 		|| start > fLargestStart);
493 
494 	if (fLargestValid
495 		&& (start + length == fLargestStart
496 			|| fLargestStart + fLargestLength == start
497 			|| (start < fLargestStart && fLargestStart > fLargestLength)
498 			|| (start > fLargestStart
499 				&& (int32)fNumBits - (fLargestStart + fLargestLength)
500 						> fLargestLength))) {
501 		fLargestValid = false;
502 	}
503 
504 	Volume* volume = transaction.GetVolume();
505 
506 	// calculate block in the block bitmap and position within
507 	uint32 bitsPerBlock = volume->BlockSize() << 3;
508 	uint32 block = start / bitsPerBlock;
509 	start = start % bitsPerBlock;
510 
511 	AllocationBlock cached(volume);
512 
513 	while (length > 0) {
514 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
515 			RETURN_ERROR(B_IO_ERROR);
516 
517 		T(Block("free-1", block, cached.Block(), volume->BlockSize()));
518 		uint16 freeLength = length;
519 		if (uint32(start + length) > cached.NumBlockBits())
520 			freeLength = cached.NumBlockBits() - start;
521 
522 		cached.Free(start, freeLength);
523 
524 		length -= freeLength;
525 		start = 0;
526 		T(Block("free-2", block, cached.Block(), volume->BlockSize()));
527 		block++;
528 	}
529 	return B_OK;
530 }
531 
532 
533 //	#pragma mark -
534 
535 
536 BlockAllocator::BlockAllocator(Volume* volume)
537 	:
538 	fVolume(volume),
539 	fGroups(NULL),
540 	fCheckBitmap(NULL),
541 	fCheckCookie(NULL)
542 {
543 	recursive_lock_init(&fLock, "bfs allocator");
544 }
545 
546 
547 BlockAllocator::~BlockAllocator()
548 {
549 	recursive_lock_destroy(&fLock);
550 	delete[] fGroups;
551 }
552 
553 
554 status_t
555 BlockAllocator::Initialize(bool full)
556 {
557 	fNumGroups = fVolume->AllocationGroups();
558 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
559 	fNumBlocks = (fVolume->NumBlocks() + fVolume->BlockSize() * 8 - 1)
560 		/ (fVolume->BlockSize() * 8);
561 
562 	fGroups = new(std::nothrow) AllocationGroup[fNumGroups];
563 	if (fGroups == NULL)
564 		return B_NO_MEMORY;
565 
566 	if (!full)
567 		return B_OK;
568 
569 	recursive_lock_lock(&fLock);
570 		// the lock will be released by the _Initialize() method
571 
572 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::_Initialize,
573 		"bfs block allocator", B_LOW_PRIORITY, this);
574 	if (id < B_OK)
575 		return _Initialize(this);
576 
577 	recursive_lock_transfer_lock(&fLock, id);
578 
579 	return resume_thread(id);
580 }
581 
582 
583 status_t
584 BlockAllocator::InitializeAndClearBitmap(Transaction& transaction)
585 {
586 	status_t status = Initialize(false);
587 	if (status != B_OK)
588 		return status;
589 
590 	uint32 numBits = 8 * fBlocksPerGroup * fVolume->BlockSize();
591 	uint32 blockShift = fVolume->BlockShift();
592 
593 	uint32* buffer = (uint32*)malloc(numBits >> 3);
594 	if (buffer == NULL)
595 		RETURN_ERROR(B_NO_MEMORY);
596 
597 	memset(buffer, 0, numBits >> 3);
598 
599 	off_t offset = 1;
600 		// the bitmap starts directly after the superblock
601 
602 	// initialize the AllocationGroup objects and clear the on-disk bitmap
603 
604 	for (int32 i = 0; i < fNumGroups; i++) {
605 		if (write_pos(fVolume->Device(), offset << blockShift, buffer,
606 				fBlocksPerGroup << blockShift) < B_OK) {
607 			free(buffer);
608 			return B_ERROR;
609 		}
610 
611 		// the last allocation group may contain less blocks than the others
612 		if (i == fNumGroups - 1) {
613 			fGroups[i].fNumBits = fVolume->NumBlocks() - i * numBits;
614 			fGroups[i].fNumBlocks = 1 + ((fGroups[i].NumBits() - 1)
615 				>> (blockShift + 3));
616 		} else {
617 			fGroups[i].fNumBits = numBits;
618 			fGroups[i].fNumBlocks = fBlocksPerGroup;
619 		}
620 		fGroups[i].fStart = offset;
621 		fGroups[i].fFirstFree = fGroups[i].fLargestStart = 0;
622 		fGroups[i].fFreeBits = fGroups[i].fLargestLength = fGroups[i].fNumBits;
623 		fGroups[i].fLargestValid = true;
624 
625 		offset += fBlocksPerGroup;
626 	}
627 	free(buffer);
628 
629 	// reserve the boot block, the log area, and the block bitmap itself
630 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
631 
632 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
633 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
634 		return B_ERROR;
635 	}
636 	fVolume->SuperBlock().used_blocks
637 		= HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
638 
639 	return B_OK;
640 }
641 
642 
643 status_t
644 BlockAllocator::_Initialize(BlockAllocator* allocator)
645 {
646 	// The lock must already be held at this point
647 	RecursiveLocker locker(allocator->fLock, true);
648 
649 	Volume* volume = allocator->fVolume;
650 	uint32 blocks = allocator->fBlocksPerGroup;
651 	uint32 blockShift = volume->BlockShift();
652 	off_t freeBlocks = 0;
653 
654 	uint32* buffer = (uint32*)malloc(blocks << blockShift);
655 	if (buffer == NULL)
656 		RETURN_ERROR(B_NO_MEMORY);
657 
658 	AllocationGroup* groups = allocator->fGroups;
659 	off_t offset = 1;
660 	uint32 bitsPerGroup = 8 * (blocks << blockShift);
661 	int32 numGroups = allocator->fNumGroups;
662 
663 	for (int32 i = 0; i < numGroups; i++) {
664 		if (read_pos(volume->Device(), offset << blockShift, buffer,
665 				blocks << blockShift) < B_OK)
666 			break;
667 
668 		// the last allocation group may contain less blocks than the others
669 		if (i == numGroups - 1) {
670 			groups[i].fNumBits = volume->NumBlocks() - i * bitsPerGroup;
671 			groups[i].fNumBlocks = 1 + ((groups[i].NumBits() - 1)
672 				>> (blockShift + 3));
673 		} else {
674 			groups[i].fNumBits = bitsPerGroup;
675 			groups[i].fNumBlocks = blocks;
676 		}
677 		groups[i].fStart = offset;
678 
679 		// finds all free ranges in this allocation group
680 		int32 start = -1, range = 0;
681 		int32 numBits = groups[i].fNumBits, bit = 0;
682 		int32 count = (numBits + 31) / 32;
683 
684 		for (int32 k = 0; k < count; k++) {
685 			for (int32 j = 0; j < 32 && bit < numBits; j++, bit++) {
686 				if (buffer[k] & (1UL << j)) {
687 					// block is in use
688 					if (range > 0) {
689 						groups[i].AddFreeRange(start, range);
690 						range = 0;
691 					}
692 				} else if (range++ == 0) {
693 					// block is free, start new free range
694 					start = bit;
695 				}
696 			}
697 		}
698 		if (range)
699 			groups[i].AddFreeRange(start, range);
700 
701 		freeBlocks += groups[i].fFreeBits;
702 
703 		offset += blocks;
704 	}
705 	free(buffer);
706 
707 	// check if block bitmap and log area are reserved
708 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
709 
710 	if (allocator->CheckBlocks(0, reservedBlocks) != B_OK) {
711 		if (volume->IsReadOnly()) {
712 			FATAL(("Space for block bitmap or log area is not reserved "
713 				"(volume is mounted read-only)!\n"));
714 		} else {
715 			Transaction transaction(volume, 0);
716 			if (groups[0].Allocate(transaction, 0, reservedBlocks) != B_OK) {
717 				FATAL(("Could not allocate reserved space for block "
718 					"bitmap/log!\n"));
719 				volume->Panic();
720 			} else {
721 				transaction.Done();
722 				FATAL(("Space for block bitmap or log area was not "
723 					"reserved!\n"));
724 			}
725 		}
726 	}
727 
728 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
729 	if (volume->UsedBlocks() != usedBlocks) {
730 		// If the disk in a dirty state at mount time, it's
731 		// normal that the values don't match
732 		INFORM(("volume reports %" B_PRIdOFF " used blocks, correct is %"
733 			B_PRIdOFF "\n", volume->UsedBlocks(), usedBlocks));
734 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
735 	}
736 
737 	return B_OK;
738 }
739 
740 
741 void
742 BlockAllocator::Uninitialize()
743 {
744 	// We only have to make sure that the initializer thread isn't running
745 	// anymore.
746 	recursive_lock_lock(&fLock);
747 }
748 
749 
750 /*!	Tries to allocate between \a minimum, and \a maximum blocks starting
751 	at group \a groupIndex with offset \a start. The resulting allocation
752 	is put into \a run.
753 
754 	The number of allocated blocks is always a multiple of \a minimum which
755 	has to be a power of two value.
756 */
757 status_t
758 BlockAllocator::AllocateBlocks(Transaction& transaction, int32 groupIndex,
759 	uint16 start, uint16 maximum, uint16 minimum, block_run& run)
760 {
761 	if (maximum == 0)
762 		return B_BAD_VALUE;
763 
764 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n",
765 		groupIndex, start, maximum, minimum));
766 
767 	AllocationBlock cached(fVolume);
768 	RecursiveLocker lock(fLock);
769 
770 	uint32 bitsPerFullBlock = fVolume->BlockSize() << 3;
771 
772 	// Find the block_run that can fulfill the request best
773 	int32 bestGroup = -1;
774 	int32 bestStart = -1;
775 	int32 bestLength = -1;
776 
777 	for (int32 i = 0; i < fNumGroups + 1; i++, groupIndex++, start = 0) {
778 		groupIndex = groupIndex % fNumGroups;
779 		AllocationGroup& group = fGroups[groupIndex];
780 
781 		CHECK_ALLOCATION_GROUP(groupIndex);
782 
783 		if (start >= group.NumBits() || group.IsFull())
784 			continue;
785 
786 		// The wanted maximum is smaller than the largest free block in the
787 		// group or already smaller than the minimum
788 
789 		if (start < group.fFirstFree)
790 			start = group.fFirstFree;
791 
792 		if (group.fLargestValid) {
793 			if (group.fLargestLength < bestLength)
794 				continue;
795 
796 			if (group.fLargestStart >= start) {
797 				if (group.fLargestLength >= bestLength) {
798 					bestGroup = groupIndex;
799 					bestStart = group.fLargestStart;
800 					bestLength = group.fLargestLength;
801 
802 					if (bestLength >= maximum)
803 						break;
804 				}
805 
806 				// We know everything about this group we have to, let's skip
807 				// to the next
808 				continue;
809 			}
810 		}
811 
812 		// There may be more than one block per allocation group - and
813 		// we iterate through it to find a place for the allocation.
814 		// (one allocation can't exceed one allocation group)
815 
816 		uint32 block = start / (fVolume->BlockSize() << 3);
817 		int32 currentStart = 0, currentLength = 0;
818 		int32 groupLargestStart = -1;
819 		int32 groupLargestLength = -1;
820 		int32 currentBit = start;
821 		bool canFindGroupLargest = start == 0;
822 
823 		for (; block < group.NumBlocks(); block++) {
824 			if (cached.SetTo(group, block) < B_OK)
825 				RETURN_ERROR(B_ERROR);
826 
827 			T(Block("alloc-in", group.Start() + block, cached.Block(),
828 				fVolume->BlockSize(), groupIndex, currentStart));
829 
830 			// find a block large enough to hold the allocation
831 			for (uint32 bit = start % bitsPerFullBlock;
832 					bit < cached.NumBlockBits(); bit++) {
833 				if (!cached.IsUsed(bit)) {
834 					if (currentLength == 0) {
835 						// start new range
836 						currentStart = currentBit;
837 					}
838 
839 					// have we found a range large enough to hold numBlocks?
840 					if (++currentLength >= maximum) {
841 						bestGroup = groupIndex;
842 						bestStart = currentStart;
843 						bestLength = currentLength;
844 						break;
845 					}
846 				} else {
847 					if (currentLength) {
848 						// end of a range
849 						if (currentLength > bestLength) {
850 							bestGroup = groupIndex;
851 							bestStart = currentStart;
852 							bestLength = currentLength;
853 						}
854 						if (currentLength > groupLargestLength) {
855 							groupLargestStart = currentStart;
856 							groupLargestLength = currentLength;
857 						}
858 						currentLength = 0;
859 					}
860 					if ((int32)group.NumBits() - currentBit
861 							<= groupLargestLength) {
862 						// We can't find a bigger block in this group anymore,
863 						// let's skip the rest.
864 						block = group.NumBlocks();
865 						break;
866 					}
867 				}
868 				currentBit++;
869 			}
870 
871 			T(Block("alloc-out", block, cached.Block(),
872 				fVolume->BlockSize(), groupIndex, currentStart));
873 
874 			if (bestLength >= maximum) {
875 				canFindGroupLargest = false;
876 				break;
877 			}
878 
879 			// start from the beginning of the next block
880 			start = 0;
881 		}
882 
883 		if (currentBit == (int32)group.NumBits()) {
884 			if (currentLength > bestLength) {
885 				bestGroup = groupIndex;
886 				bestStart = currentStart;
887 				bestLength = currentLength;
888 			}
889 			if (canFindGroupLargest && currentLength > groupLargestLength) {
890 				groupLargestStart = currentStart;
891 				groupLargestLength = currentLength;
892 			}
893 		}
894 
895 		if (canFindGroupLargest && !group.fLargestValid
896 			&& groupLargestLength >= 0) {
897 			group.fLargestStart = groupLargestStart;
898 			group.fLargestLength = groupLargestLength;
899 			group.fLargestValid = true;
900 		}
901 
902 		if (bestLength >= maximum)
903 			break;
904 	}
905 
906 	// If we found a suitable range, mark the blocks as in use, and
907 	// write the updated block bitmap back to disk
908 	if (bestLength < minimum)
909 		return B_DEVICE_FULL;
910 
911 	if (bestLength > maximum)
912 		bestLength = maximum;
913 	else if (minimum > 1) {
914 		// make sure bestLength is a multiple of minimum
915 		bestLength = round_down(bestLength, minimum);
916 	}
917 
918 	if (fGroups[bestGroup].Allocate(transaction, bestStart, bestLength) != B_OK)
919 		RETURN_ERROR(B_IO_ERROR);
920 
921 	CHECK_ALLOCATION_GROUP(bestGroup);
922 
923 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(bestGroup);
924 	run.start = HOST_ENDIAN_TO_BFS_INT16(bestStart);
925 	run.length = HOST_ENDIAN_TO_BFS_INT16(bestLength);
926 
927 	fVolume->SuperBlock().used_blocks
928 		= HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + bestLength);
929 		// We are not writing back the disk's superblock - it's
930 		// either done by the journaling code, or when the disk
931 		// is unmounted.
932 		// If the value is not correct at mount time, it will be
933 		// fixed anyway.
934 
935 	// We need to flush any remaining blocks in the new allocation to make sure
936 	// they won't interfere with the file cache.
937 	block_cache_discard(fVolume->BlockCache(), fVolume->ToBlock(run),
938 		run.Length());
939 
940 	T(Allocate(run));
941 	return B_OK;
942 }
943 
944 
945 status_t
946 BlockAllocator::AllocateForInode(Transaction& transaction,
947 	const block_run* parent, mode_t type, block_run& run)
948 {
949 	// Apply some allocation policies here (AllocateBlocks() will break them
950 	// if necessary) - we will start with those described in Dominic Giampaolo's
951 	// "Practical File System Design", and see how good they work
952 
953 	// Files are going in the same allocation group as its parent,
954 	// sub-directories will be inserted 8 allocation groups after
955 	// the one of the parent
956 	uint16 group = parent->AllocationGroup();
957 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
958 		group += 8;
959 
960 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
961 }
962 
963 
964 status_t
965 BlockAllocator::Allocate(Transaction& transaction, Inode* inode,
966 	off_t numBlocks, block_run& run, uint16 minimum)
967 {
968 	if (numBlocks <= 0)
969 		return B_ERROR;
970 
971 	// one block_run can't hold more data than there is in one allocation group
972 	if (numBlocks > fGroups[0].NumBits())
973 		numBlocks = fGroups[0].NumBits();
974 
975 	// since block_run.length is uint16, the largest number of blocks that
976 	// can be covered by a block_run is 65535
977 	// TODO: if we drop compatibility, couldn't we do this any better?
978 	// There are basically two possibilities:
979 	// a) since a length of zero doesn't have any sense, take that for 65536 -
980 	//    but that could cause many problems (bugs) in other areas
981 	// b) reduce the maximum amount of blocks per block_run, so that the
982 	//    remaining number of free blocks can be used in a useful manner
983 	//    (like 4 blocks) - but that would also reduce the maximum file size
984 	// c) have BlockRun::Length() return (length + 1).
985 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
986 		numBlocks = MAX_BLOCK_RUN_LENGTH;
987 
988 	// Apply some allocation policies here (AllocateBlocks() will break them
989 	// if necessary)
990 	uint16 group = inode->BlockRun().AllocationGroup();
991 	uint16 start = 0;
992 
993 	// Are there already allocated blocks? (then just try to allocate near the
994 	// last one)
995 	if (inode->Size() > 0) {
996 		const data_stream& data = inode->Node().data;
997 		// TODO: we currently don't care for when the data stream
998 		// is already grown into the indirect ranges
999 		if (data.max_double_indirect_range == 0
1000 			&& data.max_indirect_range == 0) {
1001 			// Since size > 0, there must be a valid block run in this stream
1002 			int32 last = 0;
1003 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
1004 				if (data.direct[last + 1].IsZero())
1005 					break;
1006 
1007 			group = data.direct[last].AllocationGroup();
1008 			start = data.direct[last].Start() + data.direct[last].Length();
1009 		}
1010 	} else if (inode->IsContainer() || inode->IsSymLink()) {
1011 		// directory and symbolic link data will go in the same allocation
1012 		// group as the inode is in but after the inode data
1013 		start = inode->BlockRun().Start();
1014 	} else {
1015 		// file data will start in the next allocation group
1016 		group = inode->BlockRun().AllocationGroup() + 1;
1017 	}
1018 
1019 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
1020 }
1021 
1022 
1023 status_t
1024 BlockAllocator::Free(Transaction& transaction, block_run run)
1025 {
1026 	RecursiveLocker lock(fLock);
1027 
1028 	int32 group = run.AllocationGroup();
1029 	uint16 start = run.Start();
1030 	uint16 length = run.Length();
1031 
1032 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start,
1033 		length));
1034 	T(Free(run));
1035 
1036 	// doesn't use Volume::IsValidBlockRun() here because it can check better
1037 	// against the group size (the last group may have a different length)
1038 	if (group < 0 || group >= fNumGroups
1039 		|| start > fGroups[group].NumBits()
1040 		|| uint32(start + length) > fGroups[group].NumBits()
1041 		|| length == 0) {
1042 		FATAL(("tried to free an invalid block_run (%d, %u, %u)\n", (int)group,
1043 			start, length));
1044 		DEBUGGER(("tried to free invalid block_run"));
1045 		return B_BAD_VALUE;
1046 	}
1047 	// check if someone tries to free reserved areas at the beginning of the
1048 	// drive
1049 	if (group == 0
1050 		&& start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
1051 		FATAL(("tried to free a reserved block_run (%d, %u, %u)\n", (int)group,
1052 			start, length));
1053 		DEBUGGER(("tried to free reserved block"));
1054 		return B_BAD_VALUE;
1055 	}
1056 #ifdef DEBUG
1057 	if (CheckBlockRun(run) != B_OK)
1058 		return B_BAD_DATA;
1059 #endif
1060 
1061 	CHECK_ALLOCATION_GROUP(group);
1062 
1063 	if (fGroups[group].Free(transaction, start, length) != B_OK)
1064 		RETURN_ERROR(B_IO_ERROR);
1065 
1066 	CHECK_ALLOCATION_GROUP(group);
1067 
1068 #ifdef DEBUG
1069 	if (CheckBlockRun(run, NULL, false) != B_OK) {
1070 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just "
1071 			"freed)\n"));
1072 	}
1073 #endif
1074 
1075 	fVolume->SuperBlock().used_blocks =
1076 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
1077 	return B_OK;
1078 }
1079 
1080 
1081 size_t
1082 BlockAllocator::BitmapSize() const
1083 {
1084 	return fVolume->BlockSize() * fNumBlocks;
1085 }
1086 
1087 
1088 #ifdef DEBUG_FRAGMENTER
1089 void
1090 BlockAllocator::Fragment()
1091 {
1092 	AllocationBlock cached(fVolume);
1093 	RecursiveLocker lock(fLock);
1094 
1095 	// only leave 4 block holes
1096 	static const uint32 kMask = 0x0f0f0f0f;
1097 	uint32 valuesPerBlock = fVolume->BlockSize() / 4;
1098 
1099 	for (int32 i = 0; i < fNumGroups; i++) {
1100 		AllocationGroup& group = fGroups[i];
1101 
1102 		for (uint32 block = 0; block < group.NumBlocks(); block++) {
1103 			Transaction transaction(fVolume, 0);
1104 
1105 			if (cached.SetToWritable(transaction, group, block) != B_OK)
1106 				return;
1107 
1108 			for (int32 index = 0; index < valuesPerBlock; index++) {
1109 				cached.Block(index) |= HOST_ENDIAN_TO_BFS_INT32(kMask);
1110 			}
1111 
1112 			transaction.Done();
1113 		}
1114 	}
1115 }
1116 #endif	// DEBUG_FRAGMENTER
1117 
1118 
1119 #ifdef DEBUG_ALLOCATION_GROUPS
1120 void
1121 BlockAllocator::_CheckGroup(int32 groupIndex) const
1122 {
1123 	AllocationBlock cached(fVolume);
1124 	ASSERT_LOCKED_RECURSIVE(&fLock);
1125 
1126 	AllocationGroup& group = fGroups[groupIndex];
1127 
1128 	int32 currentStart = 0, currentLength = 0;
1129 	int32 firstFree = -1;
1130 	int32 largestStart = -1;
1131 	int32 largestLength = 0;
1132 	int32 currentBit = 0;
1133 
1134 	for (uint32 block = 0; block < group.NumBlocks(); block++) {
1135 		if (cached.SetTo(group, block) < B_OK) {
1136 			panic("setting group block %d failed\n", (int)block);
1137 			return;
1138 		}
1139 
1140 		for (uint32 bit = 0; bit < cached.NumBlockBits(); bit++) {
1141 			if (!cached.IsUsed(bit)) {
1142 				if (firstFree < 0) {
1143 					firstFree = currentBit;
1144 					if (!group.fLargestValid) {
1145 						if (firstFree >= 0 && firstFree < group.fFirstFree) {
1146 							// mostly harmless but noteworthy
1147 							dprintf("group %d first free too late: should be "
1148 								"%d, is %d\n", (int)groupIndex, (int)firstFree,
1149 								(int)group.fFirstFree);
1150 						}
1151 						return;
1152 					}
1153 				}
1154 
1155 				if (currentLength == 0) {
1156 					// start new range
1157 					currentStart = currentBit;
1158 				}
1159 				currentLength++;
1160 			} else if (currentLength) {
1161 				// end of a range
1162 				if (currentLength > largestLength) {
1163 					largestStart = currentStart;
1164 					largestLength = currentLength;
1165 				}
1166 				currentLength = 0;
1167 			}
1168 			currentBit++;
1169 		}
1170 	}
1171 
1172 	if (currentLength > largestLength) {
1173 		largestStart = currentStart;
1174 		largestLength = currentLength;
1175 	}
1176 
1177 	if (firstFree >= 0 && firstFree < group.fFirstFree) {
1178 		// mostly harmless but noteworthy
1179 		dprintf("group %d first free too late: should be %d, is %d\n",
1180 			(int)groupIndex, (int)firstFree, (int)group.fFirstFree);
1181 	}
1182 	if (group.fLargestValid
1183 		&& (largestStart != group.fLargestStart
1184 			|| largestLength != group.fLargestLength)) {
1185 		panic("bfs %p: group %d largest differs: %d.%d, checked %d.%d.\n",
1186 			fVolume, (int)groupIndex, (int)group.fLargestStart,
1187 			(int)group.fLargestLength, (int)largestStart, (int)largestLength);
1188 	}
1189 }
1190 #endif	// DEBUG_ALLOCATION_GROUPS
1191 
1192 
1193 status_t
1194 BlockAllocator::Trim(uint64 offset, uint64 size, uint64& trimmedSize)
1195 {
1196 	const uint32 kTrimRanges = 128;
1197 	fs_trim_data* trimData = (fs_trim_data*)malloc(sizeof(fs_trim_data)
1198 		+ sizeof(uint64) * kTrimRanges);
1199 	if (trimData == NULL)
1200 		return B_NO_MEMORY;
1201 
1202 	MemoryDeleter deleter(trimData);
1203 	RecursiveLocker locker(fLock);
1204 
1205 	// TODO: take given offset and size into account!
1206 	int32 lastGroup = fNumGroups - 1;
1207 	uint32 firstBlock = 0;
1208 	uint32 firstBit = 0;
1209 	uint64 currentBlock = 0;
1210 	uint32 blockShift = fVolume->BlockShift();
1211 
1212 	uint64 firstFree = 0;
1213 	size_t freeLength = 0;
1214 
1215 	trimData->range_count = 0;
1216 	trimmedSize = 0;
1217 
1218 	AllocationBlock cached(fVolume);
1219 	for (int32 groupIndex = 0; groupIndex <= lastGroup; groupIndex++) {
1220 		AllocationGroup& group = fGroups[groupIndex];
1221 
1222 		for (uint32 block = firstBlock; block < group.NumBlocks(); block++) {
1223 			cached.SetTo(group, block);
1224 
1225 			for (uint32 i = firstBit; i < cached.NumBlockBits(); i++) {
1226 				if (cached.IsUsed(i)) {
1227 					// Block is in use
1228 					if (freeLength > 0) {
1229 						status_t status = _TrimNext(*trimData, kTrimRanges,
1230 							firstFree << blockShift, freeLength << blockShift,
1231 							false, trimmedSize);
1232 						if (status != B_OK)
1233 							return status;
1234 
1235 						freeLength = 0;
1236 					}
1237 				} else if (freeLength++ == 0) {
1238 					// Block is free, start new free range
1239 					firstFree = currentBlock;
1240 				}
1241 
1242 				currentBlock++;
1243 			}
1244 		}
1245 
1246 		firstBlock = 0;
1247 		firstBit = 0;
1248 	}
1249 
1250 	return _TrimNext(*trimData, kTrimRanges, firstFree << blockShift,
1251 		freeLength << blockShift, true, trimmedSize);
1252 }
1253 
1254 
1255 //	#pragma mark - Bitmap validity checking
1256 
1257 // TODO: implement new FS checking API
1258 // Functions to check the validity of the bitmap - they are used from
1259 // the "checkfs" command (since this does even a bit more, maybe we should
1260 // move this some place else?)
1261 
1262 
1263 bool
1264 BlockAllocator::_IsValidCheckControl(const check_control* control)
1265 {
1266 	if (control == NULL
1267 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
1268 		FATAL(("invalid check_control (%p)!\n", control));
1269 		return false;
1270 	}
1271 
1272 	return true;
1273 }
1274 
1275 
1276 status_t
1277 BlockAllocator::StartChecking(const check_control* control)
1278 {
1279 	if (!_IsValidCheckControl(control))
1280 		return B_BAD_VALUE;
1281 
1282 	fVolume->GetJournal(0)->Lock(NULL, true);
1283 		// Lock the volume's journal
1284 
1285 	recursive_lock_lock(&fLock);
1286 
1287 	size_t size = BitmapSize();
1288 	fCheckBitmap = (uint32*)malloc(size);
1289 	if (fCheckBitmap == NULL) {
1290 		recursive_lock_unlock(&fLock);
1291 		fVolume->GetJournal(0)->Unlock(NULL, true);
1292 		return B_NO_MEMORY;
1293 	}
1294 
1295 	fCheckCookie = new(std::nothrow) check_cookie();
1296 	if (fCheckCookie == NULL) {
1297 		free(fCheckBitmap);
1298 		fCheckBitmap = NULL;
1299 		recursive_lock_unlock(&fLock);
1300 		fVolume->GetJournal(0)->Unlock(NULL, true);
1301 
1302 		return B_NO_MEMORY;
1303 	}
1304 
1305 	memcpy(&fCheckCookie->control, control, sizeof(check_control));
1306 	memset(&fCheckCookie->control.stats, 0, sizeof(control->stats));
1307 
1308 	// initialize bitmap
1309 	memset(fCheckBitmap, 0, size);
1310 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length();
1311 			block-- > 0;) {
1312 		_SetCheckBitmapAt(block);
1313 	}
1314 
1315 	fCheckCookie->pass = BFS_CHECK_PASS_BITMAP;
1316 	fCheckCookie->stack.Push(fVolume->Root());
1317 	fCheckCookie->stack.Push(fVolume->Indices());
1318 	fCheckCookie->iterator = NULL;
1319 	fCheckCookie->control.stats.block_size = fVolume->BlockSize();
1320 
1321 	// Put removed vnodes to the stack -- they are not reachable by traversing
1322 	// the file system anymore.
1323 	InodeList::Iterator iterator = fVolume->RemovedInodes().GetIterator();
1324 	while (Inode* inode = iterator.Next()) {
1325 		fCheckCookie->stack.Push(inode->BlockRun());
1326 	}
1327 
1328 	// TODO: check reserved area in bitmap!
1329 
1330 	return B_OK;
1331 }
1332 
1333 
1334 status_t
1335 BlockAllocator::StopChecking(check_control* control)
1336 {
1337 	if (fCheckCookie == NULL)
1338 		return B_NO_INIT;
1339 
1340 	if (fCheckCookie->iterator != NULL) {
1341 		delete fCheckCookie->iterator;
1342 		fCheckCookie->iterator = NULL;
1343 
1344 		// the current directory inode is still locked in memory
1345 		put_vnode(fVolume->FSVolume(), fVolume->ToVnode(fCheckCookie->current));
1346 	}
1347 
1348 	if (fVolume->IsReadOnly()) {
1349 		// We can't fix errors on this volume
1350 		fCheckCookie->control.flags = 0;
1351 	}
1352 
1353 	if (fCheckCookie->control.status != B_ENTRY_NOT_FOUND)
1354 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
1355 
1356 	switch (fCheckCookie->pass) {
1357 		case BFS_CHECK_PASS_BITMAP:
1358 			// if CheckNextNode() could completely work through, we can
1359 			// fix any damages of the bitmap
1360 			if (fCheckCookie->control.status == B_ENTRY_NOT_FOUND)
1361 				_WriteBackCheckBitmap();
1362 			break;
1363 
1364 		case BFS_CHECK_PASS_INDEX:
1365 			_FreeIndices();
1366 			break;
1367 	}
1368 
1369 	fVolume->SetCheckingThread(-1);
1370 
1371 	if (control != NULL)
1372 		user_memcpy(control, &fCheckCookie->control, sizeof(check_control));
1373 
1374 	free(fCheckBitmap);
1375 	fCheckBitmap = NULL;
1376 	delete fCheckCookie;
1377 	fCheckCookie = NULL;
1378 	recursive_lock_unlock(&fLock);
1379 	fVolume->GetJournal(0)->Unlock(NULL, true);
1380 
1381 	return B_OK;
1382 }
1383 
1384 
1385 status_t
1386 BlockAllocator::CheckNextNode(check_control* control)
1387 {
1388 	if (fCheckCookie == NULL)
1389 		return B_NO_INIT;
1390 
1391 	fVolume->SetCheckingThread(find_thread(NULL));
1392 
1393 	// Make sure the user control is copied on exit
1394 	class CopyControlOnExit {
1395 	public:
1396 		CopyControlOnExit(check_control* source, check_control* userTarget)
1397 			:
1398 			fSource(source),
1399 			fTarget(userTarget)
1400 		{
1401 		}
1402 
1403 		~CopyControlOnExit()
1404 		{
1405 			if (fTarget != NULL)
1406 				user_memcpy(fTarget, fSource, sizeof(check_control));
1407 		}
1408 
1409 	private:
1410 		check_control*	fSource;
1411 		check_control*	fTarget;
1412 	} copyControl(&fCheckCookie->control, control);
1413 
1414 	while (true) {
1415 		if (fCheckCookie->iterator == NULL) {
1416 			if (!fCheckCookie->stack.Pop(&fCheckCookie->current)) {
1417 				// No more runs on the stack, we might be finished!
1418 				if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1419 					&& !fCheckCookie->indices.IsEmpty()) {
1420 					// Start second pass to repair indices
1421 					_WriteBackCheckBitmap();
1422 
1423 					fCheckCookie->pass = BFS_CHECK_PASS_INDEX;
1424 					fCheckCookie->control.pass = BFS_CHECK_PASS_INDEX;
1425 
1426 					status_t status = _PrepareIndices();
1427 					if (status != B_OK) {
1428 						fCheckCookie->control.status = status;
1429 						return status;
1430 					}
1431 
1432 					fCheckCookie->stack.Push(fVolume->Root());
1433 					continue;
1434 				}
1435 
1436 				fCheckCookie->control.status = B_ENTRY_NOT_FOUND;
1437 				return B_ENTRY_NOT_FOUND;
1438 			}
1439 
1440 			Vnode vnode(fVolume, fCheckCookie->current);
1441 			Inode* inode;
1442 			if (vnode.Get(&inode) != B_OK) {
1443 				FATAL(("check: Could not open inode at %" B_PRIdOFF "\n",
1444 					fVolume->ToBlock(fCheckCookie->current)));
1445 				continue;
1446 			}
1447 
1448 			fCheckCookie->control.inode = inode->ID();
1449 			fCheckCookie->control.mode = inode->Mode();
1450 
1451 			if (!inode->IsContainer()) {
1452 				// Check file
1453 				fCheckCookie->control.errors = 0;
1454 				fCheckCookie->control.status = CheckInode(inode, NULL);
1455 
1456 				if (inode->GetName(fCheckCookie->control.name) < B_OK)
1457 					strcpy(fCheckCookie->control.name, "(node has no name)");
1458 
1459 				return B_OK;
1460 			}
1461 
1462 			// Check directory
1463 			BPlusTree* tree = inode->Tree();
1464 			if (tree == NULL) {
1465 				FATAL(("check: could not open b+tree from inode at %" B_PRIdOFF
1466 					"\n", fVolume->ToBlock(fCheckCookie->current)));
1467 				continue;
1468 			}
1469 
1470 			fCheckCookie->parent = inode;
1471 			fCheckCookie->parent_mode = inode->Mode();
1472 
1473 			// get iterator for the next directory
1474 			fCheckCookie->iterator = new(std::nothrow) TreeIterator(tree);
1475 			if (fCheckCookie->iterator == NULL)
1476 				RETURN_ERROR(B_NO_MEMORY);
1477 
1478 			// the inode must stay locked in memory until the iterator is freed
1479 			vnode.Keep();
1480 
1481 			// check the inode of the directory
1482 			fCheckCookie->control.errors = 0;
1483 			fCheckCookie->control.status = CheckInode(inode, NULL);
1484 
1485 			if (inode->GetName(fCheckCookie->control.name) != B_OK)
1486 				strcpy(fCheckCookie->control.name, "(dir has no name)");
1487 
1488 			return B_OK;
1489 		}
1490 
1491 		char name[B_FILE_NAME_LENGTH];
1492 		uint16 length;
1493 		ino_t id;
1494 
1495 		status_t status = fCheckCookie->iterator->GetNextEntry(name, &length,
1496 			B_FILE_NAME_LENGTH, &id);
1497 		if (status != B_OK) {
1498 			// we no longer need this iterator
1499 			delete fCheckCookie->iterator;
1500 			fCheckCookie->iterator = NULL;
1501 
1502 			// unlock the directory's inode from memory
1503 			put_vnode(fVolume->FSVolume(),
1504 				fVolume->ToVnode(fCheckCookie->current));
1505 
1506 			if (status == B_ENTRY_NOT_FOUND) {
1507 				// We iterated over all entries already, just go on to the next
1508 				continue;
1509 			}
1510 
1511 			// Iterating over the B+tree failed - we let the checkfs run
1512 			// fail completely, as we would delete all files we cannot
1513 			// access.
1514 			// TODO: maybe have a force parameter that actually does that.
1515 			// TODO: we also need to be able to repair broken B+trees!
1516 			return status;
1517 		}
1518 
1519 		// ignore "." and ".." entries
1520 		if (!strcmp(name, ".") || !strcmp(name, ".."))
1521 			continue;
1522 
1523 		// fill in the control data as soon as we have them
1524 		strlcpy(fCheckCookie->control.name, name, B_FILE_NAME_LENGTH);
1525 		fCheckCookie->control.inode = id;
1526 		fCheckCookie->control.errors = 0;
1527 
1528 		Vnode vnode(fVolume, id);
1529 		Inode* inode;
1530 		if (vnode.Get(&inode) != B_OK) {
1531 			FATAL(("Could not open inode ID %" B_PRIdINO "!\n", id));
1532 			fCheckCookie->control.errors |= BFS_COULD_NOT_OPEN;
1533 
1534 			if ((fCheckCookie->control.flags & BFS_REMOVE_INVALID) != 0) {
1535 				status = _RemoveInvalidNode(fCheckCookie->parent,
1536 					fCheckCookie->iterator->Tree(), NULL, name);
1537 			} else
1538 				status = B_ERROR;
1539 
1540 			fCheckCookie->control.status = status;
1541 			return B_OK;
1542 		}
1543 
1544 		// check if the inode's name is the same as in the b+tree
1545 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1546 			&& inode->IsRegularNode()) {
1547 			RecursiveLocker locker(inode->SmallDataLock());
1548 			NodeGetter node(fVolume, inode);
1549 
1550 			const char* localName = inode->Name(node.Node());
1551 			if (localName == NULL || strcmp(localName, name)) {
1552 				fCheckCookie->control.errors |= BFS_NAMES_DONT_MATCH;
1553 				FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name,
1554 					localName));
1555 
1556 				if ((fCheckCookie->control.flags & BFS_FIX_NAME_MISMATCHES)
1557 						!= 0) {
1558 					// Rename the inode
1559 					Transaction transaction(fVolume, inode->BlockNumber());
1560 
1561 					// Note, this may need extra blocks, but the inode will
1562 					// only be checked afterwards, so that it won't be lost
1563 					status = inode->SetName(transaction, name);
1564 					if (status == B_OK)
1565 						status = inode->WriteBack(transaction);
1566 					if (status == B_OK)
1567 						status = transaction.Done();
1568 					if (status != B_OK) {
1569 						fCheckCookie->control.status = status;
1570 						return B_OK;
1571 					}
1572 				}
1573 			}
1574 		}
1575 
1576 		fCheckCookie->control.mode = inode->Mode();
1577 
1578 		// Check for the correct mode of the node (if the mode of the
1579 		// file don't fit to its parent, there is a serious problem)
1580 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1581 			&& (((fCheckCookie->parent_mode & S_ATTR_DIR) != 0
1582 					&& !inode->IsAttribute())
1583 				|| ((fCheckCookie->parent_mode & S_INDEX_DIR) != 0
1584 					&& !inode->IsIndex())
1585 				|| (is_directory(fCheckCookie->parent_mode)
1586 					&& !inode->IsRegularNode()))) {
1587 			FATAL(("inode at %" B_PRIdOFF " is of wrong type: %o (parent "
1588 				"%o at %" B_PRIdOFF ")!\n", inode->BlockNumber(),
1589 				inode->Mode(), fCheckCookie->parent_mode,
1590 				fCheckCookie->parent->BlockNumber()));
1591 
1592 			// if we are allowed to fix errors, we should remove the file
1593 			if ((fCheckCookie->control.flags & BFS_REMOVE_WRONG_TYPES) != 0
1594 				&& (fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS)
1595 						!= 0) {
1596 				status = _RemoveInvalidNode(fCheckCookie->parent, NULL,
1597 					inode, name);
1598 			} else
1599 				status = B_ERROR;
1600 
1601 			fCheckCookie->control.errors |= BFS_WRONG_TYPE;
1602 			fCheckCookie->control.status = status;
1603 			return B_OK;
1604 		}
1605 
1606 		// push the directory on the stack so that it will be scanned later
1607 		if (inode->IsContainer() && !inode->IsIndex())
1608 			fCheckCookie->stack.Push(inode->BlockRun());
1609 		else {
1610 			// check it now
1611 			fCheckCookie->control.status = CheckInode(inode, name);
1612 			return B_OK;
1613 		}
1614 	}
1615 	// is never reached
1616 }
1617 
1618 
1619 status_t
1620 BlockAllocator::_RemoveInvalidNode(Inode* parent, BPlusTree* tree, Inode* inode,
1621 	const char* name)
1622 {
1623 	// It's safe to start a transaction, because Inode::Remove()
1624 	// won't touch the block bitmap (which we hold the lock for)
1625 	// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1626 	// the bitmap anyway.
1627 	Transaction transaction(fVolume, parent->BlockNumber());
1628 	status_t status;
1629 
1630 	if (inode != NULL) {
1631 		inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1632 
1633 		status = parent->Remove(transaction, name, NULL, false, true);
1634 	} else {
1635 		parent->WriteLockInTransaction(transaction);
1636 
1637 		// does the file even exist?
1638 		off_t id;
1639 		status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1640 		if (status == B_OK)
1641 			status = tree->Remove(transaction, name, id);
1642 	}
1643 
1644 	if (status == B_OK) {
1645 		entry_cache_remove(fVolume->ID(), parent->ID(), name);
1646 		transaction.Done();
1647 	}
1648 
1649 	return status;
1650 }
1651 
1652 
1653 bool
1654 BlockAllocator::_CheckBitmapIsUsedAt(off_t block) const
1655 {
1656 	size_t size = BitmapSize();
1657 	uint32 index = block / 32;	// 32bit resolution
1658 	if (index > size / 4)
1659 		return false;
1660 
1661 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index])
1662 		& (1UL << (block & 0x1f));
1663 }
1664 
1665 
1666 void
1667 BlockAllocator::_SetCheckBitmapAt(off_t block)
1668 {
1669 	size_t size = BitmapSize();
1670 	uint32 index = block / 32;	// 32bit resolution
1671 	if (index > size / 4)
1672 		return;
1673 
1674 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1675 }
1676 
1677 
1678 status_t
1679 BlockAllocator::_WriteBackCheckBitmap()
1680 {
1681 	if (fVolume->IsReadOnly())
1682 		return B_OK;
1683 
1684 	// calculate the number of used blocks in the check bitmap
1685 	size_t size = BitmapSize();
1686 	off_t usedBlocks = 0LL;
1687 
1688 	// TODO: update the allocation groups used blocks info
1689 	for (uint32 i = size >> 2; i-- > 0;) {
1690 		uint32 compare = 1;
1691 		// Count the number of bits set
1692 		for (int16 j = 0; j < 32; j++, compare <<= 1) {
1693 			if ((compare & fCheckBitmap[i]) != 0)
1694 				usedBlocks++;
1695 		}
1696 	}
1697 
1698 	fCheckCookie->control.stats.freed = fVolume->UsedBlocks() - usedBlocks
1699 		+ fCheckCookie->control.stats.missing;
1700 	if (fCheckCookie->control.stats.freed < 0)
1701 		fCheckCookie->control.stats.freed = 0;
1702 
1703 	// Should we fix errors? Were there any errors we can fix?
1704 	if ((fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS) != 0
1705 		&& (fCheckCookie->control.stats.freed != 0
1706 			|| fCheckCookie->control.stats.missing != 0)) {
1707 		// If so, write the check bitmap back over the original one,
1708 		// and use transactions here to play safe - we even use several
1709 		// transactions, so that we don't blow the maximum log size
1710 		// on large disks, since we don't need to make this atomic.
1711 #if 0
1712 		// prints the blocks that differ
1713 		off_t block = 0;
1714 		for (int32 i = 0; i < fNumGroups; i++) {
1715 			AllocationBlock cached(fVolume);
1716 			for (uint32 j = 0; j < fGroups[i].NumBlocks(); j++) {
1717 				cached.SetTo(fGroups[i], j);
1718 				for (uint32 k = 0; k < cached.NumBlockBits(); k++) {
1719 					if (cached.IsUsed(k) != _CheckBitmapIsUsedAt(block)) {
1720 						dprintf("differ block %lld (should be %d)\n", block,
1721 							_CheckBitmapIsUsedAt(block));
1722 					}
1723 					block++;
1724 				}
1725 			}
1726 		}
1727 #endif
1728 
1729 		fVolume->SuperBlock().used_blocks
1730 			= HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
1731 
1732 		size_t blockSize = fVolume->BlockSize();
1733 
1734 		for (uint32 i = 0; i < fNumBlocks; i += 512) {
1735 			Transaction transaction(fVolume, 1 + i);
1736 
1737 			uint32 blocksToWrite = 512;
1738 			if (blocksToWrite + i > fNumBlocks)
1739 				blocksToWrite = fNumBlocks - i;
1740 
1741 			status_t status = transaction.WriteBlocks(1 + i,
1742 				(uint8*)fCheckBitmap + i * blockSize, blocksToWrite);
1743 			if (status < B_OK) {
1744 				FATAL(("error writing bitmap: %s\n", strerror(status)));
1745 				return status;
1746 			}
1747 			transaction.Done();
1748 		}
1749 	}
1750 
1751 	return B_OK;
1752 }
1753 
1754 
1755 /*!	Checks whether or not the specified block range is allocated or not,
1756 	depending on the \a allocated argument.
1757 */
1758 status_t
1759 BlockAllocator::CheckBlocks(off_t start, off_t length, bool allocated)
1760 {
1761 	if (start < 0 || start + length > fVolume->NumBlocks())
1762 		return B_BAD_VALUE;
1763 
1764 	int32 group = start >> fVolume->AllocationGroupShift();
1765 	uint32 bitmapBlock = start / (fVolume->BlockSize() << 3);
1766 	uint32 blockOffset = start % (fVolume->BlockSize() << 3);
1767 
1768 	uint32 groupBlock = bitmapBlock % fBlocksPerGroup;
1769 
1770 	AllocationBlock cached(fVolume);
1771 
1772 	while (groupBlock < fGroups[group].NumBlocks() && length > 0) {
1773 		if (cached.SetTo(fGroups[group], groupBlock) != B_OK)
1774 			RETURN_ERROR(B_IO_ERROR);
1775 
1776 		for (; blockOffset < cached.NumBlockBits() && length > 0;
1777 				blockOffset++, length--) {
1778 			if (cached.IsUsed(blockOffset) != allocated) {
1779 				RETURN_ERROR(B_BAD_DATA);
1780 			}
1781 		}
1782 
1783 		blockOffset = 0;
1784 
1785 		if (++groupBlock >= fGroups[group].NumBlocks()) {
1786 			groupBlock = 0;
1787 			group++;
1788 		}
1789 	}
1790 
1791 	return B_OK;
1792 }
1793 
1794 
1795 status_t
1796 BlockAllocator::CheckBlockRun(block_run run, const char* type, bool allocated)
1797 {
1798 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1799 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1800 		|| uint32(run.Start() + run.Length())
1801 				> fGroups[run.AllocationGroup()].fNumBits
1802 		|| run.length == 0) {
1803 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type,
1804 			run.AllocationGroup(), run.Start(), run.Length()));
1805 		if (fCheckCookie == NULL)
1806 			return B_BAD_DATA;
1807 
1808 		fCheckCookie->control.errors |= BFS_INVALID_BLOCK_RUN;
1809 		return B_OK;
1810 	}
1811 
1812 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1813 	uint32 block = run.Start() / bitsPerBlock;
1814 	uint32 pos = run.Start() % bitsPerBlock;
1815 	int32 length = 0;
1816 	off_t firstMissing = -1, firstSet = -1;
1817 	off_t firstGroupBlock
1818 		= (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1819 
1820 	AllocationBlock cached(fVolume);
1821 
1822 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1823 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1824 			RETURN_ERROR(B_IO_ERROR);
1825 
1826 		if (pos >= cached.NumBlockBits()) {
1827 			// something very strange has happened...
1828 			RETURN_ERROR(B_ERROR);
1829 		}
1830 
1831 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1832 			if (cached.IsUsed(pos) != allocated) {
1833 				if (fCheckCookie == NULL) {
1834 					PRINT(("%s: block_run(%ld, %u, %u) is only partially "
1835 						"allocated (pos = %ld, length = %ld)!\n", type,
1836 						run.AllocationGroup(), run.Start(), run.Length(),
1837 						pos, length));
1838 					return B_BAD_DATA;
1839 				}
1840 				if (firstMissing == -1) {
1841 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1842 					fCheckCookie->control.errors |= BFS_MISSING_BLOCKS;
1843 				}
1844 				fCheckCookie->control.stats.missing++;
1845 			} else if (firstMissing != -1) {
1846 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are "
1847 					"%sallocated!\n", type, run.AllocationGroup(), run.Start(),
1848 					run.Length(), firstMissing,
1849 					firstGroupBlock + pos + block * bitsPerBlock - 1,
1850 					allocated ? "not " : ""));
1851 				firstMissing = -1;
1852 			}
1853 
1854 			if (fCheckBitmap != NULL) {
1855 				// Set the block in the check bitmap as well, but have a look
1856 				// if it is already allocated first
1857 				uint32 offset = pos + block * bitsPerBlock;
1858 				if (_CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1859 					if (firstSet == -1) {
1860 						firstSet = firstGroupBlock + offset;
1861 						fCheckCookie->control.errors |= BFS_BLOCKS_ALREADY_SET;
1862 						dprintf("block %" B_PRIdOFF " is already set!!!\n",
1863 							firstGroupBlock + offset);
1864 					}
1865 					fCheckCookie->control.stats.already_set++;
1866 				} else {
1867 					if (firstSet != -1) {
1868 						FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF
1869 							" - %" B_PRIdOFF " are already set!\n", type,
1870 							(int)run.AllocationGroup(), run.Start(),
1871 							run.Length(), firstSet,
1872 							firstGroupBlock + offset - 1));
1873 						firstSet = -1;
1874 					}
1875 					_SetCheckBitmapAt(firstGroupBlock + offset);
1876 				}
1877 			}
1878 			length++;
1879 			pos++;
1880 		}
1881 
1882 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1883 			if (firstMissing != -1) {
1884 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not "
1885 					"allocated!\n", type, run.AllocationGroup(), run.Start(),
1886 					run.Length(), firstMissing,
1887 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1888 			}
1889 			if (firstSet != -1) {
1890 				FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF " - %"
1891 					B_PRIdOFF " are already set!\n", type,
1892 					(int)run.AllocationGroup(), run.Start(), run.Length(),
1893 					firstSet,
1894 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1895 			}
1896 		}
1897 	}
1898 
1899 	return B_OK;
1900 }
1901 
1902 
1903 status_t
1904 BlockAllocator::CheckInode(Inode* inode, const char* name)
1905 {
1906 	if (fCheckCookie != NULL && fCheckBitmap == NULL)
1907 		return B_NO_INIT;
1908 	if (inode == NULL)
1909 		return B_BAD_VALUE;
1910 
1911 	switch (fCheckCookie->pass) {
1912 		case BFS_CHECK_PASS_BITMAP:
1913 		{
1914 			status_t status = _CheckInodeBlocks(inode, name);
1915 			if (status != B_OK)
1916 				return status;
1917 
1918 			// Check the B+tree as well
1919 			if (inode->IsContainer()) {
1920 				bool repairErrors
1921 					= (fCheckCookie->control.flags & BFS_FIX_BPLUSTREES) != 0;
1922 				bool errorsFound = false;
1923 				status = inode->Tree()->Validate(repairErrors, errorsFound);
1924 				if (errorsFound) {
1925 					fCheckCookie->control.errors |= BFS_INVALID_BPLUSTREE;
1926 					if (inode->IsIndex() && name != NULL && repairErrors) {
1927 						// We completely rebuild corrupt indices
1928 						check_index* index = new(std::nothrow) check_index;
1929 						if (index == NULL)
1930 							return B_NO_MEMORY;
1931 
1932 						strlcpy(index->name, name, sizeof(index->name));
1933 						index->run = inode->BlockRun();
1934 						fCheckCookie->indices.Push(index);
1935 					}
1936 				}
1937 			}
1938 
1939 			return status;
1940 		}
1941 
1942 		case BFS_CHECK_PASS_INDEX:
1943 			return _AddInodeToIndex(inode);
1944 	}
1945 
1946 	return B_OK;
1947 }
1948 
1949 
1950 status_t
1951 BlockAllocator::_CheckInodeBlocks(Inode* inode, const char* name)
1952 {
1953 	status_t status = CheckBlockRun(inode->BlockRun(), "inode");
1954 	if (status != B_OK)
1955 		return status;
1956 
1957 	// If the inode has an attribute directory, push it on the stack
1958 	if (!inode->Attributes().IsZero())
1959 		fCheckCookie->stack.Push(inode->Attributes());
1960 
1961 	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1962 		// symlinks may not have a valid data stream
1963 		if (strlen(inode->Node().short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1964 			return B_BAD_DATA;
1965 
1966 		return B_OK;
1967 	}
1968 
1969 	data_stream* data = &inode->Node().data;
1970 
1971 	// check the direct range
1972 
1973 	if (data->max_direct_range) {
1974 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1975 			if (data->direct[i].IsZero())
1976 				break;
1977 
1978 			status = CheckBlockRun(data->direct[i], "direct");
1979 			if (status < B_OK)
1980 				return status;
1981 
1982 			fCheckCookie->control.stats.direct_block_runs++;
1983 			fCheckCookie->control.stats.blocks_in_direct
1984 				+= data->direct[i].Length();
1985 		}
1986 	}
1987 
1988 	CachedBlock cached(fVolume);
1989 
1990 	// check the indirect range
1991 
1992 	if (data->max_indirect_range) {
1993 		status = CheckBlockRun(data->indirect, "indirect");
1994 		if (status < B_OK)
1995 			return status;
1996 
1997 		off_t block = fVolume->ToBlock(data->indirect);
1998 
1999 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2000 			block_run* runs = (block_run*)cached.SetTo(block + i);
2001 			if (runs == NULL)
2002 				RETURN_ERROR(B_IO_ERROR);
2003 
2004 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2005 			int32 index = 0;
2006 			for (; index < runsPerBlock; index++) {
2007 				if (runs[index].IsZero())
2008 					break;
2009 
2010 				status = CheckBlockRun(runs[index], "indirect->run");
2011 				if (status < B_OK)
2012 					return status;
2013 
2014 				fCheckCookie->control.stats.indirect_block_runs++;
2015 				fCheckCookie->control.stats.blocks_in_indirect
2016 					+= runs[index].Length();
2017 			}
2018 			fCheckCookie->control.stats.indirect_array_blocks++;
2019 
2020 			if (index < runsPerBlock)
2021 				break;
2022 		}
2023 	}
2024 
2025 	// check the double indirect range
2026 
2027 	if (data->max_double_indirect_range) {
2028 		status = CheckBlockRun(data->double_indirect, "double indirect");
2029 		if (status != B_OK)
2030 			return status;
2031 
2032 		int32 runsPerBlock = runs_per_block(fVolume->BlockSize());
2033 		int32 runsPerArray = runsPerBlock * data->double_indirect.Length();
2034 
2035 		CachedBlock cachedDirect(fVolume);
2036 
2037 		for (int32 indirectIndex = 0; indirectIndex < runsPerArray;
2038 				indirectIndex++) {
2039 			// get the indirect array block
2040 			block_run* array = (block_run*)cached.SetTo(
2041 				fVolume->ToBlock(data->double_indirect)
2042 					+ indirectIndex / runsPerBlock);
2043 			if (array == NULL)
2044 				return B_IO_ERROR;
2045 
2046 			block_run indirect = array[indirectIndex % runsPerBlock];
2047 			// are we finished yet?
2048 			if (indirect.IsZero())
2049 				return B_OK;
2050 
2051 			status = CheckBlockRun(indirect, "double indirect->runs");
2052 			if (status != B_OK)
2053 				return status;
2054 
2055 			int32 maxIndex
2056 				= ((uint32)indirect.Length() << fVolume->BlockShift())
2057 					/ sizeof(block_run);
2058 
2059 			for (int32 index = 0; index < maxIndex; ) {
2060 				block_run* runs = (block_run*)cachedDirect.SetTo(
2061 					fVolume->ToBlock(indirect) + index / runsPerBlock);
2062 				if (runs == NULL)
2063 					return B_IO_ERROR;
2064 
2065 				do {
2066 					// are we finished yet?
2067 					if (runs[index % runsPerBlock].IsZero())
2068 						return B_OK;
2069 
2070 					status = CheckBlockRun(runs[index % runsPerBlock],
2071 						"double indirect->runs->run");
2072 					if (status != B_OK)
2073 						return status;
2074 
2075 					fCheckCookie->control.stats.double_indirect_block_runs++;
2076 					fCheckCookie->control.stats.blocks_in_double_indirect
2077 						+= runs[index % runsPerBlock].Length();
2078 				} while ((++index % runsPerBlock) != 0);
2079 			}
2080 
2081 			fCheckCookie->control.stats.double_indirect_array_blocks++;
2082 		}
2083 	}
2084 
2085 	return B_OK;
2086 }
2087 
2088 
2089 status_t
2090 BlockAllocator::_PrepareIndices()
2091 {
2092 	int32 count = 0;
2093 
2094 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2095 		check_index* index = fCheckCookie->indices.Array()[i];
2096 		Vnode vnode(fVolume, index->run);
2097 		Inode* inode;
2098 		status_t status = vnode.Get(&inode);
2099 		if (status != B_OK) {
2100 			FATAL(("check: Could not open index at %" B_PRIdOFF "\n",
2101 				fVolume->ToBlock(index->run)));
2102 			return status;
2103 		}
2104 
2105 		BPlusTree* tree = inode->Tree();
2106 		if (tree == NULL) {
2107 			// TODO: We can't yet repair those
2108 			continue;
2109 		}
2110 
2111 		status = tree->MakeEmpty();
2112 		if (status != B_OK)
2113 			return status;
2114 
2115 		index->inode = inode;
2116 		vnode.Keep();
2117 		count++;
2118 	}
2119 
2120 	return count == 0 ? B_ENTRY_NOT_FOUND : B_OK;
2121 }
2122 
2123 
2124 void
2125 BlockAllocator::_FreeIndices()
2126 {
2127 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2128 		check_index* index = fCheckCookie->indices.Array()[i];
2129 		if (index->inode != NULL) {
2130 			put_vnode(fVolume->FSVolume(),
2131 				fVolume->ToVnode(index->inode->BlockRun()));
2132 		}
2133 	}
2134 	fCheckCookie->indices.MakeEmpty();
2135 }
2136 
2137 
2138 status_t
2139 BlockAllocator::_AddInodeToIndex(Inode* inode)
2140 {
2141 	Transaction transaction(fVolume, inode->BlockNumber());
2142 
2143 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2144 		check_index* index = fCheckCookie->indices.Array()[i];
2145 		if (index->inode == NULL)
2146 			continue;
2147 
2148 		index->inode->WriteLockInTransaction(transaction);
2149 
2150 		BPlusTree* tree = index->inode->Tree();
2151 		if (tree == NULL)
2152 			return B_ERROR;
2153 
2154 		status_t status = B_OK;
2155 
2156 		if (!strcmp(index->name, "name")) {
2157 			if (inode->InNameIndex()) {
2158 				char name[B_FILE_NAME_LENGTH];
2159 				if (inode->GetName(name, B_FILE_NAME_LENGTH) != B_OK)
2160 					return B_ERROR;
2161 
2162 				status = tree->Insert(transaction, name, inode->ID());
2163 			}
2164 		} else if (!strcmp(index->name, "last_modified")) {
2165 			if (inode->InLastModifiedIndex()) {
2166 				status = tree->Insert(transaction, inode->OldLastModified(),
2167 					inode->ID());
2168 			}
2169 		} else if (!strcmp(index->name, "size")) {
2170 			if (inode->InSizeIndex())
2171 				status = tree->Insert(transaction, inode->Size(), inode->ID());
2172 		} else {
2173 			uint8 key[BPLUSTREE_MAX_KEY_LENGTH];
2174 			size_t keyLength = BPLUSTREE_MAX_KEY_LENGTH;
2175 			if (inode->ReadAttribute(index->name, B_ANY_TYPE, 0, key,
2176 					&keyLength) == B_OK) {
2177 				status = tree->Insert(transaction, key, keyLength, inode->ID());
2178 			}
2179 		}
2180 
2181 		if (status != B_OK)
2182 			return status;
2183 	}
2184 
2185 	return transaction.Done();
2186 }
2187 
2188 
2189 status_t
2190 BlockAllocator::_AddTrim(fs_trim_data& trimData, uint32 maxRanges,
2191 	uint64 offset, uint64 size)
2192 {
2193 	if (trimData.range_count < maxRanges && size > 0) {
2194 		trimData.ranges[trimData.range_count].offset = offset;
2195 		trimData.ranges[trimData.range_count].size = size;
2196 		trimData.range_count++;
2197 		return true;
2198 	}
2199 
2200 	return false;
2201 }
2202 
2203 
2204 status_t
2205 BlockAllocator::_TrimNext(fs_trim_data& trimData, uint32 maxRanges,
2206 	uint64 offset, uint64 size, bool force, uint64& trimmedSize)
2207 {
2208 	PRINT(("_TrimNext(index %" B_PRIu32 ", offset %" B_PRIu64 ", size %"
2209 		B_PRIu64 ")\n", trimData.range_count, offset, size));
2210 
2211 	bool pushed = _AddTrim(trimData, maxRanges, offset, size);
2212 
2213 	if (!pushed || force) {
2214 		// Trim now
2215 		trimData.trimmed_size = 0;
2216 		if (ioctl(fVolume->Device(), B_TRIM_DEVICE, &trimData,
2217 				sizeof(fs_trim_data)) != 0) {
2218 			return errno;
2219 		}
2220 
2221 		trimmedSize += trimData.trimmed_size;
2222 		trimData.range_count = 0;
2223 	}
2224 
2225 	if (!pushed)
2226 		_AddTrim(trimData, maxRanges, offset, size);
2227 
2228 	return B_OK;
2229 }
2230 
2231 
2232 //	#pragma mark - debugger commands
2233 
2234 
2235 #ifdef BFS_DEBUGGER_COMMANDS
2236 
2237 void
2238 BlockAllocator::Dump(int32 index)
2239 {
2240 	kprintf("allocation groups: %" B_PRId32 " (base %p)\n", fNumGroups, fGroups);
2241 	kprintf("blocks per group: %" B_PRId32 "\n", fBlocksPerGroup);
2242 
2243 	for (int32 i = 0; i < fNumGroups; i++) {
2244 		if (index != -1 && i != index)
2245 			continue;
2246 
2247 		AllocationGroup& group = fGroups[i];
2248 
2249 		kprintf("[%3" B_PRId32 "] num bits:       %" B_PRIu32 "  (%p)\n", i,
2250 			group.NumBits(), &group);
2251 		kprintf("      num blocks:     %" B_PRIu32 "\n", group.NumBlocks());
2252 		kprintf("      start:          %" B_PRId32 "\n", group.Start());
2253 		kprintf("      first free:     %" B_PRId32 "\n", group.fFirstFree);
2254 		kprintf("      largest start:  %" B_PRId32 "%s\n", group.fLargestStart,
2255 			group.fLargestValid ? "" : "  (invalid)");
2256 		kprintf("      largest length: %" B_PRId32 "\n", group.fLargestLength);
2257 		kprintf("      free bits:      %" B_PRId32 "\n", group.fFreeBits);
2258 	}
2259 }
2260 
2261 
2262 #if BFS_TRACING
2263 static char kTraceBuffer[256];
2264 
2265 
2266 int
2267 dump_block_allocator_blocks(int argc, char** argv)
2268 {
2269 	if (argc != 3 || !strcmp(argv[1], "--help")) {
2270 		kprintf("usage: %s <ptr-to-volume> <block>\n", argv[0]);
2271 		return 0;
2272 	}
2273 
2274 	Volume* volume = (Volume*)parse_expression(argv[1]);
2275 	off_t block = parse_expression(argv[2]);
2276 
2277 	// iterate over all tracing entries to find overlapping actions
2278 
2279 	using namespace BFSBlockTracing;
2280 
2281 	LazyTraceOutput out(kTraceBuffer, sizeof(kTraceBuffer), 0);
2282 	TraceEntryIterator iterator;
2283 	while (TraceEntry* _entry = iterator.Next()) {
2284 		if (Allocate* entry = dynamic_cast<Allocate*>(_entry)) {
2285 			off_t first = volume->ToBlock(entry->Run());
2286 			off_t last = first - 1 + entry->Run().Length();
2287 			if (block >= first && block <= last) {
2288 				out.Clear();
2289 				const char* dump = out.DumpEntry(entry);
2290 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2291 			}
2292 		} else if (Free* entry = dynamic_cast<Free*>(_entry)) {
2293 			off_t first = volume->ToBlock(entry->Run());
2294 			off_t last = first - 1 + entry->Run().Length();
2295 			if (block >= first && block <= last) {
2296 				out.Clear();
2297 				const char* dump = out.DumpEntry(entry);
2298 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2299 			}
2300 		}
2301 	}
2302 
2303 	return 0;
2304 }
2305 #endif
2306 
2307 
2308 int
2309 dump_block_allocator(int argc, char** argv)
2310 {
2311 	int32 group = -1;
2312 	if (argc == 3) {
2313 		group = parse_expression(argv[2]);
2314 		argc--;
2315 	}
2316 
2317 	if (argc != 2 || !strcmp(argv[1], "--help")) {
2318 		kprintf("usage: %s <ptr-to-volume> [group]\n", argv[0]);
2319 		return 0;
2320 	}
2321 
2322 	Volume* volume = (Volume*)parse_expression(argv[1]);
2323 	BlockAllocator& allocator = volume->Allocator();
2324 
2325 	allocator.Dump(group);
2326 	return 0;
2327 }
2328 
2329 #endif	// BFS_DEBUGGER_COMMANDS
2330 
2331