xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 6889394848e2dc9f41ff53b12141d572822ca0c6)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Block bitmap handling and allocation policies
8 
9 
10 #include "BlockAllocator.h"
11 
12 #include "bfs_control.h"
13 #include "BPlusTree.h"
14 #include "Debug.h"
15 #include "Inode.h"
16 #include "Volume.h"
17 
18 
19 // Things the BlockAllocator should do:
20 
21 // - find a range of blocks of a certain size nearby a specific position
22 // - allocating an unsharp range of blocks for pre-allocation
23 // - free blocks
24 // - know how to deal with each allocation, special handling for directories,
25 //   files, symlinks, etc. (type sensitive allocation policies)
26 
27 // What makes the code complicated is the fact that we are not just reading
28 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
29 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
30 // of partitions will grow even more - so that's not an option.
31 // Instead we are reading in every block when it's used - since an allocation
32 // group can span several blocks in the block bitmap, the AllocationBlock
33 // class is there to make handling those easier.
34 
35 // The current implementation is only slightly optimized and could probably
36 // be improved a lot. Furthermore, the allocation policies used here should
37 // have some real world tests.
38 
39 #if BFS_TRACING && !defined(FS_SHELL)
40 namespace BFSBlockTracing {
41 
42 
43 class Allocate : public AbstractTraceEntry {
44 public:
45 	Allocate(block_run run)
46 		:
47 		fRun(run)
48 	{
49 		Initialized();
50 	}
51 
52 	virtual void AddDump(TraceOutput& out)
53 	{
54 		out.Print("bfs:alloc %lu.%u.%u", fRun.AllocationGroup(),
55 			fRun.Start(), fRun.Length());
56 	}
57 
58 	const block_run& Run() const { return fRun; }
59 
60 private:
61 			block_run			fRun;
62 };
63 
64 
65 class Free : public AbstractTraceEntry {
66 public:
67 	Free(block_run run)
68 		:
69 		fRun(run)
70 	{
71 		Initialized();
72 	}
73 
74 	virtual void AddDump(TraceOutput& out)
75 	{
76 		out.Print("bfs:free %lu.%u.%u", fRun.AllocationGroup(),
77 			fRun.Start(), fRun.Length());
78 	}
79 
80 	const block_run& Run() const { return fRun; }
81 
82 private:
83 			block_run			fRun;
84 };
85 
86 
87 static uint32
88 checksum(const uint8* data, size_t size)
89 {
90 	const uint32* data4 = (const uint32*)data;
91 	uint32 sum = 0;
92 	while (size >= 4) {
93 		sum += *data4;
94 		data4++;
95 		size -= 4;
96 	}
97 	return sum;
98 }
99 
100 
101 class Block : public AbstractTraceEntry {
102 public:
103 	Block(const char* label, off_t blockNumber, const uint8* data,
104 			size_t size, uint32 start = 0, uint32 length = 0)
105 		:
106 		fBlock(blockNumber),
107 		fData(data),
108 		fStart(start),
109 		fLength(length),
110 		fLabel(label)
111 	{
112 		fSum = checksum(data, size);
113 		Initialized();
114 	}
115 
116 	virtual void AddDump(TraceOutput& out)
117 	{
118 		out.Print("bfs:%s: block %Ld (%p), sum %lu, s/l %lu/%lu", fLabel,
119 			fBlock, fData, fSum, fStart, fLength);
120 	}
121 
122 private:
123 			off_t				fBlock;
124 			const uint8*		fData;
125 			uint32				fStart;
126 			uint32				fLength;
127 			uint32				fSum;
128 			const char*			fLabel;
129 };
130 
131 
132 class BlockChange : public AbstractTraceEntry {
133 public:
134 	BlockChange(const char* label, int32 block, uint32 oldData, uint32 newData)
135 		:
136 		fBlock(block),
137 		fOldData(oldData),
138 		fNewData(newData),
139 		fLabel(label)
140 	{
141 		Initialized();
142 	}
143 
144 	virtual void AddDump(TraceOutput& out)
145 	{
146 		out.Print("bfs:%s: block %ld, %08lx -> %08lx", fLabel,
147 			fBlock, fOldData, fNewData);
148 	}
149 
150 private:
151 			int32				fBlock;
152 			uint32				fOldData;
153 			uint32				fNewData;
154 			const char*			fLabel;
155 };
156 
157 
158 }	// namespace BFSBlockTracing
159 
160 #	define T(x) new(std::nothrow) BFSBlockTracing::x;
161 #else
162 #	define T(x) ;
163 #endif
164 
165 #ifdef DEBUG_ALLOCATION_GROUPS
166 #	define CHECK_ALLOCATION_GROUP(group) _CheckGroup(group)
167 #else
168 #	define CHECK_ALLOCATION_GROUP(group) ;
169 #endif
170 
171 
172 struct check_index {
173 	check_index()
174 		:
175 		inode(NULL)
176 	{
177 	}
178 
179 	char				name[B_FILE_NAME_LENGTH];
180 	block_run			run;
181 	Inode*				inode;
182 };
183 
184 
185 struct check_cookie {
186 	check_cookie()
187 	{
188 	}
189 
190 	uint32				pass;
191 	block_run			current;
192 	Inode*				parent;
193 	mode_t				parent_mode;
194 	Stack<block_run>	stack;
195 	TreeIterator*		iterator;
196 	check_control		control;
197 	Stack<check_index*>	indices;
198 };
199 
200 
201 class AllocationBlock : public CachedBlock {
202 public:
203 	AllocationBlock(Volume* volume);
204 
205 	inline void Allocate(uint16 start, uint16 numBlocks);
206 	inline void Free(uint16 start, uint16 numBlocks);
207 	inline bool IsUsed(uint16 block);
208 
209 	status_t SetTo(AllocationGroup& group, uint16 block);
210 	status_t SetToWritable(Transaction& transaction, AllocationGroup& group,
211 		uint16 block);
212 
213 	uint32 NumBlockBits() const { return fNumBits; }
214 	uint32& Block(int32 index) { return ((uint32*)fBlock)[index]; }
215 	uint8* Block() const { return (uint8*)fBlock; }
216 
217 private:
218 	uint32	fNumBits;
219 #ifdef DEBUG
220 	bool	fWritable;
221 #endif
222 };
223 
224 
225 class AllocationGroup {
226 public:
227 	AllocationGroup();
228 
229 	void AddFreeRange(int32 start, int32 blocks);
230 	bool IsFull() const { return fFreeBits == 0; }
231 
232 	status_t Allocate(Transaction& transaction, uint16 start, int32 length);
233 	status_t Free(Transaction& transaction, uint16 start, int32 length);
234 
235 	uint32 NumBits() const { return fNumBits; }
236 	uint32 NumBlocks() const { return fNumBlocks; }
237 	int32 Start() const { return fStart; }
238 
239 private:
240 	friend class BlockAllocator;
241 
242 	uint32	fNumBits;
243 	uint32	fNumBlocks;
244 	int32	fStart;
245 	int32	fFirstFree;
246 	int32	fFreeBits;
247 
248 	int32	fLargestStart;
249 	int32	fLargestLength;
250 	bool	fLargestValid;
251 };
252 
253 
254 AllocationBlock::AllocationBlock(Volume* volume)
255 	: CachedBlock(volume)
256 {
257 }
258 
259 
260 status_t
261 AllocationBlock::SetTo(AllocationGroup& group, uint16 block)
262 {
263 	// 8 blocks per byte
264 	fNumBits = fVolume->BlockSize() << 3;
265 	// the last group may have less bits than the others
266 	if ((block + 1) * fNumBits > group.NumBits())
267 		fNumBits = group.NumBits() % fNumBits;
268 
269 #ifdef DEBUG
270 	fWritable = false;
271 #endif
272 	return CachedBlock::SetTo(group.Start() + block) != NULL ? B_OK : B_ERROR;
273 }
274 
275 
276 status_t
277 AllocationBlock::SetToWritable(Transaction& transaction, AllocationGroup& group,
278 	uint16 block)
279 {
280 	// 8 blocks per byte
281 	fNumBits = fVolume->BlockSize() << 3;
282 	// the last group may have less bits in the last block
283 	if ((block + 1) * fNumBits > group.NumBits())
284 		fNumBits = group.NumBits() % fNumBits;
285 
286 #ifdef DEBUG
287 	fWritable = true;
288 #endif
289 	return CachedBlock::SetToWritable(transaction, group.Start() + block)
290 		!= NULL ? B_OK : B_ERROR;
291 }
292 
293 
294 bool
295 AllocationBlock::IsUsed(uint16 block)
296 {
297 	if (block > fNumBits)
298 		return true;
299 
300 	// the block bitmap is accessed in 32-bit blocks
301 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
302 }
303 
304 
305 void
306 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
307 {
308 	ASSERT(start < fNumBits);
309 	ASSERT(uint32(start + numBlocks) <= fNumBits);
310 #ifdef DEBUG
311 	ASSERT(fWritable);
312 #endif
313 
314 	T(Block("b-alloc-in", fBlockNumber, fBlock, fVolume->BlockSize(),
315 		start, numBlocks));
316 
317 	int32 block = start >> 5;
318 
319 	while (numBlocks > 0) {
320 		uint32 mask = 0;
321 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
322 			mask |= 1UL << i;
323 
324 		T(BlockChange("b-alloc", block, Block(block),
325 			Block(block) | HOST_ENDIAN_TO_BFS_INT32(mask)));
326 
327 #if KDEBUG
328 		// check for already set blocks
329 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32*)fBlock)[block]) {
330 			FATAL(("AllocationBlock::Allocate(): some blocks are already "
331 				"allocated, start = %u, numBlocks = %u\n", start, numBlocks));
332 			panic("blocks already set!");
333 		}
334 #endif
335 
336 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
337 		start = 0;
338 	}
339 	T(Block("b-alloc-out", fBlockNumber, fBlock, fVolume->BlockSize(),
340 		start, numBlocks));
341 }
342 
343 
344 void
345 AllocationBlock::Free(uint16 start, uint16 numBlocks)
346 {
347 	ASSERT(start < fNumBits);
348 	ASSERT(uint32(start + numBlocks) <= fNumBits);
349 #ifdef DEBUG
350 	ASSERT(fWritable);
351 #endif
352 
353 	int32 block = start >> 5;
354 
355 	while (numBlocks > 0) {
356 		uint32 mask = 0;
357 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
358 			mask |= 1UL << (i % 32);
359 
360 		T(BlockChange("b-free", block, Block(block),
361 			Block(block) & HOST_ENDIAN_TO_BFS_INT32(~mask)));
362 
363 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
364 		start = 0;
365 	}
366 }
367 
368 
369 //	#pragma mark -
370 
371 
372 /*!	The allocation groups are created and initialized in
373 	BlockAllocator::Initialize() and BlockAllocator::InitializeAndClearBitmap()
374 	respectively.
375 */
376 AllocationGroup::AllocationGroup()
377 	:
378 	fFirstFree(-1),
379 	fFreeBits(0),
380 	fLargestValid(false)
381 {
382 }
383 
384 
385 void
386 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
387 {
388 	//D(if (blocks > 512)
389 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
390 
391 	if (fFirstFree == -1)
392 		fFirstFree = start;
393 
394 	if (!fLargestValid || fLargestLength < blocks) {
395 		fLargestStart = start;
396 		fLargestLength = blocks;
397 		fLargestValid = true;
398 	}
399 
400 	fFreeBits += blocks;
401 }
402 
403 
404 /*!	Allocates the specified run in the allocation group.
405 	Doesn't check if the run is valid or already allocated partially, nor
406 	does it maintain the free ranges hints or the volume's used blocks count.
407 	It only does the low-level work of allocating some bits in the block bitmap.
408 	Assumes that the block bitmap lock is hold.
409 */
410 status_t
411 AllocationGroup::Allocate(Transaction& transaction, uint16 start, int32 length)
412 {
413 	ASSERT(start + length <= (int32)fNumBits);
414 
415 	// Update the allocation group info
416 	// TODO: this info will be incorrect if something goes wrong later
417 	// Note, the fFirstFree block doesn't have to be really free
418 	if (start == fFirstFree)
419 		fFirstFree = start + length;
420 	fFreeBits -= length;
421 
422 	if (fLargestValid) {
423 		bool cut = false;
424 		if (fLargestStart == start) {
425 			// cut from start
426 			fLargestStart += length;
427 			fLargestLength -= length;
428 			cut = true;
429 		} else if (start > fLargestStart
430 			&& start < fLargestStart + fLargestLength) {
431 			// cut from end
432 			fLargestLength = start - fLargestStart;
433 			cut = true;
434 		}
435 		if (cut && (fLargestLength < fLargestStart
436 				|| fLargestLength
437 						< (int32)fNumBits - (fLargestStart + fLargestLength))) {
438 			// might not be the largest block anymore
439 			fLargestValid = false;
440 		}
441 	}
442 
443 	Volume* volume = transaction.GetVolume();
444 
445 	// calculate block in the block bitmap and position within
446 	uint32 bitsPerBlock = volume->BlockSize() << 3;
447 	uint32 block = start / bitsPerBlock;
448 	start = start % bitsPerBlock;
449 
450 	AllocationBlock cached(volume);
451 
452 	while (length > 0) {
453 		if (cached.SetToWritable(transaction, *this, block) < B_OK) {
454 			fLargestValid = false;
455 			RETURN_ERROR(B_IO_ERROR);
456 		}
457 
458 		uint32 numBlocks = length;
459 		if (start + numBlocks > cached.NumBlockBits())
460 			numBlocks = cached.NumBlockBits() - start;
461 
462 		cached.Allocate(start, numBlocks);
463 
464 		length -= numBlocks;
465 		start = 0;
466 		block++;
467 	}
468 
469 	return B_OK;
470 }
471 
472 
473 /*!	Frees the specified run in the allocation group.
474 	Doesn't check if the run is valid or was not completely allocated, nor
475 	does it maintain the free ranges hints or the volume's used blocks count.
476 	It only does the low-level work of freeing some bits in the block bitmap.
477 	Assumes that the block bitmap lock is hold.
478 */
479 status_t
480 AllocationGroup::Free(Transaction& transaction, uint16 start, int32 length)
481 {
482 	ASSERT(start + length <= (int32)fNumBits);
483 
484 	// Update the allocation group info
485 	// TODO: this info will be incorrect if something goes wrong later
486 	if (fFirstFree > start)
487 		fFirstFree = start;
488 	fFreeBits += length;
489 
490 	// The range to be freed cannot be part of the valid largest range
491 	ASSERT(!fLargestValid || start + length <= fLargestStart
492 		|| start > fLargestStart);
493 
494 	if (fLargestValid
495 		&& (start + length == fLargestStart
496 			|| fLargestStart + fLargestLength == start
497 			|| (start < fLargestStart && fLargestStart > fLargestLength)
498 			|| (start > fLargestStart
499 				&& (int32)fNumBits - (fLargestStart + fLargestLength)
500 						> fLargestLength))) {
501 		fLargestValid = false;
502 	}
503 
504 	Volume* volume = transaction.GetVolume();
505 
506 	// calculate block in the block bitmap and position within
507 	uint32 bitsPerBlock = volume->BlockSize() << 3;
508 	uint32 block = start / bitsPerBlock;
509 	start = start % bitsPerBlock;
510 
511 	AllocationBlock cached(volume);
512 
513 	while (length > 0) {
514 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
515 			RETURN_ERROR(B_IO_ERROR);
516 
517 		T(Block("free-1", block, cached.Block(), volume->BlockSize()));
518 		uint16 freeLength = length;
519 		if (uint32(start + length) > cached.NumBlockBits())
520 			freeLength = cached.NumBlockBits() - start;
521 
522 		cached.Free(start, freeLength);
523 
524 		length -= freeLength;
525 		start = 0;
526 		T(Block("free-2", block, cached.Block(), volume->BlockSize()));
527 		block++;
528 	}
529 	return B_OK;
530 }
531 
532 
533 //	#pragma mark -
534 
535 
536 BlockAllocator::BlockAllocator(Volume* volume)
537 	:
538 	fVolume(volume),
539 	fGroups(NULL),
540 	fCheckBitmap(NULL),
541 	fCheckCookie(NULL)
542 {
543 	recursive_lock_init(&fLock, "bfs allocator");
544 }
545 
546 
547 BlockAllocator::~BlockAllocator()
548 {
549 	recursive_lock_destroy(&fLock);
550 	delete[] fGroups;
551 }
552 
553 
554 status_t
555 BlockAllocator::Initialize(bool full)
556 {
557 	fNumGroups = fVolume->AllocationGroups();
558 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
559 	fNumBlocks = (fVolume->NumBlocks() + fVolume->BlockSize() * 8 - 1)
560 		/ (fVolume->BlockSize() * 8);
561 
562 	fGroups = new(std::nothrow) AllocationGroup[fNumGroups];
563 	if (fGroups == NULL)
564 		return B_NO_MEMORY;
565 
566 	if (!full)
567 		return B_OK;
568 
569 	recursive_lock_lock(&fLock);
570 		// the lock will be released by the _Initialize() method
571 
572 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::_Initialize,
573 		"bfs block allocator", B_LOW_PRIORITY, this);
574 	if (id < B_OK)
575 		return _Initialize(this);
576 
577 	recursive_lock_transfer_lock(&fLock, id);
578 
579 	return resume_thread(id);
580 }
581 
582 
583 status_t
584 BlockAllocator::InitializeAndClearBitmap(Transaction& transaction)
585 {
586 	status_t status = Initialize(false);
587 	if (status != B_OK)
588 		return status;
589 
590 	uint32 numBits = 8 * fBlocksPerGroup * fVolume->BlockSize();
591 	uint32 blockShift = fVolume->BlockShift();
592 
593 	uint32* buffer = (uint32*)malloc(numBits >> 3);
594 	if (buffer == NULL)
595 		RETURN_ERROR(B_NO_MEMORY);
596 
597 	memset(buffer, 0, numBits >> 3);
598 
599 	off_t offset = 1;
600 		// the bitmap starts directly after the superblock
601 
602 	// initialize the AllocationGroup objects and clear the on-disk bitmap
603 
604 	for (int32 i = 0; i < fNumGroups; i++) {
605 		if (write_pos(fVolume->Device(), offset << blockShift, buffer,
606 				fBlocksPerGroup << blockShift) < B_OK) {
607 			free(buffer);
608 			return B_ERROR;
609 		}
610 
611 		// the last allocation group may contain less blocks than the others
612 		if (i == fNumGroups - 1) {
613 			fGroups[i].fNumBits = fVolume->NumBlocks() - i * numBits;
614 			fGroups[i].fNumBlocks = 1 + ((fGroups[i].NumBits() - 1)
615 				>> (blockShift + 3));
616 		} else {
617 			fGroups[i].fNumBits = numBits;
618 			fGroups[i].fNumBlocks = fBlocksPerGroup;
619 		}
620 		fGroups[i].fStart = offset;
621 		fGroups[i].fFirstFree = fGroups[i].fLargestStart = 0;
622 		fGroups[i].fFreeBits = fGroups[i].fLargestLength = fGroups[i].fNumBits;
623 		fGroups[i].fLargestValid = true;
624 
625 		offset += fBlocksPerGroup;
626 	}
627 	free(buffer);
628 
629 	// reserve the boot block, the log area, and the block bitmap itself
630 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
631 
632 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
633 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
634 		return B_ERROR;
635 	}
636 	fVolume->SuperBlock().used_blocks
637 		= HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
638 
639 	return B_OK;
640 }
641 
642 
643 status_t
644 BlockAllocator::_Initialize(BlockAllocator* allocator)
645 {
646 	// The lock must already be held at this point
647 	RecursiveLocker locker(allocator->fLock, true);
648 
649 	Volume* volume = allocator->fVolume;
650 	uint32 blocks = allocator->fBlocksPerGroup;
651 	uint32 blockShift = volume->BlockShift();
652 	off_t freeBlocks = 0;
653 
654 	uint32* buffer = (uint32*)malloc(blocks << blockShift);
655 	if (buffer == NULL)
656 		RETURN_ERROR(B_NO_MEMORY);
657 
658 	AllocationGroup* groups = allocator->fGroups;
659 	off_t offset = 1;
660 	uint32 bitsPerGroup = 8 * (blocks << blockShift);
661 	int32 numGroups = allocator->fNumGroups;
662 
663 	for (int32 i = 0; i < numGroups; i++) {
664 		if (read_pos(volume->Device(), offset << blockShift, buffer,
665 				blocks << blockShift) < B_OK)
666 			break;
667 
668 		// the last allocation group may contain less blocks than the others
669 		if (i == numGroups - 1) {
670 			groups[i].fNumBits = volume->NumBlocks() - i * bitsPerGroup;
671 			groups[i].fNumBlocks = 1 + ((groups[i].NumBits() - 1)
672 				>> (blockShift + 3));
673 		} else {
674 			groups[i].fNumBits = bitsPerGroup;
675 			groups[i].fNumBlocks = blocks;
676 		}
677 		groups[i].fStart = offset;
678 
679 		// finds all free ranges in this allocation group
680 		int32 start = -1, range = 0;
681 		int32 numBits = groups[i].fNumBits, bit = 0;
682 		int32 count = (numBits + 31) / 32;
683 
684 		for (int32 k = 0; k < count; k++) {
685 			for (int32 j = 0; j < 32 && bit < numBits; j++, bit++) {
686 				if (buffer[k] & (1UL << j)) {
687 					// block is in use
688 					if (range > 0) {
689 						groups[i].AddFreeRange(start, range);
690 						range = 0;
691 					}
692 				} else if (range++ == 0) {
693 					// block is free, start new free range
694 					start = bit;
695 				}
696 			}
697 		}
698 		if (range)
699 			groups[i].AddFreeRange(start, range);
700 
701 		freeBlocks += groups[i].fFreeBits;
702 
703 		offset += blocks;
704 	}
705 	free(buffer);
706 
707 	// check if block bitmap and log area are reserved
708 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
709 
710 	if (allocator->CheckBlocks(0, reservedBlocks) != B_OK) {
711 		if (volume->IsReadOnly()) {
712 			FATAL(("Space for block bitmap or log area is not reserved "
713 				"(volume is mounted read-only)!\n"));
714 		} else {
715 			Transaction transaction(volume, 0);
716 			if (groups[0].Allocate(transaction, 0, reservedBlocks) != B_OK) {
717 				FATAL(("Could not allocate reserved space for block "
718 					"bitmap/log!\n"));
719 				volume->Panic();
720 			} else {
721 				transaction.Done();
722 				FATAL(("Space for block bitmap or log area was not "
723 					"reserved!\n"));
724 			}
725 		}
726 	}
727 
728 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
729 	if (volume->UsedBlocks() != usedBlocks) {
730 		// If the disk in a dirty state at mount time, it's
731 		// normal that the values don't match
732 		INFORM(("volume reports %" B_PRIdOFF " used blocks, correct is %"
733 			B_PRIdOFF "\n", volume->UsedBlocks(), usedBlocks));
734 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
735 	}
736 
737 	return B_OK;
738 }
739 
740 
741 void
742 BlockAllocator::Uninitialize()
743 {
744 	// We only have to make sure that the initializer thread isn't running
745 	// anymore.
746 	recursive_lock_lock(&fLock);
747 }
748 
749 
750 /*!	Tries to allocate between \a minimum, and \a maximum blocks starting
751 	at group \a groupIndex with offset \a start. The resulting allocation
752 	is put into \a run.
753 
754 	The number of allocated blocks is always a multiple of \a minimum which
755 	has to be a power of two value.
756 */
757 status_t
758 BlockAllocator::AllocateBlocks(Transaction& transaction, int32 groupIndex,
759 	uint16 start, uint16 maximum, uint16 minimum, block_run& run)
760 {
761 	if (maximum == 0)
762 		return B_BAD_VALUE;
763 
764 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n",
765 		groupIndex, start, maximum, minimum));
766 
767 	AllocationBlock cached(fVolume);
768 	RecursiveLocker lock(fLock);
769 
770 	uint32 bitsPerFullBlock = fVolume->BlockSize() << 3;
771 
772 	// Find the block_run that can fulfill the request best
773 	int32 bestGroup = -1;
774 	int32 bestStart = -1;
775 	int32 bestLength = -1;
776 
777 	for (int32 i = 0; i < fNumGroups + 1; i++, groupIndex++, start = 0) {
778 		groupIndex = groupIndex % fNumGroups;
779 		AllocationGroup& group = fGroups[groupIndex];
780 
781 		CHECK_ALLOCATION_GROUP(groupIndex);
782 
783 		if (start >= group.NumBits() || group.IsFull())
784 			continue;
785 
786 		// The wanted maximum is smaller than the largest free block in the
787 		// group or already smaller than the minimum
788 
789 		if (start < group.fFirstFree)
790 			start = group.fFirstFree;
791 
792 		if (group.fLargestValid) {
793 			if (group.fLargestLength < bestLength)
794 				continue;
795 
796 			if (group.fLargestStart >= start) {
797 				if (group.fLargestLength >= bestLength) {
798 					bestGroup = groupIndex;
799 					bestStart = group.fLargestStart;
800 					bestLength = group.fLargestLength;
801 
802 					if (bestLength >= maximum)
803 						break;
804 				}
805 
806 				// We know everything about this group we have to, let's skip
807 				// to the next
808 				continue;
809 			}
810 		}
811 
812 		// There may be more than one block per allocation group - and
813 		// we iterate through it to find a place for the allocation.
814 		// (one allocation can't exceed one allocation group)
815 
816 		uint32 block = start / (fVolume->BlockSize() << 3);
817 		int32 currentStart = 0, currentLength = 0;
818 		int32 groupLargestStart = -1;
819 		int32 groupLargestLength = -1;
820 		int32 currentBit = start;
821 		bool canFindGroupLargest = start == 0;
822 
823 		for (; block < group.NumBlocks(); block++) {
824 			if (cached.SetTo(group, block) < B_OK)
825 				RETURN_ERROR(B_ERROR);
826 
827 			T(Block("alloc-in", group.Start() + block, cached.Block(),
828 				fVolume->BlockSize(), groupIndex, currentStart));
829 
830 			// find a block large enough to hold the allocation
831 			for (uint32 bit = start % bitsPerFullBlock;
832 					bit < cached.NumBlockBits(); bit++) {
833 				if (!cached.IsUsed(bit)) {
834 					if (currentLength == 0) {
835 						// start new range
836 						currentStart = currentBit;
837 					}
838 
839 					// have we found a range large enough to hold numBlocks?
840 					if (++currentLength >= maximum) {
841 						bestGroup = groupIndex;
842 						bestStart = currentStart;
843 						bestLength = currentLength;
844 						break;
845 					}
846 				} else {
847 					if (currentLength) {
848 						// end of a range
849 						if (currentLength > bestLength) {
850 							bestGroup = groupIndex;
851 							bestStart = currentStart;
852 							bestLength = currentLength;
853 						}
854 						if (currentLength > groupLargestLength) {
855 							groupLargestStart = currentStart;
856 							groupLargestLength = currentLength;
857 						}
858 						currentLength = 0;
859 					}
860 					if ((int32)group.NumBits() - currentBit
861 							<= groupLargestLength) {
862 						// We can't find a bigger block in this group anymore,
863 						// let's skip the rest.
864 						block = group.NumBlocks();
865 						break;
866 					}
867 				}
868 				currentBit++;
869 			}
870 
871 			T(Block("alloc-out", block, cached.Block(),
872 				fVolume->BlockSize(), groupIndex, currentStart));
873 
874 			if (bestLength >= maximum) {
875 				canFindGroupLargest = false;
876 				break;
877 			}
878 
879 			// start from the beginning of the next block
880 			start = 0;
881 		}
882 
883 		if (currentBit == (int32)group.NumBits()) {
884 			if (currentLength > bestLength) {
885 				bestGroup = groupIndex;
886 				bestStart = currentStart;
887 				bestLength = currentLength;
888 			}
889 			if (canFindGroupLargest && currentLength > groupLargestLength) {
890 				groupLargestStart = currentStart;
891 				groupLargestLength = currentLength;
892 			}
893 		}
894 
895 		if (canFindGroupLargest && !group.fLargestValid
896 			&& groupLargestLength >= 0) {
897 			group.fLargestStart = groupLargestStart;
898 			group.fLargestLength = groupLargestLength;
899 			group.fLargestValid = true;
900 		}
901 
902 		if (bestLength >= maximum)
903 			break;
904 	}
905 
906 	// If we found a suitable range, mark the blocks as in use, and
907 	// write the updated block bitmap back to disk
908 	if (bestLength < minimum)
909 		return B_DEVICE_FULL;
910 
911 	if (bestLength > maximum)
912 		bestLength = maximum;
913 	else if (minimum > 1) {
914 		// make sure bestLength is a multiple of minimum
915 		bestLength = round_down(bestLength, minimum);
916 	}
917 
918 	if (fGroups[bestGroup].Allocate(transaction, bestStart, bestLength) != B_OK)
919 		RETURN_ERROR(B_IO_ERROR);
920 
921 	CHECK_ALLOCATION_GROUP(bestGroup);
922 
923 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(bestGroup);
924 	run.start = HOST_ENDIAN_TO_BFS_INT16(bestStart);
925 	run.length = HOST_ENDIAN_TO_BFS_INT16(bestLength);
926 
927 	fVolume->SuperBlock().used_blocks
928 		= HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + bestLength);
929 		// We are not writing back the disk's superblock - it's
930 		// either done by the journaling code, or when the disk
931 		// is unmounted.
932 		// If the value is not correct at mount time, it will be
933 		// fixed anyway.
934 
935 	// We need to flush any remaining blocks in the new allocation to make sure
936 	// they won't interfere with the file cache.
937 	block_cache_discard(fVolume->BlockCache(), fVolume->ToBlock(run),
938 		run.Length());
939 
940 	T(Allocate(run));
941 	return B_OK;
942 }
943 
944 
945 status_t
946 BlockAllocator::AllocateForInode(Transaction& transaction,
947 	const block_run* parent, mode_t type, block_run& run)
948 {
949 	// Apply some allocation policies here (AllocateBlocks() will break them
950 	// if necessary) - we will start with those described in Dominic Giampaolo's
951 	// "Practical File System Design", and see how good they work
952 
953 	// Files are going in the same allocation group as its parent,
954 	// sub-directories will be inserted 8 allocation groups after
955 	// the one of the parent
956 	uint16 group = parent->AllocationGroup();
957 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
958 		group += 8;
959 
960 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
961 }
962 
963 
964 status_t
965 BlockAllocator::Allocate(Transaction& transaction, Inode* inode,
966 	off_t numBlocks, block_run& run, uint16 minimum)
967 {
968 	if (numBlocks <= 0)
969 		return B_ERROR;
970 
971 	// one block_run can't hold more data than there is in one allocation group
972 	if (numBlocks > fGroups[0].NumBits())
973 		numBlocks = fGroups[0].NumBits();
974 
975 	// since block_run.length is uint16, the largest number of blocks that
976 	// can be covered by a block_run is 65535
977 	// TODO: if we drop compatibility, couldn't we do this any better?
978 	// There are basically two possibilities:
979 	// a) since a length of zero doesn't have any sense, take that for 65536 -
980 	//    but that could cause many problems (bugs) in other areas
981 	// b) reduce the maximum amount of blocks per block_run, so that the
982 	//    remaining number of free blocks can be used in a useful manner
983 	//    (like 4 blocks) - but that would also reduce the maximum file size
984 	// c) have BlockRun::Length() return (length + 1).
985 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
986 		numBlocks = MAX_BLOCK_RUN_LENGTH;
987 
988 	// Apply some allocation policies here (AllocateBlocks() will break them
989 	// if necessary)
990 	uint16 group = inode->BlockRun().AllocationGroup();
991 	uint16 start = 0;
992 
993 	// Are there already allocated blocks? (then just try to allocate near the
994 	// last one)
995 	if (inode->Size() > 0) {
996 		const data_stream& data = inode->Node().data;
997 		// TODO: we currently don't care for when the data stream
998 		// is already grown into the indirect ranges
999 		if (data.max_double_indirect_range == 0
1000 			&& data.max_indirect_range == 0) {
1001 			// Since size > 0, there must be a valid block run in this stream
1002 			int32 last = 0;
1003 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
1004 				if (data.direct[last + 1].IsZero())
1005 					break;
1006 
1007 			group = data.direct[last].AllocationGroup();
1008 			start = data.direct[last].Start() + data.direct[last].Length();
1009 		}
1010 	} else if (inode->IsContainer() || inode->IsSymLink()) {
1011 		// directory and symbolic link data will go in the same allocation
1012 		// group as the inode is in but after the inode data
1013 		start = inode->BlockRun().Start();
1014 	} else {
1015 		// file data will start in the next allocation group
1016 		group = inode->BlockRun().AllocationGroup() + 1;
1017 	}
1018 
1019 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
1020 }
1021 
1022 
1023 status_t
1024 BlockAllocator::Free(Transaction& transaction, block_run run)
1025 {
1026 	RecursiveLocker lock(fLock);
1027 
1028 	int32 group = run.AllocationGroup();
1029 	uint16 start = run.Start();
1030 	uint16 length = run.Length();
1031 
1032 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start,
1033 		length));
1034 	T(Free(run));
1035 
1036 	// doesn't use Volume::IsValidBlockRun() here because it can check better
1037 	// against the group size (the last group may have a different length)
1038 	if (group < 0 || group >= fNumGroups
1039 		|| start > fGroups[group].NumBits()
1040 		|| uint32(start + length) > fGroups[group].NumBits()
1041 		|| length == 0) {
1042 		FATAL(("tried to free an invalid block_run (%d, %u, %u)\n", (int)group,
1043 			start, length));
1044 		DEBUGGER(("tried to free invalid block_run"));
1045 		return B_BAD_VALUE;
1046 	}
1047 	// check if someone tries to free reserved areas at the beginning of the
1048 	// drive
1049 	if (group == 0
1050 		&& start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
1051 		FATAL(("tried to free a reserved block_run (%d, %u, %u)\n", (int)group,
1052 			start, length));
1053 		DEBUGGER(("tried to free reserved block"));
1054 		return B_BAD_VALUE;
1055 	}
1056 #ifdef DEBUG
1057 	if (CheckBlockRun(run) != B_OK)
1058 		return B_BAD_DATA;
1059 #endif
1060 
1061 	CHECK_ALLOCATION_GROUP(group);
1062 
1063 	if (fGroups[group].Free(transaction, start, length) != B_OK)
1064 		RETURN_ERROR(B_IO_ERROR);
1065 
1066 	CHECK_ALLOCATION_GROUP(group);
1067 
1068 #ifdef DEBUG
1069 	if (CheckBlockRun(run, NULL, false) != B_OK) {
1070 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just "
1071 			"freed)\n"));
1072 	}
1073 #endif
1074 
1075 	fVolume->SuperBlock().used_blocks =
1076 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
1077 	return B_OK;
1078 }
1079 
1080 
1081 size_t
1082 BlockAllocator::BitmapSize() const
1083 {
1084 	return fVolume->BlockSize() * fNumBlocks;
1085 }
1086 
1087 
1088 #ifdef DEBUG_FRAGMENTER
1089 void
1090 BlockAllocator::Fragment()
1091 {
1092 	AllocationBlock cached(fVolume);
1093 	RecursiveLocker lock(fLock);
1094 
1095 	// only leave 4 block holes
1096 	static const uint32 kMask = 0x0f0f0f0f;
1097 	uint32 valuesPerBlock = fVolume->BlockSize() / 4;
1098 
1099 	for (int32 i = 0; i < fNumGroups; i++) {
1100 		AllocationGroup& group = fGroups[i];
1101 
1102 		for (uint32 block = 0; block < group.NumBlocks(); block++) {
1103 			Transaction transaction(fVolume, 0);
1104 
1105 			if (cached.SetToWritable(transaction, group, block) != B_OK)
1106 				return;
1107 
1108 			for (int32 index = 0; index < valuesPerBlock; index++) {
1109 				cached.Block(index) |= HOST_ENDIAN_TO_BFS_INT32(kMask);
1110 			}
1111 
1112 			transaction.Done();
1113 		}
1114 	}
1115 }
1116 #endif	// DEBUG_FRAGMENTER
1117 
1118 
1119 #ifdef DEBUG_ALLOCATION_GROUPS
1120 void
1121 BlockAllocator::_CheckGroup(int32 groupIndex) const
1122 {
1123 	AllocationBlock cached(fVolume);
1124 	ASSERT_LOCKED_RECURSIVE(&fLock);
1125 
1126 	AllocationGroup& group = fGroups[groupIndex];
1127 
1128 	int32 currentStart = 0, currentLength = 0;
1129 	int32 firstFree = -1;
1130 	int32 largestStart = -1;
1131 	int32 largestLength = 0;
1132 	int32 currentBit = 0;
1133 
1134 	for (uint32 block = 0; block < group.NumBlocks(); block++) {
1135 		if (cached.SetTo(group, block) < B_OK) {
1136 			panic("setting group block %d failed\n", (int)block);
1137 			return;
1138 		}
1139 
1140 		for (uint32 bit = 0; bit < cached.NumBlockBits(); bit++) {
1141 			if (!cached.IsUsed(bit)) {
1142 				if (firstFree < 0) {
1143 					firstFree = currentBit;
1144 					if (!group.fLargestValid) {
1145 						if (firstFree >= 0 && firstFree < group.fFirstFree) {
1146 							// mostly harmless but noteworthy
1147 							dprintf("group %d first free too late: should be "
1148 								"%d, is %d\n", (int)groupIndex, (int)firstFree,
1149 								(int)group.fFirstFree);
1150 						}
1151 						return;
1152 					}
1153 				}
1154 
1155 				if (currentLength == 0) {
1156 					// start new range
1157 					currentStart = currentBit;
1158 				}
1159 				currentLength++;
1160 			} else if (currentLength) {
1161 				// end of a range
1162 				if (currentLength > largestLength) {
1163 					largestStart = currentStart;
1164 					largestLength = currentLength;
1165 				}
1166 				currentLength = 0;
1167 			}
1168 			currentBit++;
1169 		}
1170 	}
1171 
1172 	if (currentLength > largestLength) {
1173 		largestStart = currentStart;
1174 		largestLength = currentLength;
1175 	}
1176 
1177 	if (firstFree >= 0 && firstFree < group.fFirstFree) {
1178 		// mostly harmless but noteworthy
1179 		dprintf("group %d first free too late: should be %d, is %d\n",
1180 			(int)groupIndex, (int)firstFree, (int)group.fFirstFree);
1181 	}
1182 	if (group.fLargestValid
1183 		&& (largestStart != group.fLargestStart
1184 			|| largestLength != group.fLargestLength)) {
1185 		panic("bfs %p: group %d largest differs: %d.%d, checked %d.%d.\n",
1186 			fVolume, (int)groupIndex, (int)group.fLargestStart,
1187 			(int)group.fLargestLength, (int)largestStart, (int)largestLength);
1188 	}
1189 }
1190 #endif	// DEBUG_ALLOCATION_GROUPS
1191 
1192 
1193 status_t
1194 BlockAllocator::Trim(uint64 offset, uint64 size, uint64& trimmedSize)
1195 {
1196 	const uint32 kTrimRanges = 128;
1197 	fs_trim_data* trimData = (fs_trim_data*)malloc(sizeof(fs_trim_data)
1198 		+ sizeof(uint64) * kTrimRanges);
1199 	if (trimData == NULL)
1200 		return B_NO_MEMORY;
1201 
1202 	MemoryDeleter deleter(trimData);
1203 	RecursiveLocker locker(fLock);
1204 
1205 	// TODO: take given offset and size into account!
1206 	int32 lastGroup = fNumGroups - 1;
1207 	uint32 firstBlock = 0;
1208 	uint32 firstBit = 0;
1209 	uint64 currentBlock = 0;
1210 	uint32 blockShift = fVolume->BlockShift();
1211 
1212 	uint64 firstFree = 0;
1213 	size_t freeLength = 0;
1214 
1215 	trimData->range_count = 0;
1216 	trimmedSize = 0;
1217 
1218 	AllocationBlock cached(fVolume);
1219 	for (int32 groupIndex = 0; groupIndex <= lastGroup; groupIndex++) {
1220 		AllocationGroup& group = fGroups[groupIndex];
1221 
1222 		for (uint32 block = firstBlock; block < group.NumBlocks(); block++) {
1223 			cached.SetTo(group, block);
1224 
1225 			for (uint32 i = firstBit; i < cached.NumBlockBits(); i++) {
1226 				if (cached.IsUsed(i)) {
1227 					// Block is in use
1228 					if (freeLength > 0) {
1229 						status_t status = _TrimNext(*trimData, kTrimRanges,
1230 							firstFree << blockShift, freeLength << blockShift,
1231 							false, trimmedSize);
1232 						if (status != B_OK)
1233 							return status;
1234 
1235 						freeLength = 0;
1236 					}
1237 				} else if (freeLength++ == 0) {
1238 					// Block is free, start new free range
1239 					firstFree = currentBlock;
1240 				}
1241 
1242 				currentBlock++;
1243 			}
1244 		}
1245 
1246 		firstBlock = 0;
1247 		firstBit = 0;
1248 	}
1249 
1250 	return _TrimNext(*trimData, kTrimRanges, firstFree << blockShift,
1251 		freeLength << blockShift, true, trimmedSize);
1252 }
1253 
1254 
1255 //	#pragma mark - Bitmap validity checking
1256 
1257 // TODO: implement new FS checking API
1258 // Functions to check the validity of the bitmap - they are used from
1259 // the "checkfs" command (since this does even a bit more, maybe we should
1260 // move this some place else?)
1261 
1262 
1263 bool
1264 BlockAllocator::_IsValidCheckControl(const check_control* control)
1265 {
1266 	if (control == NULL
1267 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
1268 		FATAL(("invalid check_control (%p)!\n", control));
1269 		return false;
1270 	}
1271 
1272 	return true;
1273 }
1274 
1275 
1276 status_t
1277 BlockAllocator::StartChecking(const check_control* control)
1278 {
1279 	if (!_IsValidCheckControl(control))
1280 		return B_BAD_VALUE;
1281 
1282 	fVolume->GetJournal(0)->Lock(NULL, true);
1283 		// Lock the volume's journal
1284 
1285 	recursive_lock_lock(&fLock);
1286 
1287 	size_t size = BitmapSize();
1288 	fCheckBitmap = (uint32*)malloc(size);
1289 	if (fCheckBitmap == NULL) {
1290 		recursive_lock_unlock(&fLock);
1291 		fVolume->GetJournal(0)->Unlock(NULL, true);
1292 		return B_NO_MEMORY;
1293 	}
1294 
1295 	fCheckCookie = new(std::nothrow) check_cookie();
1296 	if (fCheckCookie == NULL) {
1297 		free(fCheckBitmap);
1298 		fCheckBitmap = NULL;
1299 		recursive_lock_unlock(&fLock);
1300 		fVolume->GetJournal(0)->Unlock(NULL, true);
1301 
1302 		return B_NO_MEMORY;
1303 	}
1304 
1305 	memcpy(&fCheckCookie->control, control, sizeof(check_control));
1306 	memset(&fCheckCookie->control.stats, 0, sizeof(control->stats));
1307 
1308 	// initialize bitmap
1309 	memset(fCheckBitmap, 0, size);
1310 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length();
1311 			block-- > 0;) {
1312 		_SetCheckBitmapAt(block);
1313 	}
1314 
1315 	fCheckCookie->pass = BFS_CHECK_PASS_BITMAP;
1316 	fCheckCookie->stack.Push(fVolume->Root());
1317 	fCheckCookie->stack.Push(fVolume->Indices());
1318 	fCheckCookie->iterator = NULL;
1319 	fCheckCookie->control.stats.block_size = fVolume->BlockSize();
1320 
1321 	// Put removed vnodes to the stack -- they are not reachable by traversing
1322 	// the file system anymore.
1323 	InodeList::Iterator iterator = fVolume->RemovedInodes().GetIterator();
1324 	while (Inode* inode = iterator.Next()) {
1325 		fCheckCookie->stack.Push(inode->BlockRun());
1326 	}
1327 
1328 	// TODO: check reserved area in bitmap!
1329 
1330 	return B_OK;
1331 }
1332 
1333 
1334 status_t
1335 BlockAllocator::StopChecking(check_control* control)
1336 {
1337 	if (fCheckCookie == NULL)
1338 		return B_NO_INIT;
1339 
1340 	if (fCheckCookie->iterator != NULL) {
1341 		delete fCheckCookie->iterator;
1342 		fCheckCookie->iterator = NULL;
1343 
1344 		// the current directory inode is still locked in memory
1345 		put_vnode(fVolume->FSVolume(), fVolume->ToVnode(fCheckCookie->current));
1346 	}
1347 
1348 	if (fVolume->IsReadOnly()) {
1349 		// We can't fix errors on this volume
1350 		fCheckCookie->control.flags = 0;
1351 	}
1352 
1353 	if (fCheckCookie->control.status != B_ENTRY_NOT_FOUND)
1354 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
1355 
1356 	switch (fCheckCookie->pass) {
1357 		case BFS_CHECK_PASS_BITMAP:
1358 			// if CheckNextNode() could completely work through, we can
1359 			// fix any damages of the bitmap
1360 			if (fCheckCookie->control.status == B_ENTRY_NOT_FOUND)
1361 				_WriteBackCheckBitmap();
1362 			break;
1363 
1364 		case BFS_CHECK_PASS_INDEX:
1365 			_FreeIndices();
1366 			break;
1367 	}
1368 
1369 	fVolume->SetCheckingThread(-1);
1370 
1371 	if (control != NULL)
1372 		user_memcpy(control, &fCheckCookie->control, sizeof(check_control));
1373 
1374 	free(fCheckBitmap);
1375 	fCheckBitmap = NULL;
1376 	delete fCheckCookie;
1377 	fCheckCookie = NULL;
1378 	recursive_lock_unlock(&fLock);
1379 	fVolume->GetJournal(0)->Unlock(NULL, true);
1380 
1381 	return B_OK;
1382 }
1383 
1384 
1385 status_t
1386 BlockAllocator::CheckNextNode(check_control* control)
1387 {
1388 	if (fCheckCookie == NULL)
1389 		return B_NO_INIT;
1390 
1391 	fVolume->SetCheckingThread(find_thread(NULL));
1392 
1393 	// Make sure the user control is copied on exit
1394 	class CopyControlOnExit {
1395 	public:
1396 		CopyControlOnExit(check_control* source, check_control* userTarget)
1397 			:
1398 			fSource(source),
1399 			fTarget(userTarget)
1400 		{
1401 		}
1402 
1403 		~CopyControlOnExit()
1404 		{
1405 			if (fTarget != NULL)
1406 				user_memcpy(fTarget, fSource, sizeof(check_control));
1407 		}
1408 
1409 	private:
1410 		check_control*	fSource;
1411 		check_control*	fTarget;
1412 	} copyControl(&fCheckCookie->control, control);
1413 
1414 	while (true) {
1415 		if (fCheckCookie->iterator == NULL) {
1416 			if (!fCheckCookie->stack.Pop(&fCheckCookie->current)) {
1417 				// No more runs on the stack, we might be finished!
1418 				if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1419 					&& !fCheckCookie->indices.IsEmpty()) {
1420 					// Start second pass to repair indices
1421 					_WriteBackCheckBitmap();
1422 
1423 					fCheckCookie->pass = BFS_CHECK_PASS_INDEX;
1424 					fCheckCookie->control.pass = BFS_CHECK_PASS_INDEX;
1425 
1426 					status_t status = _PrepareIndices();
1427 					if (status != B_OK) {
1428 						fCheckCookie->control.status = status;
1429 						return status;
1430 					}
1431 
1432 					fCheckCookie->stack.Push(fVolume->Root());
1433 					continue;
1434 				}
1435 
1436 				fCheckCookie->control.status = B_ENTRY_NOT_FOUND;
1437 				return B_ENTRY_NOT_FOUND;
1438 			}
1439 
1440 			Vnode vnode(fVolume, fCheckCookie->current);
1441 			Inode* inode;
1442 			if (vnode.Get(&inode) != B_OK) {
1443 				FATAL(("check: Could not open inode at %" B_PRIdOFF "\n",
1444 					fVolume->ToBlock(fCheckCookie->current)));
1445 				continue;
1446 			}
1447 
1448 			fCheckCookie->control.inode = inode->ID();
1449 			fCheckCookie->control.mode = inode->Mode();
1450 
1451 			if (!inode->IsContainer()) {
1452 				// Check file
1453 				fCheckCookie->control.errors = 0;
1454 				fCheckCookie->control.status = CheckInode(inode, NULL);
1455 
1456 				if (inode->GetName(fCheckCookie->control.name) < B_OK)
1457 					strcpy(fCheckCookie->control.name, "(node has no name)");
1458 
1459 				return B_OK;
1460 			}
1461 
1462 			// Check directory
1463 			BPlusTree* tree = inode->Tree();
1464 			if (tree == NULL) {
1465 				FATAL(("check: could not open b+tree from inode at %" B_PRIdOFF
1466 					"\n", fVolume->ToBlock(fCheckCookie->current)));
1467 				continue;
1468 			}
1469 
1470 			fCheckCookie->parent = inode;
1471 			fCheckCookie->parent_mode = inode->Mode();
1472 
1473 			// get iterator for the next directory
1474 			fCheckCookie->iterator = new(std::nothrow) TreeIterator(tree);
1475 			if (fCheckCookie->iterator == NULL)
1476 				RETURN_ERROR(B_NO_MEMORY);
1477 
1478 			// the inode must stay locked in memory until the iterator is freed
1479 			vnode.Keep();
1480 
1481 			// check the inode of the directory
1482 			fCheckCookie->control.errors = 0;
1483 			fCheckCookie->control.status = CheckInode(inode, NULL);
1484 
1485 			if (inode->GetName(fCheckCookie->control.name) != B_OK)
1486 				strcpy(fCheckCookie->control.name, "(dir has no name)");
1487 
1488 			return B_OK;
1489 		}
1490 
1491 		char name[B_FILE_NAME_LENGTH];
1492 		uint16 length;
1493 		ino_t id;
1494 
1495 		status_t status = fCheckCookie->iterator->GetNextEntry(name, &length,
1496 			B_FILE_NAME_LENGTH, &id);
1497 		if (status != B_OK) {
1498 			// we no longer need this iterator
1499 			delete fCheckCookie->iterator;
1500 			fCheckCookie->iterator = NULL;
1501 
1502 			// unlock the directory's inode from memory
1503 			put_vnode(fVolume->FSVolume(),
1504 				fVolume->ToVnode(fCheckCookie->current));
1505 
1506 			if (status == B_ENTRY_NOT_FOUND) {
1507 				// We iterated over all entries already, just go on to the next
1508 				continue;
1509 			}
1510 
1511 			// Iterating over the B+tree failed - we let the checkfs run
1512 			// fail completely, as we would delete all files we cannot
1513 			// access.
1514 			// TODO: maybe have a force parameter that actually does that.
1515 			// TODO: we also need to be able to repair broken B+trees!
1516 			return status;
1517 		}
1518 
1519 		// ignore "." and ".." entries
1520 		if (!strcmp(name, ".") || !strcmp(name, ".."))
1521 			continue;
1522 
1523 		// fill in the control data as soon as we have them
1524 		strlcpy(fCheckCookie->control.name, name, B_FILE_NAME_LENGTH);
1525 		fCheckCookie->control.inode = id;
1526 		fCheckCookie->control.errors = 0;
1527 
1528 		Vnode vnode(fVolume, id);
1529 		Inode* inode;
1530 		if (vnode.Get(&inode) != B_OK) {
1531 			FATAL(("Could not open inode ID %" B_PRIdINO "!\n", id));
1532 			fCheckCookie->control.errors |= BFS_COULD_NOT_OPEN;
1533 
1534 			if ((fCheckCookie->control.flags & BFS_REMOVE_INVALID) != 0) {
1535 				status = _RemoveInvalidNode(fCheckCookie->parent,
1536 					fCheckCookie->iterator->Tree(), NULL, name);
1537 			} else
1538 				status = B_ERROR;
1539 
1540 			fCheckCookie->control.status = status;
1541 			return B_OK;
1542 		}
1543 
1544 		// check if the inode's name is the same as in the b+tree
1545 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1546 			&& inode->IsRegularNode()) {
1547 			RecursiveLocker locker(inode->SmallDataLock());
1548 			NodeGetter node(fVolume, inode);
1549 			if (node.Node() == NULL) {
1550 				fCheckCookie->control.errors |= BFS_COULD_NOT_OPEN;
1551 				fCheckCookie->control.status = B_IO_ERROR;
1552 				return B_OK;
1553 			}
1554 
1555 			const char* localName = inode->Name(node.Node());
1556 			if (localName == NULL || strcmp(localName, name)) {
1557 				fCheckCookie->control.errors |= BFS_NAMES_DONT_MATCH;
1558 				FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name,
1559 					localName));
1560 
1561 				if ((fCheckCookie->control.flags & BFS_FIX_NAME_MISMATCHES)
1562 						!= 0) {
1563 					// Rename the inode
1564 					Transaction transaction(fVolume, inode->BlockNumber());
1565 
1566 					// Note, this may need extra blocks, but the inode will
1567 					// only be checked afterwards, so that it won't be lost
1568 					status = inode->SetName(transaction, name);
1569 					if (status == B_OK)
1570 						status = inode->WriteBack(transaction);
1571 					if (status == B_OK)
1572 						status = transaction.Done();
1573 					if (status != B_OK) {
1574 						fCheckCookie->control.status = status;
1575 						return B_OK;
1576 					}
1577 				}
1578 			}
1579 		}
1580 
1581 		fCheckCookie->control.mode = inode->Mode();
1582 
1583 		// Check for the correct mode of the node (if the mode of the
1584 		// file don't fit to its parent, there is a serious problem)
1585 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1586 			&& (((fCheckCookie->parent_mode & S_ATTR_DIR) != 0
1587 					&& !inode->IsAttribute())
1588 				|| ((fCheckCookie->parent_mode & S_INDEX_DIR) != 0
1589 					&& !inode->IsIndex())
1590 				|| (is_directory(fCheckCookie->parent_mode)
1591 					&& !inode->IsRegularNode()))) {
1592 			FATAL(("inode at %" B_PRIdOFF " is of wrong type: %o (parent "
1593 				"%o at %" B_PRIdOFF ")!\n", inode->BlockNumber(),
1594 				inode->Mode(), fCheckCookie->parent_mode,
1595 				fCheckCookie->parent->BlockNumber()));
1596 
1597 			// if we are allowed to fix errors, we should remove the file
1598 			if ((fCheckCookie->control.flags & BFS_REMOVE_WRONG_TYPES) != 0
1599 				&& (fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS)
1600 						!= 0) {
1601 				status = _RemoveInvalidNode(fCheckCookie->parent, NULL,
1602 					inode, name);
1603 			} else
1604 				status = B_ERROR;
1605 
1606 			fCheckCookie->control.errors |= BFS_WRONG_TYPE;
1607 			fCheckCookie->control.status = status;
1608 			return B_OK;
1609 		}
1610 
1611 		// push the directory on the stack so that it will be scanned later
1612 		if (inode->IsContainer() && !inode->IsIndex())
1613 			fCheckCookie->stack.Push(inode->BlockRun());
1614 		else {
1615 			// check it now
1616 			fCheckCookie->control.status = CheckInode(inode, name);
1617 			return B_OK;
1618 		}
1619 	}
1620 	// is never reached
1621 }
1622 
1623 
1624 status_t
1625 BlockAllocator::_RemoveInvalidNode(Inode* parent, BPlusTree* tree, Inode* inode,
1626 	const char* name)
1627 {
1628 	// It's safe to start a transaction, because Inode::Remove()
1629 	// won't touch the block bitmap (which we hold the lock for)
1630 	// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1631 	// the bitmap anyway.
1632 	Transaction transaction(fVolume, parent->BlockNumber());
1633 	status_t status;
1634 
1635 	if (inode != NULL) {
1636 		inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1637 
1638 		status = parent->Remove(transaction, name, NULL, false, true);
1639 	} else {
1640 		parent->WriteLockInTransaction(transaction);
1641 
1642 		// does the file even exist?
1643 		off_t id;
1644 		status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1645 		if (status == B_OK)
1646 			status = tree->Remove(transaction, name, id);
1647 	}
1648 
1649 	if (status == B_OK) {
1650 		entry_cache_remove(fVolume->ID(), parent->ID(), name);
1651 		transaction.Done();
1652 	}
1653 
1654 	return status;
1655 }
1656 
1657 
1658 bool
1659 BlockAllocator::_CheckBitmapIsUsedAt(off_t block) const
1660 {
1661 	size_t size = BitmapSize();
1662 	uint32 index = block / 32;	// 32bit resolution
1663 	if (index > size / 4)
1664 		return false;
1665 
1666 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index])
1667 		& (1UL << (block & 0x1f));
1668 }
1669 
1670 
1671 void
1672 BlockAllocator::_SetCheckBitmapAt(off_t block)
1673 {
1674 	size_t size = BitmapSize();
1675 	uint32 index = block / 32;	// 32bit resolution
1676 	if (index > size / 4)
1677 		return;
1678 
1679 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1680 }
1681 
1682 
1683 status_t
1684 BlockAllocator::_WriteBackCheckBitmap()
1685 {
1686 	if (fVolume->IsReadOnly())
1687 		return B_OK;
1688 
1689 	// calculate the number of used blocks in the check bitmap
1690 	size_t size = BitmapSize();
1691 	off_t usedBlocks = 0LL;
1692 
1693 	// TODO: update the allocation groups used blocks info
1694 	for (uint32 i = size >> 2; i-- > 0;) {
1695 		uint32 compare = 1;
1696 		// Count the number of bits set
1697 		for (int16 j = 0; j < 32; j++, compare <<= 1) {
1698 			if ((compare & fCheckBitmap[i]) != 0)
1699 				usedBlocks++;
1700 		}
1701 	}
1702 
1703 	fCheckCookie->control.stats.freed = fVolume->UsedBlocks() - usedBlocks
1704 		+ fCheckCookie->control.stats.missing;
1705 	if (fCheckCookie->control.stats.freed < 0)
1706 		fCheckCookie->control.stats.freed = 0;
1707 
1708 	// Should we fix errors? Were there any errors we can fix?
1709 	if ((fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS) != 0
1710 		&& (fCheckCookie->control.stats.freed != 0
1711 			|| fCheckCookie->control.stats.missing != 0)) {
1712 		// If so, write the check bitmap back over the original one,
1713 		// and use transactions here to play safe - we even use several
1714 		// transactions, so that we don't blow the maximum log size
1715 		// on large disks, since we don't need to make this atomic.
1716 #if 0
1717 		// prints the blocks that differ
1718 		off_t block = 0;
1719 		for (int32 i = 0; i < fNumGroups; i++) {
1720 			AllocationBlock cached(fVolume);
1721 			for (uint32 j = 0; j < fGroups[i].NumBlocks(); j++) {
1722 				cached.SetTo(fGroups[i], j);
1723 				for (uint32 k = 0; k < cached.NumBlockBits(); k++) {
1724 					if (cached.IsUsed(k) != _CheckBitmapIsUsedAt(block)) {
1725 						dprintf("differ block %lld (should be %d)\n", block,
1726 							_CheckBitmapIsUsedAt(block));
1727 					}
1728 					block++;
1729 				}
1730 			}
1731 		}
1732 #endif
1733 
1734 		fVolume->SuperBlock().used_blocks
1735 			= HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
1736 
1737 		size_t blockSize = fVolume->BlockSize();
1738 
1739 		for (uint32 i = 0; i < fNumBlocks; i += 512) {
1740 			Transaction transaction(fVolume, 1 + i);
1741 
1742 			uint32 blocksToWrite = 512;
1743 			if (blocksToWrite + i > fNumBlocks)
1744 				blocksToWrite = fNumBlocks - i;
1745 
1746 			status_t status = transaction.WriteBlocks(1 + i,
1747 				(uint8*)fCheckBitmap + i * blockSize, blocksToWrite);
1748 			if (status < B_OK) {
1749 				FATAL(("error writing bitmap: %s\n", strerror(status)));
1750 				return status;
1751 			}
1752 			transaction.Done();
1753 		}
1754 	}
1755 
1756 	return B_OK;
1757 }
1758 
1759 
1760 /*!	Checks whether or not the specified block range is allocated or not,
1761 	depending on the \a allocated argument.
1762 */
1763 status_t
1764 BlockAllocator::CheckBlocks(off_t start, off_t length, bool allocated)
1765 {
1766 	if (start < 0 || start + length > fVolume->NumBlocks())
1767 		return B_BAD_VALUE;
1768 
1769 	int32 group = start >> fVolume->AllocationGroupShift();
1770 	uint32 bitmapBlock = start / (fVolume->BlockSize() << 3);
1771 	uint32 blockOffset = start % (fVolume->BlockSize() << 3);
1772 
1773 	uint32 groupBlock = bitmapBlock % fBlocksPerGroup;
1774 
1775 	AllocationBlock cached(fVolume);
1776 
1777 	while (groupBlock < fGroups[group].NumBlocks() && length > 0) {
1778 		if (cached.SetTo(fGroups[group], groupBlock) != B_OK)
1779 			RETURN_ERROR(B_IO_ERROR);
1780 
1781 		for (; blockOffset < cached.NumBlockBits() && length > 0;
1782 				blockOffset++, length--) {
1783 			if (cached.IsUsed(blockOffset) != allocated) {
1784 				RETURN_ERROR(B_BAD_DATA);
1785 			}
1786 		}
1787 
1788 		blockOffset = 0;
1789 
1790 		if (++groupBlock >= fGroups[group].NumBlocks()) {
1791 			groupBlock = 0;
1792 			group++;
1793 		}
1794 	}
1795 
1796 	return B_OK;
1797 }
1798 
1799 
1800 status_t
1801 BlockAllocator::CheckBlockRun(block_run run, const char* type, bool allocated)
1802 {
1803 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1804 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1805 		|| uint32(run.Start() + run.Length())
1806 				> fGroups[run.AllocationGroup()].fNumBits
1807 		|| run.length == 0) {
1808 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type,
1809 			run.AllocationGroup(), run.Start(), run.Length()));
1810 		if (fCheckCookie == NULL)
1811 			return B_BAD_DATA;
1812 
1813 		fCheckCookie->control.errors |= BFS_INVALID_BLOCK_RUN;
1814 		return B_OK;
1815 	}
1816 
1817 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1818 	uint32 block = run.Start() / bitsPerBlock;
1819 	uint32 pos = run.Start() % bitsPerBlock;
1820 	int32 length = 0;
1821 	off_t firstMissing = -1, firstSet = -1;
1822 	off_t firstGroupBlock
1823 		= (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1824 
1825 	AllocationBlock cached(fVolume);
1826 
1827 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1828 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1829 			RETURN_ERROR(B_IO_ERROR);
1830 
1831 		if (pos >= cached.NumBlockBits()) {
1832 			// something very strange has happened...
1833 			RETURN_ERROR(B_ERROR);
1834 		}
1835 
1836 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1837 			if (cached.IsUsed(pos) != allocated) {
1838 				if (fCheckCookie == NULL) {
1839 					PRINT(("%s: block_run(%ld, %u, %u) is only partially "
1840 						"allocated (pos = %ld, length = %ld)!\n", type,
1841 						run.AllocationGroup(), run.Start(), run.Length(),
1842 						pos, length));
1843 					return B_BAD_DATA;
1844 				}
1845 				if (firstMissing == -1) {
1846 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1847 					fCheckCookie->control.errors |= BFS_MISSING_BLOCKS;
1848 				}
1849 				fCheckCookie->control.stats.missing++;
1850 			} else if (firstMissing != -1) {
1851 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are "
1852 					"%sallocated!\n", type, run.AllocationGroup(), run.Start(),
1853 					run.Length(), firstMissing,
1854 					firstGroupBlock + pos + block * bitsPerBlock - 1,
1855 					allocated ? "not " : ""));
1856 				firstMissing = -1;
1857 			}
1858 
1859 			if (fCheckBitmap != NULL) {
1860 				// Set the block in the check bitmap as well, but have a look
1861 				// if it is already allocated first
1862 				uint32 offset = pos + block * bitsPerBlock;
1863 				if (_CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1864 					if (firstSet == -1) {
1865 						firstSet = firstGroupBlock + offset;
1866 						fCheckCookie->control.errors |= BFS_BLOCKS_ALREADY_SET;
1867 						dprintf("block %" B_PRIdOFF " is already set!!!\n",
1868 							firstGroupBlock + offset);
1869 					}
1870 					fCheckCookie->control.stats.already_set++;
1871 				} else {
1872 					if (firstSet != -1) {
1873 						FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF
1874 							" - %" B_PRIdOFF " are already set!\n", type,
1875 							(int)run.AllocationGroup(), run.Start(),
1876 							run.Length(), firstSet,
1877 							firstGroupBlock + offset - 1));
1878 						firstSet = -1;
1879 					}
1880 					_SetCheckBitmapAt(firstGroupBlock + offset);
1881 				}
1882 			}
1883 			length++;
1884 			pos++;
1885 		}
1886 
1887 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1888 			if (firstMissing != -1) {
1889 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not "
1890 					"allocated!\n", type, run.AllocationGroup(), run.Start(),
1891 					run.Length(), firstMissing,
1892 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1893 			}
1894 			if (firstSet != -1) {
1895 				FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF " - %"
1896 					B_PRIdOFF " are already set!\n", type,
1897 					(int)run.AllocationGroup(), run.Start(), run.Length(),
1898 					firstSet,
1899 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1900 			}
1901 		}
1902 	}
1903 
1904 	return B_OK;
1905 }
1906 
1907 
1908 status_t
1909 BlockAllocator::CheckInode(Inode* inode, const char* name)
1910 {
1911 	if (fCheckCookie != NULL && fCheckBitmap == NULL)
1912 		return B_NO_INIT;
1913 	if (inode == NULL)
1914 		return B_BAD_VALUE;
1915 
1916 	switch (fCheckCookie->pass) {
1917 		case BFS_CHECK_PASS_BITMAP:
1918 		{
1919 			status_t status = _CheckInodeBlocks(inode, name);
1920 			if (status != B_OK)
1921 				return status;
1922 
1923 			// Check the B+tree as well
1924 			if (inode->IsContainer()) {
1925 				bool repairErrors
1926 					= (fCheckCookie->control.flags & BFS_FIX_BPLUSTREES) != 0;
1927 				bool errorsFound = false;
1928 				status = inode->Tree()->Validate(repairErrors, errorsFound);
1929 				if (errorsFound) {
1930 					fCheckCookie->control.errors |= BFS_INVALID_BPLUSTREE;
1931 					if (inode->IsIndex() && name != NULL && repairErrors) {
1932 						// We completely rebuild corrupt indices
1933 						check_index* index = new(std::nothrow) check_index;
1934 						if (index == NULL)
1935 							return B_NO_MEMORY;
1936 
1937 						strlcpy(index->name, name, sizeof(index->name));
1938 						index->run = inode->BlockRun();
1939 						fCheckCookie->indices.Push(index);
1940 					}
1941 				}
1942 			}
1943 
1944 			return status;
1945 		}
1946 
1947 		case BFS_CHECK_PASS_INDEX:
1948 			return _AddInodeToIndex(inode);
1949 	}
1950 
1951 	return B_OK;
1952 }
1953 
1954 
1955 status_t
1956 BlockAllocator::_CheckInodeBlocks(Inode* inode, const char* name)
1957 {
1958 	status_t status = CheckBlockRun(inode->BlockRun(), "inode");
1959 	if (status != B_OK)
1960 		return status;
1961 
1962 	// If the inode has an attribute directory, push it on the stack
1963 	if (!inode->Attributes().IsZero())
1964 		fCheckCookie->stack.Push(inode->Attributes());
1965 
1966 	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1967 		// symlinks may not have a valid data stream
1968 		if (strlen(inode->Node().short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1969 			return B_BAD_DATA;
1970 
1971 		return B_OK;
1972 	}
1973 
1974 	data_stream* data = &inode->Node().data;
1975 
1976 	// check the direct range
1977 
1978 	if (data->max_direct_range) {
1979 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1980 			if (data->direct[i].IsZero())
1981 				break;
1982 
1983 			status = CheckBlockRun(data->direct[i], "direct");
1984 			if (status < B_OK)
1985 				return status;
1986 
1987 			fCheckCookie->control.stats.direct_block_runs++;
1988 			fCheckCookie->control.stats.blocks_in_direct
1989 				+= data->direct[i].Length();
1990 		}
1991 	}
1992 
1993 	CachedBlock cached(fVolume);
1994 
1995 	// check the indirect range
1996 
1997 	if (data->max_indirect_range) {
1998 		status = CheckBlockRun(data->indirect, "indirect");
1999 		if (status < B_OK)
2000 			return status;
2001 
2002 		off_t block = fVolume->ToBlock(data->indirect);
2003 
2004 		for (int32 i = 0; i < data->indirect.Length(); i++) {
2005 			block_run* runs = (block_run*)cached.SetTo(block + i);
2006 			if (runs == NULL)
2007 				RETURN_ERROR(B_IO_ERROR);
2008 
2009 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
2010 			int32 index = 0;
2011 			for (; index < runsPerBlock; index++) {
2012 				if (runs[index].IsZero())
2013 					break;
2014 
2015 				status = CheckBlockRun(runs[index], "indirect->run");
2016 				if (status < B_OK)
2017 					return status;
2018 
2019 				fCheckCookie->control.stats.indirect_block_runs++;
2020 				fCheckCookie->control.stats.blocks_in_indirect
2021 					+= runs[index].Length();
2022 			}
2023 			fCheckCookie->control.stats.indirect_array_blocks++;
2024 
2025 			if (index < runsPerBlock)
2026 				break;
2027 		}
2028 	}
2029 
2030 	// check the double indirect range
2031 
2032 	if (data->max_double_indirect_range) {
2033 		status = CheckBlockRun(data->double_indirect, "double indirect");
2034 		if (status != B_OK)
2035 			return status;
2036 
2037 		int32 runsPerBlock = runs_per_block(fVolume->BlockSize());
2038 		int32 runsPerArray = runsPerBlock * data->double_indirect.Length();
2039 
2040 		CachedBlock cachedDirect(fVolume);
2041 
2042 		for (int32 indirectIndex = 0; indirectIndex < runsPerArray;
2043 				indirectIndex++) {
2044 			// get the indirect array block
2045 			block_run* array = (block_run*)cached.SetTo(
2046 				fVolume->ToBlock(data->double_indirect)
2047 					+ indirectIndex / runsPerBlock);
2048 			if (array == NULL)
2049 				return B_IO_ERROR;
2050 
2051 			block_run indirect = array[indirectIndex % runsPerBlock];
2052 			// are we finished yet?
2053 			if (indirect.IsZero())
2054 				return B_OK;
2055 
2056 			status = CheckBlockRun(indirect, "double indirect->runs");
2057 			if (status != B_OK)
2058 				return status;
2059 
2060 			int32 maxIndex
2061 				= ((uint32)indirect.Length() << fVolume->BlockShift())
2062 					/ sizeof(block_run);
2063 
2064 			for (int32 index = 0; index < maxIndex; ) {
2065 				block_run* runs = (block_run*)cachedDirect.SetTo(
2066 					fVolume->ToBlock(indirect) + index / runsPerBlock);
2067 				if (runs == NULL)
2068 					return B_IO_ERROR;
2069 
2070 				do {
2071 					// are we finished yet?
2072 					if (runs[index % runsPerBlock].IsZero())
2073 						return B_OK;
2074 
2075 					status = CheckBlockRun(runs[index % runsPerBlock],
2076 						"double indirect->runs->run");
2077 					if (status != B_OK)
2078 						return status;
2079 
2080 					fCheckCookie->control.stats.double_indirect_block_runs++;
2081 					fCheckCookie->control.stats.blocks_in_double_indirect
2082 						+= runs[index % runsPerBlock].Length();
2083 				} while ((++index % runsPerBlock) != 0);
2084 			}
2085 
2086 			fCheckCookie->control.stats.double_indirect_array_blocks++;
2087 		}
2088 	}
2089 
2090 	return B_OK;
2091 }
2092 
2093 
2094 status_t
2095 BlockAllocator::_PrepareIndices()
2096 {
2097 	int32 count = 0;
2098 
2099 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2100 		check_index* index = fCheckCookie->indices.Array()[i];
2101 		Vnode vnode(fVolume, index->run);
2102 		Inode* inode;
2103 		status_t status = vnode.Get(&inode);
2104 		if (status != B_OK) {
2105 			FATAL(("check: Could not open index at %" B_PRIdOFF "\n",
2106 				fVolume->ToBlock(index->run)));
2107 			return status;
2108 		}
2109 
2110 		BPlusTree* tree = inode->Tree();
2111 		if (tree == NULL) {
2112 			// TODO: We can't yet repair those
2113 			continue;
2114 		}
2115 
2116 		status = tree->MakeEmpty();
2117 		if (status != B_OK)
2118 			return status;
2119 
2120 		index->inode = inode;
2121 		vnode.Keep();
2122 		count++;
2123 	}
2124 
2125 	return count == 0 ? B_ENTRY_NOT_FOUND : B_OK;
2126 }
2127 
2128 
2129 void
2130 BlockAllocator::_FreeIndices()
2131 {
2132 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2133 		check_index* index = fCheckCookie->indices.Array()[i];
2134 		if (index->inode != NULL) {
2135 			put_vnode(fVolume->FSVolume(),
2136 				fVolume->ToVnode(index->inode->BlockRun()));
2137 		}
2138 	}
2139 	fCheckCookie->indices.MakeEmpty();
2140 }
2141 
2142 
2143 status_t
2144 BlockAllocator::_AddInodeToIndex(Inode* inode)
2145 {
2146 	Transaction transaction(fVolume, inode->BlockNumber());
2147 
2148 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2149 		check_index* index = fCheckCookie->indices.Array()[i];
2150 		if (index->inode == NULL)
2151 			continue;
2152 
2153 		index->inode->WriteLockInTransaction(transaction);
2154 
2155 		BPlusTree* tree = index->inode->Tree();
2156 		if (tree == NULL)
2157 			return B_ERROR;
2158 
2159 		status_t status = B_OK;
2160 
2161 		if (!strcmp(index->name, "name")) {
2162 			if (inode->InNameIndex()) {
2163 				char name[B_FILE_NAME_LENGTH];
2164 				if (inode->GetName(name, B_FILE_NAME_LENGTH) != B_OK)
2165 					return B_ERROR;
2166 
2167 				status = tree->Insert(transaction, name, inode->ID());
2168 			}
2169 		} else if (!strcmp(index->name, "last_modified")) {
2170 			if (inode->InLastModifiedIndex()) {
2171 				status = tree->Insert(transaction, inode->OldLastModified(),
2172 					inode->ID());
2173 			}
2174 		} else if (!strcmp(index->name, "size")) {
2175 			if (inode->InSizeIndex())
2176 				status = tree->Insert(transaction, inode->Size(), inode->ID());
2177 		} else {
2178 			uint8 key[MAX_INDEX_KEY_LENGTH];
2179 			size_t keyLength = MAX_INDEX_KEY_LENGTH;
2180 			if (inode->ReadAttribute(index->name, B_ANY_TYPE, 0, key,
2181 					&keyLength) == B_OK) {
2182 				status = tree->Insert(transaction, key, keyLength, inode->ID());
2183 			}
2184 		}
2185 
2186 		if (status != B_OK)
2187 			return status;
2188 	}
2189 
2190 	return transaction.Done();
2191 }
2192 
2193 
2194 status_t
2195 BlockAllocator::_AddTrim(fs_trim_data& trimData, uint32 maxRanges,
2196 	uint64 offset, uint64 size)
2197 {
2198 	if (trimData.range_count < maxRanges && size > 0) {
2199 		trimData.ranges[trimData.range_count].offset = offset;
2200 		trimData.ranges[trimData.range_count].size = size;
2201 		trimData.range_count++;
2202 		return true;
2203 	}
2204 
2205 	return false;
2206 }
2207 
2208 
2209 status_t
2210 BlockAllocator::_TrimNext(fs_trim_data& trimData, uint32 maxRanges,
2211 	uint64 offset, uint64 size, bool force, uint64& trimmedSize)
2212 {
2213 	PRINT(("_TrimNext(index %" B_PRIu32 ", offset %" B_PRIu64 ", size %"
2214 		B_PRIu64 ")\n", trimData.range_count, offset, size));
2215 
2216 	bool pushed = _AddTrim(trimData, maxRanges, offset, size);
2217 
2218 	if (!pushed || force) {
2219 		// Trim now
2220 		trimData.trimmed_size = 0;
2221 dprintf("TRIM FS:\n");
2222 for (uint32 i = 0; i < trimData.range_count; i++) {
2223 	dprintf("[%3" B_PRIu32 "] %" B_PRIu64 " : %" B_PRIu64 "\n", i,
2224 		trimData.ranges[i].offset, trimData.ranges[i].size);
2225 }
2226 		if (ioctl(fVolume->Device(), B_TRIM_DEVICE, &trimData,
2227 				sizeof(fs_trim_data)) != 0) {
2228 			return errno;
2229 		}
2230 
2231 		trimmedSize += trimData.trimmed_size;
2232 		trimData.range_count = 0;
2233 	}
2234 
2235 	if (!pushed)
2236 		_AddTrim(trimData, maxRanges, offset, size);
2237 
2238 	return B_OK;
2239 }
2240 
2241 
2242 //	#pragma mark - debugger commands
2243 
2244 
2245 #ifdef BFS_DEBUGGER_COMMANDS
2246 
2247 void
2248 BlockAllocator::Dump(int32 index)
2249 {
2250 	kprintf("allocation groups: %" B_PRId32 " (base %p)\n", fNumGroups, fGroups);
2251 	kprintf("blocks per group: %" B_PRId32 "\n", fBlocksPerGroup);
2252 
2253 	for (int32 i = 0; i < fNumGroups; i++) {
2254 		if (index != -1 && i != index)
2255 			continue;
2256 
2257 		AllocationGroup& group = fGroups[i];
2258 
2259 		kprintf("[%3" B_PRId32 "] num bits:       %" B_PRIu32 "  (%p)\n", i,
2260 			group.NumBits(), &group);
2261 		kprintf("      num blocks:     %" B_PRIu32 "\n", group.NumBlocks());
2262 		kprintf("      start:          %" B_PRId32 "\n", group.Start());
2263 		kprintf("      first free:     %" B_PRId32 "\n", group.fFirstFree);
2264 		kprintf("      largest start:  %" B_PRId32 "%s\n", group.fLargestStart,
2265 			group.fLargestValid ? "" : "  (invalid)");
2266 		kprintf("      largest length: %" B_PRId32 "\n", group.fLargestLength);
2267 		kprintf("      free bits:      %" B_PRId32 "\n", group.fFreeBits);
2268 	}
2269 }
2270 
2271 
2272 #if BFS_TRACING
2273 static char kTraceBuffer[256];
2274 
2275 
2276 int
2277 dump_block_allocator_blocks(int argc, char** argv)
2278 {
2279 	if (argc != 3 || !strcmp(argv[1], "--help")) {
2280 		kprintf("usage: %s <ptr-to-volume> <block>\n", argv[0]);
2281 		return 0;
2282 	}
2283 
2284 	Volume* volume = (Volume*)parse_expression(argv[1]);
2285 	off_t block = parse_expression(argv[2]);
2286 
2287 	// iterate over all tracing entries to find overlapping actions
2288 
2289 	using namespace BFSBlockTracing;
2290 
2291 	LazyTraceOutput out(kTraceBuffer, sizeof(kTraceBuffer), 0);
2292 	TraceEntryIterator iterator;
2293 	while (TraceEntry* _entry = iterator.Next()) {
2294 		if (Allocate* entry = dynamic_cast<Allocate*>(_entry)) {
2295 			off_t first = volume->ToBlock(entry->Run());
2296 			off_t last = first - 1 + entry->Run().Length();
2297 			if (block >= first && block <= last) {
2298 				out.Clear();
2299 				const char* dump = out.DumpEntry(entry);
2300 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2301 			}
2302 		} else if (Free* entry = dynamic_cast<Free*>(_entry)) {
2303 			off_t first = volume->ToBlock(entry->Run());
2304 			off_t last = first - 1 + entry->Run().Length();
2305 			if (block >= first && block <= last) {
2306 				out.Clear();
2307 				const char* dump = out.DumpEntry(entry);
2308 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2309 			}
2310 		}
2311 	}
2312 
2313 	return 0;
2314 }
2315 #endif
2316 
2317 
2318 int
2319 dump_block_allocator(int argc, char** argv)
2320 {
2321 	int32 group = -1;
2322 	if (argc == 3) {
2323 		group = parse_expression(argv[2]);
2324 		argc--;
2325 	}
2326 
2327 	if (argc != 2 || !strcmp(argv[1], "--help")) {
2328 		kprintf("usage: %s <ptr-to-volume> [group]\n", argv[0]);
2329 		return 0;
2330 	}
2331 
2332 	Volume* volume = (Volume*)parse_expression(argv[1]);
2333 	BlockAllocator& allocator = volume->Allocator();
2334 
2335 	allocator.Dump(group);
2336 	return 0;
2337 }
2338 
2339 #endif	// BFS_DEBUGGER_COMMANDS
2340 
2341