xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 7a74a5df454197933bc6e80a542102362ee98703)
1 /*
2  * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Block bitmap handling and allocation policies
8 
9 
10 #include "BlockAllocator.h"
11 
12 #include "bfs_control.h"
13 #include "BPlusTree.h"
14 #include "Debug.h"
15 #include "Inode.h"
16 #include "Volume.h"
17 
18 
19 // Things the BlockAllocator should do:
20 
21 // - find a range of blocks of a certain size nearby a specific position
22 // - allocating an unsharp range of blocks for pre-allocation
23 // - free blocks
24 // - know how to deal with each allocation, special handling for directories,
25 //   files, symlinks, etc. (type sensitive allocation policies)
26 
27 // What makes the code complicated is the fact that we are not just reading
28 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
29 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
30 // of partitions will grow even more - so that's not an option.
31 // Instead we are reading in every block when it's used - since an allocation
32 // group can span several blocks in the block bitmap, the AllocationBlock
33 // class is there to make handling those easier.
34 
35 // The current implementation is only slightly optimized and could probably
36 // be improved a lot. Furthermore, the allocation policies used here should
37 // have some real world tests.
38 
39 #if BFS_TRACING && !defined(BFS_SHELL)
40 namespace BFSBlockTracing {
41 
42 
43 class Allocate : public AbstractTraceEntry {
44 public:
45 	Allocate(block_run run)
46 		:
47 		fRun(run)
48 	{
49 		Initialized();
50 	}
51 
52 	virtual void AddDump(TraceOutput& out)
53 	{
54 		out.Print("bfs:alloc %lu.%u.%u", fRun.AllocationGroup(),
55 			fRun.Start(), fRun.Length());
56 	}
57 
58 	const block_run& Run() const { return fRun; }
59 
60 private:
61 			block_run			fRun;
62 };
63 
64 
65 class Free : public AbstractTraceEntry {
66 public:
67 	Free(block_run run)
68 		:
69 		fRun(run)
70 	{
71 		Initialized();
72 	}
73 
74 	virtual void AddDump(TraceOutput& out)
75 	{
76 		out.Print("bfs:free %lu.%u.%u", fRun.AllocationGroup(),
77 			fRun.Start(), fRun.Length());
78 	}
79 
80 	const block_run& Run() const { return fRun; }
81 
82 private:
83 			block_run			fRun;
84 };
85 
86 
87 static uint32
88 checksum(const uint8* data, size_t size)
89 {
90 	const uint32* data4 = (const uint32*)data;
91 	uint32 sum = 0;
92 	while (size >= 4) {
93 		sum += *data4;
94 		data4++;
95 		size -= 4;
96 	}
97 	return sum;
98 }
99 
100 
101 class Block : public AbstractTraceEntry {
102 public:
103 	Block(const char* label, off_t blockNumber, const uint8* data,
104 			size_t size, uint32 start = 0, uint32 length = 0)
105 		:
106 		fBlock(blockNumber),
107 		fData(data),
108 		fStart(start),
109 		fLength(length),
110 		fLabel(label)
111 	{
112 		fSum = checksum(data, size);
113 		Initialized();
114 	}
115 
116 	virtual void AddDump(TraceOutput& out)
117 	{
118 		out.Print("bfs:%s: block %Ld (%p), sum %lu, s/l %lu/%lu", fLabel,
119 			fBlock, fData, fSum, fStart, fLength);
120 	}
121 
122 private:
123 			off_t				fBlock;
124 			const uint8*		fData;
125 			uint32				fStart;
126 			uint32				fLength;
127 			uint32				fSum;
128 			const char*			fLabel;
129 };
130 
131 
132 class BlockChange : public AbstractTraceEntry {
133 public:
134 	BlockChange(const char* label, int32 block, uint32 oldData, uint32 newData)
135 		:
136 		fBlock(block),
137 		fOldData(oldData),
138 		fNewData(newData),
139 		fLabel(label)
140 	{
141 		Initialized();
142 	}
143 
144 	virtual void AddDump(TraceOutput& out)
145 	{
146 		out.Print("bfs:%s: block %ld, %08lx -> %08lx", fLabel,
147 			fBlock, fOldData, fNewData);
148 	}
149 
150 private:
151 			int32				fBlock;
152 			uint32				fOldData;
153 			uint32				fNewData;
154 			const char*			fLabel;
155 };
156 
157 
158 }	// namespace BFSBlockTracing
159 
160 #	define T(x) new(std::nothrow) BFSBlockTracing::x;
161 #else
162 #	define T(x) ;
163 #endif
164 
165 #ifdef DEBUG_ALLOCATION_GROUPS
166 #	define CHECK_ALLOCATION_GROUP(group) _CheckGroup(group)
167 #else
168 #	define CHECK_ALLOCATION_GROUP(group) ;
169 #endif
170 
171 
172 struct check_index {
173 	check_index()
174 		:
175 		inode(NULL)
176 	{
177 	}
178 
179 	char				name[B_FILE_NAME_LENGTH];
180 	block_run			run;
181 	Inode*				inode;
182 };
183 
184 
185 struct check_cookie {
186 	check_cookie()
187 	{
188 	}
189 
190 	uint32				pass;
191 	block_run			current;
192 	Inode*				parent;
193 	mode_t				parent_mode;
194 	Stack<block_run>	stack;
195 	TreeIterator*		iterator;
196 	check_control		control;
197 	Stack<check_index*>	indices;
198 };
199 
200 
201 class AllocationBlock : public CachedBlock {
202 public:
203 	AllocationBlock(Volume* volume);
204 
205 	inline void Allocate(uint16 start, uint16 numBlocks);
206 	inline void Free(uint16 start, uint16 numBlocks);
207 	inline bool IsUsed(uint16 block);
208 
209 	status_t SetTo(AllocationGroup& group, uint16 block);
210 	status_t SetToWritable(Transaction& transaction, AllocationGroup& group,
211 		uint16 block);
212 
213 	uint32 NumBlockBits() const { return fNumBits; }
214 	uint32& Block(int32 index) { return ((uint32*)fBlock)[index]; }
215 	uint8* Block() const { return (uint8*)fBlock; }
216 
217 private:
218 	uint32	fNumBits;
219 #ifdef DEBUG
220 	bool	fWritable;
221 #endif
222 };
223 
224 
225 class AllocationGroup {
226 public:
227 	AllocationGroup();
228 
229 	void AddFreeRange(int32 start, int32 blocks);
230 	bool IsFull() const { return fFreeBits == 0; }
231 
232 	status_t Allocate(Transaction& transaction, uint16 start, int32 length);
233 	status_t Free(Transaction& transaction, uint16 start, int32 length);
234 
235 	uint32 NumBits() const { return fNumBits; }
236 	uint32 NumBlocks() const { return fNumBlocks; }
237 	int32 Start() const { return fStart; }
238 
239 private:
240 	friend class BlockAllocator;
241 
242 	uint32	fNumBits;
243 	uint32	fNumBlocks;
244 	int32	fStart;
245 	int32	fFirstFree;
246 	int32	fFreeBits;
247 
248 	int32	fLargestStart;
249 	int32	fLargestLength;
250 	bool	fLargestValid;
251 };
252 
253 
254 AllocationBlock::AllocationBlock(Volume* volume)
255 	: CachedBlock(volume)
256 {
257 }
258 
259 
260 status_t
261 AllocationBlock::SetTo(AllocationGroup& group, uint16 block)
262 {
263 	// 8 blocks per byte
264 	fNumBits = fVolume->BlockSize() << 3;
265 	// the last group may have less bits than the others
266 	if ((block + 1) * fNumBits > group.NumBits())
267 		fNumBits = group.NumBits() % fNumBits;
268 
269 #ifdef DEBUG
270 	fWritable = false;
271 #endif
272 	return CachedBlock::SetTo(group.Start() + block) != NULL ? B_OK : B_ERROR;
273 }
274 
275 
276 status_t
277 AllocationBlock::SetToWritable(Transaction& transaction, AllocationGroup& group,
278 	uint16 block)
279 {
280 	// 8 blocks per byte
281 	fNumBits = fVolume->BlockSize() << 3;
282 	// the last group may have less bits in the last block
283 	if ((block + 1) * fNumBits > group.NumBits())
284 		fNumBits = group.NumBits() % fNumBits;
285 
286 #ifdef DEBUG
287 	fWritable = true;
288 #endif
289 	return CachedBlock::SetToWritable(transaction, group.Start() + block)
290 		!= NULL ? B_OK : B_ERROR;
291 }
292 
293 
294 bool
295 AllocationBlock::IsUsed(uint16 block)
296 {
297 	if (block > fNumBits)
298 		return true;
299 
300 	// the block bitmap is accessed in 32-bit blocks
301 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
302 }
303 
304 
305 void
306 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
307 {
308 	ASSERT(start < fNumBits);
309 	ASSERT(uint32(start + numBlocks) <= fNumBits);
310 #ifdef DEBUG
311 	ASSERT(fWritable);
312 #endif
313 
314 	T(Block("b-alloc-in", fBlockNumber, fBlock, fVolume->BlockSize(),
315 		start, numBlocks));
316 
317 	int32 block = start >> 5;
318 
319 	while (numBlocks > 0) {
320 		uint32 mask = 0;
321 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
322 			mask |= 1UL << i;
323 
324 		T(BlockChange("b-alloc", block, Block(block),
325 			Block(block) | HOST_ENDIAN_TO_BFS_INT32(mask)));
326 
327 #if KDEBUG
328 		// check for already set blocks
329 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32*)fBlock)[block]) {
330 			FATAL(("AllocationBlock::Allocate(): some blocks are already "
331 				"allocated, start = %u, numBlocks = %u\n", start, numBlocks));
332 			panic("blocks already set!");
333 		}
334 #endif
335 
336 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
337 		start = 0;
338 	}
339 	T(Block("b-alloc-out", fBlockNumber, fBlock, fVolume->BlockSize(),
340 		start, numBlocks));
341 }
342 
343 
344 void
345 AllocationBlock::Free(uint16 start, uint16 numBlocks)
346 {
347 	ASSERT(start < fNumBits);
348 	ASSERT(uint32(start + numBlocks) <= fNumBits);
349 #ifdef DEBUG
350 	ASSERT(fWritable);
351 #endif
352 
353 	int32 block = start >> 5;
354 
355 	while (numBlocks > 0) {
356 		uint32 mask = 0;
357 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
358 			mask |= 1UL << (i % 32);
359 
360 		T(BlockChange("b-free", block, Block(block),
361 			Block(block) & HOST_ENDIAN_TO_BFS_INT32(~mask)));
362 
363 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
364 		start = 0;
365 	}
366 }
367 
368 
369 //	#pragma mark -
370 
371 
372 /*!	The allocation groups are created and initialized in
373 	BlockAllocator::Initialize() and BlockAllocator::InitializeAndClearBitmap()
374 	respectively.
375 */
376 AllocationGroup::AllocationGroup()
377 	:
378 	fFirstFree(-1),
379 	fFreeBits(0),
380 	fLargestValid(false)
381 {
382 }
383 
384 
385 void
386 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
387 {
388 	//D(if (blocks > 512)
389 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
390 
391 	if (fFirstFree == -1)
392 		fFirstFree = start;
393 
394 	if (!fLargestValid || fLargestLength < blocks) {
395 		fLargestStart = start;
396 		fLargestLength = blocks;
397 		fLargestValid = true;
398 	}
399 
400 	fFreeBits += blocks;
401 }
402 
403 
404 /*!	Allocates the specified run in the allocation group.
405 	Doesn't check if the run is valid or already allocated partially, nor
406 	does it maintain the free ranges hints or the volume's used blocks count.
407 	It only does the low-level work of allocating some bits in the block bitmap.
408 	Assumes that the block bitmap lock is hold.
409 */
410 status_t
411 AllocationGroup::Allocate(Transaction& transaction, uint16 start, int32 length)
412 {
413 	ASSERT(start + length <= (int32)fNumBits);
414 
415 	// Update the allocation group info
416 	// TODO: this info will be incorrect if something goes wrong later
417 	// Note, the fFirstFree block doesn't have to be really free
418 	if (start == fFirstFree)
419 		fFirstFree = start + length;
420 	fFreeBits -= length;
421 
422 	if (fLargestValid) {
423 		bool cut = false;
424 		if (fLargestStart == start) {
425 			// cut from start
426 			fLargestStart += length;
427 			fLargestLength -= length;
428 			cut = true;
429 		} else if (start > fLargestStart
430 			&& start < fLargestStart + fLargestLength) {
431 			// cut from end
432 			fLargestLength = start - fLargestStart;
433 			cut = true;
434 		}
435 		if (cut && (fLargestLength < fLargestStart
436 				|| fLargestLength
437 						< (int32)fNumBits - (fLargestStart + fLargestLength))) {
438 			// might not be the largest block anymore
439 			fLargestValid = false;
440 		}
441 	}
442 
443 	Volume* volume = transaction.GetVolume();
444 
445 	// calculate block in the block bitmap and position within
446 	uint32 bitsPerBlock = volume->BlockSize() << 3;
447 	uint32 block = start / bitsPerBlock;
448 	start = start % bitsPerBlock;
449 
450 	AllocationBlock cached(volume);
451 
452 	while (length > 0) {
453 		if (cached.SetToWritable(transaction, *this, block) < B_OK) {
454 			fLargestValid = false;
455 			RETURN_ERROR(B_IO_ERROR);
456 		}
457 
458 		uint32 numBlocks = length;
459 		if (start + numBlocks > cached.NumBlockBits())
460 			numBlocks = cached.NumBlockBits() - start;
461 
462 		cached.Allocate(start, numBlocks);
463 
464 		length -= numBlocks;
465 		start = 0;
466 		block++;
467 	}
468 
469 	return B_OK;
470 }
471 
472 
473 /*!	Frees the specified run in the allocation group.
474 	Doesn't check if the run is valid or was not completely allocated, nor
475 	does it maintain the free ranges hints or the volume's used blocks count.
476 	It only does the low-level work of freeing some bits in the block bitmap.
477 	Assumes that the block bitmap lock is hold.
478 */
479 status_t
480 AllocationGroup::Free(Transaction& transaction, uint16 start, int32 length)
481 {
482 	ASSERT(start + length <= (int32)fNumBits);
483 
484 	// Update the allocation group info
485 	// TODO: this info will be incorrect if something goes wrong later
486 	if (fFirstFree > start)
487 		fFirstFree = start;
488 	fFreeBits += length;
489 
490 	// The range to be freed cannot be part of the valid largest range
491 	ASSERT(!fLargestValid || start + length <= fLargestStart
492 		|| start > fLargestStart);
493 
494 	if (fLargestValid
495 		&& (start + length == fLargestStart
496 			|| fLargestStart + fLargestLength == start
497 			|| (start < fLargestStart && fLargestStart > fLargestLength)
498 			|| (start > fLargestStart
499 				&& (int32)fNumBits - (fLargestStart + fLargestLength)
500 						> fLargestLength))) {
501 		fLargestValid = false;
502 	}
503 
504 	Volume* volume = transaction.GetVolume();
505 
506 	// calculate block in the block bitmap and position within
507 	uint32 bitsPerBlock = volume->BlockSize() << 3;
508 	uint32 block = start / bitsPerBlock;
509 	start = start % bitsPerBlock;
510 
511 	AllocationBlock cached(volume);
512 
513 	while (length > 0) {
514 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
515 			RETURN_ERROR(B_IO_ERROR);
516 
517 		T(Block("free-1", block, cached.Block(), volume->BlockSize()));
518 		uint16 freeLength = length;
519 		if (uint32(start + length) > cached.NumBlockBits())
520 			freeLength = cached.NumBlockBits() - start;
521 
522 		cached.Free(start, freeLength);
523 
524 		length -= freeLength;
525 		start = 0;
526 		T(Block("free-2", block, cached.Block(), volume->BlockSize()));
527 		block++;
528 	}
529 	return B_OK;
530 }
531 
532 
533 //	#pragma mark -
534 
535 
536 BlockAllocator::BlockAllocator(Volume* volume)
537 	:
538 	fVolume(volume),
539 	fGroups(NULL),
540 	fCheckBitmap(NULL),
541 	fCheckCookie(NULL)
542 {
543 	recursive_lock_init(&fLock, "bfs allocator");
544 }
545 
546 
547 BlockAllocator::~BlockAllocator()
548 {
549 	recursive_lock_destroy(&fLock);
550 	delete[] fGroups;
551 }
552 
553 
554 status_t
555 BlockAllocator::Initialize(bool full)
556 {
557 	fNumGroups = fVolume->AllocationGroups();
558 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
559 	fNumBlocks = (fVolume->NumBlocks() + fVolume->BlockSize() * 8 - 1)
560 		/ (fVolume->BlockSize() * 8);
561 
562 	fGroups = new(std::nothrow) AllocationGroup[fNumGroups];
563 	if (fGroups == NULL)
564 		return B_NO_MEMORY;
565 
566 	if (!full)
567 		return B_OK;
568 
569 	recursive_lock_lock(&fLock);
570 		// the lock will be released by the _Initialize() method
571 
572 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::_Initialize,
573 		"bfs block allocator", B_LOW_PRIORITY, this);
574 	if (id < B_OK)
575 		return _Initialize(this);
576 
577 	recursive_lock_transfer_lock(&fLock, id);
578 
579 	return resume_thread(id);
580 }
581 
582 
583 status_t
584 BlockAllocator::InitializeAndClearBitmap(Transaction& transaction)
585 {
586 	status_t status = Initialize(false);
587 	if (status != B_OK)
588 		return status;
589 
590 	uint32 numBits = 8 * fBlocksPerGroup * fVolume->BlockSize();
591 	uint32 blockShift = fVolume->BlockShift();
592 
593 	uint32* buffer = (uint32*)malloc(numBits >> 3);
594 	if (buffer == NULL)
595 		RETURN_ERROR(B_NO_MEMORY);
596 
597 	memset(buffer, 0, numBits >> 3);
598 
599 	off_t offset = 1;
600 		// the bitmap starts directly after the super block
601 
602 	// initialize the AllocationGroup objects and clear the on-disk bitmap
603 
604 	for (int32 i = 0; i < fNumGroups; i++) {
605 		if (write_pos(fVolume->Device(), offset << blockShift, buffer,
606 				fBlocksPerGroup << blockShift) < B_OK) {
607 			free(buffer);
608 			return B_ERROR;
609 		}
610 
611 		// the last allocation group may contain less blocks than the others
612 		if (i == fNumGroups - 1) {
613 			fGroups[i].fNumBits = fVolume->NumBlocks() - i * numBits;
614 			fGroups[i].fNumBlocks = 1 + ((fGroups[i].NumBits() - 1)
615 				>> (blockShift + 3));
616 		} else {
617 			fGroups[i].fNumBits = numBits;
618 			fGroups[i].fNumBlocks = fBlocksPerGroup;
619 		}
620 		fGroups[i].fStart = offset;
621 		fGroups[i].fFirstFree = fGroups[i].fLargestStart = 0;
622 		fGroups[i].fFreeBits = fGroups[i].fLargestLength = fGroups[i].fNumBits;
623 		fGroups[i].fLargestValid = true;
624 
625 		offset += fBlocksPerGroup;
626 	}
627 	free(buffer);
628 
629 	// reserve the boot block, the log area, and the block bitmap itself
630 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
631 
632 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
633 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
634 		return B_ERROR;
635 	}
636 	fVolume->SuperBlock().used_blocks
637 		= HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
638 
639 	return B_OK;
640 }
641 
642 
643 status_t
644 BlockAllocator::_Initialize(BlockAllocator* allocator)
645 {
646 	// The lock must already be held at this point
647 	RecursiveLocker locker(allocator->fLock, true);
648 
649 	Volume* volume = allocator->fVolume;
650 	uint32 blocks = allocator->fBlocksPerGroup;
651 	uint32 blockShift = volume->BlockShift();
652 	off_t freeBlocks = 0;
653 
654 	uint32* buffer = (uint32*)malloc(blocks << blockShift);
655 	if (buffer == NULL)
656 		RETURN_ERROR(B_NO_MEMORY);
657 
658 	AllocationGroup* groups = allocator->fGroups;
659 	off_t offset = 1;
660 	uint32 bitsPerGroup = 8 * (blocks << blockShift);
661 	int32 numGroups = allocator->fNumGroups;
662 
663 	for (int32 i = 0; i < numGroups; i++) {
664 		if (read_pos(volume->Device(), offset << blockShift, buffer,
665 				blocks << blockShift) < B_OK)
666 			break;
667 
668 		// the last allocation group may contain less blocks than the others
669 		if (i == numGroups - 1) {
670 			groups[i].fNumBits = volume->NumBlocks() - i * bitsPerGroup;
671 			groups[i].fNumBlocks = 1 + ((groups[i].NumBits() - 1)
672 				>> (blockShift + 3));
673 		} else {
674 			groups[i].fNumBits = bitsPerGroup;
675 			groups[i].fNumBlocks = blocks;
676 		}
677 		groups[i].fStart = offset;
678 
679 		// finds all free ranges in this allocation group
680 		int32 start = -1, range = 0;
681 		int32 numBits = groups[i].fNumBits, bit = 0;
682 		int32 count = (numBits + 31) / 32;
683 
684 		for (int32 k = 0; k < count; k++) {
685 			for (int32 j = 0; j < 32 && bit < numBits; j++, bit++) {
686 				if (buffer[k] & (1UL << j)) {
687 					// block is in use
688 					if (range > 0) {
689 						groups[i].AddFreeRange(start, range);
690 						range = 0;
691 					}
692 				} else if (range++ == 0) {
693 					// block is free, start new free range
694 					start = bit;
695 				}
696 			}
697 		}
698 		if (range)
699 			groups[i].AddFreeRange(start, range);
700 
701 		freeBlocks += groups[i].fFreeBits;
702 
703 		offset += blocks;
704 	}
705 	free(buffer);
706 
707 	// check if block bitmap and log area are reserved
708 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
709 
710 	if (allocator->CheckBlocks(0, reservedBlocks) != B_OK) {
711 		if (volume->IsReadOnly()) {
712 			FATAL(("Space for block bitmap or log area is not reserved "
713 				"(volume is mounted read-only)!\n"));
714 		} else {
715 			Transaction transaction(volume, 0);
716 			if (groups[0].Allocate(transaction, 0, reservedBlocks) != B_OK) {
717 				FATAL(("Could not allocate reserved space for block "
718 					"bitmap/log!\n"));
719 				volume->Panic();
720 			} else {
721 				transaction.Done();
722 				FATAL(("Space for block bitmap or log area was not "
723 					"reserved!\n"));
724 			}
725 		}
726 	}
727 
728 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
729 	if (volume->UsedBlocks() != usedBlocks) {
730 		// If the disk in a dirty state at mount time, it's
731 		// normal that the values don't match
732 		INFORM(("volume reports %" B_PRIdOFF " used blocks, correct is %"
733 			B_PRIdOFF "\n", volume->UsedBlocks(), usedBlocks));
734 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
735 	}
736 
737 	return B_OK;
738 }
739 
740 
741 void
742 BlockAllocator::Uninitialize()
743 {
744 	// We only have to make sure that the initializer thread isn't running
745 	// anymore.
746 	recursive_lock_lock(&fLock);
747 }
748 
749 
750 /*!	Tries to allocate between \a minimum, and \a maximum blocks starting
751 	at group \a groupIndex with offset \a start. The resulting allocation
752 	is put into \a run.
753 
754 	The number of allocated blocks is always a multiple of \a minimum which
755 	has to be a power of two value.
756 */
757 status_t
758 BlockAllocator::AllocateBlocks(Transaction& transaction, int32 groupIndex,
759 	uint16 start, uint16 maximum, uint16 minimum, block_run& run)
760 {
761 	if (maximum == 0)
762 		return B_BAD_VALUE;
763 
764 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n",
765 		groupIndex, start, maximum, minimum));
766 
767 	AllocationBlock cached(fVolume);
768 	RecursiveLocker lock(fLock);
769 
770 	uint32 bitsPerFullBlock = fVolume->BlockSize() << 3;
771 
772 	// Find the block_run that can fulfill the request best
773 	int32 bestGroup = -1;
774 	int32 bestStart = -1;
775 	int32 bestLength = -1;
776 
777 	for (int32 i = 0; i < fNumGroups + 1; i++, groupIndex++, start = 0) {
778 		groupIndex = groupIndex % fNumGroups;
779 		AllocationGroup& group = fGroups[groupIndex];
780 
781 		CHECK_ALLOCATION_GROUP(groupIndex);
782 
783 		if (start >= group.NumBits() || group.IsFull())
784 			continue;
785 
786 		// The wanted maximum is smaller than the largest free block in the
787 		// group or already smaller than the minimum
788 
789 		if (start < group.fFirstFree)
790 			start = group.fFirstFree;
791 
792 		if (group.fLargestValid) {
793 			if (group.fLargestLength < bestLength)
794 				continue;
795 
796 			if (group.fLargestStart >= start) {
797 				if (group.fLargestLength >= bestLength) {
798 					bestGroup = groupIndex;
799 					bestStart = group.fLargestStart;
800 					bestLength = group.fLargestLength;
801 
802 					if (bestLength >= maximum)
803 						break;
804 				}
805 
806 				// We know everything about this group we have to, let's skip
807 				// to the next
808 				continue;
809 			}
810 		}
811 
812 		// There may be more than one block per allocation group - and
813 		// we iterate through it to find a place for the allocation.
814 		// (one allocation can't exceed one allocation group)
815 
816 		uint32 block = start / (fVolume->BlockSize() << 3);
817 		int32 currentStart = 0, currentLength = 0;
818 		int32 groupLargestStart = -1;
819 		int32 groupLargestLength = -1;
820 		int32 currentBit = start;
821 		bool canFindGroupLargest = start == 0;
822 
823 		for (; block < group.NumBlocks(); block++) {
824 			if (cached.SetTo(group, block) < B_OK)
825 				RETURN_ERROR(B_ERROR);
826 
827 			T(Block("alloc-in", group.Start() + block, cached.Block(),
828 				fVolume->BlockSize(), groupIndex, currentStart));
829 
830 			// find a block large enough to hold the allocation
831 			for (uint32 bit = start % bitsPerFullBlock;
832 					bit < cached.NumBlockBits(); bit++) {
833 				if (!cached.IsUsed(bit)) {
834 					if (currentLength == 0) {
835 						// start new range
836 						currentStart = currentBit;
837 					}
838 
839 					// have we found a range large enough to hold numBlocks?
840 					if (++currentLength >= maximum) {
841 						bestGroup = groupIndex;
842 						bestStart = currentStart;
843 						bestLength = currentLength;
844 						break;
845 					}
846 				} else {
847 					if (currentLength) {
848 						// end of a range
849 						if (currentLength > bestLength) {
850 							bestGroup = groupIndex;
851 							bestStart = currentStart;
852 							bestLength = currentLength;
853 						}
854 						if (currentLength > groupLargestLength) {
855 							groupLargestStart = currentStart;
856 							groupLargestLength = currentLength;
857 						}
858 						currentLength = 0;
859 					}
860 					if ((int32)group.NumBits() - currentBit
861 							<= groupLargestLength) {
862 						// We can't find a bigger block in this group anymore,
863 						// let's skip the rest.
864 						block = group.NumBlocks();
865 						break;
866 					}
867 				}
868 				currentBit++;
869 			}
870 
871 			T(Block("alloc-out", block, cached.Block(),
872 				fVolume->BlockSize(), groupIndex, currentStart));
873 
874 			if (bestLength >= maximum) {
875 				canFindGroupLargest = false;
876 				break;
877 			}
878 
879 			// start from the beginning of the next block
880 			start = 0;
881 		}
882 
883 		if (currentBit == (int32)group.NumBits()) {
884 			if (currentLength > bestLength) {
885 				bestGroup = groupIndex;
886 				bestStart = currentStart;
887 				bestLength = currentLength;
888 			}
889 			if (canFindGroupLargest && currentLength > groupLargestLength) {
890 				groupLargestStart = currentStart;
891 				groupLargestLength = currentLength;
892 			}
893 		}
894 
895 		if (canFindGroupLargest && !group.fLargestValid
896 			&& groupLargestLength >= 0) {
897 			group.fLargestStart = groupLargestStart;
898 			group.fLargestLength = groupLargestLength;
899 			group.fLargestValid = true;
900 		}
901 
902 		if (bestLength >= maximum)
903 			break;
904 	}
905 
906 	// If we found a suitable range, mark the blocks as in use, and
907 	// write the updated block bitmap back to disk
908 	if (bestLength < minimum)
909 		return B_DEVICE_FULL;
910 
911 	if (bestLength > maximum)
912 		bestLength = maximum;
913 	else if (minimum > 1) {
914 		// make sure bestLength is a multiple of minimum
915 		bestLength = round_down(bestLength, minimum);
916 	}
917 
918 	if (fGroups[bestGroup].Allocate(transaction, bestStart, bestLength) != B_OK)
919 		RETURN_ERROR(B_IO_ERROR);
920 
921 	CHECK_ALLOCATION_GROUP(bestGroup);
922 
923 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(bestGroup);
924 	run.start = HOST_ENDIAN_TO_BFS_INT16(bestStart);
925 	run.length = HOST_ENDIAN_TO_BFS_INT16(bestLength);
926 
927 	fVolume->SuperBlock().used_blocks
928 		= HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + bestLength);
929 		// We are not writing back the disk's super block - it's
930 		// either done by the journaling code, or when the disk
931 		// is unmounted.
932 		// If the value is not correct at mount time, it will be
933 		// fixed anyway.
934 
935 	// We need to flush any remaining blocks in the new allocation to make sure
936 	// they won't interfere with the file cache.
937 	block_cache_discard(fVolume->BlockCache(), fVolume->ToBlock(run),
938 		run.Length());
939 
940 	T(Allocate(run));
941 	return B_OK;
942 }
943 
944 
945 status_t
946 BlockAllocator::AllocateForInode(Transaction& transaction,
947 	const block_run* parent, mode_t type, block_run& run)
948 {
949 	// Apply some allocation policies here (AllocateBlocks() will break them
950 	// if necessary) - we will start with those described in Dominic Giampaolo's
951 	// "Practical File System Design", and see how good they work
952 
953 	// Files are going in the same allocation group as its parent,
954 	// sub-directories will be inserted 8 allocation groups after
955 	// the one of the parent
956 	uint16 group = parent->AllocationGroup();
957 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
958 		group += 8;
959 
960 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
961 }
962 
963 
964 status_t
965 BlockAllocator::Allocate(Transaction& transaction, Inode* inode,
966 	off_t numBlocks, block_run& run, uint16 minimum)
967 {
968 	if (numBlocks <= 0)
969 		return B_ERROR;
970 
971 	// one block_run can't hold more data than there is in one allocation group
972 	if (numBlocks > fGroups[0].NumBits())
973 		numBlocks = fGroups[0].NumBits();
974 
975 	// since block_run.length is uint16, the largest number of blocks that
976 	// can be covered by a block_run is 65535
977 	// TODO: if we drop compatibility, couldn't we do this any better?
978 	// There are basically two possibilities:
979 	// a) since a length of zero doesn't have any sense, take that for 65536 -
980 	//    but that could cause many problems (bugs) in other areas
981 	// b) reduce the maximum amount of blocks per block_run, so that the
982 	//    remaining number of free blocks can be used in a useful manner
983 	//    (like 4 blocks) - but that would also reduce the maximum file size
984 	// c) have BlockRun::Length() return (length + 1).
985 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
986 		numBlocks = MAX_BLOCK_RUN_LENGTH;
987 
988 	// Apply some allocation policies here (AllocateBlocks() will break them
989 	// if necessary)
990 	uint16 group = inode->BlockRun().AllocationGroup();
991 	uint16 start = 0;
992 
993 	// Are there already allocated blocks? (then just try to allocate near the
994 	// last one)
995 	if (inode->Size() > 0) {
996 		const data_stream& data = inode->Node().data;
997 		// TODO: we currently don't care for when the data stream
998 		// is already grown into the indirect ranges
999 		if (data.max_double_indirect_range == 0
1000 			&& data.max_indirect_range == 0) {
1001 			// Since size > 0, there must be a valid block run in this stream
1002 			int32 last = 0;
1003 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
1004 				if (data.direct[last + 1].IsZero())
1005 					break;
1006 
1007 			group = data.direct[last].AllocationGroup();
1008 			start = data.direct[last].Start() + data.direct[last].Length();
1009 		}
1010 	} else if (inode->IsContainer() || inode->IsSymLink()) {
1011 		// directory and symbolic link data will go in the same allocation
1012 		// group as the inode is in but after the inode data
1013 		start = inode->BlockRun().Start();
1014 	} else {
1015 		// file data will start in the next allocation group
1016 		group = inode->BlockRun().AllocationGroup() + 1;
1017 	}
1018 
1019 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
1020 }
1021 
1022 
1023 status_t
1024 BlockAllocator::Free(Transaction& transaction, block_run run)
1025 {
1026 	RecursiveLocker lock(fLock);
1027 
1028 	int32 group = run.AllocationGroup();
1029 	uint16 start = run.Start();
1030 	uint16 length = run.Length();
1031 
1032 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start,
1033 		length));
1034 	T(Free(run));
1035 
1036 	// doesn't use Volume::IsValidBlockRun() here because it can check better
1037 	// against the group size (the last group may have a different length)
1038 	if (group < 0 || group >= fNumGroups
1039 		|| start > fGroups[group].NumBits()
1040 		|| uint32(start + length) > fGroups[group].NumBits()
1041 		|| length == 0) {
1042 		FATAL(("tried to free an invalid block_run (%d, %u, %u)\n", (int)group,
1043 			start, length));
1044 		DEBUGGER(("tried to free invalid block_run"));
1045 		return B_BAD_VALUE;
1046 	}
1047 	// check if someone tries to free reserved areas at the beginning of the
1048 	// drive
1049 	if (group == 0
1050 		&& start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
1051 		FATAL(("tried to free a reserved block_run (%d, %u, %u)\n", (int)group,
1052 			start, length));
1053 		DEBUGGER(("tried to free reserved block"));
1054 		return B_BAD_VALUE;
1055 	}
1056 #ifdef DEBUG
1057 	if (CheckBlockRun(run) != B_OK)
1058 		return B_BAD_DATA;
1059 #endif
1060 
1061 	CHECK_ALLOCATION_GROUP(group);
1062 
1063 	if (fGroups[group].Free(transaction, start, length) != B_OK)
1064 		RETURN_ERROR(B_IO_ERROR);
1065 
1066 	CHECK_ALLOCATION_GROUP(group);
1067 
1068 #ifdef DEBUG
1069 	if (CheckBlockRun(run, NULL, false) != B_OK) {
1070 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just "
1071 			"freed)\n"));
1072 	}
1073 #endif
1074 
1075 	fVolume->SuperBlock().used_blocks =
1076 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
1077 	return B_OK;
1078 }
1079 
1080 
1081 size_t
1082 BlockAllocator::BitmapSize() const
1083 {
1084 	return fVolume->BlockSize() * fNumBlocks;
1085 }
1086 
1087 
1088 #ifdef DEBUG_FRAGMENTER
1089 void
1090 BlockAllocator::Fragment()
1091 {
1092 	AllocationBlock cached(fVolume);
1093 	RecursiveLocker lock(fLock);
1094 
1095 	// only leave 4 block holes
1096 	static const uint32 kMask = 0x0f0f0f0f;
1097 	uint32 valuesPerBlock = fVolume->BlockSize() / 4;
1098 
1099 	for (int32 i = 0; i < fNumGroups; i++) {
1100 		AllocationGroup& group = fGroups[i];
1101 
1102 		for (uint32 block = 0; block < group.NumBlocks(); block++) {
1103 			Transaction transaction(fVolume, 0);
1104 
1105 			if (cached.SetToWritable(transaction, group, block) != B_OK)
1106 				return;
1107 
1108 			for (int32 index = 0; index < valuesPerBlock; index++) {
1109 				cached.Block(index) |= HOST_ENDIAN_TO_BFS_INT32(kMask);
1110 			}
1111 
1112 			transaction.Done();
1113 		}
1114 	}
1115 }
1116 #endif	// DEBUG_FRAGMENTER
1117 
1118 
1119 #ifdef DEBUG_ALLOCATION_GROUPS
1120 void
1121 BlockAllocator::_CheckGroup(int32 groupIndex) const
1122 {
1123 	AllocationBlock cached(fVolume);
1124 	ASSERT_LOCKED_RECURSIVE(&fLock);
1125 
1126 	AllocationGroup& group = fGroups[groupIndex];
1127 
1128 	int32 currentStart = 0, currentLength = 0;
1129 	int32 firstFree = -1;
1130 	int32 largestStart = -1;
1131 	int32 largestLength = 0;
1132 	int32 currentBit = 0;
1133 
1134 	for (uint32 block = 0; block < group.NumBlocks(); block++) {
1135 		if (cached.SetTo(group, block) < B_OK) {
1136 			panic("setting group block %d failed\n", (int)block);
1137 			return;
1138 		}
1139 
1140 		for (uint32 bit = 0; bit < cached.NumBlockBits(); bit++) {
1141 			if (!cached.IsUsed(bit)) {
1142 				if (firstFree < 0) {
1143 					firstFree = currentBit;
1144 					if (!group.fLargestValid) {
1145 						if (firstFree >= 0 && firstFree < group.fFirstFree) {
1146 							// mostly harmless but noteworthy
1147 							dprintf("group %d first free too late: should be "
1148 								"%d, is %d\n", (int)groupIndex, (int)firstFree,
1149 								(int)group.fFirstFree);
1150 						}
1151 						return;
1152 					}
1153 				}
1154 
1155 				if (currentLength == 0) {
1156 					// start new range
1157 					currentStart = currentBit;
1158 				}
1159 				currentLength++;
1160 			} else if (currentLength) {
1161 				// end of a range
1162 				if (currentLength > largestLength) {
1163 					largestStart = currentStart;
1164 					largestLength = currentLength;
1165 				}
1166 				currentLength = 0;
1167 			}
1168 			currentBit++;
1169 		}
1170 	}
1171 
1172 	if (currentLength > largestLength) {
1173 		largestStart = currentStart;
1174 		largestLength = currentLength;
1175 	}
1176 
1177 	if (firstFree >= 0 && firstFree < group.fFirstFree) {
1178 		// mostly harmless but noteworthy
1179 		dprintf("group %d first free too late: should be %d, is %d\n",
1180 			(int)groupIndex, (int)firstFree, (int)group.fFirstFree);
1181 	}
1182 	if (group.fLargestValid
1183 		&& (largestStart != group.fLargestStart
1184 			|| largestLength != group.fLargestLength)) {
1185 		panic("bfs %p: group %d largest differs: %d.%d, checked %d.%d.\n",
1186 			fVolume, (int)groupIndex, (int)group.fLargestStart,
1187 			(int)group.fLargestLength, (int)largestStart, (int)largestLength);
1188 	}
1189 }
1190 #endif	// DEBUG_ALLOCATION_GROUPS
1191 
1192 
1193 //	#pragma mark - Bitmap validity checking
1194 
1195 // TODO: implement new FS checking API
1196 // Functions to check the validity of the bitmap - they are used from
1197 // the "checkfs" command (since this does even a bit more, maybe we should
1198 // move this some place else?)
1199 
1200 
1201 bool
1202 BlockAllocator::_IsValidCheckControl(const check_control* control)
1203 {
1204 	if (control == NULL
1205 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
1206 		FATAL(("invalid check_control (%p)!\n", control));
1207 		return false;
1208 	}
1209 
1210 	return true;
1211 }
1212 
1213 
1214 status_t
1215 BlockAllocator::StartChecking(const check_control* control)
1216 {
1217 	if (!_IsValidCheckControl(control))
1218 		return B_BAD_VALUE;
1219 
1220 	fVolume->GetJournal(0)->Lock(NULL, true);
1221 		// Lock the volume's journal
1222 
1223 	recursive_lock_lock(&fLock);
1224 
1225 	size_t size = BitmapSize();
1226 	fCheckBitmap = (uint32*)malloc(size);
1227 	if (fCheckBitmap == NULL) {
1228 		recursive_lock_unlock(&fLock);
1229 		fVolume->GetJournal(0)->Unlock(NULL, true);
1230 		return B_NO_MEMORY;
1231 	}
1232 
1233 	fCheckCookie = new(std::nothrow) check_cookie();
1234 	if (fCheckCookie == NULL) {
1235 		free(fCheckBitmap);
1236 		fCheckBitmap = NULL;
1237 		recursive_lock_unlock(&fLock);
1238 		fVolume->GetJournal(0)->Unlock(NULL, true);
1239 
1240 		return B_NO_MEMORY;
1241 	}
1242 
1243 	memcpy(&fCheckCookie->control, control, sizeof(check_control));
1244 	memset(&fCheckCookie->control.stats, 0, sizeof(control->stats));
1245 
1246 	// initialize bitmap
1247 	memset(fCheckBitmap, 0, size);
1248 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length();
1249 			block-- > 0;) {
1250 		_SetCheckBitmapAt(block);
1251 	}
1252 
1253 	fCheckCookie->pass = BFS_CHECK_PASS_BITMAP;
1254 	fCheckCookie->stack.Push(fVolume->Root());
1255 	fCheckCookie->stack.Push(fVolume->Indices());
1256 	fCheckCookie->iterator = NULL;
1257 	fCheckCookie->control.stats.block_size = fVolume->BlockSize();
1258 
1259 	// Put removed vnodes to the stack -- they are not reachable by traversing
1260 	// the file system anymore.
1261 	InodeList::Iterator iterator = fVolume->RemovedInodes().GetIterator();
1262 	while (Inode* inode = iterator.Next()) {
1263 		fCheckCookie->stack.Push(inode->BlockRun());
1264 	}
1265 
1266 	// TODO: check reserved area in bitmap!
1267 
1268 	return B_OK;
1269 }
1270 
1271 
1272 status_t
1273 BlockAllocator::StopChecking(check_control* control)
1274 {
1275 	if (fCheckCookie == NULL)
1276 		return B_NO_INIT;
1277 
1278 	if (fCheckCookie->iterator != NULL) {
1279 		delete fCheckCookie->iterator;
1280 		fCheckCookie->iterator = NULL;
1281 
1282 		// the current directory inode is still locked in memory
1283 		put_vnode(fVolume->FSVolume(), fVolume->ToVnode(fCheckCookie->current));
1284 	}
1285 
1286 	if (fVolume->IsReadOnly()) {
1287 		// We can't fix errors on this volume
1288 		fCheckCookie->control.flags = 0;
1289 	}
1290 
1291 	if (fCheckCookie->control.status != B_ENTRY_NOT_FOUND)
1292 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
1293 
1294 	switch (fCheckCookie->pass) {
1295 		case BFS_CHECK_PASS_BITMAP:
1296 			// if CheckNextNode() could completely work through, we can
1297 			// fix any damages of the bitmap
1298 			if (fCheckCookie->control.status == B_ENTRY_NOT_FOUND)
1299 				_WriteBackCheckBitmap();
1300 			break;
1301 
1302 		case BFS_CHECK_PASS_INDEX:
1303 			_FreeIndices();
1304 			break;
1305 	}
1306 
1307 	fVolume->SetCheckingThread(-1);
1308 
1309 	if (control != NULL)
1310 		user_memcpy(control, &fCheckCookie->control, sizeof(check_control));
1311 
1312 	free(fCheckBitmap);
1313 	fCheckBitmap = NULL;
1314 	delete fCheckCookie;
1315 	fCheckCookie = NULL;
1316 	recursive_lock_unlock(&fLock);
1317 	fVolume->GetJournal(0)->Unlock(NULL, true);
1318 
1319 	return B_OK;
1320 }
1321 
1322 
1323 status_t
1324 BlockAllocator::CheckNextNode(check_control* control)
1325 {
1326 	if (fCheckCookie == NULL)
1327 		return B_NO_INIT;
1328 
1329 	fVolume->SetCheckingThread(find_thread(NULL));
1330 
1331 	// Make sure the user control is copied on exit
1332 	class CopyControlOnExit {
1333 	public:
1334 		CopyControlOnExit(check_control* source, check_control* userTarget)
1335 			:
1336 			fSource(source),
1337 			fTarget(userTarget)
1338 		{
1339 		}
1340 
1341 		~CopyControlOnExit()
1342 		{
1343 			if (fTarget != NULL)
1344 				user_memcpy(fTarget, fSource, sizeof(check_control));
1345 		}
1346 
1347 	private:
1348 		check_control*	fSource;
1349 		check_control*	fTarget;
1350 	} copyControl(&fCheckCookie->control, control);
1351 
1352 	while (true) {
1353 		if (fCheckCookie->iterator == NULL) {
1354 			if (!fCheckCookie->stack.Pop(&fCheckCookie->current)) {
1355 				// No more runs on the stack, we might be finished!
1356 				if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1357 					&& !fCheckCookie->indices.IsEmpty()) {
1358 					// Start second pass to repair indices
1359 					_WriteBackCheckBitmap();
1360 
1361 					fCheckCookie->pass = BFS_CHECK_PASS_INDEX;
1362 					fCheckCookie->control.pass = BFS_CHECK_PASS_INDEX;
1363 
1364 					status_t status = _PrepareIndices();
1365 					if (status != B_OK) {
1366 						fCheckCookie->control.status = status;
1367 						return status;
1368 					}
1369 
1370 					fCheckCookie->stack.Push(fVolume->Root());
1371 					continue;
1372 				}
1373 
1374 				fCheckCookie->control.status = B_ENTRY_NOT_FOUND;
1375 				return B_ENTRY_NOT_FOUND;
1376 			}
1377 
1378 			Vnode vnode(fVolume, fCheckCookie->current);
1379 			Inode* inode;
1380 			if (vnode.Get(&inode) != B_OK) {
1381 				FATAL(("check: Could not open inode at %" B_PRIdOFF "\n",
1382 					fVolume->ToBlock(fCheckCookie->current)));
1383 				continue;
1384 			}
1385 
1386 			fCheckCookie->control.inode = inode->ID();
1387 			fCheckCookie->control.mode = inode->Mode();
1388 
1389 			if (!inode->IsContainer()) {
1390 				// Check file
1391 				fCheckCookie->control.errors = 0;
1392 				fCheckCookie->control.status = CheckInode(inode, NULL);
1393 
1394 				if (inode->GetName(fCheckCookie->control.name) < B_OK)
1395 					strcpy(fCheckCookie->control.name, "(node has no name)");
1396 
1397 				return B_OK;
1398 			}
1399 
1400 			// Check directory
1401 			BPlusTree* tree = inode->Tree();
1402 			if (tree == NULL) {
1403 				FATAL(("check: could not open b+tree from inode at %" B_PRIdOFF
1404 					"\n", fVolume->ToBlock(fCheckCookie->current)));
1405 				continue;
1406 			}
1407 
1408 			fCheckCookie->parent = inode;
1409 			fCheckCookie->parent_mode = inode->Mode();
1410 
1411 			// get iterator for the next directory
1412 			fCheckCookie->iterator = new(std::nothrow) TreeIterator(tree);
1413 			if (fCheckCookie->iterator == NULL)
1414 				RETURN_ERROR(B_NO_MEMORY);
1415 
1416 			// the inode must stay locked in memory until the iterator is freed
1417 			vnode.Keep();
1418 
1419 			// check the inode of the directory
1420 			fCheckCookie->control.errors = 0;
1421 			fCheckCookie->control.status = CheckInode(inode, NULL);
1422 
1423 			if (inode->GetName(fCheckCookie->control.name) != B_OK)
1424 				strcpy(fCheckCookie->control.name, "(dir has no name)");
1425 
1426 			return B_OK;
1427 		}
1428 
1429 		char name[B_FILE_NAME_LENGTH];
1430 		uint16 length;
1431 		ino_t id;
1432 
1433 		status_t status = fCheckCookie->iterator->GetNextEntry(name, &length,
1434 			B_FILE_NAME_LENGTH, &id);
1435 		if (status != B_OK) {
1436 			// we no longer need this iterator
1437 			delete fCheckCookie->iterator;
1438 			fCheckCookie->iterator = NULL;
1439 
1440 			// unlock the directory's inode from memory
1441 			put_vnode(fVolume->FSVolume(),
1442 				fVolume->ToVnode(fCheckCookie->current));
1443 
1444 			if (status == B_ENTRY_NOT_FOUND) {
1445 				// We iterated over all entries already, just go on to the next
1446 				continue;
1447 			}
1448 
1449 			// Iterating over the B+tree failed - we let the checkfs run
1450 			// fail completely, as we would delete all files we cannot
1451 			// access.
1452 			// TODO: maybe have a force parameter that actually does that.
1453 			// TODO: we also need to be able to repair broken B+trees!
1454 			return status;
1455 		}
1456 
1457 		// ignore "." and ".." entries
1458 		if (!strcmp(name, ".") || !strcmp(name, ".."))
1459 			continue;
1460 
1461 		// fill in the control data as soon as we have them
1462 		strlcpy(fCheckCookie->control.name, name, B_FILE_NAME_LENGTH);
1463 		fCheckCookie->control.inode = id;
1464 		fCheckCookie->control.errors = 0;
1465 
1466 		Vnode vnode(fVolume, id);
1467 		Inode* inode;
1468 		if (vnode.Get(&inode) != B_OK) {
1469 			FATAL(("Could not open inode ID %" B_PRIdINO "!\n", id));
1470 			fCheckCookie->control.errors |= BFS_COULD_NOT_OPEN;
1471 
1472 			if ((fCheckCookie->control.flags & BFS_REMOVE_INVALID) != 0) {
1473 				status = _RemoveInvalidNode(fCheckCookie->parent,
1474 					fCheckCookie->iterator->Tree(), NULL, name);
1475 			} else
1476 				status = B_ERROR;
1477 
1478 			fCheckCookie->control.status = status;
1479 			return B_OK;
1480 		}
1481 
1482 		// check if the inode's name is the same as in the b+tree
1483 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1484 			&& inode->IsRegularNode()) {
1485 			RecursiveLocker locker(inode->SmallDataLock());
1486 			NodeGetter node(fVolume, inode);
1487 
1488 			const char* localName = inode->Name(node.Node());
1489 			if (localName == NULL || strcmp(localName, name)) {
1490 				fCheckCookie->control.errors |= BFS_NAMES_DONT_MATCH;
1491 				FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name,
1492 					localName));
1493 
1494 				if ((fCheckCookie->control.flags & BFS_FIX_NAME_MISMATCHES)
1495 						!= 0) {
1496 					// Rename the inode
1497 					Transaction transaction(fVolume, inode->BlockNumber());
1498 
1499 					// Note, this may need extra blocks, but the inode will
1500 					// only be checked afterwards, so that it won't be lost
1501 					status = inode->SetName(transaction, name);
1502 					if (status == B_OK)
1503 						status = inode->WriteBack(transaction);
1504 					if (status == B_OK)
1505 						status = transaction.Done();
1506 					if (status != B_OK) {
1507 						fCheckCookie->control.status = status;
1508 						return B_OK;
1509 					}
1510 				}
1511 			}
1512 		}
1513 
1514 		fCheckCookie->control.mode = inode->Mode();
1515 
1516 		// Check for the correct mode of the node (if the mode of the
1517 		// file don't fit to its parent, there is a serious problem)
1518 		if (fCheckCookie->pass == BFS_CHECK_PASS_BITMAP
1519 			&& (((fCheckCookie->parent_mode & S_ATTR_DIR) != 0
1520 					&& !inode->IsAttribute())
1521 				|| ((fCheckCookie->parent_mode & S_INDEX_DIR) != 0
1522 					&& !inode->IsIndex())
1523 				|| (is_directory(fCheckCookie->parent_mode)
1524 					&& !inode->IsRegularNode()))) {
1525 			FATAL(("inode at %" B_PRIdOFF " is of wrong type: %o (parent "
1526 				"%o at %" B_PRIdOFF ")!\n", inode->BlockNumber(),
1527 				inode->Mode(), fCheckCookie->parent_mode,
1528 				fCheckCookie->parent->BlockNumber()));
1529 
1530 			// if we are allowed to fix errors, we should remove the file
1531 			if ((fCheckCookie->control.flags & BFS_REMOVE_WRONG_TYPES) != 0
1532 				&& (fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS)
1533 						!= 0) {
1534 				status = _RemoveInvalidNode(fCheckCookie->parent, NULL,
1535 					inode, name);
1536 			} else
1537 				status = B_ERROR;
1538 
1539 			fCheckCookie->control.errors |= BFS_WRONG_TYPE;
1540 			fCheckCookie->control.status = status;
1541 			return B_OK;
1542 		}
1543 
1544 		// push the directory on the stack so that it will be scanned later
1545 		if (inode->IsContainer() && !inode->IsIndex())
1546 			fCheckCookie->stack.Push(inode->BlockRun());
1547 		else {
1548 			// check it now
1549 			fCheckCookie->control.status = CheckInode(inode, name);
1550 			return B_OK;
1551 		}
1552 	}
1553 	// is never reached
1554 }
1555 
1556 
1557 status_t
1558 BlockAllocator::_RemoveInvalidNode(Inode* parent, BPlusTree* tree, Inode* inode,
1559 	const char* name)
1560 {
1561 	// It's safe to start a transaction, because Inode::Remove()
1562 	// won't touch the block bitmap (which we hold the lock for)
1563 	// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1564 	// the bitmap anyway.
1565 	Transaction transaction(fVolume, parent->BlockNumber());
1566 	status_t status;
1567 
1568 	if (inode != NULL) {
1569 		inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1570 
1571 		status = parent->Remove(transaction, name, NULL, false, true);
1572 	} else {
1573 		parent->WriteLockInTransaction(transaction);
1574 
1575 		// does the file even exist?
1576 		off_t id;
1577 		status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1578 		if (status == B_OK)
1579 			status = tree->Remove(transaction, name, id);
1580 	}
1581 
1582 	if (status == B_OK) {
1583 		entry_cache_remove(fVolume->ID(), parent->ID(), name);
1584 		transaction.Done();
1585 	}
1586 
1587 	return status;
1588 }
1589 
1590 
1591 bool
1592 BlockAllocator::_CheckBitmapIsUsedAt(off_t block) const
1593 {
1594 	size_t size = BitmapSize();
1595 	uint32 index = block / 32;	// 32bit resolution
1596 	if (index > size / 4)
1597 		return false;
1598 
1599 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index])
1600 		& (1UL << (block & 0x1f));
1601 }
1602 
1603 
1604 void
1605 BlockAllocator::_SetCheckBitmapAt(off_t block)
1606 {
1607 	size_t size = BitmapSize();
1608 	uint32 index = block / 32;	// 32bit resolution
1609 	if (index > size / 4)
1610 		return;
1611 
1612 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1613 }
1614 
1615 
1616 status_t
1617 BlockAllocator::_WriteBackCheckBitmap()
1618 {
1619 	if (fVolume->IsReadOnly())
1620 		return B_OK;
1621 
1622 	// calculate the number of used blocks in the check bitmap
1623 	size_t size = BitmapSize();
1624 	off_t usedBlocks = 0LL;
1625 
1626 	// TODO: update the allocation groups used blocks info
1627 	for (uint32 i = size >> 2; i-- > 0;) {
1628 		uint32 compare = 1;
1629 		// Count the number of bits set
1630 		for (int16 j = 0; j < 32; j++, compare <<= 1) {
1631 			if ((compare & fCheckBitmap[i]) != 0)
1632 				usedBlocks++;
1633 		}
1634 	}
1635 
1636 	fCheckCookie->control.stats.freed = fVolume->UsedBlocks() - usedBlocks
1637 		+ fCheckCookie->control.stats.missing;
1638 	if (fCheckCookie->control.stats.freed < 0)
1639 		fCheckCookie->control.stats.freed = 0;
1640 
1641 	// Should we fix errors? Were there any errors we can fix?
1642 	if ((fCheckCookie->control.flags & BFS_FIX_BITMAP_ERRORS) != 0
1643 		&& (fCheckCookie->control.stats.freed != 0
1644 			|| fCheckCookie->control.stats.missing != 0)) {
1645 		// If so, write the check bitmap back over the original one,
1646 		// and use transactions here to play safe - we even use several
1647 		// transactions, so that we don't blow the maximum log size
1648 		// on large disks, since we don't need to make this atomic.
1649 #if 0
1650 		// prints the blocks that differ
1651 		off_t block = 0;
1652 		for (int32 i = 0; i < fNumGroups; i++) {
1653 			AllocationBlock cached(fVolume);
1654 			for (uint32 j = 0; j < fGroups[i].NumBlocks(); j++) {
1655 				cached.SetTo(fGroups[i], j);
1656 				for (uint32 k = 0; k < cached.NumBlockBits(); k++) {
1657 					if (cached.IsUsed(k) != _CheckBitmapIsUsedAt(block)) {
1658 						dprintf("differ block %lld (should be %d)\n", block,
1659 							_CheckBitmapIsUsedAt(block));
1660 					}
1661 					block++;
1662 				}
1663 			}
1664 		}
1665 #endif
1666 
1667 		fVolume->SuperBlock().used_blocks
1668 			= HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
1669 
1670 		size_t blockSize = fVolume->BlockSize();
1671 
1672 		for (uint32 i = 0; i < fNumBlocks; i += 512) {
1673 			Transaction transaction(fVolume, 1 + i);
1674 
1675 			uint32 blocksToWrite = 512;
1676 			if (blocksToWrite + i > fNumBlocks)
1677 				blocksToWrite = fNumBlocks - i;
1678 
1679 			status_t status = transaction.WriteBlocks(1 + i,
1680 				(uint8*)fCheckBitmap + i * blockSize, blocksToWrite);
1681 			if (status < B_OK) {
1682 				FATAL(("error writing bitmap: %s\n", strerror(status)));
1683 				return status;
1684 			}
1685 			transaction.Done();
1686 		}
1687 	}
1688 
1689 	return B_OK;
1690 }
1691 
1692 
1693 /*!	Checks whether or not the specified block range is allocated or not,
1694 	depending on the \a allocated argument.
1695 */
1696 status_t
1697 BlockAllocator::CheckBlocks(off_t start, off_t length, bool allocated)
1698 {
1699 	if (start < 0 || start + length > fVolume->NumBlocks())
1700 		return B_BAD_VALUE;
1701 
1702 	int32 group = start >> fVolume->AllocationGroupShift();
1703 	uint32 bitmapBlock = start / (fVolume->BlockSize() << 3);
1704 	uint32 blockOffset = start % (fVolume->BlockSize() << 3);
1705 
1706 	uint32 groupBlock = bitmapBlock % fBlocksPerGroup;
1707 
1708 	AllocationBlock cached(fVolume);
1709 
1710 	while (groupBlock < fGroups[group].NumBlocks() && length > 0) {
1711 		if (cached.SetTo(fGroups[group], groupBlock) != B_OK)
1712 			RETURN_ERROR(B_IO_ERROR);
1713 
1714 		for (; blockOffset < cached.NumBlockBits() && length > 0;
1715 				blockOffset++, length--) {
1716 			if (cached.IsUsed(blockOffset) != allocated) {
1717 				RETURN_ERROR(B_BAD_DATA);
1718 			}
1719 		}
1720 
1721 		blockOffset = 0;
1722 
1723 		if (++groupBlock >= fGroups[group].NumBlocks()) {
1724 			groupBlock = 0;
1725 			group++;
1726 		}
1727 	}
1728 
1729 	return B_OK;
1730 }
1731 
1732 
1733 status_t
1734 BlockAllocator::CheckBlockRun(block_run run, const char* type, bool allocated)
1735 {
1736 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1737 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1738 		|| uint32(run.Start() + run.Length())
1739 				> fGroups[run.AllocationGroup()].fNumBits
1740 		|| run.length == 0) {
1741 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type,
1742 			run.AllocationGroup(), run.Start(), run.Length()));
1743 		if (fCheckCookie == NULL)
1744 			return B_BAD_DATA;
1745 
1746 		fCheckCookie->control.errors |= BFS_INVALID_BLOCK_RUN;
1747 		return B_OK;
1748 	}
1749 
1750 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1751 	uint32 block = run.Start() / bitsPerBlock;
1752 	uint32 pos = run.Start() % bitsPerBlock;
1753 	int32 length = 0;
1754 	off_t firstMissing = -1, firstSet = -1;
1755 	off_t firstGroupBlock
1756 		= (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1757 
1758 	AllocationBlock cached(fVolume);
1759 
1760 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1761 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1762 			RETURN_ERROR(B_IO_ERROR);
1763 
1764 		if (pos >= cached.NumBlockBits()) {
1765 			// something very strange has happened...
1766 			RETURN_ERROR(B_ERROR);
1767 		}
1768 
1769 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1770 			if (cached.IsUsed(pos) != allocated) {
1771 				if (fCheckCookie == NULL) {
1772 					PRINT(("%s: block_run(%ld, %u, %u) is only partially "
1773 						"allocated (pos = %ld, length = %ld)!\n", type,
1774 						run.AllocationGroup(), run.Start(), run.Length(),
1775 						pos, length));
1776 					return B_BAD_DATA;
1777 				}
1778 				if (firstMissing == -1) {
1779 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1780 					fCheckCookie->control.errors |= BFS_MISSING_BLOCKS;
1781 				}
1782 				fCheckCookie->control.stats.missing++;
1783 			} else if (firstMissing != -1) {
1784 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are "
1785 					"%sallocated!\n", type, run.AllocationGroup(), run.Start(),
1786 					run.Length(), firstMissing,
1787 					firstGroupBlock + pos + block * bitsPerBlock - 1,
1788 					allocated ? "not " : ""));
1789 				firstMissing = -1;
1790 			}
1791 
1792 			if (fCheckBitmap != NULL) {
1793 				// Set the block in the check bitmap as well, but have a look
1794 				// if it is already allocated first
1795 				uint32 offset = pos + block * bitsPerBlock;
1796 				if (_CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1797 					if (firstSet == -1) {
1798 						firstSet = firstGroupBlock + offset;
1799 						fCheckCookie->control.errors |= BFS_BLOCKS_ALREADY_SET;
1800 						dprintf("block %" B_PRIdOFF " is already set!!!\n",
1801 							firstGroupBlock + offset);
1802 					}
1803 					fCheckCookie->control.stats.already_set++;
1804 				} else {
1805 					if (firstSet != -1) {
1806 						FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF
1807 							" - %" B_PRIdOFF " are already set!\n", type,
1808 							(int)run.AllocationGroup(), run.Start(),
1809 							run.Length(), firstSet,
1810 							firstGroupBlock + offset - 1));
1811 						firstSet = -1;
1812 					}
1813 					_SetCheckBitmapAt(firstGroupBlock + offset);
1814 				}
1815 			}
1816 			length++;
1817 			pos++;
1818 		}
1819 
1820 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1821 			if (firstMissing != -1) {
1822 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not "
1823 					"allocated!\n", type, run.AllocationGroup(), run.Start(),
1824 					run.Length(), firstMissing,
1825 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1826 			}
1827 			if (firstSet != -1) {
1828 				FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF " - %"
1829 					B_PRIdOFF " are already set!\n", type,
1830 					(int)run.AllocationGroup(), run.Start(), run.Length(),
1831 					firstSet,
1832 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1833 			}
1834 		}
1835 	}
1836 
1837 	return B_OK;
1838 }
1839 
1840 
1841 status_t
1842 BlockAllocator::CheckInode(Inode* inode, const char* name)
1843 {
1844 	if (fCheckCookie != NULL && fCheckBitmap == NULL)
1845 		return B_NO_INIT;
1846 	if (inode == NULL)
1847 		return B_BAD_VALUE;
1848 
1849 	switch (fCheckCookie->pass) {
1850 		case BFS_CHECK_PASS_BITMAP:
1851 		{
1852 			status_t status = _CheckInodeBlocks(inode, name);
1853 			if (status != B_OK)
1854 				return status;
1855 
1856 			// Check the B+tree as well
1857 			if (inode->IsContainer()) {
1858 				bool repairErrors
1859 					= (fCheckCookie->control.flags & BFS_FIX_BPLUSTREES) != 0;
1860 				bool errorsFound = false;
1861 				status = inode->Tree()->Validate(repairErrors, errorsFound);
1862 				if (errorsFound) {
1863 					fCheckCookie->control.errors |= BFS_INVALID_BPLUSTREE;
1864 					if (inode->IsIndex() && name != NULL && repairErrors) {
1865 						// We completely rebuild corrupt indices
1866 						check_index* index = new(std::nothrow) check_index;
1867 						if (index == NULL)
1868 							return B_NO_MEMORY;
1869 
1870 						strlcpy(index->name, name, sizeof(index->name));
1871 						index->run = inode->BlockRun();
1872 						fCheckCookie->indices.Push(index);
1873 					}
1874 				}
1875 			}
1876 
1877 			return status;
1878 		}
1879 
1880 		case BFS_CHECK_PASS_INDEX:
1881 			return _AddInodeToIndex(inode);
1882 	}
1883 
1884 	return B_OK;
1885 }
1886 
1887 
1888 status_t
1889 BlockAllocator::_CheckInodeBlocks(Inode* inode, const char* name)
1890 {
1891 	status_t status = CheckBlockRun(inode->BlockRun(), "inode");
1892 	if (status != B_OK)
1893 		return status;
1894 
1895 	// If the inode has an attribute directory, push it on the stack
1896 	if (!inode->Attributes().IsZero())
1897 		fCheckCookie->stack.Push(inode->Attributes());
1898 
1899 	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1900 		// symlinks may not have a valid data stream
1901 		if (strlen(inode->Node().short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1902 			return B_BAD_DATA;
1903 
1904 		return B_OK;
1905 	}
1906 
1907 	data_stream* data = &inode->Node().data;
1908 
1909 	// check the direct range
1910 
1911 	if (data->max_direct_range) {
1912 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1913 			if (data->direct[i].IsZero())
1914 				break;
1915 
1916 			status = CheckBlockRun(data->direct[i], "direct");
1917 			if (status < B_OK)
1918 				return status;
1919 
1920 			fCheckCookie->control.stats.direct_block_runs++;
1921 			fCheckCookie->control.stats.blocks_in_direct
1922 				+= data->direct[i].Length();
1923 		}
1924 	}
1925 
1926 	CachedBlock cached(fVolume);
1927 
1928 	// check the indirect range
1929 
1930 	if (data->max_indirect_range) {
1931 		status = CheckBlockRun(data->indirect, "indirect");
1932 		if (status < B_OK)
1933 			return status;
1934 
1935 		off_t block = fVolume->ToBlock(data->indirect);
1936 
1937 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1938 			block_run* runs = (block_run*)cached.SetTo(block + i);
1939 			if (runs == NULL)
1940 				RETURN_ERROR(B_IO_ERROR);
1941 
1942 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1943 			int32 index = 0;
1944 			for (; index < runsPerBlock; index++) {
1945 				if (runs[index].IsZero())
1946 					break;
1947 
1948 				status = CheckBlockRun(runs[index], "indirect->run");
1949 				if (status < B_OK)
1950 					return status;
1951 
1952 				fCheckCookie->control.stats.indirect_block_runs++;
1953 				fCheckCookie->control.stats.blocks_in_indirect
1954 					+= runs[index].Length();
1955 			}
1956 			fCheckCookie->control.stats.indirect_array_blocks++;
1957 
1958 			if (index < runsPerBlock)
1959 				break;
1960 		}
1961 	}
1962 
1963 	// check the double indirect range
1964 
1965 	if (data->max_double_indirect_range) {
1966 		status = CheckBlockRun(data->double_indirect, "double indirect");
1967 		if (status != B_OK)
1968 			return status;
1969 
1970 		int32 runsPerBlock = runs_per_block(fVolume->BlockSize());
1971 		int32 runsPerArray = runsPerBlock * data->double_indirect.Length();
1972 
1973 		CachedBlock cachedDirect(fVolume);
1974 
1975 		for (int32 indirectIndex = 0; indirectIndex < runsPerArray;
1976 				indirectIndex++) {
1977 			// get the indirect array block
1978 			block_run* array = (block_run*)cached.SetTo(
1979 				fVolume->ToBlock(data->double_indirect)
1980 					+ indirectIndex / runsPerBlock);
1981 			if (array == NULL)
1982 				return B_IO_ERROR;
1983 
1984 			block_run indirect = array[indirectIndex % runsPerBlock];
1985 			// are we finished yet?
1986 			if (indirect.IsZero())
1987 				return B_OK;
1988 
1989 			status = CheckBlockRun(indirect, "double indirect->runs");
1990 			if (status != B_OK)
1991 				return status;
1992 
1993 			int32 maxIndex
1994 				= ((uint32)indirect.Length() << fVolume->BlockShift())
1995 					/ sizeof(block_run);
1996 
1997 			for (int32 index = 0; index < maxIndex; ) {
1998 				block_run* runs = (block_run*)cachedDirect.SetTo(
1999 					fVolume->ToBlock(indirect) + index / runsPerBlock);
2000 				if (runs == NULL)
2001 					return B_IO_ERROR;
2002 
2003 				do {
2004 					// are we finished yet?
2005 					if (runs[index % runsPerBlock].IsZero())
2006 						return B_OK;
2007 
2008 					status = CheckBlockRun(runs[index % runsPerBlock],
2009 						"double indirect->runs->run");
2010 					if (status != B_OK)
2011 						return status;
2012 
2013 					fCheckCookie->control.stats.double_indirect_block_runs++;
2014 					fCheckCookie->control.stats.blocks_in_double_indirect
2015 						+= runs[index % runsPerBlock].Length();
2016 				} while ((++index % runsPerArray) != 0);
2017 			}
2018 
2019 			fCheckCookie->control.stats.double_indirect_array_blocks++;
2020 		}
2021 	}
2022 
2023 	return B_OK;
2024 }
2025 
2026 
2027 status_t
2028 BlockAllocator::_PrepareIndices()
2029 {
2030 	int32 count = 0;
2031 
2032 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2033 		check_index* index = fCheckCookie->indices.Array()[i];
2034 		Vnode vnode(fVolume, index->run);
2035 		Inode* inode;
2036 		status_t status = vnode.Get(&inode);
2037 		if (status != B_OK) {
2038 			FATAL(("check: Could not open index at %" B_PRIdOFF "\n",
2039 				fVolume->ToBlock(index->run)));
2040 			return status;
2041 		}
2042 
2043 		BPlusTree* tree = inode->Tree();
2044 		if (tree == NULL) {
2045 			// TODO: We can't yet repair those
2046 			continue;
2047 		}
2048 
2049 		status = tree->MakeEmpty();
2050 		if (status != B_OK)
2051 			return status;
2052 
2053 		index->inode = inode;
2054 		vnode.Keep();
2055 		count++;
2056 	}
2057 
2058 	return count == 0 ? B_ENTRY_NOT_FOUND : B_OK;
2059 }
2060 
2061 
2062 void
2063 BlockAllocator::_FreeIndices()
2064 {
2065 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2066 		check_index* index = fCheckCookie->indices.Array()[i];
2067 		if (index->inode != NULL) {
2068 			put_vnode(fVolume->FSVolume(),
2069 				fVolume->ToVnode(index->inode->BlockRun()));
2070 		}
2071 	}
2072 	fCheckCookie->indices.MakeEmpty();
2073 }
2074 
2075 
2076 status_t
2077 BlockAllocator::_AddInodeToIndex(Inode* inode)
2078 {
2079 	Transaction transaction(fVolume, inode->BlockNumber());
2080 
2081 	for (int32 i = 0; i < fCheckCookie->indices.CountItems(); i++) {
2082 		check_index* index = fCheckCookie->indices.Array()[i];
2083 		if (index->inode == NULL)
2084 			continue;
2085 
2086 		index->inode->WriteLockInTransaction(transaction);
2087 
2088 		BPlusTree* tree = index->inode->Tree();
2089 		if (tree == NULL)
2090 			return B_ERROR;
2091 
2092 		status_t status = B_OK;
2093 
2094 		if (!strcmp(index->name, "name")) {
2095 			if (inode->InNameIndex()) {
2096 				char name[B_FILE_NAME_LENGTH];
2097 				if (inode->GetName(name, B_FILE_NAME_LENGTH) != B_OK)
2098 					return B_ERROR;
2099 
2100 				status = tree->Insert(transaction, name, inode->ID());
2101 			}
2102 		} else if (!strcmp(index->name, "last_modified")) {
2103 			if (inode->InLastModifiedIndex()) {
2104 				status = tree->Insert(transaction, inode->OldLastModified(),
2105 					inode->ID());
2106 			}
2107 		} else if (!strcmp(index->name, "size")) {
2108 			if (inode->InSizeIndex())
2109 				status = tree->Insert(transaction, inode->Size(), inode->ID());
2110 		} else {
2111 			uint8 key[BPLUSTREE_MAX_KEY_LENGTH];
2112 			size_t keyLength = BPLUSTREE_MAX_KEY_LENGTH;
2113 			if (inode->ReadAttribute(index->name, B_ANY_TYPE, 0, key,
2114 					&keyLength) == B_OK) {
2115 				status = tree->Insert(transaction, key, keyLength, inode->ID());
2116 			}
2117 		}
2118 
2119 		if (status != B_OK)
2120 			return status;
2121 	}
2122 
2123 	return transaction.Done();
2124 }
2125 
2126 
2127 //	#pragma mark - debugger commands
2128 
2129 
2130 #ifdef BFS_DEBUGGER_COMMANDS
2131 
2132 void
2133 BlockAllocator::Dump(int32 index)
2134 {
2135 	kprintf("allocation groups: %ld (base %p)\n", fNumGroups, fGroups);
2136 	kprintf("blocks per group: %ld\n", fBlocksPerGroup);
2137 
2138 	for (int32 i = 0; i < fNumGroups; i++) {
2139 		if (index != -1 && i != index)
2140 			continue;
2141 
2142 		AllocationGroup& group = fGroups[i];
2143 
2144 		kprintf("[%3ld] num bits:       %lu  (%p)\n", i, group.NumBits(),
2145 			&group);
2146 		kprintf("      num blocks:     %lu\n", group.NumBlocks());
2147 		kprintf("      start:          %ld\n", group.Start());
2148 		kprintf("      first free:     %ld\n", group.fFirstFree);
2149 		kprintf("      largest start:  %ld%s\n", group.fLargestStart,
2150 			group.fLargestValid ? "" : "  (invalid)");
2151 		kprintf("      largest length: %ld\n", group.fLargestLength);
2152 		kprintf("      free bits:      %ld\n", group.fFreeBits);
2153 	}
2154 }
2155 
2156 
2157 #if BFS_TRACING
2158 static char kTraceBuffer[256];
2159 
2160 
2161 int
2162 dump_block_allocator_blocks(int argc, char** argv)
2163 {
2164 	if (argc != 3 || !strcmp(argv[1], "--help")) {
2165 		kprintf("usage: %s <ptr-to-volume> <block>\n", argv[0]);
2166 		return 0;
2167 	}
2168 
2169 	Volume* volume = (Volume*)parse_expression(argv[1]);
2170 	off_t block = parse_expression(argv[2]);
2171 
2172 	// iterate over all tracing entries to find overlapping actions
2173 
2174 	using namespace BFSBlockTracing;
2175 
2176 	LazyTraceOutput out(kTraceBuffer, sizeof(kTraceBuffer), 0);
2177 	TraceEntryIterator iterator;
2178 	while (TraceEntry* _entry = iterator.Next()) {
2179 		if (Allocate* entry = dynamic_cast<Allocate*>(_entry)) {
2180 			off_t first = volume->ToBlock(entry->Run());
2181 			off_t last = first - 1 + entry->Run().Length();
2182 			if (block >= first && block <= last) {
2183 				out.Clear();
2184 				const char* dump = out.DumpEntry(entry);
2185 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2186 			}
2187 		} else if (Free* entry = dynamic_cast<Free*>(_entry)) {
2188 			off_t first = volume->ToBlock(entry->Run());
2189 			off_t last = first - 1 + entry->Run().Length();
2190 			if (block >= first && block <= last) {
2191 				out.Clear();
2192 				const char* dump = out.DumpEntry(entry);
2193 				kprintf("%5ld. %s\n", iterator.Index(), dump);
2194 			}
2195 		}
2196 	}
2197 
2198 	return 0;
2199 }
2200 #endif
2201 
2202 
2203 int
2204 dump_block_allocator(int argc, char** argv)
2205 {
2206 	int32 group = -1;
2207 	if (argc == 3) {
2208 		group = parse_expression(argv[2]);
2209 		argc--;
2210 	}
2211 
2212 	if (argc != 2 || !strcmp(argv[1], "--help")) {
2213 		kprintf("usage: %s <ptr-to-volume> [group]\n", argv[0]);
2214 		return 0;
2215 	}
2216 
2217 	Volume* volume = (Volume*)parse_expression(argv[1]);
2218 	BlockAllocator& allocator = volume->Allocator();
2219 
2220 	allocator.Dump(group);
2221 	return 0;
2222 }
2223 
2224 #endif	// BFS_DEBUGGER_COMMANDS
2225 
2226