xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision adb0d19d561947362090081e81d90dde59142026)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! block bitmap handling and allocation policies
7 
8 
9 #include "BlockAllocator.h"
10 
11 #include "bfs_control.h"
12 #include "BPlusTree.h"
13 #include "Debug.h"
14 #include "Inode.h"
15 #include "Volume.h"
16 
17 
18 // Things the BlockAllocator should do:
19 
20 // - find a range of blocks of a certain size nearby a specific position
21 // - allocating an unsharp range of blocks for pre-allocation
22 // - free blocks
23 // - know how to deal with each allocation, special handling for directories,
24 //   files, symlinks, etc. (type sensitive allocation policies)
25 
26 // What makes the code complicated is the fact that we are not just reading
27 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
28 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
29 // of partitions will grow even more - so that's not an option.
30 // Instead we are reading in every block when it's used - since an allocation
31 // group can span several blocks in the block bitmap, the AllocationBlock
32 // class is there to make handling those easier.
33 
34 // The current implementation is only slightly optimized and could probably
35 // be improved a lot. Furthermore, the allocation policies used here should
36 // have some real world tests.
37 
38 #if BFS_TRACING && !defined(BFS_SHELL)
39 namespace BFSBlockTracing {
40 
41 class Allocate : public AbstractTraceEntry {
42 public:
43 	Allocate(block_run run)
44 		:
45 		fRun(run)
46 	{
47 		Initialized();
48 	}
49 
50 	virtual void AddDump(TraceOutput& out)
51 	{
52 		out.Print("bfs:alloc %lu.%u.%u", fRun.AllocationGroup(),
53 			fRun.Start(), fRun.Length());
54 	}
55 
56 	const block_run& Run() const { return fRun; }
57 
58 private:
59 	block_run	fRun;
60 };
61 
62 class Free : public AbstractTraceEntry {
63 public:
64 	Free(block_run run)
65 		:
66 		fRun(run)
67 	{
68 		Initialized();
69 	}
70 
71 	virtual void AddDump(TraceOutput& out)
72 	{
73 		out.Print("bfs:free %lu.%u.%u", fRun.AllocationGroup(),
74 			fRun.Start(), fRun.Length());
75 	}
76 
77 	const block_run& Run() const { return fRun; }
78 
79 private:
80 	block_run	fRun;
81 };
82 
83 
84 static uint32
85 checksum(const uint8* data, size_t size)
86 {
87 	const uint32* data4 = (const uint32*)data;
88 	uint32 sum = 0;
89 	while (size >= 4) {
90 		sum += *data4;
91 		data4++;
92 		size -= 4;
93 	}
94 	return sum;
95 }
96 
97 
98 class Block : public AbstractTraceEntry {
99 public:
100 	Block(const char* label, off_t blockNumber, const uint8* data,
101 			size_t size, uint32 start = 0, uint32 length = 0)
102 		:
103 		fBlock(blockNumber),
104 		fData(data),
105 		fStart(start),
106 		fLength(length),
107 		fLabel(label)
108 	{
109 		fSum = checksum(data, size);
110 		Initialized();
111 	}
112 
113 	virtual void AddDump(TraceOutput& out)
114 	{
115 		out.Print("bfs:%s: block %Ld (%p), sum %lu, s/l %lu/%lu", fLabel,
116 			fBlock, fData, fSum, fStart, fLength);
117 	}
118 
119 private:
120 	off_t		fBlock;
121 	const uint8	*fData;
122 	uint32		fStart;
123 	uint32		fLength;
124 	uint32		fSum;
125 	const char*	fLabel;
126 };
127 
128 
129 class BlockChange : public AbstractTraceEntry {
130 public:
131 	BlockChange(const char* label, int32 block, uint32 oldData, uint32 newData)
132 		:
133 		fBlock(block),
134 		fOldData(oldData),
135 		fNewData(newData),
136 		fLabel(label)
137 	{
138 		Initialized();
139 	}
140 
141 	virtual void AddDump(TraceOutput& out)
142 	{
143 		out.Print("bfs:%s: block %ld, %08lx -> %08lx", fLabel,
144 			fBlock, fOldData, fNewData);
145 	}
146 
147 private:
148 	int32		fBlock;
149 	uint32		fOldData;
150 	uint32		fNewData;
151 	const char*	fLabel;
152 };
153 
154 }	// namespace BFSBlockTracing
155 
156 #	define T(x) new(std::nothrow) BFSBlockTracing::x;
157 #else
158 #	define T(x) ;
159 #endif
160 
161 #ifdef DEBUG_ALLOCATION_GROUPS
162 #	define CHECK_ALLOCATION_GROUP(group) _CheckGroup(group)
163 #else
164 #	define CHECK_ALLOCATION_GROUP(group) ;
165 #endif
166 
167 
168 struct check_cookie {
169 	check_cookie() {}
170 
171 	block_run			current;
172 	Inode				*parent;
173 	mode_t				parent_mode;
174 	Stack<block_run>	stack;
175 	TreeIterator		*iterator;
176 };
177 
178 
179 class AllocationBlock : public CachedBlock {
180 public:
181 	AllocationBlock(Volume* volume);
182 
183 	inline void Allocate(uint16 start, uint16 numBlocks);
184 	inline void Free(uint16 start, uint16 numBlocks);
185 	inline bool IsUsed(uint16 block);
186 
187 	status_t SetTo(AllocationGroup& group, uint16 block);
188 	status_t SetToWritable(Transaction& transaction, AllocationGroup& group,
189 		uint16 block);
190 
191 	uint32 NumBlockBits() const { return fNumBits; }
192 	uint32& Block(int32 index) { return ((uint32*)fBlock)[index]; }
193 	uint8* Block() const { return (uint8*)fBlock; }
194 
195 private:
196 	uint32	fNumBits;
197 #ifdef DEBUG
198 	bool	fWritable;
199 #endif
200 };
201 
202 
203 class AllocationGroup {
204 public:
205 	AllocationGroup();
206 
207 	void AddFreeRange(int32 start, int32 blocks);
208 	bool IsFull() const { return fFreeBits == 0; }
209 
210 	status_t Allocate(Transaction& transaction, uint16 start, int32 length);
211 	status_t Free(Transaction& transaction, uint16 start, int32 length);
212 
213 	uint32 NumBits() const { return fNumBits; }
214 	uint32 NumBlocks() const { return fNumBlocks; }
215 	int32 Start() const { return fStart; }
216 
217 private:
218 	friend class BlockAllocator;
219 
220 	uint32	fNumBits;
221 	uint32	fNumBlocks;
222 	int32	fStart;
223 	int32	fFirstFree;
224 	int32	fFreeBits;
225 
226 	int32	fLargestStart;
227 	int32	fLargestLength;
228 	bool	fLargestValid;
229 };
230 
231 
232 AllocationBlock::AllocationBlock(Volume* volume)
233 	: CachedBlock(volume)
234 {
235 }
236 
237 
238 status_t
239 AllocationBlock::SetTo(AllocationGroup& group, uint16 block)
240 {
241 	// 8 blocks per byte
242 	fNumBits = fVolume->BlockSize() << 3;
243 	// the last group may have less bits than the others
244 	if ((block + 1) * fNumBits > group.NumBits())
245 		fNumBits = group.NumBits() % fNumBits;
246 
247 #ifdef DEBUG
248 	fWritable = false;
249 #endif
250 	return CachedBlock::SetTo(group.Start() + block) != NULL ? B_OK : B_ERROR;
251 }
252 
253 
254 status_t
255 AllocationBlock::SetToWritable(Transaction& transaction, AllocationGroup& group,
256 	uint16 block)
257 {
258 	// 8 blocks per byte
259 	fNumBits = fVolume->BlockSize() << 3;
260 	// the last group may have less bits in the last block
261 	if ((block + 1) * fNumBits > group.NumBits())
262 		fNumBits = group.NumBits() % fNumBits;
263 
264 #ifdef DEBUG
265 	fWritable = true;
266 #endif
267 	return CachedBlock::SetToWritable(transaction, group.Start() + block)
268 		!= NULL ? B_OK : B_ERROR;
269 }
270 
271 
272 bool
273 AllocationBlock::IsUsed(uint16 block)
274 {
275 	if (block > fNumBits)
276 		return true;
277 
278 	// the block bitmap is accessed in 32-bit blocks
279 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
280 }
281 
282 
283 void
284 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
285 {
286 	ASSERT(start < fNumBits);
287 	ASSERT(uint32(start + numBlocks) <= fNumBits);
288 #ifdef DEBUG
289 	ASSERT(fWritable);
290 #endif
291 
292 	T(Block("b-alloc-in", fBlockNumber, fBlock, fVolume->BlockSize(),
293 		start, numBlocks));
294 
295 	int32 block = start >> 5;
296 
297 	while (numBlocks > 0) {
298 		uint32 mask = 0;
299 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
300 			mask |= 1UL << i;
301 
302 		T(BlockChange("b-alloc", block, Block(block),
303 			Block(block) | HOST_ENDIAN_TO_BFS_INT32(mask)));
304 
305 #if KDEBUG
306 		// check for already set blocks
307 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32*)fBlock)[block]) {
308 			FATAL(("AllocationBlock::Allocate(): some blocks are already "
309 				"allocated, start = %u, numBlocks = %u\n", start, numBlocks));
310 			panic("blocks already set!");
311 		}
312 #endif
313 
314 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
315 		start = 0;
316 	}
317 	T(Block("b-alloc-out", fBlockNumber, fBlock, fVolume->BlockSize(),
318 		start, numBlocks));
319 }
320 
321 
322 void
323 AllocationBlock::Free(uint16 start, uint16 numBlocks)
324 {
325 	ASSERT(start < fNumBits);
326 	ASSERT(uint32(start + numBlocks) <= fNumBits);
327 #ifdef DEBUG
328 	ASSERT(fWritable);
329 #endif
330 
331 	int32 block = start >> 5;
332 
333 	while (numBlocks > 0) {
334 		uint32 mask = 0;
335 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
336 			mask |= 1UL << (i % 32);
337 
338 		T(BlockChange("b-free", block, Block(block),
339 			Block(block) & HOST_ENDIAN_TO_BFS_INT32(~mask)));
340 
341 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
342 		start = 0;
343 	}
344 }
345 
346 
347 //	#pragma mark -
348 
349 
350 /*!	The allocation groups are created and initialized in
351 	BlockAllocator::Initialize() and BlockAllocator::InitializeAndClearBitmap()
352 	respectively.
353 */
354 AllocationGroup::AllocationGroup()
355 	:
356 	fFirstFree(-1),
357 	fFreeBits(0),
358 	fLargestValid(false)
359 {
360 }
361 
362 
363 void
364 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
365 {
366 	//D(if (blocks > 512)
367 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
368 
369 	if (fFirstFree == -1)
370 		fFirstFree = start;
371 
372 	if (!fLargestValid || fLargestLength < blocks) {
373 		fLargestStart = start;
374 		fLargestLength = blocks;
375 		fLargestValid = true;
376 	}
377 
378 	fFreeBits += blocks;
379 }
380 
381 
382 /*!	Allocates the specified run in the allocation group.
383 	Doesn't check if the run is valid or already allocated partially, nor
384 	does it maintain the free ranges hints or the volume's used blocks count.
385 	It only does the low-level work of allocating some bits in the block bitmap.
386 	Assumes that the block bitmap lock is hold.
387 */
388 status_t
389 AllocationGroup::Allocate(Transaction& transaction, uint16 start, int32 length)
390 {
391 	ASSERT(start + length <= (int32)fNumBits);
392 
393 	// Update the allocation group info
394 	// TODO: this info will be incorrect if something goes wrong later
395 	// Note, the fFirstFree block doesn't have to be really free
396 	if (start == fFirstFree)
397 		fFirstFree = start + length;
398 	fFreeBits -= length;
399 
400 	if (fLargestValid) {
401 		bool cut = false;
402 		if (fLargestStart == start) {
403 			// cut from start
404 			fLargestStart += length;
405 			fLargestLength -= length;
406 			cut = true;
407 		} else if (start > fLargestStart
408 			&& start < fLargestStart + fLargestLength) {
409 			// cut from end
410 			fLargestLength = start - fLargestStart;
411 			cut = true;
412 		}
413 		if (cut && (fLargestLength < fLargestStart
414 				|| fLargestLength
415 						< (int32)fNumBits - (fLargestStart + fLargestLength))) {
416 			// might not be the largest block anymore
417 			fLargestValid = false;
418 		}
419 	}
420 
421 	Volume* volume = transaction.GetVolume();
422 
423 	// calculate block in the block bitmap and position within
424 	uint32 bitsPerBlock = volume->BlockSize() << 3;
425 	uint32 block = start / bitsPerBlock;
426 	start = start % bitsPerBlock;
427 
428 	AllocationBlock cached(volume);
429 
430 	while (length > 0) {
431 		if (cached.SetToWritable(transaction, *this, block) < B_OK) {
432 			fLargestValid = false;
433 			RETURN_ERROR(B_IO_ERROR);
434 		}
435 
436 		uint32 numBlocks = length;
437 		if (start + numBlocks > cached.NumBlockBits())
438 			numBlocks = cached.NumBlockBits() - start;
439 
440 		cached.Allocate(start, numBlocks);
441 
442 		length -= numBlocks;
443 		start = 0;
444 		block++;
445 	}
446 
447 	return B_OK;
448 }
449 
450 
451 /*!	Frees the specified run in the allocation group.
452 	Doesn't check if the run is valid or was not completely allocated, nor
453 	does it maintain the free ranges hints or the volume's used blocks count.
454 	It only does the low-level work of freeing some bits in the block bitmap.
455 	Assumes that the block bitmap lock is hold.
456 */
457 status_t
458 AllocationGroup::Free(Transaction& transaction, uint16 start, int32 length)
459 {
460 	ASSERT(start + length <= (int32)fNumBits);
461 
462 	// Update the allocation group info
463 	// TODO: this info will be incorrect if something goes wrong later
464 	if (fFirstFree > start)
465 		fFirstFree = start;
466 	fFreeBits += length;
467 
468 	// The range to be freed cannot be part of the valid largest range
469 	ASSERT(!fLargestValid || start + length <= fLargestStart
470 		|| start > fLargestStart);
471 
472 	if (fLargestValid
473 		&& (start + length == fLargestStart
474 			|| fLargestStart + fLargestLength == start
475 			|| (start < fLargestStart && fLargestStart > fLargestLength)
476 			|| (start > fLargestStart
477 				&& (int32)fNumBits - (fLargestStart + fLargestLength)
478 						> fLargestLength))) {
479 		fLargestValid = false;
480 	}
481 
482 	Volume* volume = transaction.GetVolume();
483 
484 	// calculate block in the block bitmap and position within
485 	uint32 bitsPerBlock = volume->BlockSize() << 3;
486 	uint32 block = start / bitsPerBlock;
487 	start = start % bitsPerBlock;
488 
489 	AllocationBlock cached(volume);
490 
491 	while (length > 0) {
492 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
493 			RETURN_ERROR(B_IO_ERROR);
494 
495 		T(Block("free-1", block, cached.Block(), volume->BlockSize()));
496 		uint16 freeLength = length;
497 		if (uint32(start + length) > cached.NumBlockBits())
498 			freeLength = cached.NumBlockBits() - start;
499 
500 		cached.Free(start, freeLength);
501 
502 		length -= freeLength;
503 		start = 0;
504 		T(Block("free-2", block, cached.Block(), volume->BlockSize()));
505 		block++;
506 	}
507 	return B_OK;
508 }
509 
510 
511 //	#pragma mark -
512 
513 
514 BlockAllocator::BlockAllocator(Volume* volume)
515 	:
516 	fVolume(volume),
517 	fGroups(NULL),
518 	fCheckBitmap(NULL)
519 {
520 	mutex_init(&fLock, "bfs allocator");
521 }
522 
523 
524 BlockAllocator::~BlockAllocator()
525 {
526 	mutex_destroy(&fLock);
527 	delete[] fGroups;
528 }
529 
530 
531 status_t
532 BlockAllocator::Initialize(bool full)
533 {
534 	fNumGroups = fVolume->AllocationGroups();
535 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
536 	fGroups = new AllocationGroup[fNumGroups];
537 	if (fGroups == NULL)
538 		return B_NO_MEMORY;
539 
540 	if (!full)
541 		return B_OK;
542 
543 	mutex_lock(&fLock);
544 		// the lock will be released by the _Initialize() method
545 
546 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::_Initialize,
547 		"bfs block allocator", B_LOW_PRIORITY, this);
548 	if (id < B_OK)
549 		return _Initialize(this);
550 
551 	mutex_transfer_lock(&fLock, id);
552 
553 	return resume_thread(id);
554 }
555 
556 
557 status_t
558 BlockAllocator::InitializeAndClearBitmap(Transaction& transaction)
559 {
560 	status_t status = Initialize(false);
561 	if (status < B_OK)
562 		return status;
563 
564 	uint32 blocks = fBlocksPerGroup;
565 	uint32 numBits = 8 * blocks * fVolume->BlockSize();
566 	uint32 blockShift = fVolume->BlockShift();
567 
568 	uint32* buffer = (uint32*)malloc(numBits >> 3);
569 	if (buffer == NULL)
570 		RETURN_ERROR(B_NO_MEMORY);
571 
572 	memset(buffer, 0, numBits >> 3);
573 
574 	off_t offset = 1;
575 
576 	// initialize the AllocationGroup objects and clear the on-disk bitmap
577 
578 	for (int32 i = 0; i < fNumGroups; i++) {
579 		if (write_pos(fVolume->Device(), offset << blockShift, buffer,
580 				blocks << blockShift) < B_OK)
581 			return B_ERROR;
582 
583 		// the last allocation group may contain less blocks than the others
584 		if (i == fNumGroups - 1) {
585 			fGroups[i].fNumBits = fVolume->NumBlocks() - i * numBits;
586 			fGroups[i].fNumBlocks = 1 + ((fGroups[i].NumBits() - 1)
587 				>> (blockShift + 3));
588 		} else {
589 			fGroups[i].fNumBits = numBits;
590 			fGroups[i].fNumBlocks = blocks;
591 		}
592 		fGroups[i].fStart = offset;
593 		fGroups[i].fFirstFree = fGroups[i].fLargestStart = 0;
594 		fGroups[i].fFreeBits = fGroups[i].fLargestLength = fGroups[i].fNumBits;
595 		fGroups[i].fLargestValid = true;
596 
597 		offset += blocks;
598 	}
599 	free(buffer);
600 
601 	// reserve the boot block, the log area, and the block bitmap itself
602 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
603 
604 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
605 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
606 		return B_ERROR;
607 	}
608 	fVolume->SuperBlock().used_blocks
609 		= HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
610 
611 	return B_OK;
612 }
613 
614 
615 status_t
616 BlockAllocator::_Initialize(BlockAllocator* allocator)
617 {
618 	// The lock must already be held at this point
619 
620 	Volume* volume = allocator->fVolume;
621 	uint32 blocks = allocator->fBlocksPerGroup;
622 	uint32 blockShift = volume->BlockShift();
623 	off_t freeBlocks = 0;
624 
625 	uint32* buffer = (uint32*)malloc(blocks << blockShift);
626 	if (buffer == NULL) {
627 		mutex_unlock(&allocator->fLock);
628 		RETURN_ERROR(B_NO_MEMORY);
629 	}
630 
631 	AllocationGroup* groups = allocator->fGroups;
632 	off_t offset = 1;
633 	uint32 bitsPerGroup = 8 * (blocks << blockShift);
634 	int32 numGroups = allocator->fNumGroups;
635 
636 	for (int32 i = 0; i < numGroups; i++) {
637 		if (read_pos(volume->Device(), offset << blockShift, buffer,
638 				blocks << blockShift) < B_OK)
639 			break;
640 
641 		// the last allocation group may contain less blocks than the others
642 		if (i == numGroups - 1) {
643 			groups[i].fNumBits = volume->NumBlocks() - i * bitsPerGroup;
644 			groups[i].fNumBlocks = 1 + ((groups[i].NumBits() - 1)
645 				>> (blockShift + 3));
646 		} else {
647 			groups[i].fNumBits = bitsPerGroup;
648 			groups[i].fNumBlocks = blocks;
649 		}
650 		groups[i].fStart = offset;
651 
652 		// finds all free ranges in this allocation group
653 		int32 start = -1, range = 0;
654 		int32 numBits = groups[i].fNumBits, bit = 0;
655 		int32 count = (numBits + 31) / 32;
656 
657 		for (int32 k = 0; k < count; k++) {
658 			for (int32 j = 0; j < 32 && bit < numBits; j++, bit++) {
659 				if (buffer[k] & (1UL << j)) {
660 					// block is in use
661 					if (range > 0) {
662 						groups[i].AddFreeRange(start, range);
663 						range = 0;
664 					}
665 				} else if (range++ == 0) {
666 					// block is free, start new free range
667 					start = bit;
668 				}
669 			}
670 		}
671 		if (range)
672 			groups[i].AddFreeRange(start, range);
673 
674 		freeBlocks += groups[i].fFreeBits;
675 
676 		offset += blocks;
677 	}
678 	free(buffer);
679 
680 	// check if block bitmap and log area are reserved
681 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
682 	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
683 		Transaction transaction(volume, 0);
684 		if (groups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
685 			FATAL(("could not allocate reserved space for block "
686 				"bitmap/log!\n"));
687 			volume->Panic();
688 		} else {
689 			FATAL(("space for block bitmap or log area was not reserved!\n"));
690 
691 			transaction.Done();
692 		}
693 	}
694 
695 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
696 	if (volume->UsedBlocks() != usedBlocks) {
697 		// If the disk in a dirty state at mount time, it's
698 		// normal that the values don't match
699 		INFORM(("volume reports %Ld used blocks, correct is %Ld\n",
700 			volume->UsedBlocks(), usedBlocks));
701 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
702 	}
703 
704 	mutex_unlock(&allocator->fLock);
705 	return B_OK;
706 }
707 
708 
709 void
710 BlockAllocator::Uninitialize()
711 {
712 	// We only have to make sure that the initializer thread isn't running
713 	// anymore.
714 	mutex_lock(&fLock);
715 }
716 
717 
718 status_t
719 BlockAllocator::AllocateBlocks(Transaction& transaction, int32 groupIndex,
720 	uint16 start, uint16 maximum, uint16 minimum, block_run& run)
721 {
722 	if (maximum == 0)
723 		return B_BAD_VALUE;
724 
725 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n",
726 		groupIndex, start, maximum, minimum));
727 
728 	AllocationBlock cached(fVolume);
729 	MutexLocker lock(fLock);
730 
731 	uint32 bitsPerFullBlock = fVolume->BlockSize() << 3;
732 
733 	// Find the block_run that can fulfill the request best
734 	int32 bestGroup = -1;
735 	int32 bestStart = -1;
736 	int32 bestLength = -1;
737 
738 	for (int32 i = 0; i < fNumGroups + 1; i++, groupIndex++, start = 0) {
739 		groupIndex = groupIndex % fNumGroups;
740 		AllocationGroup& group = fGroups[groupIndex];
741 
742 		CHECK_ALLOCATION_GROUP(groupIndex);
743 
744 		if (start >= group.NumBits() || group.IsFull())
745 			continue;
746 
747 		// The wanted maximum is smaller than the largest free block in the
748 		// group or already smaller than the minimum
749 
750 		if (start < group.fFirstFree)
751 			start = group.fFirstFree;
752 
753 		if (group.fLargestValid) {
754 			if (group.fLargestLength < bestLength)
755 				continue;
756 
757 			if (group.fLargestStart >= start) {
758 				if (group.fLargestLength >= bestLength) {
759 					bestGroup = groupIndex;
760 					bestStart = group.fLargestStart;
761 					bestLength = group.fLargestLength;
762 
763 					if (bestLength >= maximum)
764 						break;
765 				}
766 
767 				// We know everything about this group we have to, let's skip
768 				// to the next
769 				continue;
770 			}
771 		}
772 
773 		// There may be more than one block per allocation group - and
774 		// we iterate through it to find a place for the allocation.
775 		// (one allocation can't exceed one allocation group)
776 
777 		uint32 block = start / (fVolume->BlockSize() << 3);
778 		int32 currentStart = 0, currentLength = 0;
779 		int32 groupLargestStart = -1;
780 		int32 groupLargestLength = -1;
781 		int32 currentBit = start;
782 		bool canFindGroupLargest = start == 0;
783 
784 		for (; block < group.NumBlocks(); block++) {
785 			if (cached.SetTo(group, block) < B_OK)
786 				RETURN_ERROR(B_ERROR);
787 
788 			T(Block("alloc-in", group.Start() + block, cached.Block(),
789 				fVolume->BlockSize(), groupIndex, currentStart));
790 
791 			// find a block large enough to hold the allocation
792 			for (uint32 bit = start % bitsPerFullBlock;
793 					bit < cached.NumBlockBits(); bit++) {
794 				if (!cached.IsUsed(bit)) {
795 					if (currentLength == 0) {
796 						// start new range
797 						currentStart = currentBit;
798 					}
799 
800 					// have we found a range large enough to hold numBlocks?
801 					if (++currentLength >= maximum) {
802 						bestGroup = groupIndex;
803 						bestStart = currentStart;
804 						bestLength = currentLength;
805 						break;
806 					}
807 				} else {
808 					if (currentLength) {
809 						// end of a range
810 						if (currentLength > bestLength) {
811 							bestGroup = groupIndex;
812 							bestStart = currentStart;
813 							bestLength = currentLength;
814 						}
815 						if (currentLength > groupLargestLength) {
816 							groupLargestStart = currentStart;
817 							groupLargestLength = currentLength;
818 						}
819 						currentLength = 0;
820 					}
821 					if ((int32)group.NumBits() - currentBit
822 							<= groupLargestLength) {
823 						// We can't find a bigger block in this group anymore,
824 						// let's skip the rest.
825 						block = group.NumBlocks();
826 						break;
827 					}
828 				}
829 				currentBit++;
830 			}
831 
832 			T(Block("alloc-out", block, cached.Block(),
833 				fVolume->BlockSize(), groupIndex, currentStart));
834 
835 			if (bestLength >= maximum) {
836 				canFindGroupLargest = false;
837 				break;
838 			}
839 
840 			// start from the beginning of the next block
841 			start = 0;
842 		}
843 
844 		if (currentBit == (int32)group.NumBits()) {
845 			if (currentLength > bestLength) {
846 				bestGroup = groupIndex;
847 				bestStart = currentStart;
848 				bestLength = currentLength;
849 			}
850 			if (canFindGroupLargest && currentLength > groupLargestLength) {
851 				groupLargestStart = currentStart;
852 				groupLargestLength = currentLength;
853 			}
854 		}
855 
856 		if (canFindGroupLargest && !group.fLargestValid
857 			&& groupLargestLength >= 0) {
858 			group.fLargestStart = groupLargestStart;
859 			group.fLargestLength = groupLargestLength;
860 			group.fLargestValid = true;
861 		}
862 
863 		if (bestLength >= maximum)
864 			break;
865 	}
866 
867 	// If we found a suitable range, mark the blocks as in use, and
868 	// write the updated block bitmap back to disk
869 	if (bestLength < minimum)
870 		return B_DEVICE_FULL;
871 	if (bestLength > maximum)
872 		bestLength = maximum;
873 
874 	if (fGroups[bestGroup].Allocate(transaction, bestStart, bestLength)
875 			< B_OK)
876 		RETURN_ERROR(B_IO_ERROR);
877 
878 	CHECK_ALLOCATION_GROUP(bestGroup);
879 
880 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(bestGroup);
881 	run.start = HOST_ENDIAN_TO_BFS_INT16(bestStart);
882 	run.length = HOST_ENDIAN_TO_BFS_INT16(bestLength);
883 
884 	fVolume->SuperBlock().used_blocks =
885 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + bestLength);
886 		// We are not writing back the disk's super block - it's
887 		// either done by the journaling code, or when the disk
888 		// is unmounted.
889 		// If the value is not correct at mount time, it will be
890 		// fixed anyway.
891 
892 	// We need to flush any remaining blocks in the new allocation to make sure
893 	// they won't interfere with the file cache.
894 	block_cache_discard(fVolume->BlockCache(), fVolume->ToBlock(run),
895 		run.Length());
896 
897 	T(Allocate(run));
898 	return B_OK;
899 }
900 
901 
902 status_t
903 BlockAllocator::AllocateForInode(Transaction& transaction,
904 	const block_run* parent, mode_t type, block_run& run)
905 {
906 	// Apply some allocation policies here (AllocateBlocks() will break them
907 	// if necessary) - we will start with those described in Dominic Giampaolo's
908 	// "Practical File System Design", and see how good they work
909 
910 	// Files are going in the same allocation group as its parent,
911 	// sub-directories will be inserted 8 allocation groups after
912 	// the one of the parent
913 	uint16 group = parent->AllocationGroup();
914 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
915 		group += 8;
916 
917 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
918 }
919 
920 
921 status_t
922 BlockAllocator::Allocate(Transaction& transaction, Inode* inode,
923 	off_t numBlocks, block_run& run, uint16 minimum)
924 {
925 	if (numBlocks <= 0)
926 		return B_ERROR;
927 
928 	// one block_run can't hold more data than there is in one allocation group
929 	if (numBlocks > fGroups[0].NumBits())
930 		numBlocks = fGroups[0].NumBits();
931 
932 	// since block_run.length is uint16, the largest number of blocks that
933 	// can be covered by a block_run is 65535
934 	// TODO: if we drop compatibility, couldn't we do this any better?
935 	// There are basically two possibilities:
936 	// a) since a length of zero doesn't have any sense, take that for 65536 -
937 	//    but that could cause many problems (bugs) in other areas
938 	// b) reduce the maximum amount of blocks per block_run, so that the
939 	//    remaining number of free blocks can be used in a useful manner
940 	//    (like 4 blocks) - but that would also reduce the maximum file size
941 	// c) have BlockRun::Length() return (length + 1).
942 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
943 		numBlocks = MAX_BLOCK_RUN_LENGTH;
944 
945 	// Apply some allocation policies here (AllocateBlocks() will break them
946 	// if necessary)
947 	uint16 group = inode->BlockRun().AllocationGroup();
948 	uint16 start = 0;
949 
950 	// Are there already allocated blocks? (then just try to allocate near the
951 	// last one)
952 	if (inode->Size() > 0) {
953 		const data_stream& data = inode->Node().data;
954 		// TODO: we currently don't care for when the data stream
955 		// is already grown into the indirect ranges
956 		if (data.max_double_indirect_range == 0
957 			&& data.max_indirect_range == 0) {
958 			// Since size > 0, there must be a valid block run in this stream
959 			int32 last = 0;
960 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
961 				if (data.direct[last + 1].IsZero())
962 					break;
963 
964 			group = data.direct[last].AllocationGroup();
965 			start = data.direct[last].Start() + data.direct[last].Length();
966 		}
967 	} else if (inode->IsContainer() || inode->IsSymLink()) {
968 		// directory and symbolic link data will go in the same allocation
969 		// group as the inode is in but after the inode data
970 		start = inode->BlockRun().Start();
971 	} else {
972 		// file data will start in the next allocation group
973 		group = inode->BlockRun().AllocationGroup() + 1;
974 	}
975 
976 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
977 }
978 
979 
980 status_t
981 BlockAllocator::Free(Transaction& transaction, block_run run)
982 {
983 	MutexLocker lock(fLock);
984 
985 	int32 group = run.AllocationGroup();
986 	uint16 start = run.Start();
987 	uint16 length = run.Length();
988 
989 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start,
990 		length));
991 	T(Free(run));
992 
993 	// doesn't use Volume::IsValidBlockRun() here because it can check better
994 	// against the group size (the last group may have a different length)
995 	if (group < 0 || group >= fNumGroups
996 		|| start > fGroups[group].NumBits()
997 		|| uint32(start + length) > fGroups[group].NumBits()
998 		|| length == 0) {
999 		FATAL(("tried to free an invalid block_run (%d, %u, %u)\n", (int)group,
1000 			start, length));
1001 		DEBUGGER(("tried to free invalid block_run"));
1002 		return B_BAD_VALUE;
1003 	}
1004 	// check if someone tries to free reserved areas at the beginning of the
1005 	// drive
1006 	if (group == 0
1007 		&& start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
1008 		FATAL(("tried to free a reserved block_run (%d, %u, %u)\n", (int)group,
1009 			start, length));
1010 		DEBUGGER(("tried to free reserved block"));
1011 		return B_BAD_VALUE;
1012 	}
1013 #ifdef DEBUG
1014 	if (CheckBlockRun(run) < B_OK)
1015 		return B_BAD_DATA;
1016 #endif
1017 
1018 	CHECK_ALLOCATION_GROUP(group);
1019 
1020 	if (fGroups[group].Free(transaction, start, length) < B_OK)
1021 		RETURN_ERROR(B_IO_ERROR);
1022 
1023 	CHECK_ALLOCATION_GROUP(group);
1024 
1025 #ifdef DEBUG
1026 	if (CheckBlockRun(run, NULL, NULL, false) < B_OK) {
1027 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just "
1028 			"freed)\n"));
1029 	}
1030 #endif
1031 
1032 	fVolume->SuperBlock().used_blocks =
1033 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
1034 	return B_OK;
1035 }
1036 
1037 
1038 size_t
1039 BlockAllocator::BitmapSize() const
1040 {
1041 	return fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
1042 }
1043 
1044 
1045 #ifdef DEBUG_ALLOCATION_GROUPS
1046 void
1047 BlockAllocator::_CheckGroup(int32 groupIndex) const
1048 {
1049 	AllocationBlock cached(fVolume);
1050 	ASSERT_LOCKED_MUTEX(&fLock);
1051 
1052 	AllocationGroup& group = fGroups[groupIndex];
1053 
1054 	int32 currentStart = 0, currentLength = 0;
1055 	int32 firstFree = -1;
1056 	int32 largestStart = -1;
1057 	int32 largestLength = 0;
1058 	int32 currentBit = 0;
1059 
1060 	for (uint32 block = 0; block < group.NumBlocks(); block++) {
1061 		if (cached.SetTo(group, block) < B_OK) {
1062 			panic("setting group block %d failed\n", (int)block);
1063 			return;
1064 		}
1065 
1066 		for (uint32 bit = 0; bit < cached.NumBlockBits(); bit++) {
1067 			if (!cached.IsUsed(bit)) {
1068 				if (firstFree < 0) {
1069 					firstFree = currentBit;
1070 					if (!group.fLargestValid) {
1071 						if (firstFree >= 0 && firstFree < group.fFirstFree) {
1072 							// mostly harmless but noteworthy
1073 							dprintf("group %d first free too late: should be "
1074 								"%d, is %d\n", (int)groupIndex, (int)firstFree,
1075 								(int)group.fFirstFree);
1076 						}
1077 						return;
1078 					}
1079 				}
1080 
1081 				if (currentLength == 0) {
1082 					// start new range
1083 					currentStart = currentBit;
1084 				}
1085 				currentLength++;
1086 			} else if (currentLength) {
1087 				// end of a range
1088 				if (currentLength > largestLength) {
1089 					largestStart = currentStart;
1090 					largestLength = currentLength;
1091 				}
1092 				currentLength = 0;
1093 			}
1094 			currentBit++;
1095 		}
1096 	}
1097 
1098 	if (currentLength > largestLength) {
1099 		largestStart = currentStart;
1100 		largestLength = currentLength;
1101 	}
1102 
1103 	if (firstFree >= 0 && firstFree < group.fFirstFree) {
1104 		// mostly harmless but noteworthy
1105 		dprintf("group %d first free too late: should be %d, is %d\n",
1106 			(int)groupIndex, (int)firstFree, (int)group.fFirstFree);
1107 	}
1108 	if (group.fLargestValid
1109 		&& (largestStart != group.fLargestStart
1110 			|| largestLength != group.fLargestLength)) {
1111 		panic("bfs %p: group %d largest differs: %d.%d, checked %d.%d.\n",
1112 			fVolume, (int)groupIndex, (int)group.fLargestStart,
1113 			(int)group.fLargestLength, (int)largestStart, (int)largestLength);
1114 	}
1115 }
1116 #endif	// DEBUG_ALLOCATION_GROUPS
1117 
1118 
1119 //	#pragma mark - Bitmap validity checking
1120 
1121 // TODO: implement new FS checking API
1122 // Functions to check the validity of the bitmap - they are used from
1123 // the "checkfs" command (since this does even a bit more, maybe we should
1124 // move this some place else?)
1125 
1126 
1127 bool
1128 BlockAllocator::_IsValidCheckControl(check_control* control)
1129 {
1130 	if (control == NULL
1131 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
1132 		FATAL(("invalid check_control (%p)!\n", control));
1133 		return false;
1134 	}
1135 
1136 	return true;
1137 }
1138 
1139 
1140 status_t
1141 BlockAllocator::StartChecking(check_control* control)
1142 {
1143 	if (!_IsValidCheckControl(control))
1144 		return B_BAD_VALUE;
1145 
1146 	fVolume->GetJournal(0)->Lock(NULL, true);
1147 		// Lock the volume's journal
1148 
1149 	status_t status = mutex_lock(&fLock);
1150 	if (status != B_OK) {
1151 		fVolume->GetJournal(0)->Unlock(NULL, true);
1152 		return status;
1153 	}
1154 
1155 	size_t size = BitmapSize();
1156 	fCheckBitmap = (uint32*)malloc(size);
1157 	if (fCheckBitmap == NULL) {
1158 		mutex_unlock(&fLock);
1159 		fVolume->GetJournal(0)->Unlock(NULL, true);
1160 		return B_NO_MEMORY;
1161 	}
1162 
1163 	check_cookie* cookie = new check_cookie();
1164 	if (cookie == NULL) {
1165 		free(fCheckBitmap);
1166 		fCheckBitmap = NULL;
1167 		mutex_unlock(&fLock);
1168 		fVolume->GetJournal(0)->Unlock(NULL, true);
1169 
1170 		return B_NO_MEMORY;
1171 	}
1172 
1173 	// initialize bitmap
1174 	memset(fCheckBitmap, 0, size);
1175 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length();
1176 			block-- > 0;) {
1177 		_SetCheckBitmapAt(block);
1178 	}
1179 
1180 	cookie->stack.Push(fVolume->Root());
1181 	cookie->stack.Push(fVolume->Indices());
1182 	cookie->iterator = NULL;
1183 	control->cookie = cookie;
1184 
1185 	fCheckCookie = cookie;
1186 		// to be able to restore nicely if "chkbfs" exited abnormally
1187 
1188 	// Put removed vnodes to the stack -- they are not reachable by traversing
1189 	// the file system anymore.
1190 	InodeList::Iterator iterator = fVolume->RemovedInodes().GetIterator();
1191 	while (Inode* inode = iterator.Next()) {
1192 		cookie->stack.Push(inode->BlockRun());
1193 	}
1194 
1195 	// TODO: check reserved area in bitmap!
1196 
1197 	return B_OK;
1198 }
1199 
1200 
1201 status_t
1202 BlockAllocator::StopChecking(check_control* control)
1203 {
1204 	check_cookie* cookie;
1205 	if (control == NULL)
1206 		cookie = fCheckCookie;
1207 	else
1208 		cookie = (check_cookie*)control->cookie;
1209 
1210 	if (cookie == NULL)
1211 		return B_ERROR;
1212 
1213 	if (cookie->iterator != NULL) {
1214 		delete cookie->iterator;
1215 		cookie->iterator = NULL;
1216 
1217 		// the current directory inode is still locked in memory
1218 		put_vnode(fVolume->FSVolume(), fVolume->ToVnode(cookie->current));
1219 	}
1220 
1221 	if (fVolume->IsReadOnly()) {
1222 		// We can't fix errors on this volume
1223 		control->flags &= ~BFS_FIX_BITMAP_ERRORS;
1224 	}
1225 
1226 	// if CheckNextNode() could completely work through, we can
1227 	// fix any damages of the bitmap
1228 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
1229 		// calculate the number of used blocks in the check bitmap
1230 		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
1231 		off_t usedBlocks = 0LL;
1232 
1233 		// TODO: update the allocation groups used blocks info
1234 		for (uint32 i = size >> 2; i-- > 0;) {
1235 			uint32 compare = 1;
1236 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
1237 				if (compare & fCheckBitmap[i])
1238 					usedBlocks++;
1239 			}
1240 		}
1241 
1242 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks
1243 			+ control->stats.missing;
1244 		if (control->stats.freed < 0)
1245 			control->stats.freed = 0;
1246 
1247 		// Should we fix errors? Were there any errors we can fix?
1248 		if ((control->flags & BFS_FIX_BITMAP_ERRORS) != 0
1249 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
1250 			// if so, write the check bitmap back over the original one,
1251 			// and use transactions here to play safe - we even use several
1252 			// transactions, so that we don't blow the maximum log size
1253 			// on large disks; since we don't need to make this atomic
1254 			fVolume->SuperBlock().used_blocks
1255 				= HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
1256 
1257 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
1258 			size_t blockSize = fVolume->BlockSize();
1259 
1260 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
1261 				Transaction transaction(fVolume, 1 + i);
1262 
1263 				int32 blocksToWrite = 512;
1264 				if (blocksToWrite + i > blocksInBitmap)
1265 					blocksToWrite = blocksInBitmap - i;
1266 
1267 				status_t status = transaction.WriteBlocks(1 + i,
1268 					(uint8*)fCheckBitmap + i * blockSize, blocksToWrite);
1269 				if (status < B_OK) {
1270 					FATAL(("error writing bitmap: %s\n", strerror(status)));
1271 					break;
1272 				}
1273 				transaction.Done();
1274 			}
1275 		}
1276 	} else
1277 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
1278 
1279 	fVolume->SetCheckingThread(-1);
1280 
1281 	free(fCheckBitmap);
1282 	fCheckBitmap = NULL;
1283 	fCheckCookie = NULL;
1284 	delete cookie;
1285 	mutex_unlock(&fLock);
1286 	fVolume->GetJournal(0)->Unlock(NULL, true);
1287 
1288 	return B_OK;
1289 }
1290 
1291 
1292 status_t
1293 BlockAllocator::CheckNextNode(check_control* control)
1294 {
1295 	if (!_IsValidCheckControl(control))
1296 		return B_BAD_VALUE;
1297 
1298 	check_cookie* cookie = (check_cookie*)control->cookie;
1299 	fVolume->SetCheckingThread(find_thread(NULL));
1300 
1301 	while (true) {
1302 		if (cookie->iterator == NULL) {
1303 			if (!cookie->stack.Pop(&cookie->current)) {
1304 				// no more runs on the stack, we are obviously finished!
1305 				control->status = B_ENTRY_NOT_FOUND;
1306 				return B_ENTRY_NOT_FOUND;
1307 			}
1308 
1309 			Vnode vnode(fVolume, cookie->current);
1310 			Inode* inode;
1311 			if (vnode.Get(&inode) < B_OK) {
1312 				FATAL(("check: Could not open inode at %Ld\n",
1313 					fVolume->ToBlock(cookie->current)));
1314 				continue;
1315 			}
1316 
1317 			control->inode = inode->ID();
1318 			control->mode = inode->Mode();
1319 
1320 			if (!inode->IsContainer()) {
1321 				// Check file
1322 				control->errors = 0;
1323 				control->status = CheckInode(inode, control);
1324 
1325 				if (inode->GetName(control->name) < B_OK)
1326 					strcpy(control->name, "(node has no name)");
1327 
1328 				return B_OK;
1329 			}
1330 
1331 			// get iterator for the next directory
1332 
1333 			BPlusTree* tree;
1334 			if (inode->GetTree(&tree) != B_OK) {
1335 				FATAL(("check: could not open b+tree from inode at %Ld\n",
1336 					fVolume->ToBlock(cookie->current)));
1337 				continue;
1338 			}
1339 
1340 			cookie->parent = inode;
1341 			cookie->parent_mode = inode->Mode();
1342 
1343 			cookie->iterator = new TreeIterator(tree);
1344 			if (cookie->iterator == NULL)
1345 				RETURN_ERROR(B_NO_MEMORY);
1346 
1347 			// the inode must stay locked in memory until the iterator is freed
1348 			vnode.Keep();
1349 
1350 			// check the inode of the directory
1351 			control->errors = 0;
1352 			control->status = CheckInode(inode, control);
1353 
1354 			if (inode->GetName(control->name) < B_OK)
1355 				strcpy(control->name, "(dir has no name)");
1356 
1357 			return B_OK;
1358 		}
1359 
1360 		char name[B_FILE_NAME_LENGTH];
1361 		uint16 length;
1362 		ino_t id;
1363 
1364 		status_t status = cookie->iterator->GetNextEntry(name, &length,
1365 			B_FILE_NAME_LENGTH, &id);
1366 		if (status == B_ENTRY_NOT_FOUND) {
1367 			// there are no more entries in this iterator, free it and go on
1368 			delete cookie->iterator;
1369 			cookie->iterator = NULL;
1370 
1371 			// unlock the directory's inode from memory
1372 			put_vnode(fVolume->FSVolume(), fVolume->ToVnode(cookie->current));
1373 
1374 			continue;
1375 		} else if (status == B_OK) {
1376 			// ignore "." and ".." entries
1377 			if (!strcmp(name, ".") || !strcmp(name, ".."))
1378 				continue;
1379 
1380 			// fill in the control data as soon as we have them
1381 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
1382 			control->inode = id;
1383 			control->errors = 0;
1384 
1385 			Vnode vnode(fVolume, id);
1386 			Inode* inode;
1387 			if (vnode.Get(&inode) < B_OK) {
1388 				FATAL(("Could not open inode ID %Ld!\n", id));
1389 				control->errors |= BFS_COULD_NOT_OPEN;
1390 				control->status = B_ERROR;
1391 				return B_OK;
1392 			}
1393 
1394 			// check if the inode's name is the same as in the b+tree
1395 			if (inode->IsRegularNode()) {
1396 				RecursiveLocker locker(inode->SmallDataLock());
1397 				NodeGetter node(fVolume, inode);
1398 
1399 				const char* localName = inode->Name(node.Node());
1400 				if (localName == NULL || strcmp(localName, name)) {
1401 					control->errors |= BFS_NAMES_DONT_MATCH;
1402 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name,
1403 						localName));
1404 				}
1405 			}
1406 
1407 			control->mode = inode->Mode();
1408 
1409 			// Check for the correct mode of the node (if the mode of the
1410 			// file don't fit to its parent, there is a serious problem)
1411 			if (((cookie->parent_mode & S_ATTR_DIR) != 0
1412 					&& !inode->IsAttribute())
1413 				|| ((cookie->parent_mode & S_INDEX_DIR) != 0
1414 					&& !inode->IsIndex())
1415 				|| (is_directory(cookie->parent_mode)
1416 					&& !inode->IsRegularNode())) {
1417 				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!"
1418 					"\n", inode->BlockNumber(), inode->Mode(),
1419 					cookie->parent_mode, cookie->parent->BlockNumber()));
1420 
1421 				// if we are allowed to fix errors, we should remove the file
1422 				if ((control->flags & BFS_REMOVE_WRONG_TYPES) != 0
1423 					&& (control->flags & BFS_FIX_BITMAP_ERRORS) != 0) {
1424 					// it's safe to start a transaction, because Inode::Remove()
1425 					// won't touch the block bitmap (which we hold the lock for)
1426 					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1427 					// the bitmap anyway
1428 					Transaction transaction(fVolume,
1429 						cookie->parent->BlockNumber());
1430 
1431 					inode->Node().flags
1432 						|= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1433 					status = cookie->parent->Remove(transaction, name, NULL,
1434 						inode->IsContainer());
1435 					if (status == B_OK)
1436 						transaction.Done();
1437 				} else
1438 					status = B_ERROR;
1439 
1440 				control->errors |= BFS_WRONG_TYPE;
1441 				control->status = status;
1442 				return B_OK;
1443 			}
1444 
1445 			// If the inode has an attribute directory, push it on the stack
1446 			if (!inode->Attributes().IsZero())
1447 				cookie->stack.Push(inode->Attributes());
1448 
1449 			// push the directory on the stack so that it will be scanned later
1450 			if (inode->IsContainer() && !inode->IsIndex())
1451 				cookie->stack.Push(inode->BlockRun());
1452 			else {
1453 				// check it now
1454 				control->status = CheckInode(inode, control);
1455 
1456 				return B_OK;
1457 			}
1458 		}
1459 	}
1460 	// is never reached
1461 }
1462 
1463 
1464 bool
1465 BlockAllocator::_CheckBitmapIsUsedAt(off_t block) const
1466 {
1467 	size_t size = BitmapSize();
1468 	uint32 index = block / 32;	// 32bit resolution
1469 	if (index > size / 4)
1470 		return false;
1471 
1472 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index])
1473 		& (1UL << (block & 0x1f));
1474 }
1475 
1476 
1477 void
1478 BlockAllocator::_SetCheckBitmapAt(off_t block)
1479 {
1480 	size_t size = BitmapSize();
1481 	uint32 index = block / 32;	// 32bit resolution
1482 	if (index > size / 4)
1483 		return;
1484 
1485 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1486 }
1487 
1488 
1489 status_t
1490 BlockAllocator::CheckBlockRun(block_run run, const char* type,
1491 	check_control* control, bool allocated)
1492 {
1493 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1494 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1495 		|| uint32(run.Start() + run.Length())
1496 				> fGroups[run.AllocationGroup()].fNumBits
1497 		|| run.length == 0) {
1498 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type,
1499 			run.AllocationGroup(), run.Start(), run.Length()));
1500 		if (control == NULL)
1501 			return B_BAD_DATA;
1502 
1503 		control->errors |= BFS_INVALID_BLOCK_RUN;
1504 		return B_OK;
1505 	}
1506 
1507 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1508 	uint32 block = run.Start() / bitsPerBlock;
1509 	uint32 pos = run.Start() % bitsPerBlock;
1510 	int32 length = 0;
1511 	off_t firstMissing = -1, firstSet = -1;
1512 	off_t firstGroupBlock
1513 		= (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1514 
1515 	AllocationBlock cached(fVolume);
1516 
1517 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1518 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1519 			RETURN_ERROR(B_IO_ERROR);
1520 
1521 		if (pos >= cached.NumBlockBits()) {
1522 			// something very strange has happened...
1523 			RETURN_ERROR(B_ERROR);
1524 		}
1525 
1526 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1527 			if (cached.IsUsed(pos) != allocated) {
1528 				if (control == NULL) {
1529 					PRINT(("%s: block_run(%ld, %u, %u) is only partially "
1530 						"allocated (pos = %ld, length = %ld)!\n", type,
1531 						run.AllocationGroup(), run.Start(), run.Length(),
1532 						pos, length));
1533 					return B_BAD_DATA;
1534 				}
1535 				if (firstMissing == -1) {
1536 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1537 					control->errors |= BFS_MISSING_BLOCKS;
1538 				}
1539 				control->stats.missing++;
1540 			} else if (firstMissing != -1) {
1541 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are "
1542 					"%sallocated!\n", type, run.AllocationGroup(), run.Start(),
1543 					run.Length(), firstMissing,
1544 					firstGroupBlock + pos + block * bitsPerBlock - 1,
1545 					allocated ? "not " : ""));
1546 				firstMissing = -1;
1547 			}
1548 
1549 			if (fCheckBitmap != NULL) {
1550 				// Set the block in the check bitmap as well, but have a look
1551 				// if it is already allocated first
1552 				uint32 offset = pos + block * bitsPerBlock;
1553 				if (_CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1554 					if (firstSet == -1) {
1555 						firstSet = firstGroupBlock + offset;
1556 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1557 					}
1558 					control->stats.already_set++;
1559 				} else {
1560 					if (firstSet != -1) {
1561 						FATAL(("%s: block_run(%d, %u, %u): blocks %Ld - %Ld "
1562 							"are already set!\n", type,
1563 							(int)run.AllocationGroup(), run.Start(),
1564 							run.Length(), firstSet,
1565 							firstGroupBlock + offset - 1));
1566 						firstSet = -1;
1567 					}
1568 					_SetCheckBitmapAt(firstGroupBlock + offset);
1569 				}
1570 			}
1571 			length++;
1572 			pos++;
1573 		}
1574 
1575 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1576 			if (firstMissing != -1) {
1577 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not "
1578 					"allocated!\n", type, run.AllocationGroup(), run.Start(),
1579 					run.Length(), firstMissing,
1580 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1581 			}
1582 			if (firstSet != -1) {
1583 				FATAL(("%s: block_run(%d, %u, %u): blocks %Ld - %Ld are "
1584 					"already set!\n", type, (int)run.AllocationGroup(),
1585 					run.Start(), run.Length(), firstSet,
1586 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1587 			}
1588 		}
1589 	}
1590 
1591 	return B_OK;
1592 }
1593 
1594 
1595 status_t
1596 BlockAllocator::CheckInode(Inode* inode, check_control* control)
1597 {
1598 	if (control != NULL && fCheckBitmap == NULL)
1599 		return B_NO_INIT;
1600 	if (inode == NULL)
1601 		return B_BAD_VALUE;
1602 
1603 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1604 	if (status < B_OK)
1605 		return status;
1606 
1607 	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1608 		// symlinks may not have a valid data stream
1609 		if (strlen(inode->Node().short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1610 			return B_BAD_DATA;
1611 
1612 		return B_OK;
1613 	}
1614 
1615 	data_stream* data = &inode->Node().data;
1616 
1617 	// check the direct range
1618 
1619 	if (data->max_direct_range) {
1620 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1621 			if (data->direct[i].IsZero())
1622 				break;
1623 
1624 			status = CheckBlockRun(data->direct[i], "direct", control);
1625 			if (status < B_OK)
1626 				return status;
1627 		}
1628 	}
1629 
1630 	CachedBlock cached(fVolume);
1631 
1632 	// check the indirect range
1633 
1634 	if (data->max_indirect_range) {
1635 		status = CheckBlockRun(data->indirect, "indirect", control);
1636 		if (status < B_OK)
1637 			return status;
1638 
1639 		off_t block = fVolume->ToBlock(data->indirect);
1640 
1641 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1642 			block_run* runs = (block_run*)cached.SetTo(block + i);
1643 			if (runs == NULL)
1644 				RETURN_ERROR(B_IO_ERROR);
1645 
1646 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1647 			int32 index = 0;
1648 			for (; index < runsPerBlock; index++) {
1649 				if (runs[index].IsZero())
1650 					break;
1651 
1652 				status = CheckBlockRun(runs[index], "indirect->run", control);
1653 				if (status < B_OK)
1654 					return status;
1655 			}
1656 			if (index < runsPerBlock)
1657 				break;
1658 		}
1659 	}
1660 
1661 	// check the double indirect range
1662 
1663 	if (data->max_double_indirect_range) {
1664 		status = CheckBlockRun(data->double_indirect, "double indirect",
1665 			control);
1666 		if (status < B_OK)
1667 			return status;
1668 
1669 		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1670 		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1671 
1672 		CachedBlock cachedDirect(fVolume);
1673 		int32 maxIndirectIndex = (data->double_indirect.Length()
1674 			<< fVolume->BlockShift()) / sizeof(block_run);
1675 
1676 		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex;
1677 				indirectIndex++) {
1678 			// get the indirect array block
1679 			block_run* array = (block_run*)cached.SetTo(
1680 				fVolume->ToBlock(data->double_indirect)
1681 					+ indirectIndex / runsPerBlock);
1682 			if (array == NULL)
1683 				return B_IO_ERROR;
1684 
1685 			block_run indirect = array[indirectIndex % runsPerBlock];
1686 			// are we finished yet?
1687 			if (indirect.IsZero())
1688 				return B_OK;
1689 
1690 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1691 			if (status < B_OK)
1692 				return status;
1693 
1694 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift())
1695 				/ sizeof(block_run);
1696 
1697 			for (int32 index = 0; index < maxIndex; ) {
1698 				block_run* runs = (block_run*)cachedDirect.SetTo(
1699 					fVolume->ToBlock(indirect) + index / runsPerBlock);
1700 				if (runs == NULL)
1701 					return B_IO_ERROR;
1702 
1703 				do {
1704 					// are we finished yet?
1705 					if (runs[index % runsPerBlock].IsZero())
1706 						return B_OK;
1707 
1708 					status = CheckBlockRun(runs[index % runsPerBlock],
1709 						"double indirect->runs->run", control);
1710 					if (status < B_OK)
1711 						return status;
1712 				} while ((++index % runsPerArray) != 0);
1713 			}
1714 		}
1715 	}
1716 
1717 	return B_OK;
1718 }
1719 
1720 
1721 //	#pragma mark - debugger commands
1722 
1723 
1724 #ifdef BFS_DEBUGGER_COMMANDS
1725 
1726 void
1727 BlockAllocator::Dump(int32 index)
1728 {
1729 	kprintf("allocation groups: %ld\n", fNumGroups);
1730 	kprintf("blocks per group: %ld\n", fBlocksPerGroup);
1731 
1732 	for (int32 i = 0; i < fNumGroups; i++) {
1733 		if (index != -1 && i != index)
1734 			continue;
1735 
1736 		AllocationGroup& group = fGroups[i];
1737 
1738 		kprintf("[%3ld] num bits:       %lu\n", i, group.NumBits());
1739 		kprintf("      num blocks:     %lu\n", group.NumBlocks());
1740 		kprintf("      start:          %ld\n", group.Start());
1741 		kprintf("      first free:     %ld\n", group.fFirstFree);
1742 		kprintf("      largest start:  %ld%s\n", group.fLargestStart,
1743 			group.fLargestValid ? "" : "  (invalid)");
1744 		kprintf("      largest length: %ld\n", group.fLargestLength);
1745 		kprintf("      free bits:      %ld\n", group.fFreeBits);
1746 	}
1747 }
1748 
1749 
1750 #if BFS_TRACING
1751 static char kTraceBuffer[256];
1752 
1753 
1754 int
1755 dump_block_allocator_blocks(int argc, char** argv)
1756 {
1757 	if (argc != 3 || !strcmp(argv[1], "--help")) {
1758 		kprintf("usage: %s <ptr-to-volume> <block>\n", argv[0]);
1759 		return 0;
1760 	}
1761 
1762 	Volume* volume = (Volume*)parse_expression(argv[1]);
1763 	off_t block = parse_expression(argv[2]);
1764 
1765 	// iterate over all tracing entries to find overlapping actions
1766 
1767 	using namespace BFSBlockTracing;
1768 
1769 	LazyTraceOutput out(kTraceBuffer, sizeof(kTraceBuffer), 0);
1770 	TraceEntryIterator iterator;
1771 	while (TraceEntry* _entry = iterator.Next()) {
1772 		if (Allocate* entry = dynamic_cast<Allocate*>(_entry)) {
1773 			off_t first = volume->ToBlock(entry->Run());
1774 			off_t last = first - 1 + entry->Run().Length();
1775 			if (block >= first && block <= last) {
1776 				out.Clear();
1777 				const char* dump = out.DumpEntry(entry);
1778 				kprintf("%5ld. %s\n", iterator.Index(), dump);
1779 			}
1780 		} else if (Free* entry = dynamic_cast<Free*>(_entry)) {
1781 			off_t first = volume->ToBlock(entry->Run());
1782 			off_t last = first - 1 + entry->Run().Length();
1783 			if (block >= first && block <= last) {
1784 				out.Clear();
1785 				const char* dump = out.DumpEntry(entry);
1786 				kprintf("%5ld. %s\n", iterator.Index(), dump);
1787 			}
1788 		}
1789 	}
1790 
1791 	return 0;
1792 }
1793 #endif
1794 
1795 
1796 int
1797 dump_block_allocator(int argc, char** argv)
1798 {
1799 	int32 group = -1;
1800 	if (argc == 3) {
1801 		group = parse_expression(argv[2]);
1802 		argc--;
1803 	}
1804 
1805 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1806 		kprintf("usage: %s <ptr-to-volume> [group]\n", argv[0]);
1807 		return 0;
1808 	}
1809 
1810 	Volume* volume = (Volume*)parse_expression(argv[1]);
1811 	BlockAllocator& allocator = volume->Allocator();
1812 
1813 	allocator.Dump(group);
1814 	return 0;
1815 }
1816 
1817 #endif	// BFS_DEBUGGER_COMMANDS
1818 
1819