xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 9760dcae2038d47442f4658c2575844c6cf92c40)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! block bitmap handling and allocation policies
7 
8 
9 #include "BlockAllocator.h"
10 
11 #include "bfs_control.h"
12 #include "BPlusTree.h"
13 #include "Debug.h"
14 #include "Inode.h"
15 #include "Volume.h"
16 
17 
18 // Things the BlockAllocator should do:
19 
20 // - find a range of blocks of a certain size nearby a specific position
21 // - allocating an unsharp range of blocks for pre-allocation
22 // - free blocks
23 // - know how to deal with each allocation, special handling for directories,
24 //   files, symlinks, etc. (type sensitive allocation policies)
25 
26 // What makes the code complicated is the fact that we are not just reading
27 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
28 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
29 // of partitions will grow even more - so that's not an option.
30 // Instead we are reading in every block when it's used - since an allocation
31 // group can span several blocks in the block bitmap, the AllocationBlock
32 // class is there to make handling those easier.
33 
34 // The current implementation is only slightly optimized and could probably
35 // be improved a lot. Furthermore, the allocation policies used here should
36 // have some real world tests.
37 
38 #if BFS_TRACING && !defined(BFS_SHELL)
39 namespace BFSBlockTracing {
40 
41 class Allocate : public AbstractTraceEntry {
42 public:
43 	Allocate(block_run run)
44 		:
45 		fRun(run)
46 	{
47 		Initialized();
48 	}
49 
50 	virtual void AddDump(TraceOutput& out)
51 	{
52 		out.Print("bfs:alloc %lu.%u.%u", fRun.AllocationGroup(),
53 			fRun.Start(), fRun.Length());
54 	}
55 
56 	const block_run& Run() const { return fRun; }
57 
58 private:
59 	block_run	fRun;
60 };
61 
62 class Free : public AbstractTraceEntry {
63 public:
64 	Free(block_run run)
65 		:
66 		fRun(run)
67 	{
68 		Initialized();
69 	}
70 
71 	virtual void AddDump(TraceOutput& out)
72 	{
73 		out.Print("bfs:free %lu.%u.%u", fRun.AllocationGroup(),
74 			fRun.Start(), fRun.Length());
75 	}
76 
77 	const block_run& Run() const { return fRun; }
78 
79 private:
80 	block_run	fRun;
81 };
82 
83 
84 static uint32
85 checksum(const uint8* data, size_t size)
86 {
87 	const uint32* data4 = (const uint32*)data;
88 	uint32 sum = 0;
89 	while (size >= 4) {
90 		sum += *data4;
91 		data4++;
92 		size -= 4;
93 	}
94 	return sum;
95 }
96 
97 
98 class Block : public AbstractTraceEntry {
99 public:
100 	Block(const char* label, off_t blockNumber, const uint8* data,
101 			size_t size, uint32 start = 0, uint32 length = 0)
102 		:
103 		fBlock(blockNumber),
104 		fData(data),
105 		fStart(start),
106 		fLength(length),
107 		fLabel(label)
108 	{
109 		fSum = checksum(data, size);
110 		Initialized();
111 	}
112 
113 	virtual void AddDump(TraceOutput& out)
114 	{
115 		out.Print("bfs:%s: block %Ld (%p), sum %lu, s/l %lu/%lu", fLabel,
116 			fBlock, fData, fSum, fStart, fLength);
117 	}
118 
119 private:
120 	off_t		fBlock;
121 	const uint8	*fData;
122 	uint32		fStart;
123 	uint32		fLength;
124 	uint32		fSum;
125 	const char*	fLabel;
126 };
127 
128 
129 class BlockChange : public AbstractTraceEntry {
130 public:
131 	BlockChange(const char* label, int32 block, uint32 oldData, uint32 newData)
132 		:
133 		fBlock(block),
134 		fOldData(oldData),
135 		fNewData(newData),
136 		fLabel(label)
137 	{
138 		Initialized();
139 	}
140 
141 	virtual void AddDump(TraceOutput& out)
142 	{
143 		out.Print("bfs:%s: block %ld, %08lx -> %08lx", fLabel,
144 			fBlock, fOldData, fNewData);
145 	}
146 
147 private:
148 	int32		fBlock;
149 	uint32		fOldData;
150 	uint32		fNewData;
151 	const char*	fLabel;
152 };
153 
154 }	// namespace BFSBlockTracing
155 
156 #	define T(x) new(std::nothrow) BFSBlockTracing::x;
157 #else
158 #	define T(x) ;
159 #endif
160 
161 #ifdef DEBUG_ALLOCATION_GROUPS
162 #	define CHECK_ALLOCATION_GROUP(group) _CheckGroup(group)
163 #else
164 #	define CHECK_ALLOCATION_GROUP(group) ;
165 #endif
166 
167 
168 struct check_cookie {
169 	check_cookie() {}
170 
171 	block_run			current;
172 	Inode*				parent;
173 	mode_t				parent_mode;
174 	Stack<block_run>	stack;
175 	TreeIterator*		iterator;
176 };
177 
178 
179 class AllocationBlock : public CachedBlock {
180 public:
181 	AllocationBlock(Volume* volume);
182 
183 	inline void Allocate(uint16 start, uint16 numBlocks);
184 	inline void Free(uint16 start, uint16 numBlocks);
185 	inline bool IsUsed(uint16 block);
186 
187 	status_t SetTo(AllocationGroup& group, uint16 block);
188 	status_t SetToWritable(Transaction& transaction, AllocationGroup& group,
189 		uint16 block);
190 
191 	uint32 NumBlockBits() const { return fNumBits; }
192 	uint32& Block(int32 index) { return ((uint32*)fBlock)[index]; }
193 	uint8* Block() const { return (uint8*)fBlock; }
194 
195 private:
196 	uint32	fNumBits;
197 #ifdef DEBUG
198 	bool	fWritable;
199 #endif
200 };
201 
202 
203 class AllocationGroup {
204 public:
205 	AllocationGroup();
206 
207 	void AddFreeRange(int32 start, int32 blocks);
208 	bool IsFull() const { return fFreeBits == 0; }
209 
210 	status_t Allocate(Transaction& transaction, uint16 start, int32 length);
211 	status_t Free(Transaction& transaction, uint16 start, int32 length);
212 
213 	uint32 NumBits() const { return fNumBits; }
214 	uint32 NumBlocks() const { return fNumBlocks; }
215 	int32 Start() const { return fStart; }
216 
217 private:
218 	friend class BlockAllocator;
219 
220 	uint32	fNumBits;
221 	uint32	fNumBlocks;
222 	int32	fStart;
223 	int32	fFirstFree;
224 	int32	fFreeBits;
225 
226 	int32	fLargestStart;
227 	int32	fLargestLength;
228 	bool	fLargestValid;
229 };
230 
231 
232 AllocationBlock::AllocationBlock(Volume* volume)
233 	: CachedBlock(volume)
234 {
235 }
236 
237 
238 status_t
239 AllocationBlock::SetTo(AllocationGroup& group, uint16 block)
240 {
241 	// 8 blocks per byte
242 	fNumBits = fVolume->BlockSize() << 3;
243 	// the last group may have less bits than the others
244 	if ((block + 1) * fNumBits > group.NumBits())
245 		fNumBits = group.NumBits() % fNumBits;
246 
247 #ifdef DEBUG
248 	fWritable = false;
249 #endif
250 	return CachedBlock::SetTo(group.Start() + block) != NULL ? B_OK : B_ERROR;
251 }
252 
253 
254 status_t
255 AllocationBlock::SetToWritable(Transaction& transaction, AllocationGroup& group,
256 	uint16 block)
257 {
258 	// 8 blocks per byte
259 	fNumBits = fVolume->BlockSize() << 3;
260 	// the last group may have less bits in the last block
261 	if ((block + 1) * fNumBits > group.NumBits())
262 		fNumBits = group.NumBits() % fNumBits;
263 
264 #ifdef DEBUG
265 	fWritable = true;
266 #endif
267 	return CachedBlock::SetToWritable(transaction, group.Start() + block)
268 		!= NULL ? B_OK : B_ERROR;
269 }
270 
271 
272 bool
273 AllocationBlock::IsUsed(uint16 block)
274 {
275 	if (block > fNumBits)
276 		return true;
277 
278 	// the block bitmap is accessed in 32-bit blocks
279 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
280 }
281 
282 
283 void
284 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
285 {
286 	ASSERT(start < fNumBits);
287 	ASSERT(uint32(start + numBlocks) <= fNumBits);
288 #ifdef DEBUG
289 	ASSERT(fWritable);
290 #endif
291 
292 	T(Block("b-alloc-in", fBlockNumber, fBlock, fVolume->BlockSize(),
293 		start, numBlocks));
294 
295 	int32 block = start >> 5;
296 
297 	while (numBlocks > 0) {
298 		uint32 mask = 0;
299 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
300 			mask |= 1UL << i;
301 
302 		T(BlockChange("b-alloc", block, Block(block),
303 			Block(block) | HOST_ENDIAN_TO_BFS_INT32(mask)));
304 
305 #if KDEBUG
306 		// check for already set blocks
307 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32*)fBlock)[block]) {
308 			FATAL(("AllocationBlock::Allocate(): some blocks are already "
309 				"allocated, start = %u, numBlocks = %u\n", start, numBlocks));
310 			panic("blocks already set!");
311 		}
312 #endif
313 
314 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
315 		start = 0;
316 	}
317 	T(Block("b-alloc-out", fBlockNumber, fBlock, fVolume->BlockSize(),
318 		start, numBlocks));
319 }
320 
321 
322 void
323 AllocationBlock::Free(uint16 start, uint16 numBlocks)
324 {
325 	ASSERT(start < fNumBits);
326 	ASSERT(uint32(start + numBlocks) <= fNumBits);
327 #ifdef DEBUG
328 	ASSERT(fWritable);
329 #endif
330 
331 	int32 block = start >> 5;
332 
333 	while (numBlocks > 0) {
334 		uint32 mask = 0;
335 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
336 			mask |= 1UL << (i % 32);
337 
338 		T(BlockChange("b-free", block, Block(block),
339 			Block(block) & HOST_ENDIAN_TO_BFS_INT32(~mask)));
340 
341 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
342 		start = 0;
343 	}
344 }
345 
346 
347 //	#pragma mark -
348 
349 
350 /*!	The allocation groups are created and initialized in
351 	BlockAllocator::Initialize() and BlockAllocator::InitializeAndClearBitmap()
352 	respectively.
353 */
354 AllocationGroup::AllocationGroup()
355 	:
356 	fFirstFree(-1),
357 	fFreeBits(0),
358 	fLargestValid(false)
359 {
360 }
361 
362 
363 void
364 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
365 {
366 	//D(if (blocks > 512)
367 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
368 
369 	if (fFirstFree == -1)
370 		fFirstFree = start;
371 
372 	if (!fLargestValid || fLargestLength < blocks) {
373 		fLargestStart = start;
374 		fLargestLength = blocks;
375 		fLargestValid = true;
376 	}
377 
378 	fFreeBits += blocks;
379 }
380 
381 
382 /*!	Allocates the specified run in the allocation group.
383 	Doesn't check if the run is valid or already allocated partially, nor
384 	does it maintain the free ranges hints or the volume's used blocks count.
385 	It only does the low-level work of allocating some bits in the block bitmap.
386 	Assumes that the block bitmap lock is hold.
387 */
388 status_t
389 AllocationGroup::Allocate(Transaction& transaction, uint16 start, int32 length)
390 {
391 	ASSERT(start + length <= (int32)fNumBits);
392 
393 	// Update the allocation group info
394 	// TODO: this info will be incorrect if something goes wrong later
395 	// Note, the fFirstFree block doesn't have to be really free
396 	if (start == fFirstFree)
397 		fFirstFree = start + length;
398 	fFreeBits -= length;
399 
400 	if (fLargestValid) {
401 		bool cut = false;
402 		if (fLargestStart == start) {
403 			// cut from start
404 			fLargestStart += length;
405 			fLargestLength -= length;
406 			cut = true;
407 		} else if (start > fLargestStart
408 			&& start < fLargestStart + fLargestLength) {
409 			// cut from end
410 			fLargestLength = start - fLargestStart;
411 			cut = true;
412 		}
413 		if (cut && (fLargestLength < fLargestStart
414 				|| fLargestLength
415 						< (int32)fNumBits - (fLargestStart + fLargestLength))) {
416 			// might not be the largest block anymore
417 			fLargestValid = false;
418 		}
419 	}
420 
421 	Volume* volume = transaction.GetVolume();
422 
423 	// calculate block in the block bitmap and position within
424 	uint32 bitsPerBlock = volume->BlockSize() << 3;
425 	uint32 block = start / bitsPerBlock;
426 	start = start % bitsPerBlock;
427 
428 	AllocationBlock cached(volume);
429 
430 	while (length > 0) {
431 		if (cached.SetToWritable(transaction, *this, block) < B_OK) {
432 			fLargestValid = false;
433 			RETURN_ERROR(B_IO_ERROR);
434 		}
435 
436 		uint32 numBlocks = length;
437 		if (start + numBlocks > cached.NumBlockBits())
438 			numBlocks = cached.NumBlockBits() - start;
439 
440 		cached.Allocate(start, numBlocks);
441 
442 		length -= numBlocks;
443 		start = 0;
444 		block++;
445 	}
446 
447 	return B_OK;
448 }
449 
450 
451 /*!	Frees the specified run in the allocation group.
452 	Doesn't check if the run is valid or was not completely allocated, nor
453 	does it maintain the free ranges hints or the volume's used blocks count.
454 	It only does the low-level work of freeing some bits in the block bitmap.
455 	Assumes that the block bitmap lock is hold.
456 */
457 status_t
458 AllocationGroup::Free(Transaction& transaction, uint16 start, int32 length)
459 {
460 	ASSERT(start + length <= (int32)fNumBits);
461 
462 	// Update the allocation group info
463 	// TODO: this info will be incorrect if something goes wrong later
464 	if (fFirstFree > start)
465 		fFirstFree = start;
466 	fFreeBits += length;
467 
468 	// The range to be freed cannot be part of the valid largest range
469 	ASSERT(!fLargestValid || start + length <= fLargestStart
470 		|| start > fLargestStart);
471 
472 	if (fLargestValid
473 		&& (start + length == fLargestStart
474 			|| fLargestStart + fLargestLength == start
475 			|| (start < fLargestStart && fLargestStart > fLargestLength)
476 			|| (start > fLargestStart
477 				&& (int32)fNumBits - (fLargestStart + fLargestLength)
478 						> fLargestLength))) {
479 		fLargestValid = false;
480 	}
481 
482 	Volume* volume = transaction.GetVolume();
483 
484 	// calculate block in the block bitmap and position within
485 	uint32 bitsPerBlock = volume->BlockSize() << 3;
486 	uint32 block = start / bitsPerBlock;
487 	start = start % bitsPerBlock;
488 
489 	AllocationBlock cached(volume);
490 
491 	while (length > 0) {
492 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
493 			RETURN_ERROR(B_IO_ERROR);
494 
495 		T(Block("free-1", block, cached.Block(), volume->BlockSize()));
496 		uint16 freeLength = length;
497 		if (uint32(start + length) > cached.NumBlockBits())
498 			freeLength = cached.NumBlockBits() - start;
499 
500 		cached.Free(start, freeLength);
501 
502 		length -= freeLength;
503 		start = 0;
504 		T(Block("free-2", block, cached.Block(), volume->BlockSize()));
505 		block++;
506 	}
507 	return B_OK;
508 }
509 
510 
511 //	#pragma mark -
512 
513 
514 BlockAllocator::BlockAllocator(Volume* volume)
515 	:
516 	fVolume(volume),
517 	fGroups(NULL),
518 	fCheckBitmap(NULL)
519 {
520 	mutex_init(&fLock, "bfs allocator");
521 }
522 
523 
524 BlockAllocator::~BlockAllocator()
525 {
526 	mutex_destroy(&fLock);
527 	delete[] fGroups;
528 }
529 
530 
531 status_t
532 BlockAllocator::Initialize(bool full)
533 {
534 	fNumGroups = fVolume->AllocationGroups();
535 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
536 	fNumBlocks = (fVolume->NumBlocks() + fVolume->BlockSize() * 8 - 1)
537 		/ (fVolume->BlockSize() * 8);
538 
539 	fGroups = new AllocationGroup[fNumGroups];
540 	if (fGroups == NULL)
541 		return B_NO_MEMORY;
542 
543 	if (!full)
544 		return B_OK;
545 
546 	mutex_lock(&fLock);
547 		// the lock will be released by the _Initialize() method
548 
549 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::_Initialize,
550 		"bfs block allocator", B_LOW_PRIORITY, this);
551 	if (id < B_OK)
552 		return _Initialize(this);
553 
554 	mutex_transfer_lock(&fLock, id);
555 
556 	return resume_thread(id);
557 }
558 
559 
560 status_t
561 BlockAllocator::InitializeAndClearBitmap(Transaction& transaction)
562 {
563 	status_t status = Initialize(false);
564 	if (status != B_OK)
565 		return status;
566 
567 	uint32 numBits = 8 * fBlocksPerGroup * fVolume->BlockSize();
568 	uint32 blockShift = fVolume->BlockShift();
569 
570 	uint32* buffer = (uint32*)malloc(numBits >> 3);
571 	if (buffer == NULL)
572 		RETURN_ERROR(B_NO_MEMORY);
573 
574 	memset(buffer, 0, numBits >> 3);
575 
576 	off_t offset = 1;
577 		// the bitmap starts directly after the super block
578 
579 	// initialize the AllocationGroup objects and clear the on-disk bitmap
580 
581 	for (int32 i = 0; i < fNumGroups; i++) {
582 		if (write_pos(fVolume->Device(), offset << blockShift, buffer,
583 				fBlocksPerGroup << blockShift) < B_OK)
584 			return B_ERROR;
585 
586 		// the last allocation group may contain less blocks than the others
587 		if (i == fNumGroups - 1) {
588 			fGroups[i].fNumBits = fVolume->NumBlocks() - i * numBits;
589 			fGroups[i].fNumBlocks = 1 + ((fGroups[i].NumBits() - 1)
590 				>> (blockShift + 3));
591 		} else {
592 			fGroups[i].fNumBits = numBits;
593 			fGroups[i].fNumBlocks = fBlocksPerGroup;
594 		}
595 		fGroups[i].fStart = offset;
596 		fGroups[i].fFirstFree = fGroups[i].fLargestStart = 0;
597 		fGroups[i].fFreeBits = fGroups[i].fLargestLength = fGroups[i].fNumBits;
598 		fGroups[i].fLargestValid = true;
599 
600 		offset += fBlocksPerGroup;
601 	}
602 	free(buffer);
603 
604 	// reserve the boot block, the log area, and the block bitmap itself
605 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
606 
607 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
608 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
609 		return B_ERROR;
610 	}
611 	fVolume->SuperBlock().used_blocks
612 		= HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
613 
614 	return B_OK;
615 }
616 
617 
618 status_t
619 BlockAllocator::_Initialize(BlockAllocator* allocator)
620 {
621 	// The lock must already be held at this point
622 
623 	Volume* volume = allocator->fVolume;
624 	uint32 blocks = allocator->fBlocksPerGroup;
625 	uint32 blockShift = volume->BlockShift();
626 	off_t freeBlocks = 0;
627 
628 	uint32* buffer = (uint32*)malloc(blocks << blockShift);
629 	if (buffer == NULL) {
630 		mutex_unlock(&allocator->fLock);
631 		RETURN_ERROR(B_NO_MEMORY);
632 	}
633 
634 	AllocationGroup* groups = allocator->fGroups;
635 	off_t offset = 1;
636 	uint32 bitsPerGroup = 8 * (blocks << blockShift);
637 	int32 numGroups = allocator->fNumGroups;
638 
639 	for (int32 i = 0; i < numGroups; i++) {
640 		if (read_pos(volume->Device(), offset << blockShift, buffer,
641 				blocks << blockShift) < B_OK)
642 			break;
643 
644 		// the last allocation group may contain less blocks than the others
645 		if (i == numGroups - 1) {
646 			groups[i].fNumBits = volume->NumBlocks() - i * bitsPerGroup;
647 			groups[i].fNumBlocks = 1 + ((groups[i].NumBits() - 1)
648 				>> (blockShift + 3));
649 		} else {
650 			groups[i].fNumBits = bitsPerGroup;
651 			groups[i].fNumBlocks = blocks;
652 		}
653 		groups[i].fStart = offset;
654 
655 		// finds all free ranges in this allocation group
656 		int32 start = -1, range = 0;
657 		int32 numBits = groups[i].fNumBits, bit = 0;
658 		int32 count = (numBits + 31) / 32;
659 
660 		for (int32 k = 0; k < count; k++) {
661 			for (int32 j = 0; j < 32 && bit < numBits; j++, bit++) {
662 				if (buffer[k] & (1UL << j)) {
663 					// block is in use
664 					if (range > 0) {
665 						groups[i].AddFreeRange(start, range);
666 						range = 0;
667 					}
668 				} else if (range++ == 0) {
669 					// block is free, start new free range
670 					start = bit;
671 				}
672 			}
673 		}
674 		if (range)
675 			groups[i].AddFreeRange(start, range);
676 
677 		freeBlocks += groups[i].fFreeBits;
678 
679 		offset += blocks;
680 	}
681 	free(buffer);
682 
683 	// check if block bitmap and log area are reserved
684 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
685 
686 	if (allocator->CheckBlocks(0, reservedBlocks) != B_OK) {
687 		if (volume->IsReadOnly()) {
688 			FATAL(("Space for block bitmap or log area is not reserved "
689 				"(volume is mounted read-only)!\n"));
690 		} else {
691 			Transaction transaction(volume, 0);
692 			if (groups[0].Allocate(transaction, 0, reservedBlocks) != B_OK) {
693 				FATAL(("Could not allocate reserved space for block "
694 					"bitmap/log!\n"));
695 				volume->Panic();
696 			} else {
697 				transaction.Done();
698 				FATAL(("Space for block bitmap or log area was not "
699 					"reserved!\n"));
700 			}
701 		}
702 	}
703 
704 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
705 	if (volume->UsedBlocks() != usedBlocks) {
706 		// If the disk in a dirty state at mount time, it's
707 		// normal that the values don't match
708 		INFORM(("volume reports %" B_PRIdOFF " used blocks, correct is %"
709 			B_PRIdOFF "\n", volume->UsedBlocks(), usedBlocks));
710 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
711 	}
712 
713 	mutex_unlock(&allocator->fLock);
714 	return B_OK;
715 }
716 
717 
718 void
719 BlockAllocator::Uninitialize()
720 {
721 	// We only have to make sure that the initializer thread isn't running
722 	// anymore.
723 	mutex_lock(&fLock);
724 }
725 
726 
727 /*!	Tries to allocate between \a minimum, and \a maximum blocks starting
728 	at group \a groupIndex with offset \a start. The resulting allocation
729 	is put into \a run.
730 
731 	The number of allocated blocks is always a multiple of \a minimum which
732 	has to be a power of two value.
733 */
734 status_t
735 BlockAllocator::AllocateBlocks(Transaction& transaction, int32 groupIndex,
736 	uint16 start, uint16 maximum, uint16 minimum, block_run& run)
737 {
738 	if (maximum == 0)
739 		return B_BAD_VALUE;
740 
741 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n",
742 		groupIndex, start, maximum, minimum));
743 
744 	AllocationBlock cached(fVolume);
745 	MutexLocker lock(fLock);
746 
747 	uint32 bitsPerFullBlock = fVolume->BlockSize() << 3;
748 
749 	// Find the block_run that can fulfill the request best
750 	int32 bestGroup = -1;
751 	int32 bestStart = -1;
752 	int32 bestLength = -1;
753 
754 	for (int32 i = 0; i < fNumGroups + 1; i++, groupIndex++, start = 0) {
755 		groupIndex = groupIndex % fNumGroups;
756 		AllocationGroup& group = fGroups[groupIndex];
757 
758 		CHECK_ALLOCATION_GROUP(groupIndex);
759 
760 		if (start >= group.NumBits() || group.IsFull())
761 			continue;
762 
763 		// The wanted maximum is smaller than the largest free block in the
764 		// group or already smaller than the minimum
765 
766 		if (start < group.fFirstFree)
767 			start = group.fFirstFree;
768 
769 		if (group.fLargestValid) {
770 			if (group.fLargestLength < bestLength)
771 				continue;
772 
773 			if (group.fLargestStart >= start) {
774 				if (group.fLargestLength >= bestLength) {
775 					bestGroup = groupIndex;
776 					bestStart = group.fLargestStart;
777 					bestLength = group.fLargestLength;
778 
779 					if (bestLength >= maximum)
780 						break;
781 				}
782 
783 				// We know everything about this group we have to, let's skip
784 				// to the next
785 				continue;
786 			}
787 		}
788 
789 		// There may be more than one block per allocation group - and
790 		// we iterate through it to find a place for the allocation.
791 		// (one allocation can't exceed one allocation group)
792 
793 		uint32 block = start / (fVolume->BlockSize() << 3);
794 		int32 currentStart = 0, currentLength = 0;
795 		int32 groupLargestStart = -1;
796 		int32 groupLargestLength = -1;
797 		int32 currentBit = start;
798 		bool canFindGroupLargest = start == 0;
799 
800 		for (; block < group.NumBlocks(); block++) {
801 			if (cached.SetTo(group, block) < B_OK)
802 				RETURN_ERROR(B_ERROR);
803 
804 			T(Block("alloc-in", group.Start() + block, cached.Block(),
805 				fVolume->BlockSize(), groupIndex, currentStart));
806 
807 			// find a block large enough to hold the allocation
808 			for (uint32 bit = start % bitsPerFullBlock;
809 					bit < cached.NumBlockBits(); bit++) {
810 				if (!cached.IsUsed(bit)) {
811 					if (currentLength == 0) {
812 						// start new range
813 						currentStart = currentBit;
814 					}
815 
816 					// have we found a range large enough to hold numBlocks?
817 					if (++currentLength >= maximum) {
818 						bestGroup = groupIndex;
819 						bestStart = currentStart;
820 						bestLength = currentLength;
821 						break;
822 					}
823 				} else {
824 					if (currentLength) {
825 						// end of a range
826 						if (currentLength > bestLength) {
827 							bestGroup = groupIndex;
828 							bestStart = currentStart;
829 							bestLength = currentLength;
830 						}
831 						if (currentLength > groupLargestLength) {
832 							groupLargestStart = currentStart;
833 							groupLargestLength = currentLength;
834 						}
835 						currentLength = 0;
836 					}
837 					if ((int32)group.NumBits() - currentBit
838 							<= groupLargestLength) {
839 						// We can't find a bigger block in this group anymore,
840 						// let's skip the rest.
841 						block = group.NumBlocks();
842 						break;
843 					}
844 				}
845 				currentBit++;
846 			}
847 
848 			T(Block("alloc-out", block, cached.Block(),
849 				fVolume->BlockSize(), groupIndex, currentStart));
850 
851 			if (bestLength >= maximum) {
852 				canFindGroupLargest = false;
853 				break;
854 			}
855 
856 			// start from the beginning of the next block
857 			start = 0;
858 		}
859 
860 		if (currentBit == (int32)group.NumBits()) {
861 			if (currentLength > bestLength) {
862 				bestGroup = groupIndex;
863 				bestStart = currentStart;
864 				bestLength = currentLength;
865 			}
866 			if (canFindGroupLargest && currentLength > groupLargestLength) {
867 				groupLargestStart = currentStart;
868 				groupLargestLength = currentLength;
869 			}
870 		}
871 
872 		if (canFindGroupLargest && !group.fLargestValid
873 			&& groupLargestLength >= 0) {
874 			group.fLargestStart = groupLargestStart;
875 			group.fLargestLength = groupLargestLength;
876 			group.fLargestValid = true;
877 		}
878 
879 		if (bestLength >= maximum)
880 			break;
881 	}
882 
883 	// If we found a suitable range, mark the blocks as in use, and
884 	// write the updated block bitmap back to disk
885 	if (bestLength < minimum)
886 		return B_DEVICE_FULL;
887 
888 	if (bestLength > maximum)
889 		bestLength = maximum;
890 	else if (minimum > 1) {
891 		// make sure bestLength is a multiple of minimum
892 		bestLength = round_down(bestLength, minimum);
893 	}
894 
895 	if (fGroups[bestGroup].Allocate(transaction, bestStart, bestLength) != B_OK)
896 		RETURN_ERROR(B_IO_ERROR);
897 
898 	CHECK_ALLOCATION_GROUP(bestGroup);
899 
900 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(bestGroup);
901 	run.start = HOST_ENDIAN_TO_BFS_INT16(bestStart);
902 	run.length = HOST_ENDIAN_TO_BFS_INT16(bestLength);
903 
904 	fVolume->SuperBlock().used_blocks
905 		= HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + bestLength);
906 		// We are not writing back the disk's super block - it's
907 		// either done by the journaling code, or when the disk
908 		// is unmounted.
909 		// If the value is not correct at mount time, it will be
910 		// fixed anyway.
911 
912 	// We need to flush any remaining blocks in the new allocation to make sure
913 	// they won't interfere with the file cache.
914 	block_cache_discard(fVolume->BlockCache(), fVolume->ToBlock(run),
915 		run.Length());
916 
917 	T(Allocate(run));
918 	return B_OK;
919 }
920 
921 
922 status_t
923 BlockAllocator::AllocateForInode(Transaction& transaction,
924 	const block_run* parent, mode_t type, block_run& run)
925 {
926 	// Apply some allocation policies here (AllocateBlocks() will break them
927 	// if necessary) - we will start with those described in Dominic Giampaolo's
928 	// "Practical File System Design", and see how good they work
929 
930 	// Files are going in the same allocation group as its parent,
931 	// sub-directories will be inserted 8 allocation groups after
932 	// the one of the parent
933 	uint16 group = parent->AllocationGroup();
934 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
935 		group += 8;
936 
937 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
938 }
939 
940 
941 status_t
942 BlockAllocator::Allocate(Transaction& transaction, Inode* inode,
943 	off_t numBlocks, block_run& run, uint16 minimum)
944 {
945 	if (numBlocks <= 0)
946 		return B_ERROR;
947 
948 	// one block_run can't hold more data than there is in one allocation group
949 	if (numBlocks > fGroups[0].NumBits())
950 		numBlocks = fGroups[0].NumBits();
951 
952 	// since block_run.length is uint16, the largest number of blocks that
953 	// can be covered by a block_run is 65535
954 	// TODO: if we drop compatibility, couldn't we do this any better?
955 	// There are basically two possibilities:
956 	// a) since a length of zero doesn't have any sense, take that for 65536 -
957 	//    but that could cause many problems (bugs) in other areas
958 	// b) reduce the maximum amount of blocks per block_run, so that the
959 	//    remaining number of free blocks can be used in a useful manner
960 	//    (like 4 blocks) - but that would also reduce the maximum file size
961 	// c) have BlockRun::Length() return (length + 1).
962 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
963 		numBlocks = MAX_BLOCK_RUN_LENGTH;
964 
965 	// Apply some allocation policies here (AllocateBlocks() will break them
966 	// if necessary)
967 	uint16 group = inode->BlockRun().AllocationGroup();
968 	uint16 start = 0;
969 
970 	// Are there already allocated blocks? (then just try to allocate near the
971 	// last one)
972 	if (inode->Size() > 0) {
973 		const data_stream& data = inode->Node().data;
974 		// TODO: we currently don't care for when the data stream
975 		// is already grown into the indirect ranges
976 		if (data.max_double_indirect_range == 0
977 			&& data.max_indirect_range == 0) {
978 			// Since size > 0, there must be a valid block run in this stream
979 			int32 last = 0;
980 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
981 				if (data.direct[last + 1].IsZero())
982 					break;
983 
984 			group = data.direct[last].AllocationGroup();
985 			start = data.direct[last].Start() + data.direct[last].Length();
986 		}
987 	} else if (inode->IsContainer() || inode->IsSymLink()) {
988 		// directory and symbolic link data will go in the same allocation
989 		// group as the inode is in but after the inode data
990 		start = inode->BlockRun().Start();
991 	} else {
992 		// file data will start in the next allocation group
993 		group = inode->BlockRun().AllocationGroup() + 1;
994 	}
995 
996 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
997 }
998 
999 
1000 status_t
1001 BlockAllocator::Free(Transaction& transaction, block_run run)
1002 {
1003 	MutexLocker lock(fLock);
1004 
1005 	int32 group = run.AllocationGroup();
1006 	uint16 start = run.Start();
1007 	uint16 length = run.Length();
1008 
1009 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start,
1010 		length));
1011 	T(Free(run));
1012 
1013 	// doesn't use Volume::IsValidBlockRun() here because it can check better
1014 	// against the group size (the last group may have a different length)
1015 	if (group < 0 || group >= fNumGroups
1016 		|| start > fGroups[group].NumBits()
1017 		|| uint32(start + length) > fGroups[group].NumBits()
1018 		|| length == 0) {
1019 		FATAL(("tried to free an invalid block_run (%d, %u, %u)\n", (int)group,
1020 			start, length));
1021 		DEBUGGER(("tried to free invalid block_run"));
1022 		return B_BAD_VALUE;
1023 	}
1024 	// check if someone tries to free reserved areas at the beginning of the
1025 	// drive
1026 	if (group == 0
1027 		&& start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
1028 		FATAL(("tried to free a reserved block_run (%d, %u, %u)\n", (int)group,
1029 			start, length));
1030 		DEBUGGER(("tried to free reserved block"));
1031 		return B_BAD_VALUE;
1032 	}
1033 #ifdef DEBUG
1034 	if (CheckBlockRun(run) != B_OK)
1035 		return B_BAD_DATA;
1036 #endif
1037 
1038 	CHECK_ALLOCATION_GROUP(group);
1039 
1040 	if (fGroups[group].Free(transaction, start, length) != B_OK)
1041 		RETURN_ERROR(B_IO_ERROR);
1042 
1043 	CHECK_ALLOCATION_GROUP(group);
1044 
1045 #ifdef DEBUG
1046 	if (CheckBlockRun(run, NULL, NULL, false) != B_OK) {
1047 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just "
1048 			"freed)\n"));
1049 	}
1050 #endif
1051 
1052 	fVolume->SuperBlock().used_blocks =
1053 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
1054 	return B_OK;
1055 }
1056 
1057 
1058 size_t
1059 BlockAllocator::BitmapSize() const
1060 {
1061 	return fVolume->BlockSize() * fNumBlocks;
1062 }
1063 
1064 
1065 #ifdef DEBUG_FRAGMENTER
1066 void
1067 BlockAllocator::Fragment()
1068 {
1069 	AllocationBlock cached(fVolume);
1070 	MutexLocker lock(fLock);
1071 
1072 	// only leave 4 block holes
1073 	static const uint32 kMask = 0x0f0f0f0f;
1074 	uint32 valuesPerBlock = fVolume->BlockSize() / 4;
1075 
1076 	for (int32 i = 0; i < fNumGroups; i++) {
1077 		AllocationGroup& group = fGroups[i];
1078 
1079 		for (uint32 block = 0; block < group.NumBlocks(); block++) {
1080 			Transaction transaction(fVolume, 0);
1081 
1082 			if (cached.SetToWritable(transaction, group, block) != B_OK)
1083 				return;
1084 
1085 			for (int32 index = 0; index < valuesPerBlock; index++) {
1086 				cached.Block(index) |= HOST_ENDIAN_TO_BFS_INT32(kMask);
1087 			}
1088 
1089 			transaction.Done();
1090 		}
1091 	}
1092 }
1093 #endif	// DEBUG_FRAGMENTER
1094 
1095 
1096 #ifdef DEBUG_ALLOCATION_GROUPS
1097 void
1098 BlockAllocator::_CheckGroup(int32 groupIndex) const
1099 {
1100 	AllocationBlock cached(fVolume);
1101 	ASSERT_LOCKED_MUTEX(&fLock);
1102 
1103 	AllocationGroup& group = fGroups[groupIndex];
1104 
1105 	int32 currentStart = 0, currentLength = 0;
1106 	int32 firstFree = -1;
1107 	int32 largestStart = -1;
1108 	int32 largestLength = 0;
1109 	int32 currentBit = 0;
1110 
1111 	for (uint32 block = 0; block < group.NumBlocks(); block++) {
1112 		if (cached.SetTo(group, block) < B_OK) {
1113 			panic("setting group block %d failed\n", (int)block);
1114 			return;
1115 		}
1116 
1117 		for (uint32 bit = 0; bit < cached.NumBlockBits(); bit++) {
1118 			if (!cached.IsUsed(bit)) {
1119 				if (firstFree < 0) {
1120 					firstFree = currentBit;
1121 					if (!group.fLargestValid) {
1122 						if (firstFree >= 0 && firstFree < group.fFirstFree) {
1123 							// mostly harmless but noteworthy
1124 							dprintf("group %d first free too late: should be "
1125 								"%d, is %d\n", (int)groupIndex, (int)firstFree,
1126 								(int)group.fFirstFree);
1127 						}
1128 						return;
1129 					}
1130 				}
1131 
1132 				if (currentLength == 0) {
1133 					// start new range
1134 					currentStart = currentBit;
1135 				}
1136 				currentLength++;
1137 			} else if (currentLength) {
1138 				// end of a range
1139 				if (currentLength > largestLength) {
1140 					largestStart = currentStart;
1141 					largestLength = currentLength;
1142 				}
1143 				currentLength = 0;
1144 			}
1145 			currentBit++;
1146 		}
1147 	}
1148 
1149 	if (currentLength > largestLength) {
1150 		largestStart = currentStart;
1151 		largestLength = currentLength;
1152 	}
1153 
1154 	if (firstFree >= 0 && firstFree < group.fFirstFree) {
1155 		// mostly harmless but noteworthy
1156 		dprintf("group %d first free too late: should be %d, is %d\n",
1157 			(int)groupIndex, (int)firstFree, (int)group.fFirstFree);
1158 	}
1159 	if (group.fLargestValid
1160 		&& (largestStart != group.fLargestStart
1161 			|| largestLength != group.fLargestLength)) {
1162 		panic("bfs %p: group %d largest differs: %d.%d, checked %d.%d.\n",
1163 			fVolume, (int)groupIndex, (int)group.fLargestStart,
1164 			(int)group.fLargestLength, (int)largestStart, (int)largestLength);
1165 	}
1166 }
1167 #endif	// DEBUG_ALLOCATION_GROUPS
1168 
1169 
1170 //	#pragma mark - Bitmap validity checking
1171 
1172 // TODO: implement new FS checking API
1173 // Functions to check the validity of the bitmap - they are used from
1174 // the "checkfs" command (since this does even a bit more, maybe we should
1175 // move this some place else?)
1176 
1177 
1178 bool
1179 BlockAllocator::_IsValidCheckControl(check_control* control)
1180 {
1181 	if (control == NULL
1182 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
1183 		FATAL(("invalid check_control (%p)!\n", control));
1184 		return false;
1185 	}
1186 
1187 	return true;
1188 }
1189 
1190 
1191 status_t
1192 BlockAllocator::StartChecking(check_control* control)
1193 {
1194 	if (!_IsValidCheckControl(control))
1195 		return B_BAD_VALUE;
1196 
1197 	fVolume->GetJournal(0)->Lock(NULL, true);
1198 		// Lock the volume's journal
1199 
1200 	status_t status = mutex_lock(&fLock);
1201 	if (status != B_OK) {
1202 		fVolume->GetJournal(0)->Unlock(NULL, true);
1203 		return status;
1204 	}
1205 
1206 	size_t size = BitmapSize();
1207 	fCheckBitmap = (uint32*)malloc(size);
1208 	if (fCheckBitmap == NULL) {
1209 		mutex_unlock(&fLock);
1210 		fVolume->GetJournal(0)->Unlock(NULL, true);
1211 		return B_NO_MEMORY;
1212 	}
1213 
1214 	check_cookie* cookie = new check_cookie();
1215 	if (cookie == NULL) {
1216 		free(fCheckBitmap);
1217 		fCheckBitmap = NULL;
1218 		mutex_unlock(&fLock);
1219 		fVolume->GetJournal(0)->Unlock(NULL, true);
1220 
1221 		return B_NO_MEMORY;
1222 	}
1223 
1224 	// initialize bitmap
1225 	memset(fCheckBitmap, 0, size);
1226 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length();
1227 			block-- > 0;) {
1228 		_SetCheckBitmapAt(block);
1229 	}
1230 
1231 	cookie->stack.Push(fVolume->Root());
1232 	cookie->stack.Push(fVolume->Indices());
1233 	cookie->iterator = NULL;
1234 	control->cookie = cookie;
1235 
1236 	fCheckCookie = cookie;
1237 		// to be able to restore nicely if "chkbfs" exited abnormally
1238 
1239 	// Put removed vnodes to the stack -- they are not reachable by traversing
1240 	// the file system anymore.
1241 	InodeList::Iterator iterator = fVolume->RemovedInodes().GetIterator();
1242 	while (Inode* inode = iterator.Next()) {
1243 		cookie->stack.Push(inode->BlockRun());
1244 	}
1245 
1246 	// TODO: check reserved area in bitmap!
1247 
1248 	return B_OK;
1249 }
1250 
1251 
1252 status_t
1253 BlockAllocator::StopChecking(check_control* control)
1254 {
1255 	check_cookie* cookie;
1256 	if (control == NULL)
1257 		cookie = fCheckCookie;
1258 	else
1259 		cookie = (check_cookie*)control->cookie;
1260 
1261 	if (cookie == NULL)
1262 		return B_ERROR;
1263 
1264 	if (cookie->iterator != NULL) {
1265 		delete cookie->iterator;
1266 		cookie->iterator = NULL;
1267 
1268 		// the current directory inode is still locked in memory
1269 		put_vnode(fVolume->FSVolume(), fVolume->ToVnode(cookie->current));
1270 	}
1271 
1272 	if (fVolume->IsReadOnly()) {
1273 		// We can't fix errors on this volume
1274 		control->flags &= ~BFS_FIX_BITMAP_ERRORS;
1275 	}
1276 
1277 	// if CheckNextNode() could completely work through, we can
1278 	// fix any damages of the bitmap
1279 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
1280 		// calculate the number of used blocks in the check bitmap
1281 		size_t size = BitmapSize();
1282 		off_t usedBlocks = 0LL;
1283 
1284 		// TODO: update the allocation groups used blocks info
1285 		for (uint32 i = size >> 2; i-- > 0;) {
1286 			uint32 compare = 1;
1287 			// Count the number of bits set
1288 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
1289 				if ((compare & fCheckBitmap[i]) != 0)
1290 					usedBlocks++;
1291 			}
1292 		}
1293 
1294 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks
1295 			+ control->stats.missing;
1296 		if (control->stats.freed < 0)
1297 			control->stats.freed = 0;
1298 
1299 		// Should we fix errors? Were there any errors we can fix?
1300 		if ((control->flags & BFS_FIX_BITMAP_ERRORS) != 0
1301 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
1302 			// If so, write the check bitmap back over the original one,
1303 			// and use transactions here to play safe - we even use several
1304 			// transactions, so that we don't blow the maximum log size
1305 			// on large disks, since we don't need to make this atomic.
1306 #if 0
1307 			// prints the blocks that differ
1308 			off_t block = 0;
1309 			for (int32 i = 0; i < fNumGroups; i++) {
1310 				AllocationBlock cached(fVolume);
1311 				for (uint32 j = 0; j < fGroups[i].NumBlocks(); j++) {
1312 					cached.SetTo(fGroups[i], j);
1313 					for (uint32 k = 0; k < cached.NumBlockBits(); k++) {
1314 						if (cached.IsUsed(k) != _CheckBitmapIsUsedAt(block)) {
1315 							dprintf("differ block %lld (should be %d)\n", block,
1316 								_CheckBitmapIsUsedAt(block));
1317 						}
1318 						block++;
1319 					}
1320 				}
1321 			}
1322 #endif
1323 
1324 			fVolume->SuperBlock().used_blocks
1325 				= HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
1326 
1327 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
1328 			size_t blockSize = fVolume->BlockSize();
1329 
1330 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
1331 				Transaction transaction(fVolume, 1 + i);
1332 
1333 				int32 blocksToWrite = 512;
1334 				if (blocksToWrite + i > blocksInBitmap)
1335 					blocksToWrite = blocksInBitmap - i;
1336 
1337 				status_t status = transaction.WriteBlocks(1 + i,
1338 					(uint8*)fCheckBitmap + i * blockSize, blocksToWrite);
1339 				if (status < B_OK) {
1340 					FATAL(("error writing bitmap: %s\n", strerror(status)));
1341 					break;
1342 				}
1343 				transaction.Done();
1344 			}
1345 		}
1346 	} else
1347 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
1348 
1349 	fVolume->SetCheckingThread(-1);
1350 
1351 	free(fCheckBitmap);
1352 	fCheckBitmap = NULL;
1353 	fCheckCookie = NULL;
1354 	delete cookie;
1355 	mutex_unlock(&fLock);
1356 	fVolume->GetJournal(0)->Unlock(NULL, true);
1357 
1358 	return B_OK;
1359 }
1360 
1361 
1362 status_t
1363 BlockAllocator::CheckNextNode(check_control* control)
1364 {
1365 	if (!_IsValidCheckControl(control))
1366 		return B_BAD_VALUE;
1367 
1368 	check_cookie* cookie = (check_cookie*)control->cookie;
1369 	fVolume->SetCheckingThread(find_thread(NULL));
1370 
1371 	while (true) {
1372 		if (cookie->iterator == NULL) {
1373 			if (!cookie->stack.Pop(&cookie->current)) {
1374 				// no more runs on the stack, we are obviously finished!
1375 				control->status = B_ENTRY_NOT_FOUND;
1376 				return B_ENTRY_NOT_FOUND;
1377 			}
1378 
1379 			Vnode vnode(fVolume, cookie->current);
1380 			Inode* inode;
1381 			if (vnode.Get(&inode) != B_OK) {
1382 				FATAL(("check: Could not open inode at %" B_PRIdOFF "\n",
1383 					fVolume->ToBlock(cookie->current)));
1384 				continue;
1385 			}
1386 
1387 			control->inode = inode->ID();
1388 			control->mode = inode->Mode();
1389 
1390 			if (!inode->IsContainer()) {
1391 				// Check file
1392 				control->errors = 0;
1393 				control->status = CheckInode(inode, control);
1394 
1395 				if (inode->GetName(control->name) < B_OK)
1396 					strcpy(control->name, "(node has no name)");
1397 
1398 				return B_OK;
1399 			}
1400 
1401 			// get iterator for the next directory
1402 
1403 			BPlusTree* tree = inode->Tree();
1404 			if (tree == NULL) {
1405 				FATAL(("check: could not open b+tree from inode at %" B_PRIdOFF
1406 					"\n", fVolume->ToBlock(cookie->current)));
1407 				continue;
1408 			}
1409 
1410 			cookie->parent = inode;
1411 			cookie->parent_mode = inode->Mode();
1412 
1413 			cookie->iterator = new TreeIterator(tree);
1414 			if (cookie->iterator == NULL)
1415 				RETURN_ERROR(B_NO_MEMORY);
1416 
1417 			// the inode must stay locked in memory until the iterator is freed
1418 			vnode.Keep();
1419 
1420 			// check the inode of the directory
1421 			control->errors = 0;
1422 			control->status = CheckInode(inode, control);
1423 
1424 			if (inode->GetName(control->name) < B_OK)
1425 				strcpy(control->name, "(dir has no name)");
1426 
1427 			return B_OK;
1428 		}
1429 
1430 		char name[B_FILE_NAME_LENGTH];
1431 		uint16 length;
1432 		ino_t id;
1433 
1434 		status_t status = cookie->iterator->GetNextEntry(name, &length,
1435 			B_FILE_NAME_LENGTH, &id);
1436 		if (status == B_ENTRY_NOT_FOUND) {
1437 			// there are no more entries in this iterator, free it and go on
1438 			delete cookie->iterator;
1439 			cookie->iterator = NULL;
1440 
1441 			// unlock the directory's inode from memory
1442 			put_vnode(fVolume->FSVolume(), fVolume->ToVnode(cookie->current));
1443 
1444 			continue;
1445 		} else if (status == B_OK) {
1446 			// ignore "." and ".." entries
1447 			if (!strcmp(name, ".") || !strcmp(name, ".."))
1448 				continue;
1449 
1450 			// fill in the control data as soon as we have them
1451 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
1452 			control->inode = id;
1453 			control->errors = 0;
1454 
1455 			Vnode vnode(fVolume, id);
1456 			Inode* inode;
1457 			if (vnode.Get(&inode) != B_OK) {
1458 				FATAL(("Could not open inode ID %" B_PRIdINO "!\n", id));
1459 				control->errors |= BFS_COULD_NOT_OPEN;
1460 
1461 				if ((control->flags & BFS_REMOVE_INVALID) != 0) {
1462 					status = _RemoveInvalidNode(cookie->parent,
1463 						cookie->iterator->Tree(), NULL, name);
1464 				} else
1465 					status = B_ERROR;
1466 
1467 				control->status = status;
1468 				return B_OK;
1469 			}
1470 
1471 			// check if the inode's name is the same as in the b+tree
1472 			if (inode->IsRegularNode()) {
1473 				RecursiveLocker locker(inode->SmallDataLock());
1474 				NodeGetter node(fVolume, inode);
1475 
1476 				const char* localName = inode->Name(node.Node());
1477 				if (localName == NULL || strcmp(localName, name)) {
1478 					control->errors |= BFS_NAMES_DONT_MATCH;
1479 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name,
1480 						localName));
1481 
1482 					if ((control->flags & BFS_FIX_NAME_MISMATCHES) != 0) {
1483 						// Rename the inode
1484 						Transaction transaction(fVolume, inode->BlockNumber());
1485 
1486 						status = inode->SetName(transaction, name);
1487 						if (status == B_OK)
1488 							status = inode->WriteBack(transaction);
1489 						if (status == B_OK)
1490 							status = transaction.Done();
1491 						if (status != B_OK) {
1492 							control->status = status;
1493 							return B_OK;
1494 						}
1495 					}
1496 				}
1497 			}
1498 
1499 			control->mode = inode->Mode();
1500 
1501 			// Check for the correct mode of the node (if the mode of the
1502 			// file don't fit to its parent, there is a serious problem)
1503 			if (((cookie->parent_mode & S_ATTR_DIR) != 0
1504 					&& !inode->IsAttribute())
1505 				|| ((cookie->parent_mode & S_INDEX_DIR) != 0
1506 					&& !inode->IsIndex())
1507 				|| (is_directory(cookie->parent_mode)
1508 					&& !inode->IsRegularNode())) {
1509 				FATAL(("inode at %" B_PRIdOFF " is of wrong type: %o (parent "
1510 					"%o at %" B_PRIdOFF ")!\n", inode->BlockNumber(),
1511 					inode->Mode(), cookie->parent_mode,
1512 					cookie->parent->BlockNumber()));
1513 
1514 				// if we are allowed to fix errors, we should remove the file
1515 				if ((control->flags & BFS_REMOVE_WRONG_TYPES) != 0
1516 					&& (control->flags & BFS_FIX_BITMAP_ERRORS) != 0) {
1517 					status = _RemoveInvalidNode(cookie->parent, NULL, inode,
1518 						name);
1519 				} else
1520 					status = B_ERROR;
1521 
1522 				control->errors |= BFS_WRONG_TYPE;
1523 				control->status = status;
1524 				return B_OK;
1525 			}
1526 
1527 			// push the directory on the stack so that it will be scanned later
1528 			if (inode->IsContainer() && !inode->IsIndex())
1529 				cookie->stack.Push(inode->BlockRun());
1530 			else {
1531 				// check it now
1532 				control->status = CheckInode(inode, control);
1533 
1534 				return B_OK;
1535 			}
1536 		}
1537 	}
1538 	// is never reached
1539 }
1540 
1541 
1542 status_t
1543 BlockAllocator::_RemoveInvalidNode(Inode* parent, BPlusTree* tree, Inode* inode,
1544 	const char* name)
1545 {
1546 	// it's safe to start a transaction, because Inode::Remove()
1547 	// won't touch the block bitmap (which we hold the lock for)
1548 	// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1549 	// the bitmap anyway
1550 	Transaction transaction(fVolume, parent->BlockNumber());
1551 	status_t status;
1552 
1553 	if (inode != NULL) {
1554 		inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1555 
1556 		status = parent->Remove(transaction, name, NULL, false, true);
1557 	} else {
1558 		parent->WriteLockInTransaction(transaction);
1559 
1560 		// does the file even exist?
1561 		off_t id;
1562 		status = tree->Find((uint8*)name, (uint16)strlen(name), &id);
1563 		if (status == B_OK)
1564 			status = tree->Remove(transaction, name, id);
1565 	}
1566 
1567 	if (status == B_OK)
1568 		transaction.Done();
1569 
1570 	return status;
1571 }
1572 
1573 
1574 bool
1575 BlockAllocator::_CheckBitmapIsUsedAt(off_t block) const
1576 {
1577 	size_t size = BitmapSize();
1578 	uint32 index = block / 32;	// 32bit resolution
1579 	if (index > size / 4)
1580 		return false;
1581 
1582 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index])
1583 		& (1UL << (block & 0x1f));
1584 }
1585 
1586 
1587 void
1588 BlockAllocator::_SetCheckBitmapAt(off_t block)
1589 {
1590 	size_t size = BitmapSize();
1591 	uint32 index = block / 32;	// 32bit resolution
1592 	if (index > size / 4)
1593 		return;
1594 
1595 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1596 }
1597 
1598 
1599 /*!	Checks whether or not the specified block range is allocated or not,
1600 	depending on the \a allocated argument.
1601 */
1602 status_t
1603 BlockAllocator::CheckBlocks(off_t start, off_t length, bool allocated)
1604 {
1605 	if (start < 0 || start + length > fVolume->NumBlocks())
1606 		return B_BAD_VALUE;
1607 
1608 	uint32 group = start >> fVolume->AllocationGroupShift();
1609 	uint32 groupBlock = start / (fVolume->BlockSize() << 3);
1610 	uint32 blockOffset = start % fVolume->BlockSize();
1611 
1612 	AllocationBlock cached(fVolume);
1613 
1614 	while (groupBlock < fGroups[group].NumBlocks() && length > 0) {
1615 		if (cached.SetTo(fGroups[group], groupBlock) != B_OK)
1616 			RETURN_ERROR(B_IO_ERROR);
1617 
1618 		for (; blockOffset < cached.NumBlockBits() && length > 0;
1619 				blockOffset++, length--) {
1620 			if (cached.IsUsed(blockOffset) != allocated) {
1621 				RETURN_ERROR(B_BAD_DATA);
1622 			}
1623 		}
1624 
1625 		blockOffset = 0;
1626 
1627 		if (++groupBlock >= fGroups[group].NumBlocks()) {
1628 			groupBlock = 0;
1629 			group++;
1630 		}
1631 	}
1632 
1633 	return B_OK;
1634 }
1635 
1636 
1637 status_t
1638 BlockAllocator::CheckBlockRun(block_run run, const char* type,
1639 	check_control* control, bool allocated)
1640 {
1641 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1642 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1643 		|| uint32(run.Start() + run.Length())
1644 				> fGroups[run.AllocationGroup()].fNumBits
1645 		|| run.length == 0) {
1646 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type,
1647 			run.AllocationGroup(), run.Start(), run.Length()));
1648 		if (control == NULL)
1649 			return B_BAD_DATA;
1650 
1651 		control->errors |= BFS_INVALID_BLOCK_RUN;
1652 		return B_OK;
1653 	}
1654 
1655 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1656 	uint32 block = run.Start() / bitsPerBlock;
1657 	uint32 pos = run.Start() % bitsPerBlock;
1658 	int32 length = 0;
1659 	off_t firstMissing = -1, firstSet = -1;
1660 	off_t firstGroupBlock
1661 		= (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1662 
1663 	AllocationBlock cached(fVolume);
1664 
1665 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1666 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1667 			RETURN_ERROR(B_IO_ERROR);
1668 
1669 		if (pos >= cached.NumBlockBits()) {
1670 			// something very strange has happened...
1671 			RETURN_ERROR(B_ERROR);
1672 		}
1673 
1674 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1675 			if (cached.IsUsed(pos) != allocated) {
1676 				if (control == NULL) {
1677 					PRINT(("%s: block_run(%ld, %u, %u) is only partially "
1678 						"allocated (pos = %ld, length = %ld)!\n", type,
1679 						run.AllocationGroup(), run.Start(), run.Length(),
1680 						pos, length));
1681 					return B_BAD_DATA;
1682 				}
1683 				if (firstMissing == -1) {
1684 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1685 					control->errors |= BFS_MISSING_BLOCKS;
1686 				}
1687 				control->stats.missing++;
1688 			} else if (firstMissing != -1) {
1689 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are "
1690 					"%sallocated!\n", type, run.AllocationGroup(), run.Start(),
1691 					run.Length(), firstMissing,
1692 					firstGroupBlock + pos + block * bitsPerBlock - 1,
1693 					allocated ? "not " : ""));
1694 				firstMissing = -1;
1695 			}
1696 
1697 			if (fCheckBitmap != NULL) {
1698 				// Set the block in the check bitmap as well, but have a look
1699 				// if it is already allocated first
1700 				uint32 offset = pos + block * bitsPerBlock;
1701 				if (_CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1702 					if (firstSet == -1) {
1703 						firstSet = firstGroupBlock + offset;
1704 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1705 						dprintf("block %" B_PRIdOFF " is already set!!!\n",
1706 							firstGroupBlock + offset);
1707 					}
1708 					control->stats.already_set++;
1709 				} else {
1710 					if (firstSet != -1) {
1711 						FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF
1712 							" - %" B_PRIdOFF " are already set!\n", type,
1713 							(int)run.AllocationGroup(), run.Start(),
1714 							run.Length(), firstSet,
1715 							firstGroupBlock + offset - 1));
1716 						firstSet = -1;
1717 					}
1718 					_SetCheckBitmapAt(firstGroupBlock + offset);
1719 				}
1720 			}
1721 			length++;
1722 			pos++;
1723 		}
1724 
1725 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1726 			if (firstMissing != -1) {
1727 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not "
1728 					"allocated!\n", type, run.AllocationGroup(), run.Start(),
1729 					run.Length(), firstMissing,
1730 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1731 			}
1732 			if (firstSet != -1) {
1733 				FATAL(("%s: block_run(%d, %u, %u): blocks %" B_PRIdOFF " - %"
1734 					B_PRIdOFF " are already set!\n", type,
1735 					(int)run.AllocationGroup(), run.Start(), run.Length(),
1736 					firstSet,
1737 					firstGroupBlock + pos + block * bitsPerBlock - 1));
1738 			}
1739 		}
1740 	}
1741 
1742 	return B_OK;
1743 }
1744 
1745 
1746 status_t
1747 BlockAllocator::CheckInode(Inode* inode, check_control* control)
1748 {
1749 	if (control != NULL && fCheckBitmap == NULL)
1750 		return B_NO_INIT;
1751 	if (inode == NULL)
1752 		return B_BAD_VALUE;
1753 
1754 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1755 	if (status != B_OK)
1756 		return status;
1757 
1758 	// If the inode has an attribute directory, push it on the stack
1759 	if (!inode->Attributes().IsZero()) {
1760 		check_cookie* cookie = (check_cookie*)control->cookie;
1761 		cookie->stack.Push(inode->Attributes());
1762 	}
1763 
1764 	if (inode->IsSymLink() && (inode->Flags() & INODE_LONG_SYMLINK) == 0) {
1765 		// symlinks may not have a valid data stream
1766 		if (strlen(inode->Node().short_symlink) >= SHORT_SYMLINK_NAME_LENGTH)
1767 			return B_BAD_DATA;
1768 
1769 		return B_OK;
1770 	}
1771 
1772 	data_stream* data = &inode->Node().data;
1773 
1774 	// check the direct range
1775 
1776 	if (data->max_direct_range) {
1777 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1778 			if (data->direct[i].IsZero())
1779 				break;
1780 
1781 			status = CheckBlockRun(data->direct[i], "direct", control);
1782 			if (status < B_OK)
1783 				return status;
1784 		}
1785 	}
1786 
1787 	CachedBlock cached(fVolume);
1788 
1789 	// check the indirect range
1790 
1791 	if (data->max_indirect_range) {
1792 		status = CheckBlockRun(data->indirect, "indirect", control);
1793 		if (status < B_OK)
1794 			return status;
1795 
1796 		off_t block = fVolume->ToBlock(data->indirect);
1797 
1798 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1799 			block_run* runs = (block_run*)cached.SetTo(block + i);
1800 			if (runs == NULL)
1801 				RETURN_ERROR(B_IO_ERROR);
1802 
1803 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1804 			int32 index = 0;
1805 			for (; index < runsPerBlock; index++) {
1806 				if (runs[index].IsZero())
1807 					break;
1808 
1809 				status = CheckBlockRun(runs[index], "indirect->run", control);
1810 				if (status < B_OK)
1811 					return status;
1812 			}
1813 			if (index < runsPerBlock)
1814 				break;
1815 		}
1816 	}
1817 
1818 	// check the double indirect range
1819 
1820 	if (data->max_double_indirect_range) {
1821 		status = CheckBlockRun(data->double_indirect, "double indirect",
1822 			control);
1823 		if (status != B_OK)
1824 			return status;
1825 
1826 		int32 runsPerBlock = runs_per_block(fVolume->BlockSize());
1827 		int32 runsPerArray = runsPerBlock * data->double_indirect.Length();
1828 
1829 		CachedBlock cachedDirect(fVolume);
1830 
1831 		for (int32 indirectIndex = 0; indirectIndex < runsPerArray;
1832 				indirectIndex++) {
1833 			// get the indirect array block
1834 			block_run* array = (block_run*)cached.SetTo(
1835 				fVolume->ToBlock(data->double_indirect)
1836 					+ indirectIndex / runsPerBlock);
1837 			if (array == NULL)
1838 				return B_IO_ERROR;
1839 
1840 			block_run indirect = array[indirectIndex % runsPerBlock];
1841 			// are we finished yet?
1842 			if (indirect.IsZero())
1843 				return B_OK;
1844 
1845 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1846 			if (status != B_OK)
1847 				return status;
1848 
1849 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift())
1850 				/ sizeof(block_run);
1851 
1852 			for (int32 index = 0; index < maxIndex; ) {
1853 				block_run* runs = (block_run*)cachedDirect.SetTo(
1854 					fVolume->ToBlock(indirect) + index / runsPerBlock);
1855 				if (runs == NULL)
1856 					return B_IO_ERROR;
1857 
1858 				do {
1859 					// are we finished yet?
1860 					if (runs[index % runsPerBlock].IsZero())
1861 						return B_OK;
1862 
1863 					status = CheckBlockRun(runs[index % runsPerBlock],
1864 						"double indirect->runs->run", control);
1865 					if (status != B_OK)
1866 						return status;
1867 				} while ((++index % runsPerArray) != 0);
1868 			}
1869 		}
1870 	}
1871 
1872 	return B_OK;
1873 }
1874 
1875 
1876 //	#pragma mark - debugger commands
1877 
1878 
1879 #ifdef BFS_DEBUGGER_COMMANDS
1880 
1881 void
1882 BlockAllocator::Dump(int32 index)
1883 {
1884 	kprintf("allocation groups: %ld (base %p)\n", fNumGroups, fGroups);
1885 	kprintf("blocks per group: %ld\n", fBlocksPerGroup);
1886 
1887 	for (int32 i = 0; i < fNumGroups; i++) {
1888 		if (index != -1 && i != index)
1889 			continue;
1890 
1891 		AllocationGroup& group = fGroups[i];
1892 
1893 		kprintf("[%3ld] num bits:       %lu  (%p)\n", i, group.NumBits(),
1894 			&group);
1895 		kprintf("      num blocks:     %lu\n", group.NumBlocks());
1896 		kprintf("      start:          %ld\n", group.Start());
1897 		kprintf("      first free:     %ld\n", group.fFirstFree);
1898 		kprintf("      largest start:  %ld%s\n", group.fLargestStart,
1899 			group.fLargestValid ? "" : "  (invalid)");
1900 		kprintf("      largest length: %ld\n", group.fLargestLength);
1901 		kprintf("      free bits:      %ld\n", group.fFreeBits);
1902 	}
1903 }
1904 
1905 
1906 #if BFS_TRACING
1907 static char kTraceBuffer[256];
1908 
1909 
1910 int
1911 dump_block_allocator_blocks(int argc, char** argv)
1912 {
1913 	if (argc != 3 || !strcmp(argv[1], "--help")) {
1914 		kprintf("usage: %s <ptr-to-volume> <block>\n", argv[0]);
1915 		return 0;
1916 	}
1917 
1918 	Volume* volume = (Volume*)parse_expression(argv[1]);
1919 	off_t block = parse_expression(argv[2]);
1920 
1921 	// iterate over all tracing entries to find overlapping actions
1922 
1923 	using namespace BFSBlockTracing;
1924 
1925 	LazyTraceOutput out(kTraceBuffer, sizeof(kTraceBuffer), 0);
1926 	TraceEntryIterator iterator;
1927 	while (TraceEntry* _entry = iterator.Next()) {
1928 		if (Allocate* entry = dynamic_cast<Allocate*>(_entry)) {
1929 			off_t first = volume->ToBlock(entry->Run());
1930 			off_t last = first - 1 + entry->Run().Length();
1931 			if (block >= first && block <= last) {
1932 				out.Clear();
1933 				const char* dump = out.DumpEntry(entry);
1934 				kprintf("%5ld. %s\n", iterator.Index(), dump);
1935 			}
1936 		} else if (Free* entry = dynamic_cast<Free*>(_entry)) {
1937 			off_t first = volume->ToBlock(entry->Run());
1938 			off_t last = first - 1 + entry->Run().Length();
1939 			if (block >= first && block <= last) {
1940 				out.Clear();
1941 				const char* dump = out.DumpEntry(entry);
1942 				kprintf("%5ld. %s\n", iterator.Index(), dump);
1943 			}
1944 		}
1945 	}
1946 
1947 	return 0;
1948 }
1949 #endif
1950 
1951 
1952 int
1953 dump_block_allocator(int argc, char** argv)
1954 {
1955 	int32 group = -1;
1956 	if (argc == 3) {
1957 		group = parse_expression(argv[2]);
1958 		argc--;
1959 	}
1960 
1961 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1962 		kprintf("usage: %s <ptr-to-volume> [group]\n", argv[0]);
1963 		return 0;
1964 	}
1965 
1966 	Volume* volume = (Volume*)parse_expression(argv[1]);
1967 	BlockAllocator& allocator = volume->Allocator();
1968 
1969 	allocator.Dump(group);
1970 	return 0;
1971 }
1972 
1973 #endif	// BFS_DEBUGGER_COMMANDS
1974 
1975