xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision d5cd5d63ff0ad395989db6cf4841a64d5b545d1d)
1 /* BlockAllocator - block bitmap handling and allocation policies
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** This file may be used under the terms of the OpenBeOS License.
5 */
6 
7 
8 #include "cpp.h"
9 #include "Debug.h"
10 #include "BlockAllocator.h"
11 #include "Volume.h"
12 #include "Inode.h"
13 #include "BPlusTree.h"
14 #include "Stack.h"
15 #include "bfs_control.h"
16 
17 #include <malloc.h>
18 
19 #ifdef USER
20 #	define spawn_kernel_thread spawn_thread
21 #endif
22 
23 
24 // Things the BlockAllocator should do:
25 
26 // - find a range of blocks of a certain size nearby a specific position
27 // - allocating an unsharp range of blocks for pre-allocation
28 // - free blocks
29 // - know how to deal with each allocation, special handling for directories,
30 //   files, symlinks, etc. (type sensitive allocation policies)
31 
32 // What makes the code complicated is the fact that we are not just reading
33 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
34 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
35 // of partitions will grow even more - so that's not an option.
36 // Instead we are reading in every block when it's used - since an allocation
37 // group can span several blocks in the block bitmap, the AllocationBlock
38 // class is there to make handling those easier.
39 
40 // The current implementation is only slightly optimized and could probably
41 // be improved a lot. Furthermore, the allocation policies used here should
42 // have some real world tests.
43 
44 
45 struct check_cookie {
46 	check_cookie() {}
47 
48 	block_run			current;
49 	Inode				*parent;
50 	mode_t				parent_mode;
51 	Stack<block_run>	stack;
52 	TreeIterator		*iterator;
53 };
54 
55 
56 class AllocationBlock : public CachedBlock {
57 	public:
58 		AllocationBlock(Volume *volume);
59 
60 		void Allocate(uint16 start, uint16 numBlocks);
61 		void Free(uint16 start, uint16 numBlocks);
62 		inline bool IsUsed(uint16 block);
63 
64 		status_t SetTo(AllocationGroup &group, uint16 block);
65 
66 		uint32 NumBlockBits() const { return fNumBits; }
67 		uint32 &Block(int32 index) { return ((uint32 *)fBlock)[index]; }
68 
69 	private:
70 		uint32 fNumBits;
71 };
72 
73 
74 class AllocationGroup {
75 	public:
76 		AllocationGroup();
77 
78 		void AddFreeRange(int32 start, int32 blocks);
79 		bool IsFull() const { return fFreeBits == 0; }
80 
81 		status_t Allocate(Transaction *transaction, uint16 start, int32 length);
82 		status_t Free(Transaction *transaction, uint16 start, int32 length);
83 
84 		uint32 fNumBits;
85 		int32 fStart;
86 		int32 fFirstFree, fLargest, fLargestFirst;
87 			// ToDo: fLargest & fLargestFirst are not maintained (and therefore used) yet!
88 		int32 fFreeBits;
89 };
90 
91 
92 AllocationBlock::AllocationBlock(Volume *volume)
93 	: CachedBlock(volume)
94 {
95 }
96 
97 
98 status_t
99 AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
100 {
101 	// 8 blocks per byte
102 	fNumBits = fVolume->BlockSize() << 3;
103 	// the last group may have less bits in the last block
104 	if ((block + 1) * fNumBits > group.fNumBits)
105 		fNumBits = group.fNumBits % fNumBits;
106 
107 	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
108 }
109 
110 
111 bool
112 AllocationBlock::IsUsed(uint16 block)
113 {
114 	if (block > fNumBits)
115 		return true;
116 	// the block bitmap is accessed in 32-bit blocks
117 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
118 }
119 
120 
121 void
122 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
123 {
124 	ASSERT(start < fNumBits);
125 	ASSERT(uint32(start + numBlocks) <= fNumBits);
126 
127 	if (uint32(start + numBlocks) > fNumBits) {
128 		FATAL(("Allocation::Allocate(): tried to allocate too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
129 		DIE(("Allocation::Allocate(): tried to allocate too many blocks"));
130 	}
131 
132 	int32 block = start >> 5;
133 
134 	while (numBlocks > 0) {
135 		uint32 mask = 0;
136 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
137 			mask |= 1UL << (i % 32);
138 
139 #ifdef DEBUG
140 		// check for already set blocks
141 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32 *)fBlock)[block]) {
142 			FATAL(("AllocationBlock::Allocate(): some blocks are already allocated, start = %u, numBlocks = %u\n", start, numBlocks));
143 			DEBUGGER(("blocks already set!"));
144 		}
145 #endif
146 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
147 		start = 0;
148 	}
149 }
150 
151 
152 void
153 AllocationBlock::Free(uint16 start, uint16 numBlocks)
154 {
155 	ASSERT(start < fNumBits);
156 	ASSERT(uint32(start + numBlocks) <= fNumBits);
157 
158 	if (uint32(start + numBlocks) > fNumBits) {
159 		FATAL(("Allocation::Free(): tried to free too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
160 		DIE(("Allocation::Free(): tried to free too many blocks"));
161 	}
162 
163 	int32 block = start >> 5;
164 
165 	while (numBlocks > 0) {
166 		uint32 mask = 0;
167 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
168 			mask |= 1UL << (i % 32);
169 
170 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
171 		start = 0;
172 	}
173 }
174 
175 
176 //	#pragma mark -
177 
178 
179 AllocationGroup::AllocationGroup()
180 	:
181 	fFirstFree(-1),
182 	fLargest(-1),
183 	fLargestFirst(-1),
184 	fFreeBits(0)
185 {
186 }
187 
188 
189 void
190 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
191 {
192 	//D(if (blocks > 512)
193 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
194 
195 	if (fFirstFree == -1)
196 		fFirstFree = start;
197 
198 	if (fLargest < blocks) {
199 		fLargest = blocks;
200 		fLargestFirst = start;
201 	}
202 
203 	fFreeBits += blocks;
204 }
205 
206 
207 /** Allocates the specified run in the allocation group.
208  *	Doesn't check if the run is valid or already allocated
209  *	partially, nor does it maintain the free ranges hints
210  *	or the volume's used blocks count.
211  *	It only does the low-level work of allocating some bits
212  *	in the block bitmap.
213  *	Assumes that the block bitmap lock is hold.
214  */
215 
216 status_t
217 AllocationGroup::Allocate(Transaction *transaction, uint16 start, int32 length)
218 {
219 	if (start > fNumBits)
220 		return B_ERROR;
221 
222 	// Update the allocation group info
223 	// ToDo: this info will be incorrect if something goes wrong later
224 	// Note, the fFirstFree block doesn't have to be really free
225 	if (start == fFirstFree)
226 		fFirstFree = start + length;
227 	fFreeBits -= length;
228 
229 	Volume *volume = transaction->GetVolume();
230 
231 	// calculate block in the block bitmap and position within
232 	uint32 bitsPerBlock = volume->BlockSize() << 3;
233 	uint32 block = start / bitsPerBlock;
234 	start = start % bitsPerBlock;
235 
236 	AllocationBlock cached(volume);
237 
238 	while (length > 0) {
239 		if (cached.SetTo(*this, block) < B_OK)
240 			RETURN_ERROR(B_IO_ERROR);
241 
242 		uint32 numBlocks = length;
243 		if (start + numBlocks > cached.NumBlockBits())
244 			numBlocks = cached.NumBlockBits() - start;
245 
246 		cached.Allocate(start, numBlocks);
247 
248 		if (cached.WriteBack(transaction) < B_OK)
249 			return B_IO_ERROR;
250 
251 		length -= numBlocks;
252 		start = 0;
253 		block++;
254 	}
255 
256 	return B_OK;
257 }
258 
259 
260 /** Frees the specified run in the allocation group.
261  *	Doesn't check if the run is valid or was not completely
262  *	allocated, nor does it maintain the free ranges hints
263  *	or the volume's used blocks count.
264  *	It only does the low-level work of freeing some bits
265  *	in the block bitmap.
266  *	Assumes that the block bitmap lock is hold.
267  */
268 
269 status_t
270 AllocationGroup::Free(Transaction *transaction, uint16 start, int32 length)
271 {
272 	if (start > fNumBits)
273 		return B_ERROR;
274 
275 	// Update the allocation group info
276 	// ToDo: this info will be incorrect if something goes wrong later
277 	if (fFirstFree > start)
278 		fFirstFree = start;
279 	fFreeBits += length;
280 
281 	Volume *volume = transaction->GetVolume();
282 
283 	// calculate block in the block bitmap and position within
284 	uint32 bitsPerBlock = volume->BlockSize() << 3;
285 	uint32 block = start / bitsPerBlock;
286 	start = start % bitsPerBlock;
287 
288 	AllocationBlock cached(volume);
289 
290 	while (length > 0) {
291 		if (cached.SetTo(*this, block) < B_OK)
292 			RETURN_ERROR(B_IO_ERROR);
293 
294 		uint16 freeLength = length;
295 		if (uint32(start + length) > cached.NumBlockBits())
296 			freeLength = cached.NumBlockBits() - start;
297 
298 		cached.Free(start, freeLength);
299 
300 		if (cached.WriteBack(transaction) < B_OK)
301 			return B_IO_ERROR;
302 
303 		length -= freeLength;
304 		start = 0;
305 		block++;
306 	}
307 	return B_OK;
308 }
309 
310 
311 //	#pragma mark -
312 
313 
314 BlockAllocator::BlockAllocator(Volume *volume)
315 	:
316 	fVolume(volume),
317 	fLock("bfs allocator"),
318 	fGroups(NULL),
319 	fCheckBitmap(NULL)
320 {
321 }
322 
323 
324 BlockAllocator::~BlockAllocator()
325 {
326 	delete[] fGroups;
327 }
328 
329 
330 status_t
331 BlockAllocator::Initialize()
332 {
333 	if (fLock.InitCheck() < B_OK)
334 		return B_ERROR;
335 
336 	fNumGroups = fVolume->AllocationGroups();
337 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
338 	fGroups = new AllocationGroup[fNumGroups];
339 	if (fGroups == NULL)
340 		return B_NO_MEMORY;
341 
342 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,
343 		"bfs block allocator", B_LOW_PRIORITY, (void *)this);
344 	if (id < B_OK)
345 		return initialize(this);
346 
347 	return resume_thread(id);
348 }
349 
350 
351 status_t
352 BlockAllocator::initialize(BlockAllocator *allocator)
353 {
354 	Locker lock(allocator->fLock);
355 
356 	Volume *volume = allocator->fVolume;
357 	uint32 blocks = allocator->fBlocksPerGroup;
358 	uint32 numBits = 8 * blocks * volume->BlockSize();
359 	off_t freeBlocks = 0;
360 
361 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
362 	if (buffer == NULL)
363 		RETURN_ERROR(B_NO_MEMORY);
364 
365 	AllocationGroup *groups = allocator->fGroups;
366 	off_t offset = 1;
367 	int32 num = allocator->fNumGroups;
368 
369 	for (int32 i = 0;i < num;i++) {
370 		if (cached_read(volume->Device(), offset, buffer, blocks, volume->BlockSize()) < B_OK)
371 			break;
372 
373 		// the last allocation group may contain less blocks than the others
374 		groups[i].fNumBits = i == num - 1 ? allocator->fVolume->NumBlocks() - i * numBits : numBits;
375 		groups[i].fStart = offset;
376 
377 		// finds all free ranges in this allocation group
378 		int32 start = -1, range = 0;
379 		int32 size = groups[i].fNumBits, num = 0;
380 
381 		for (int32 k = 0;k < (size >> 2);k++) {
382 			for (int32 j = 0; j < 32 && num < size; j++, num++) {
383 				if (buffer[k] & (1UL << j)) {
384 					if (range > 0) {
385 						groups[i].AddFreeRange(start, range);
386 						range = 0;
387 					}
388 				} else if (range++ == 0)
389 					start = num;
390 			}
391 		}
392 		if (range)
393 			groups[i].AddFreeRange(start, range);
394 
395 		freeBlocks += groups[i].fFreeBits;
396 
397 		offset += blocks;
398 	}
399 	free(buffer);
400 
401 	// check if block bitmap and log area are reserved
402 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
403 	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
404 		Transaction transaction(volume, 0);
405 		if (groups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
406 			FATAL(("could not allocate reserved space for block bitmap/log!\n"));
407 			volume->Panic();
408 		} else {
409 			FATAL(("space for block bitmap or log area was not reserved!\n"));
410 
411 			transaction.Done();
412 		}
413 	}
414 
415 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
416 	if (volume->UsedBlocks() != usedBlocks) {
417 		// If the disk in a dirty state at mount time, it's
418 		// normal that the values don't match
419 		INFORM(("volume reports %Ld used blocks, correct is %Ld\n", volume->UsedBlocks(), usedBlocks));
420 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
421 	}
422 
423 	return B_OK;
424 }
425 
426 
427 status_t
428 BlockAllocator::AllocateBlocks(Transaction *transaction, int32 group, uint16 start,
429 	uint16 maximum, uint16 minimum, block_run &run)
430 {
431 	if (maximum == 0)
432 		return B_BAD_VALUE;
433 
434 	AllocationBlock cached(fVolume);
435 	Locker lock(fLock);
436 
437 	// the first scan through all allocation groups will look for the
438 	// wanted maximum of blocks, the second scan will just look to
439 	// satisfy the minimal requirement
440 	uint16 numBlocks = maximum;
441 
442 	for (int32 i = 0; i < fNumGroups * 2; i++, group++, start = 0) {
443 		group = group % fNumGroups;
444 
445 		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
446 			continue;
447 
448 		if (i >= fNumGroups) {
449 			// if the minimum is the same as the maximum, it's not necessary to
450 			// search for in the allocation groups a second time
451 			if (maximum == minimum)
452 				return B_DEVICE_FULL;
453 
454 			numBlocks = minimum;
455 		}
456 
457 		// The wanted maximum is smaller than the largest free block in the group
458 		// or already smaller than the minimum
459 		// ToDo: disabled because it's currently not maintained after the first allocation
460 		//if (numBlocks > fGroups[group].fLargest)
461 		//	continue;
462 
463 		if (start < fGroups[group].fFirstFree)
464 			start = fGroups[group].fFirstFree;
465 
466 		// there may be more than one block per allocation group - and
467 		// we iterate through it to find a place for the allocation.
468 		// (one allocation can't exceed one allocation group)
469 
470 		uint32 block = start / (fVolume->BlockSize() << 3);
471 		int32 range = 0, rangeStart = 0;
472 
473 		for (; block < fBlocksPerGroup; block++) {
474 			if (cached.SetTo(fGroups[group], block) < B_OK)
475 				RETURN_ERROR(B_ERROR);
476 
477 			// find a block large enough to hold the allocation
478 			for (uint32 bit = start % cached.NumBlockBits(); bit < cached.NumBlockBits(); bit++) {
479 				if (!cached.IsUsed(bit)) {
480 					if (range == 0) {
481 						// start new range
482 						rangeStart = block * cached.NumBlockBits() + bit;
483 					}
484 
485 					// have we found a range large enough to hold numBlocks?
486 					if (++range >= maximum)
487 						break;
488 				} else if (i >= fNumGroups && range >= minimum) {
489 					// we have found a block larger than the required minimum (second pass)
490 					break;
491 				} else {
492 					// end of a range
493 					range = 0;
494 				}
495 			}
496 
497 			// if we found a suitable block, mark the blocks as in use, and write
498 			// the updated block bitmap back to disk
499 			if (range >= numBlocks) {
500 				// adjust allocation size
501 				if (numBlocks < maximum)
502 					numBlocks = range;
503 
504 				if (fGroups[group].Allocate(transaction, rangeStart, numBlocks) < B_OK)
505 					RETURN_ERROR(B_IO_ERROR);
506 
507 				run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(group);
508 				run.start = HOST_ENDIAN_TO_BFS_INT16(rangeStart);
509 				run.length = HOST_ENDIAN_TO_BFS_INT16(numBlocks);
510 
511 				fVolume->SuperBlock().used_blocks =
512 					HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + numBlocks);
513 					// We are not writing back the disk's super block - it's
514 					// either done by the journaling code, or when the disk
515 					// is unmounted.
516 					// If the value is not correct at mount time, it will be
517 					// fixed anyway.
518 
519 				return B_OK;
520 			}
521 
522 			// start from the beginning of the next block
523 			start = 0;
524 		}
525 	}
526 	return B_DEVICE_FULL;
527 }
528 
529 
530 status_t
531 BlockAllocator::AllocateForInode(Transaction *transaction, const block_run *parent,
532 	mode_t type, block_run &run)
533 {
534 	// apply some allocation policies here (AllocateBlocks() will break them
535 	// if necessary) - we will start with those described in Dominic Giampaolo's
536 	// "Practical File System Design", and see how good they work
537 
538 	// files are going in the same allocation group as its parent, sub-directories
539 	// will be inserted 8 allocation groups after the one of the parent
540 	uint16 group = parent->AllocationGroup();
541 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
542 		group += 8;
543 
544 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
545 }
546 
547 
548 status_t
549 BlockAllocator::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks,
550 	block_run &run, uint16 minimum)
551 {
552 	if (numBlocks <= 0)
553 		return B_ERROR;
554 
555 	// one block_run can't hold more data than there is in one allocation group
556 	if (numBlocks > fGroups[0].fNumBits)
557 		numBlocks = fGroups[0].fNumBits;
558 
559 	// since block_run.length is uint16, the largest number of blocks that
560 	// can be covered by a block_run is 65535
561 	// ToDo: if we drop compatibility, couldn't we do this any better?
562 	// There are basically two possibilities:
563 	// a) since a length of zero doesn't have any sense, take that for 65536 -
564 	//    but that could cause many problems (bugs) in other areas
565 	// b) reduce the maximum amount of blocks per block_run, so that the remaining
566 	//    number of free blocks can be used in a useful manner (like 4 blocks) -
567 	//    but that would also reduce the maximum file size
568 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
569 		numBlocks = MAX_BLOCK_RUN_LENGTH;
570 
571 	// apply some allocation policies here (AllocateBlocks() will break them
572 	// if necessary)
573 	uint16 group = inode->BlockRun().AllocationGroup();
574 	uint16 start = 0;
575 
576 	// are there already allocated blocks? (then just try to allocate near the last one)
577 	if (inode->Size() > 0) {
578 		data_stream *data = &inode->Node()->data;
579 		// ToDo: we currently don't care for when the data stream
580 		// is already grown into the indirect ranges
581 		if (data->max_double_indirect_range == 0
582 			&& data->max_indirect_range == 0) {
583 			// Since size > 0, there must be a valid block run in this stream
584 			int32 last = 0;
585 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
586 				if (data->direct[last + 1].IsZero())
587 					break;
588 
589 			group = data->direct[last].AllocationGroup();
590 			start = data->direct[last].Start() + data->direct[last].Length();
591 		}
592 	} else if (inode->IsContainer() || inode->IsSymLink()) {
593 		// directory and symbolic link data will go in the same allocation
594 		// group as the inode is in but after the inode data
595 		start = inode->BlockRun().Start();
596 	} else {
597 		// file data will start in the next allocation group
598 		group = inode->BlockRun().AllocationGroup() + 1;
599 	}
600 
601 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
602 }
603 
604 
605 status_t
606 BlockAllocator::Free(Transaction *transaction, block_run run)
607 {
608 	Locker lock(fLock);
609 
610 	int32 group = run.AllocationGroup();
611 	uint16 start = run.Start();
612 	uint16 length = run.Length();
613 
614 	// doesn't use Volume::IsValidBlockRun() here because it can check better
615 	// against the group size (the last group may have a different length)
616 	if (group < 0 || group >= fNumGroups
617 		|| start > fGroups[group].fNumBits
618 		|| uint32(start + length) > fGroups[group].fNumBits
619 		|| length == 0) {
620 		FATAL(("tried to free an invalid block_run (%ld, %u, %u)\n", group, start, length));
621 		DEBUGGER(("tried to free invalid block_run"));
622 		return B_BAD_VALUE;
623 	}
624 	// check if someone tries to free reserved areas at the beginning of the drive
625 	if (group == 0 && start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
626 		FATAL(("tried to free a reserved block_run (%ld, %u, %u)\n", group, start, length));
627 		DEBUGGER(("tried to free reserved block"));
628 		return B_BAD_VALUE;
629 	}
630 #ifdef DEBUG
631 	if (CheckBlockRun(run) < B_OK)
632 		return B_BAD_DATA;
633 #endif
634 
635 	if (fGroups[group].Free(transaction, start, length) < B_OK)
636 		RETURN_ERROR(B_IO_ERROR);
637 
638 	fVolume->SuperBlock().used_blocks =
639 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
640 	return B_OK;
641 }
642 
643 
644 //	#pragma mark -
645 //	Functions to check the validity of the bitmap - they are used from
646 //	the "chkbfs" command
647 
648 
649 bool
650 BlockAllocator::IsValidCheckControl(check_control *control)
651 {
652 	if (control == NULL
653 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
654 		FATAL(("invalid check_control (%p)!\n", control));
655 		return false;
656 	}
657 
658 	return true;
659 }
660 
661 
662 status_t
663 BlockAllocator::StartChecking(check_control *control)
664 {
665 	if (!IsValidCheckControl(control))
666 		return B_BAD_VALUE;
667 
668 	status_t status = fLock.Lock();
669 	if (status < B_OK)
670 		return status;
671 
672 	size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
673 	fCheckBitmap = (uint32 *)malloc(size);
674 	if (fCheckBitmap == NULL) {
675 		fLock.Unlock();
676 		return B_NO_MEMORY;
677 	}
678 
679 	check_cookie *cookie = new check_cookie();
680 	if (cookie == NULL) {
681 		free(fCheckBitmap);
682 		fCheckBitmap = NULL;
683 		fLock.Unlock();
684 
685 		return B_NO_MEMORY;
686 	}
687 
688 	// initialize bitmap
689 	memset(fCheckBitmap, 0, size);
690 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length(); block-- > 0;)
691 		SetCheckBitmapAt(block);
692 
693 	cookie->stack.Push(fVolume->Root());
694 	cookie->stack.Push(fVolume->Indices());
695 	cookie->iterator = NULL;
696 	control->cookie = cookie;
697 
698 	fCheckCookie = cookie;
699 		// to be able to restore nicely if "chkbfs" exited abnormally
700 
701 	// ToDo: check reserved area in bitmap!
702 
703 	return B_OK;
704 }
705 
706 
707 status_t
708 BlockAllocator::StopChecking(check_control *control)
709 {
710 	check_cookie *cookie;
711 	if (control == NULL)
712 		cookie = fCheckCookie;
713 	else
714 		cookie = (check_cookie *)control->cookie;
715 
716 	if (cookie == NULL)
717 		return B_ERROR;
718 
719 	if (cookie->iterator != NULL) {
720 		delete cookie->iterator;
721 		cookie->iterator = NULL;
722 
723 		// the current directory inode is still locked in memory
724 		put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
725 	}
726 
727 	// if CheckNextNode() could completely work through, we can
728 	// fix any damages of the bitmap
729 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
730 		// calculate the number of used blocks in the check bitmap
731 		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
732 		off_t usedBlocks = 0LL;
733 
734 		// ToDo: update the allocation groups used blocks info
735 		for (uint32 i = size >> 2; i-- > 0;) {
736 			uint32 compare = 1;
737 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
738 				if (compare & fCheckBitmap[i])
739 					usedBlocks++;
740 			}
741 		}
742 
743 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks + control->stats.missing;
744 		if (control->stats.freed < 0)
745 			control->stats.freed = 0;
746 
747 		// Should we fix errors? Were there any errors we can fix?
748 		if (control->flags & BFS_FIX_BITMAP_ERRORS
749 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
750 			// if so, write the check bitmap back over the original one,
751 			// and use transactions here to play safe - we even use several
752 			// transactions, so that we don't blow the maximum log size
753 			// on large disks; since we don't need to make this atomic
754 			fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
755 
756 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
757 			int32 blockSize = fVolume->BlockSize();
758 
759 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
760 				Transaction transaction(fVolume, 1 + i);
761 
762 				int32 blocksToWrite = 512;
763 				if (blocksToWrite + i > blocksInBitmap)
764 					blocksToWrite = blocksInBitmap - i;
765 
766 				status_t status = transaction.WriteBlocks(1 + i,
767 					(uint8 *)fCheckBitmap + i * blockSize, blocksToWrite);
768 				if (status < B_OK) {
769 					FATAL(("error writing bitmap: %s\n", strerror(status)));
770 					break;
771 				}
772 				transaction.Done();
773 			}
774 		}
775 	} else
776 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
777 
778 	free(fCheckBitmap);
779 	fCheckBitmap = NULL;
780 	fCheckCookie = NULL;
781 	delete cookie;
782 	fLock.Unlock();
783 
784 	return B_OK;
785 }
786 
787 
788 status_t
789 BlockAllocator::CheckNextNode(check_control *control)
790 {
791 	if (!IsValidCheckControl(control))
792 		return B_BAD_VALUE;
793 
794 	check_cookie *cookie = (check_cookie *)control->cookie;
795 
796 	while (true) {
797 		if (cookie->iterator == NULL) {
798 			if (!cookie->stack.Pop(&cookie->current)) {
799 				// no more runs on the stack, we are obviously finished!
800 				control->status = B_ENTRY_NOT_FOUND;
801 				return B_ENTRY_NOT_FOUND;
802 			}
803 
804 			// get iterator for the next directory
805 
806 			Vnode vnode(fVolume, cookie->current);
807 			Inode *inode;
808 			if (vnode.Get(&inode) < B_OK) {
809 				FATAL(("check: Could not open inode at %Ld\n", fVolume->ToBlock(cookie->current)));
810 				continue;
811 			}
812 
813 			if (!inode->IsContainer()) {
814 				FATAL(("check: inode at %Ld should have been a directory\n", fVolume->ToBlock(cookie->current)));
815 				continue;
816 			}
817 
818 			BPlusTree *tree;
819 			if (inode->GetTree(&tree) != B_OK) {
820 				FATAL(("check: could not open b+tree from inode at %Ld\n", fVolume->ToBlock(cookie->current)));
821 				continue;
822 			}
823 
824 			cookie->parent = inode;
825 			cookie->parent_mode = inode->Mode();
826 
827 			cookie->iterator = new TreeIterator(tree);
828 			if (cookie->iterator == NULL)
829 				RETURN_ERROR(B_NO_MEMORY);
830 
831 			// the inode must stay locked in memory until the iterator is freed
832 			vnode.Keep();
833 
834 			// check the inode of the directory
835 			control->errors = 0;
836 			control->status = CheckInode(inode, control);
837 
838 			if (inode->GetName(control->name) < B_OK)
839 				strcpy(control->name, "(node has no name)");
840 
841 			control->inode = inode->ID();
842 			control->mode = inode->Mode();
843 
844 			return B_OK;
845 		}
846 
847 		char name[B_FILE_NAME_LENGTH];
848 		uint16 length;
849 		vnode_id id;
850 
851 		status_t status = cookie->iterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
852 		if (status == B_ENTRY_NOT_FOUND) {
853 			// there are no more entries in this iterator, free it and go on
854 			delete cookie->iterator;
855 			cookie->iterator = NULL;
856 
857 			// unlock the directory's inode from memory
858 			put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
859 
860 			continue;
861 		} else if (status == B_OK) {
862 			// ignore "." and ".." entries
863 			if (!strcmp(name, ".") || !strcmp(name, ".."))
864 				continue;
865 
866 			// fill in the control data as soon as we have them
867 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
868 			control->inode = id;
869 			control->errors = 0;
870 
871 			Vnode vnode(fVolume, id);
872 			Inode *inode;
873 			if (vnode.Get(&inode) < B_OK) {
874 				FATAL(("Could not open inode ID %Ld!\n", id));
875 				control->errors |= BFS_COULD_NOT_OPEN;
876 				control->status = B_ERROR;
877 				return B_OK;
878 			}
879 
880 			// check if the inode's name is the same as in the b+tree
881 			if (inode->IsRegularNode()) {
882 				SimpleLocker locker(inode->SmallDataLock());
883 
884 				const char *localName = inode->Name();
885 				if (localName == NULL || strcmp(localName, name)) {
886 					control->errors |= BFS_NAMES_DONT_MATCH;
887 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name, localName));
888 				}
889 			}
890 
891 			control->mode = inode->Mode();
892 
893 			// Check for the correct mode of the node (if the mode of the
894 			// file don't fit to its parent, there is a serious problem)
895 			if ((cookie->parent_mode & S_ATTR_DIR && !inode->IsAttribute())
896 				|| (cookie->parent_mode & S_INDEX_DIR && !inode->IsIndex())
897 				|| ((cookie->parent_mode & S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR) == S_DIRECTORY
898 					&& inode->Mode() & (S_ATTR | S_ATTR_DIR | S_INDEX_DIR))) {
899 				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!\n",
900 					inode->BlockNumber(), inode->Mode(), cookie->parent_mode, cookie->parent->BlockNumber()));
901 
902 				// if we are allowed to fix errors, we should remove the file
903 				if (control->flags & BFS_REMOVE_WRONG_TYPES
904 					&& control->flags & BFS_FIX_BITMAP_ERRORS) {
905 					// it's safe to start a transaction, because Inode::Remove()
906 					// won't touch the block bitmap (which we hold the lock for)
907 					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
908 					// the bitmap anyway
909 					Transaction transaction(fVolume, cookie->parent->BlockNumber());
910 
911 					inode->Node()->flags |= INODE_DONT_FREE_SPACE;
912 					status = cookie->parent->Remove(&transaction, name, NULL, inode->IsContainer());
913 					if (status == B_OK)
914 						transaction.Done();
915 				} else
916 					status = B_ERROR;
917 
918 				control->errors |= BFS_WRONG_TYPE;
919 				control->status = status;
920 				return B_OK;
921 			}
922 
923 			// If the inode has an attribute directory, push it on the stack
924 			if (!inode->Attributes().IsZero())
925 				cookie->stack.Push(inode->Attributes());
926 
927 			// push the directory on the stack so that it will be scanned later
928 			if (inode->IsContainer() && !inode->IsIndex())
929 				cookie->stack.Push(inode->BlockRun());
930 			else {
931 				// check it now
932 				control->status = CheckInode(inode, control);
933 
934 				return B_OK;
935 			}
936 		}
937 	}
938 	// is never reached
939 }
940 
941 
942 bool
943 BlockAllocator::CheckBitmapIsUsedAt(off_t block) const
944 {
945 	size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
946 	uint32 index = block / 32;	// 32bit resolution
947 	if (index > size / 4)
948 		return false;
949 
950 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index]) & (1UL << (block & 0x1f));
951 }
952 
953 
954 void
955 BlockAllocator::SetCheckBitmapAt(off_t block)
956 {
957 	size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
958 	uint32 index = block / 32;	// 32bit resolution
959 	if (index > size / 4)
960 		return;
961 
962 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
963 }
964 
965 
966 status_t
967 BlockAllocator::CheckBlockRun(block_run run, const char *type, check_control *control)
968 {
969 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
970 		|| run.Start() > fGroups[run.allocation_group].fNumBits
971 		|| uint32(run.Start() + run.Length()) > fGroups[run.AllocationGroup()].fNumBits
972 		|| run.length == 0) {
973 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
974 		if (control == NULL)
975 			return B_BAD_DATA;
976 
977 		control->errors |= BFS_INVALID_BLOCK_RUN;
978 		return B_OK;
979 	}
980 
981 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
982 	uint32 block = run.Start() / bitsPerBlock;
983 	uint32 pos = run.Start() % bitsPerBlock;
984 	int32 length = 0;
985 	off_t firstMissing = -1, firstSet = -1;
986 	off_t firstGroupBlock = (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
987 
988 	AllocationBlock cached(fVolume);
989 
990 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
991 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
992 			RETURN_ERROR(B_IO_ERROR);
993 
994 		if (pos >= cached.NumBlockBits()) {
995 			// something very strange has happened...
996 			RETURN_ERROR(B_ERROR);
997 		}
998 
999 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1000 			if (!cached.IsUsed(pos)) {
1001 				if (control == NULL) {
1002 					PRINT(("%s: block_run(%ld, %u, %u) is only partially allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1003 					return B_BAD_DATA;
1004 				}
1005 				if (firstMissing == -1) {
1006 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1007 					control->errors |= BFS_MISSING_BLOCKS;
1008 				}
1009 				control->stats.missing++;
1010 			} else if (firstMissing != -1) {
1011 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n",
1012 					type, run.allocation_group, run.start, run.length, firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1013 				firstMissing = -1;
1014 			}
1015 
1016 			if (fCheckBitmap != NULL) {
1017 				// Set the block in the check bitmap as well, but have a look if it
1018 				// is already allocated first
1019 				uint32 offset = pos + block * bitsPerBlock;
1020 				if (CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1021 					if (firstSet == -1) {
1022 						firstSet = firstGroupBlock + offset;
1023 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1024 					}
1025 					control->stats.already_set++;
1026 				} else {
1027 					if (firstSet != -1) {
1028 						FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n",
1029 							type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + offset - 1));
1030 						firstSet = -1;
1031 					}
1032 					SetCheckBitmapAt(firstGroupBlock + offset);
1033 				}
1034 			}
1035 			length++;
1036 			pos++;
1037 		}
1038 
1039 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1040 			if (firstMissing != -1)
1041 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1042 			if (firstSet != -1)
1043 				FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + pos + block * bitsPerBlock - 1));
1044 		}
1045 	}
1046 
1047 	return B_OK;
1048 }
1049 
1050 
1051 status_t
1052 BlockAllocator::CheckInode(Inode *inode, check_control *control)
1053 {
1054 	if (control != NULL && fCheckBitmap == NULL)
1055 		return B_NO_INIT;
1056 	if (inode == NULL)
1057 		return B_BAD_VALUE;
1058 
1059 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1060 	if (status < B_OK)
1061 		return status;
1062 
1063 	data_stream *data = &inode->Node()->data;
1064 
1065 	// check the direct range
1066 
1067 	if (data->max_direct_range) {
1068 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1069 			if (data->direct[i].IsZero())
1070 				break;
1071 
1072 			status = CheckBlockRun(data->direct[i], "direct", control);
1073 			if (status < B_OK)
1074 				return status;
1075 		}
1076 	}
1077 
1078 	CachedBlock cached(fVolume);
1079 
1080 	// check the indirect range
1081 
1082 	if (data->max_indirect_range) {
1083 		status = CheckBlockRun(data->indirect, "indirect", control);
1084 		if (status < B_OK)
1085 			return status;
1086 
1087 		off_t block = fVolume->ToBlock(data->indirect);
1088 
1089 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1090 			block_run *runs = (block_run *)cached.SetTo(block + i);
1091 			if (runs == NULL)
1092 				RETURN_ERROR(B_IO_ERROR);
1093 
1094 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1095 			int32 index = 0;
1096 			for (; index < runsPerBlock; index++) {
1097 				if (runs[index].IsZero())
1098 					break;
1099 
1100 				status = CheckBlockRun(runs[index], "indirect->run", control);
1101 				if (status < B_OK)
1102 					return status;
1103 			}
1104 			if (index < runsPerBlock)
1105 				break;
1106 		}
1107 	}
1108 
1109 	// check the double indirect range
1110 
1111 	if (data->max_double_indirect_range) {
1112 		status = CheckBlockRun(data->double_indirect, "double indirect", control);
1113 		if (status < B_OK)
1114 			return status;
1115 
1116 		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1117 		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1118 
1119 		CachedBlock cachedDirect(fVolume);
1120 		int32 maxIndirectIndex = (data->double_indirect.Length() << fVolume->BlockShift())
1121 									/ sizeof(block_run);
1122 
1123 		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex; indirectIndex++) {
1124 			// get the indirect array block
1125 			block_run *array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1126 				+ indirectIndex / runsPerBlock);
1127 			if (array == NULL)
1128 				return B_IO_ERROR;
1129 
1130 			block_run indirect = array[indirectIndex % runsPerBlock];
1131 			// are we finished yet?
1132 			if (indirect.IsZero())
1133 				return B_OK;
1134 
1135 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1136 			if (status < B_OK)
1137 				return status;
1138 
1139 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift()) / sizeof(block_run);
1140 
1141 			for (int32 index = 0; index < maxIndex; ) {
1142 				block_run *runs = (block_run *)cachedDirect.SetTo(fVolume->ToBlock(indirect)
1143 					+ index / runsPerBlock);
1144 				if (runs == NULL)
1145 					return B_IO_ERROR;
1146 
1147 				do {
1148 					// are we finished yet?
1149 					if (runs[index % runsPerBlock].IsZero())
1150 						return B_OK;
1151 
1152 					status = CheckBlockRun(runs[index % runsPerBlock], "double indirect->runs->run", control);
1153 					if (status < B_OK)
1154 						return status;
1155 				} while ((++index % runsPerArray) != 0);
1156 			}
1157 		}
1158 	}
1159 
1160 	return B_OK;
1161 }
1162 
1163