xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 81f5654c124bf46fba0fd251f208e2d88d81e1ce)
1 /* BlockAllocator - block bitmap handling and allocation policies
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** This file may be used under the terms of the OpenBeOS License.
5 */
6 
7 
8 #include "Debug.h"
9 #include "BlockAllocator.h"
10 #include "Volume.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Stack.h"
14 #include "bfs_control.h"
15 
16 #include <util/kernel_cpp.h>
17 #include <stdlib.h>
18 
19 #ifdef USER
20 #	define spawn_kernel_thread spawn_thread
21 #endif
22 
23 
24 // Things the BlockAllocator should do:
25 
26 // - find a range of blocks of a certain size nearby a specific position
27 // - allocating an unsharp range of blocks for pre-allocation
28 // - free blocks
29 // - know how to deal with each allocation, special handling for directories,
30 //   files, symlinks, etc. (type sensitive allocation policies)
31 
32 // What makes the code complicated is the fact that we are not just reading
33 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
34 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
35 // of partitions will grow even more - so that's not an option.
36 // Instead we are reading in every block when it's used - since an allocation
37 // group can span several blocks in the block bitmap, the AllocationBlock
38 // class is there to make handling those easier.
39 
40 // The current implementation is only slightly optimized and could probably
41 // be improved a lot. Furthermore, the allocation policies used here should
42 // have some real world tests.
43 
44 
45 struct check_cookie {
46 	check_cookie() {}
47 
48 	block_run			current;
49 	Inode				*parent;
50 	mode_t				parent_mode;
51 	Stack<block_run>	stack;
52 	TreeIterator		*iterator;
53 };
54 
55 
56 class AllocationBlock : public CachedBlock {
57 	public:
58 		AllocationBlock(Volume *volume);
59 
60 		void Allocate(uint16 start, uint16 numBlocks);
61 		void Free(uint16 start, uint16 numBlocks);
62 		inline bool IsUsed(uint16 block);
63 
64 		status_t SetTo(AllocationGroup &group, uint16 block);
65 
66 		uint32 NumBlockBits() const { return fNumBits; }
67 		uint32 &Block(int32 index) { return ((uint32 *)fBlock)[index]; }
68 
69 	private:
70 		uint32 fNumBits;
71 };
72 
73 
74 class AllocationGroup {
75 	public:
76 		AllocationGroup();
77 
78 		void AddFreeRange(int32 start, int32 blocks);
79 		bool IsFull() const { return fFreeBits == 0; }
80 
81 		status_t Allocate(Transaction *transaction, uint16 start, int32 length);
82 		status_t Free(Transaction *transaction, uint16 start, int32 length);
83 
84 		uint32 fNumBits;
85 		int32 fStart;
86 		int32 fFirstFree, fLargest, fLargestFirst;
87 			// ToDo: fLargest & fLargestFirst are not maintained (and therefore used) yet!
88 		int32 fFreeBits;
89 };
90 
91 
92 AllocationBlock::AllocationBlock(Volume *volume)
93 	: CachedBlock(volume)
94 {
95 }
96 
97 
98 status_t
99 AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
100 {
101 	// 8 blocks per byte
102 	fNumBits = fVolume->BlockSize() << 3;
103 	// the last group may have less bits in the last block
104 	if ((block + 1) * fNumBits > group.fNumBits)
105 		fNumBits = group.fNumBits % fNumBits;
106 
107 	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
108 }
109 
110 
111 bool
112 AllocationBlock::IsUsed(uint16 block)
113 {
114 	if (block > fNumBits)
115 		return true;
116 	// the block bitmap is accessed in 32-bit blocks
117 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
118 }
119 
120 
121 void
122 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
123 {
124 	ASSERT(start < fNumBits);
125 	ASSERT(uint32(start + numBlocks) <= fNumBits);
126 
127 	if (uint32(start + numBlocks) > fNumBits) {
128 		FATAL(("Allocation::Allocate(): tried to allocate too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
129 		DIE(("Allocation::Allocate(): tried to allocate too many blocks"));
130 	}
131 
132 	int32 block = start >> 5;
133 
134 	while (numBlocks > 0) {
135 		uint32 mask = 0;
136 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
137 			mask |= 1UL << (i % 32);
138 
139 #ifdef DEBUG
140 		// check for already set blocks
141 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32 *)fBlock)[block]) {
142 			FATAL(("AllocationBlock::Allocate(): some blocks are already allocated, start = %u, numBlocks = %u\n", start, numBlocks));
143 			DEBUGGER(("blocks already set!"));
144 		}
145 #endif
146 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
147 		start = 0;
148 	}
149 }
150 
151 
152 void
153 AllocationBlock::Free(uint16 start, uint16 numBlocks)
154 {
155 	ASSERT(start < fNumBits);
156 	ASSERT(uint32(start + numBlocks) <= fNumBits);
157 
158 	if (uint32(start + numBlocks) > fNumBits) {
159 		FATAL(("Allocation::Free(): tried to free too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
160 		DIE(("Allocation::Free(): tried to free too many blocks"));
161 	}
162 
163 	int32 block = start >> 5;
164 
165 	while (numBlocks > 0) {
166 		uint32 mask = 0;
167 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
168 			mask |= 1UL << (i % 32);
169 
170 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
171 		start = 0;
172 	}
173 }
174 
175 
176 //	#pragma mark -
177 
178 
179 AllocationGroup::AllocationGroup()
180 	:
181 	fFirstFree(-1),
182 	fLargest(-1),
183 	fLargestFirst(-1),
184 	fFreeBits(0)
185 {
186 }
187 
188 
189 void
190 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
191 {
192 	//D(if (blocks > 512)
193 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
194 
195 	if (fFirstFree == -1)
196 		fFirstFree = start;
197 
198 	if (fLargest < blocks) {
199 		fLargest = blocks;
200 		fLargestFirst = start;
201 	}
202 
203 	fFreeBits += blocks;
204 }
205 
206 
207 /** Allocates the specified run in the allocation group.
208  *	Doesn't check if the run is valid or already allocated
209  *	partially, nor does it maintain the free ranges hints
210  *	or the volume's used blocks count.
211  *	It only does the low-level work of allocating some bits
212  *	in the block bitmap.
213  *	Assumes that the block bitmap lock is hold.
214  */
215 
216 status_t
217 AllocationGroup::Allocate(Transaction *transaction, uint16 start, int32 length)
218 {
219 	if (start > fNumBits)
220 		return B_ERROR;
221 
222 	// Update the allocation group info
223 	// ToDo: this info will be incorrect if something goes wrong later
224 	// Note, the fFirstFree block doesn't have to be really free
225 	if (start == fFirstFree)
226 		fFirstFree = start + length;
227 	fFreeBits -= length;
228 
229 	Volume *volume = transaction->GetVolume();
230 
231 	// calculate block in the block bitmap and position within
232 	uint32 bitsPerBlock = volume->BlockSize() << 3;
233 	uint32 block = start / bitsPerBlock;
234 	start = start % bitsPerBlock;
235 
236 	AllocationBlock cached(volume);
237 
238 	while (length > 0) {
239 		if (cached.SetTo(*this, block) < B_OK)
240 			RETURN_ERROR(B_IO_ERROR);
241 
242 		uint32 numBlocks = length;
243 		if (start + numBlocks > cached.NumBlockBits())
244 			numBlocks = cached.NumBlockBits() - start;
245 
246 		cached.Allocate(start, numBlocks);
247 
248 		if (cached.WriteBack(transaction) < B_OK)
249 			return B_IO_ERROR;
250 
251 		length -= numBlocks;
252 		start = 0;
253 		block++;
254 	}
255 
256 	return B_OK;
257 }
258 
259 
260 /** Frees the specified run in the allocation group.
261  *	Doesn't check if the run is valid or was not completely
262  *	allocated, nor does it maintain the free ranges hints
263  *	or the volume's used blocks count.
264  *	It only does the low-level work of freeing some bits
265  *	in the block bitmap.
266  *	Assumes that the block bitmap lock is hold.
267  */
268 
269 status_t
270 AllocationGroup::Free(Transaction *transaction, uint16 start, int32 length)
271 {
272 	if (start > fNumBits)
273 		return B_ERROR;
274 
275 	// Update the allocation group info
276 	// ToDo: this info will be incorrect if something goes wrong later
277 	if (fFirstFree > start)
278 		fFirstFree = start;
279 	fFreeBits += length;
280 
281 	Volume *volume = transaction->GetVolume();
282 
283 	// calculate block in the block bitmap and position within
284 	uint32 bitsPerBlock = volume->BlockSize() << 3;
285 	uint32 block = start / bitsPerBlock;
286 	start = start % bitsPerBlock;
287 
288 	AllocationBlock cached(volume);
289 
290 	while (length > 0) {
291 		if (cached.SetTo(*this, block) < B_OK)
292 			RETURN_ERROR(B_IO_ERROR);
293 
294 		uint16 freeLength = length;
295 		if (uint32(start + length) > cached.NumBlockBits())
296 			freeLength = cached.NumBlockBits() - start;
297 
298 		cached.Free(start, freeLength);
299 
300 		if (cached.WriteBack(transaction) < B_OK)
301 			return B_IO_ERROR;
302 
303 		length -= freeLength;
304 		start = 0;
305 		block++;
306 	}
307 	return B_OK;
308 }
309 
310 
311 //	#pragma mark -
312 
313 
314 BlockAllocator::BlockAllocator(Volume *volume)
315 	:
316 	fVolume(volume),
317 	fLock("bfs allocator"),
318 	fGroups(NULL),
319 	fCheckBitmap(NULL)
320 {
321 }
322 
323 
324 BlockAllocator::~BlockAllocator()
325 {
326 	delete[] fGroups;
327 }
328 
329 
330 status_t
331 BlockAllocator::Initialize(bool full)
332 {
333 	if (fLock.InitCheck() < B_OK)
334 		return B_ERROR;
335 
336 	fNumGroups = fVolume->AllocationGroups();
337 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
338 	fGroups = new AllocationGroup[fNumGroups];
339 	if (fGroups == NULL)
340 		return B_NO_MEMORY;
341 
342 	if (!full)
343 		return B_OK;
344 
345 	fLock.Lock();
346 		// the lock will be released by the initialize() function
347 
348 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,
349 			"bfs block allocator", B_LOW_PRIORITY, (void *)this);
350 	if (id < B_OK)
351 		return initialize(this);
352 
353 	return resume_thread(id);
354 }
355 
356 
357 status_t
358 BlockAllocator::InitializeAndClearBitmap(Transaction &transaction)
359 {
360 	status_t status = Initialize(false);
361 	if (status < B_OK)
362 		return status;
363 
364 	uint32 blocks = fBlocksPerGroup;
365 	uint32 numBits = 8 * blocks * fVolume->BlockSize();
366 
367 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
368 	if (buffer == NULL)
369 		RETURN_ERROR(B_NO_MEMORY);
370 
371 	memset(buffer, 0, numBits >> 3);
372 
373 	off_t offset = 1;
374 
375 	// initialize the AllocationGroup objects and clear the on-disk bitmap
376 
377 	for (int32 i = 0; i < fNumGroups; i++) {
378 		if (cached_write(fVolume->Device(), offset, buffer, blocks, fVolume->BlockSize()) < B_OK)
379 			return B_ERROR;
380 
381 		// the last allocation group may contain less blocks than the others
382 		fGroups[i].fNumBits = i == fNumGroups - 1 ? fVolume->NumBlocks() - i * numBits : numBits;
383 		fGroups[i].fStart = offset;
384 		fGroups[i].fFirstFree = fGroups[i].fLargestFirst = 0;
385 		fGroups[i].fFreeBits = fGroups[i].fLargest = fGroups[i].fNumBits;
386 
387 		offset += blocks;
388 	}
389 	free(buffer);
390 
391 	// reserve the boot block, the log area, and the block bitmap itself
392 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
393 
394 	if (fGroups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
395 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
396 		return B_ERROR;
397 	}
398 	fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
399 
400 	return B_OK;
401 }
402 
403 
404 status_t
405 BlockAllocator::initialize(BlockAllocator *allocator)
406 {
407 	// The lock must already be held at this point
408 
409 	Volume *volume = allocator->fVolume;
410 	uint32 blocks = allocator->fBlocksPerGroup;
411 	uint32 numBits = 8 * blocks * volume->BlockSize();
412 	off_t freeBlocks = 0;
413 
414 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
415 	if (buffer == NULL) {
416 		allocator->fLock.Unlock();
417 		RETURN_ERROR(B_NO_MEMORY);
418 	}
419 
420 	AllocationGroup *groups = allocator->fGroups;
421 	off_t offset = 1;
422 	int32 num = allocator->fNumGroups;
423 
424 	for (int32 i = 0;i < num;i++) {
425 		if (cached_read(volume->Device(), offset, buffer, blocks, volume->BlockSize()) < B_OK)
426 			break;
427 
428 		// the last allocation group may contain less blocks than the others
429 		groups[i].fNumBits = i == num - 1 ? volume->NumBlocks() - i * numBits : numBits;
430 		groups[i].fStart = offset;
431 
432 		// finds all free ranges in this allocation group
433 		int32 start = -1, range = 0;
434 		int32 size = groups[i].fNumBits, num = 0;
435 
436 		for (int32 k = 0;k < (size >> 2);k++) {
437 			for (int32 j = 0; j < 32 && num < size; j++, num++) {
438 				if (buffer[k] & (1UL << j)) {
439 					if (range > 0) {
440 						groups[i].AddFreeRange(start, range);
441 						range = 0;
442 					}
443 				} else if (range++ == 0)
444 					start = num;
445 			}
446 		}
447 		if (range)
448 			groups[i].AddFreeRange(start, range);
449 
450 		freeBlocks += groups[i].fFreeBits;
451 
452 		offset += blocks;
453 	}
454 	free(buffer);
455 
456 	// check if block bitmap and log area are reserved
457 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
458 	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
459 		Transaction transaction(volume, 0);
460 		if (groups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
461 			FATAL(("could not allocate reserved space for block bitmap/log!\n"));
462 			volume->Panic();
463 		} else {
464 			FATAL(("space for block bitmap or log area was not reserved!\n"));
465 
466 			transaction.Done();
467 		}
468 	}
469 
470 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
471 	if (volume->UsedBlocks() != usedBlocks) {
472 		// If the disk in a dirty state at mount time, it's
473 		// normal that the values don't match
474 		INFORM(("volume reports %Ld used blocks, correct is %Ld\n", volume->UsedBlocks(), usedBlocks));
475 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
476 	}
477 
478 	allocator->fLock.Unlock();
479 	return B_OK;
480 }
481 
482 
483 status_t
484 BlockAllocator::AllocateBlocks(Transaction *transaction, int32 group, uint16 start,
485 	uint16 maximum, uint16 minimum, block_run &run)
486 {
487 	if (maximum == 0)
488 		return B_BAD_VALUE;
489 
490 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n", group, start, maximum, minimum));
491 
492 	AllocationBlock cached(fVolume);
493 	Locker lock(fLock);
494 
495 	// the first scan through all allocation groups will look for the
496 	// wanted maximum of blocks, the second scan will just look to
497 	// satisfy the minimal requirement
498 	uint16 numBlocks = maximum;
499 
500 	for (int32 i = 0; i < fNumGroups * 2; i++, group++, start = 0) {
501 		group = group % fNumGroups;
502 
503 		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
504 			continue;
505 
506 		if (i >= fNumGroups) {
507 			// if the minimum is the same as the maximum, it's not necessary to
508 			// search for in the allocation groups a second time
509 			if (maximum == minimum)
510 				return B_DEVICE_FULL;
511 
512 			numBlocks = minimum;
513 		}
514 
515 		// The wanted maximum is smaller than the largest free block in the group
516 		// or already smaller than the minimum
517 		// ToDo: disabled because it's currently not maintained after the first allocation
518 		//if (numBlocks > fGroups[group].fLargest)
519 		//	continue;
520 
521 		if (start < fGroups[group].fFirstFree)
522 			start = fGroups[group].fFirstFree;
523 
524 		// there may be more than one block per allocation group - and
525 		// we iterate through it to find a place for the allocation.
526 		// (one allocation can't exceed one allocation group)
527 
528 		uint32 block = start / (fVolume->BlockSize() << 3);
529 		int32 range = 0, rangeStart = 0;
530 
531 		for (; block < fBlocksPerGroup; block++) {
532 			if (cached.SetTo(fGroups[group], block) < B_OK)
533 				RETURN_ERROR(B_ERROR);
534 
535 			// find a block large enough to hold the allocation
536 			for (uint32 bit = start % cached.NumBlockBits(); bit < cached.NumBlockBits(); bit++) {
537 				if (!cached.IsUsed(bit)) {
538 					if (range == 0) {
539 						// start new range
540 						rangeStart = block * cached.NumBlockBits() + bit;
541 					}
542 
543 					// have we found a range large enough to hold numBlocks?
544 					if (++range >= maximum)
545 						break;
546 				} else if (i >= fNumGroups && range >= minimum) {
547 					// we have found a block larger than the required minimum (second pass)
548 					break;
549 				} else {
550 					// end of a range
551 					range = 0;
552 				}
553 			}
554 
555 			// if we found a suitable block, mark the blocks as in use, and write
556 			// the updated block bitmap back to disk
557 			if (range >= numBlocks) {
558 				// adjust allocation size
559 				if (numBlocks < maximum)
560 					numBlocks = range;
561 
562 				if (fGroups[group].Allocate(transaction, rangeStart, numBlocks) < B_OK)
563 					RETURN_ERROR(B_IO_ERROR);
564 
565 				run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(group);
566 				run.start = HOST_ENDIAN_TO_BFS_INT16(rangeStart);
567 				run.length = HOST_ENDIAN_TO_BFS_INT16(numBlocks);
568 
569 				fVolume->SuperBlock().used_blocks =
570 					HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + numBlocks);
571 					// We are not writing back the disk's super block - it's
572 					// either done by the journaling code, or when the disk
573 					// is unmounted.
574 					// If the value is not correct at mount time, it will be
575 					// fixed anyway.
576 
577 				return B_OK;
578 			}
579 
580 			// start from the beginning of the next block
581 			start = 0;
582 		}
583 	}
584 	return B_DEVICE_FULL;
585 }
586 
587 
588 status_t
589 BlockAllocator::AllocateForInode(Transaction *transaction, const block_run *parent,
590 	mode_t type, block_run &run)
591 {
592 	// apply some allocation policies here (AllocateBlocks() will break them
593 	// if necessary) - we will start with those described in Dominic Giampaolo's
594 	// "Practical File System Design", and see how good they work
595 
596 	// files are going in the same allocation group as its parent, sub-directories
597 	// will be inserted 8 allocation groups after the one of the parent
598 	uint16 group = parent->AllocationGroup();
599 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
600 		group += 8;
601 
602 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
603 }
604 
605 
606 status_t
607 BlockAllocator::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks,
608 	block_run &run, uint16 minimum)
609 {
610 	if (numBlocks <= 0)
611 		return B_ERROR;
612 
613 	// one block_run can't hold more data than there is in one allocation group
614 	if (numBlocks > fGroups[0].fNumBits)
615 		numBlocks = fGroups[0].fNumBits;
616 
617 	// since block_run.length is uint16, the largest number of blocks that
618 	// can be covered by a block_run is 65535
619 	// ToDo: if we drop compatibility, couldn't we do this any better?
620 	// There are basically two possibilities:
621 	// a) since a length of zero doesn't have any sense, take that for 65536 -
622 	//    but that could cause many problems (bugs) in other areas
623 	// b) reduce the maximum amount of blocks per block_run, so that the remaining
624 	//    number of free blocks can be used in a useful manner (like 4 blocks) -
625 	//    but that would also reduce the maximum file size
626 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
627 		numBlocks = MAX_BLOCK_RUN_LENGTH;
628 
629 	// apply some allocation policies here (AllocateBlocks() will break them
630 	// if necessary)
631 	uint16 group = inode->BlockRun().AllocationGroup();
632 	uint16 start = 0;
633 
634 	// are there already allocated blocks? (then just try to allocate near the last one)
635 	if (inode->Size() > 0) {
636 		data_stream *data = &inode->Node()->data;
637 		// ToDo: we currently don't care for when the data stream
638 		// is already grown into the indirect ranges
639 		if (data->max_double_indirect_range == 0
640 			&& data->max_indirect_range == 0) {
641 			// Since size > 0, there must be a valid block run in this stream
642 			int32 last = 0;
643 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
644 				if (data->direct[last + 1].IsZero())
645 					break;
646 
647 			group = data->direct[last].AllocationGroup();
648 			start = data->direct[last].Start() + data->direct[last].Length();
649 		}
650 	} else if (inode->IsContainer() || inode->IsSymLink()) {
651 		// directory and symbolic link data will go in the same allocation
652 		// group as the inode is in but after the inode data
653 		start = inode->BlockRun().Start();
654 	} else {
655 		// file data will start in the next allocation group
656 		group = inode->BlockRun().AllocationGroup() + 1;
657 	}
658 
659 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
660 }
661 
662 
663 status_t
664 BlockAllocator::Free(Transaction *transaction, block_run run)
665 {
666 	Locker lock(fLock);
667 
668 	int32 group = run.AllocationGroup();
669 	uint16 start = run.Start();
670 	uint16 length = run.Length();
671 
672 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start, length));
673 
674 	// doesn't use Volume::IsValidBlockRun() here because it can check better
675 	// against the group size (the last group may have a different length)
676 	if (group < 0 || group >= fNumGroups
677 		|| start > fGroups[group].fNumBits
678 		|| uint32(start + length) > fGroups[group].fNumBits
679 		|| length == 0) {
680 		FATAL(("tried to free an invalid block_run (%ld, %u, %u)\n", group, start, length));
681 		DEBUGGER(("tried to free invalid block_run"));
682 		return B_BAD_VALUE;
683 	}
684 	// check if someone tries to free reserved areas at the beginning of the drive
685 	if (group == 0 && start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
686 		FATAL(("tried to free a reserved block_run (%ld, %u, %u)\n", group, start, length));
687 		DEBUGGER(("tried to free reserved block"));
688 		return B_BAD_VALUE;
689 	}
690 #ifdef DEBUG
691 	if (CheckBlockRun(run) < B_OK)
692 		return B_BAD_DATA;
693 #endif
694 
695 	if (fGroups[group].Free(transaction, start, length) < B_OK)
696 		RETURN_ERROR(B_IO_ERROR);
697 
698 #ifdef DEBUG
699 	if (CheckBlockRun(run, NULL, NULL, false) < B_OK)
700 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just freed)\n"));
701 #endif
702 
703 	fVolume->SuperBlock().used_blocks =
704 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
705 	return B_OK;
706 }
707 
708 
709 size_t
710 BlockAllocator::BitmapSize() const
711 {
712 	return fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
713 }
714 
715 
716 //	#pragma mark -
717 //	Functions to check the validity of the bitmap - they are used from
718 //	the "chkbfs" command
719 
720 
721 bool
722 BlockAllocator::IsValidCheckControl(check_control *control)
723 {
724 	if (control == NULL
725 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
726 		FATAL(("invalid check_control (%p)!\n", control));
727 		return false;
728 	}
729 
730 	return true;
731 }
732 
733 
734 status_t
735 BlockAllocator::StartChecking(check_control *control)
736 {
737 	if (!IsValidCheckControl(control))
738 		return B_BAD_VALUE;
739 
740 	status_t status = fLock.Lock();
741 	if (status < B_OK)
742 		return status;
743 
744 	size_t size = BitmapSize();
745 	fCheckBitmap = (uint32 *)malloc(size);
746 	if (fCheckBitmap == NULL) {
747 		fLock.Unlock();
748 		return B_NO_MEMORY;
749 	}
750 
751 	check_cookie *cookie = new check_cookie();
752 	if (cookie == NULL) {
753 		free(fCheckBitmap);
754 		fCheckBitmap = NULL;
755 		fLock.Unlock();
756 
757 		return B_NO_MEMORY;
758 	}
759 
760 	// initialize bitmap
761 	memset(fCheckBitmap, 0, size);
762 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length(); block-- > 0;)
763 		SetCheckBitmapAt(block);
764 
765 	cookie->stack.Push(fVolume->Root());
766 	cookie->stack.Push(fVolume->Indices());
767 	cookie->iterator = NULL;
768 	control->cookie = cookie;
769 
770 	fCheckCookie = cookie;
771 		// to be able to restore nicely if "chkbfs" exited abnormally
772 
773 	// ToDo: check reserved area in bitmap!
774 
775 	return B_OK;
776 }
777 
778 
779 status_t
780 BlockAllocator::StopChecking(check_control *control)
781 {
782 	check_cookie *cookie;
783 	if (control == NULL)
784 		cookie = fCheckCookie;
785 	else
786 		cookie = (check_cookie *)control->cookie;
787 
788 	if (cookie == NULL)
789 		return B_ERROR;
790 
791 	if (cookie->iterator != NULL) {
792 		delete cookie->iterator;
793 		cookie->iterator = NULL;
794 
795 		// the current directory inode is still locked in memory
796 		put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
797 	}
798 
799 	// if CheckNextNode() could completely work through, we can
800 	// fix any damages of the bitmap
801 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
802 		// calculate the number of used blocks in the check bitmap
803 		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
804 		off_t usedBlocks = 0LL;
805 
806 		// ToDo: update the allocation groups used blocks info
807 		for (uint32 i = size >> 2; i-- > 0;) {
808 			uint32 compare = 1;
809 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
810 				if (compare & fCheckBitmap[i])
811 					usedBlocks++;
812 			}
813 		}
814 
815 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks + control->stats.missing;
816 		if (control->stats.freed < 0)
817 			control->stats.freed = 0;
818 
819 		// Should we fix errors? Were there any errors we can fix?
820 		if (control->flags & BFS_FIX_BITMAP_ERRORS
821 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
822 			// if so, write the check bitmap back over the original one,
823 			// and use transactions here to play safe - we even use several
824 			// transactions, so that we don't blow the maximum log size
825 			// on large disks; since we don't need to make this atomic
826 			fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
827 
828 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
829 			int32 blockSize = fVolume->BlockSize();
830 
831 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
832 				Transaction transaction(fVolume, 1 + i);
833 
834 				int32 blocksToWrite = 512;
835 				if (blocksToWrite + i > blocksInBitmap)
836 					blocksToWrite = blocksInBitmap - i;
837 
838 				status_t status = transaction.WriteBlocks(1 + i,
839 					(uint8 *)fCheckBitmap + i * blockSize, blocksToWrite);
840 				if (status < B_OK) {
841 					FATAL(("error writing bitmap: %s\n", strerror(status)));
842 					break;
843 				}
844 				transaction.Done();
845 			}
846 		}
847 	} else
848 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
849 
850 	free(fCheckBitmap);
851 	fCheckBitmap = NULL;
852 	fCheckCookie = NULL;
853 	delete cookie;
854 	fLock.Unlock();
855 
856 	return B_OK;
857 }
858 
859 
860 status_t
861 BlockAllocator::CheckNextNode(check_control *control)
862 {
863 	if (!IsValidCheckControl(control))
864 		return B_BAD_VALUE;
865 
866 	check_cookie *cookie = (check_cookie *)control->cookie;
867 
868 	while (true) {
869 		if (cookie->iterator == NULL) {
870 			if (!cookie->stack.Pop(&cookie->current)) {
871 				// no more runs on the stack, we are obviously finished!
872 				control->status = B_ENTRY_NOT_FOUND;
873 				return B_ENTRY_NOT_FOUND;
874 			}
875 
876 			// get iterator for the next directory
877 
878 #ifdef UNSAFE_GET_VNODE
879 			RecursiveLocker locker(fVolume->Lock());
880 #endif
881 			Vnode vnode(fVolume, cookie->current);
882 			Inode *inode;
883 			if (vnode.Get(&inode) < B_OK) {
884 				FATAL(("check: Could not open inode at %Ld\n", fVolume->ToBlock(cookie->current)));
885 				continue;
886 			}
887 
888 			if (!inode->IsContainer()) {
889 				FATAL(("check: inode at %Ld should have been a directory\n", fVolume->ToBlock(cookie->current)));
890 				continue;
891 			}
892 
893 			BPlusTree *tree;
894 			if (inode->GetTree(&tree) != B_OK) {
895 				FATAL(("check: could not open b+tree from inode at %Ld\n", fVolume->ToBlock(cookie->current)));
896 				continue;
897 			}
898 
899 			cookie->parent = inode;
900 			cookie->parent_mode = inode->Mode();
901 
902 			cookie->iterator = new TreeIterator(tree);
903 			if (cookie->iterator == NULL)
904 				RETURN_ERROR(B_NO_MEMORY);
905 
906 			// the inode must stay locked in memory until the iterator is freed
907 			vnode.Keep();
908 
909 			// check the inode of the directory
910 			control->errors = 0;
911 			control->status = CheckInode(inode, control);
912 
913 			if (inode->GetName(control->name) < B_OK)
914 				strcpy(control->name, "(node has no name)");
915 
916 			control->inode = inode->ID();
917 			control->mode = inode->Mode();
918 
919 			return B_OK;
920 		}
921 
922 		char name[B_FILE_NAME_LENGTH];
923 		uint16 length;
924 		vnode_id id;
925 
926 		status_t status = cookie->iterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
927 		if (status == B_ENTRY_NOT_FOUND) {
928 			// there are no more entries in this iterator, free it and go on
929 			delete cookie->iterator;
930 			cookie->iterator = NULL;
931 
932 			// unlock the directory's inode from memory
933 			put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
934 
935 			continue;
936 		} else if (status == B_OK) {
937 			// ignore "." and ".." entries
938 			if (!strcmp(name, ".") || !strcmp(name, ".."))
939 				continue;
940 
941 			// fill in the control data as soon as we have them
942 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
943 			control->inode = id;
944 			control->errors = 0;
945 
946 			Vnode vnode(fVolume, id);
947 			Inode *inode;
948 			if (vnode.Get(&inode) < B_OK) {
949 				FATAL(("Could not open inode ID %Ld!\n", id));
950 				control->errors |= BFS_COULD_NOT_OPEN;
951 				control->status = B_ERROR;
952 				return B_OK;
953 			}
954 
955 			// check if the inode's name is the same as in the b+tree
956 			if (inode->IsRegularNode()) {
957 				SimpleLocker locker(inode->SmallDataLock());
958 
959 				const char *localName = inode->Name();
960 				if (localName == NULL || strcmp(localName, name)) {
961 					control->errors |= BFS_NAMES_DONT_MATCH;
962 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name, localName));
963 				}
964 			}
965 
966 			control->mode = inode->Mode();
967 
968 			// Check for the correct mode of the node (if the mode of the
969 			// file don't fit to its parent, there is a serious problem)
970 			if ((cookie->parent_mode & S_ATTR_DIR && !inode->IsAttribute())
971 				|| (cookie->parent_mode & S_INDEX_DIR && !inode->IsIndex())
972 				|| ((cookie->parent_mode & S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR) == S_DIRECTORY
973 					&& inode->Mode() & (S_ATTR | S_ATTR_DIR | S_INDEX_DIR))) {
974 				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!\n",
975 					inode->BlockNumber(), inode->Mode(), cookie->parent_mode, cookie->parent->BlockNumber()));
976 
977 				// if we are allowed to fix errors, we should remove the file
978 				if (control->flags & BFS_REMOVE_WRONG_TYPES
979 					&& control->flags & BFS_FIX_BITMAP_ERRORS) {
980 #ifdef UNSAFE_GET_VNODE
981 					RecursiveLocker locker(fVolume->Lock());
982 #endif
983 					// it's safe to start a transaction, because Inode::Remove()
984 					// won't touch the block bitmap (which we hold the lock for)
985 					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
986 					// the bitmap anyway
987 					Transaction transaction(fVolume, cookie->parent->BlockNumber());
988 
989 					inode->Node()->flags |= INODE_DONT_FREE_SPACE;
990 					status = cookie->parent->Remove(&transaction, name, NULL, inode->IsContainer());
991 					if (status == B_OK)
992 						transaction.Done();
993 				} else
994 					status = B_ERROR;
995 
996 				control->errors |= BFS_WRONG_TYPE;
997 				control->status = status;
998 				return B_OK;
999 			}
1000 
1001 			// If the inode has an attribute directory, push it on the stack
1002 			if (!inode->Attributes().IsZero())
1003 				cookie->stack.Push(inode->Attributes());
1004 
1005 			// push the directory on the stack so that it will be scanned later
1006 			if (inode->IsContainer() && !inode->IsIndex())
1007 				cookie->stack.Push(inode->BlockRun());
1008 			else {
1009 				// check it now
1010 				control->status = CheckInode(inode, control);
1011 
1012 				return B_OK;
1013 			}
1014 		}
1015 	}
1016 	// is never reached
1017 }
1018 
1019 
1020 bool
1021 BlockAllocator::CheckBitmapIsUsedAt(off_t block) const
1022 {
1023 	size_t size = BitmapSize();
1024 	uint32 index = block / 32;	// 32bit resolution
1025 	if (index > size / 4)
1026 		return false;
1027 
1028 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index]) & (1UL << (block & 0x1f));
1029 }
1030 
1031 
1032 void
1033 BlockAllocator::SetCheckBitmapAt(off_t block)
1034 {
1035 	size_t size = BitmapSize();
1036 	uint32 index = block / 32;	// 32bit resolution
1037 	if (index > size / 4)
1038 		return;
1039 
1040 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1041 }
1042 
1043 
1044 status_t
1045 BlockAllocator::CheckBlockRun(block_run run, const char *type, check_control *control, bool allocated)
1046 {
1047 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1048 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1049 		|| uint32(run.Start() + run.Length()) > fGroups[run.AllocationGroup()].fNumBits
1050 		|| run.length == 0) {
1051 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1052 		if (control == NULL)
1053 			return B_BAD_DATA;
1054 
1055 		control->errors |= BFS_INVALID_BLOCK_RUN;
1056 		return B_OK;
1057 	}
1058 
1059 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1060 	uint32 block = run.Start() / bitsPerBlock;
1061 	uint32 pos = run.Start() % bitsPerBlock;
1062 	int32 length = 0;
1063 	off_t firstMissing = -1, firstSet = -1;
1064 	off_t firstGroupBlock = (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1065 
1066 	AllocationBlock cached(fVolume);
1067 
1068 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1069 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1070 			RETURN_ERROR(B_IO_ERROR);
1071 
1072 		if (pos >= cached.NumBlockBits()) {
1073 			// something very strange has happened...
1074 			RETURN_ERROR(B_ERROR);
1075 		}
1076 
1077 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1078 			if (cached.IsUsed(pos) != allocated) {
1079 				if (control == NULL) {
1080 					PRINT(("%s: block_run(%ld, %u, %u) is only partially allocated (pos = %ld, length = %ld)!\n",
1081 						type, run.AllocationGroup(), run.Start(), run.Length(), pos, length));
1082 					return B_BAD_DATA;
1083 				}
1084 				if (firstMissing == -1) {
1085 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1086 					control->errors |= BFS_MISSING_BLOCKS;
1087 				}
1088 				control->stats.missing++;
1089 			} else if (firstMissing != -1) {
1090 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are %sallocated!\n",
1091 					type, run.allocation_group, run.start, run.length, firstMissing,
1092 					firstGroupBlock + pos + block * bitsPerBlock - 1, allocated ? "not " : ""));
1093 				firstMissing = -1;
1094 			}
1095 
1096 			if (fCheckBitmap != NULL) {
1097 				// Set the block in the check bitmap as well, but have a look if it
1098 				// is already allocated first
1099 				uint32 offset = pos + block * bitsPerBlock;
1100 				if (CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1101 					if (firstSet == -1) {
1102 						firstSet = firstGroupBlock + offset;
1103 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1104 					}
1105 					control->stats.already_set++;
1106 				} else {
1107 					if (firstSet != -1) {
1108 						FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n",
1109 							type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + offset - 1));
1110 						firstSet = -1;
1111 					}
1112 					SetCheckBitmapAt(firstGroupBlock + offset);
1113 				}
1114 			}
1115 			length++;
1116 			pos++;
1117 		}
1118 
1119 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1120 			if (firstMissing != -1)
1121 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1122 			if (firstSet != -1)
1123 				FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + pos + block * bitsPerBlock - 1));
1124 		}
1125 	}
1126 
1127 	return B_OK;
1128 }
1129 
1130 
1131 status_t
1132 BlockAllocator::CheckInode(Inode *inode, check_control *control)
1133 {
1134 	if (control != NULL && fCheckBitmap == NULL)
1135 		return B_NO_INIT;
1136 	if (inode == NULL)
1137 		return B_BAD_VALUE;
1138 
1139 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1140 	if (status < B_OK)
1141 		return status;
1142 
1143 	data_stream *data = &inode->Node()->data;
1144 
1145 	// check the direct range
1146 
1147 	if (data->max_direct_range) {
1148 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1149 			if (data->direct[i].IsZero())
1150 				break;
1151 
1152 			status = CheckBlockRun(data->direct[i], "direct", control);
1153 			if (status < B_OK)
1154 				return status;
1155 		}
1156 	}
1157 
1158 	CachedBlock cached(fVolume);
1159 
1160 	// check the indirect range
1161 
1162 	if (data->max_indirect_range) {
1163 		status = CheckBlockRun(data->indirect, "indirect", control);
1164 		if (status < B_OK)
1165 			return status;
1166 
1167 		off_t block = fVolume->ToBlock(data->indirect);
1168 
1169 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1170 			block_run *runs = (block_run *)cached.SetTo(block + i);
1171 			if (runs == NULL)
1172 				RETURN_ERROR(B_IO_ERROR);
1173 
1174 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1175 			int32 index = 0;
1176 			for (; index < runsPerBlock; index++) {
1177 				if (runs[index].IsZero())
1178 					break;
1179 
1180 				status = CheckBlockRun(runs[index], "indirect->run", control);
1181 				if (status < B_OK)
1182 					return status;
1183 			}
1184 			if (index < runsPerBlock)
1185 				break;
1186 		}
1187 	}
1188 
1189 	// check the double indirect range
1190 
1191 	if (data->max_double_indirect_range) {
1192 		status = CheckBlockRun(data->double_indirect, "double indirect", control);
1193 		if (status < B_OK)
1194 			return status;
1195 
1196 		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1197 		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1198 
1199 		CachedBlock cachedDirect(fVolume);
1200 		int32 maxIndirectIndex = (data->double_indirect.Length() << fVolume->BlockShift())
1201 									/ sizeof(block_run);
1202 
1203 		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex; indirectIndex++) {
1204 			// get the indirect array block
1205 			block_run *array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1206 				+ indirectIndex / runsPerBlock);
1207 			if (array == NULL)
1208 				return B_IO_ERROR;
1209 
1210 			block_run indirect = array[indirectIndex % runsPerBlock];
1211 			// are we finished yet?
1212 			if (indirect.IsZero())
1213 				return B_OK;
1214 
1215 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1216 			if (status < B_OK)
1217 				return status;
1218 
1219 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift()) / sizeof(block_run);
1220 
1221 			for (int32 index = 0; index < maxIndex; ) {
1222 				block_run *runs = (block_run *)cachedDirect.SetTo(fVolume->ToBlock(indirect)
1223 					+ index / runsPerBlock);
1224 				if (runs == NULL)
1225 					return B_IO_ERROR;
1226 
1227 				do {
1228 					// are we finished yet?
1229 					if (runs[index % runsPerBlock].IsZero())
1230 						return B_OK;
1231 
1232 					status = CheckBlockRun(runs[index % runsPerBlock], "double indirect->runs->run", control);
1233 					if (status < B_OK)
1234 						return status;
1235 				} while ((++index % runsPerArray) != 0);
1236 			}
1237 		}
1238 	}
1239 
1240 	return B_OK;
1241 }
1242 
1243