xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 95bac3fda53a4cb21880712d7b43f8c21db32a2e)
1 /* BlockAllocator - block bitmap handling and allocation policies
2  *
3  * Copyright 2001-2004, Axel Dörfler, axeld@pinc-software.de.
4  * This file may be used under the terms of the MIT License.
5  */
6 
7 
8 #include "Debug.h"
9 #include "BlockAllocator.h"
10 #include "Volume.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Stack.h"
14 #include "bfs_control.h"
15 
16 #include <util/kernel_cpp.h>
17 #include <stdlib.h>
18 
19 #ifdef USER
20 #	define spawn_kernel_thread spawn_thread
21 #endif
22 
23 
24 // Things the BlockAllocator should do:
25 
26 // - find a range of blocks of a certain size nearby a specific position
27 // - allocating an unsharp range of blocks for pre-allocation
28 // - free blocks
29 // - know how to deal with each allocation, special handling for directories,
30 //   files, symlinks, etc. (type sensitive allocation policies)
31 
32 // What makes the code complicated is the fact that we are not just reading
33 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
34 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
35 // of partitions will grow even more - so that's not an option.
36 // Instead we are reading in every block when it's used - since an allocation
37 // group can span several blocks in the block bitmap, the AllocationBlock
38 // class is there to make handling those easier.
39 
40 // The current implementation is only slightly optimized and could probably
41 // be improved a lot. Furthermore, the allocation policies used here should
42 // have some real world tests.
43 
44 
45 struct check_cookie {
46 	check_cookie() {}
47 
48 	block_run			current;
49 	Inode				*parent;
50 	mode_t				parent_mode;
51 	Stack<block_run>	stack;
52 	TreeIterator		*iterator;
53 };
54 
55 
56 class AllocationBlock : public CachedBlock {
57 	public:
58 		AllocationBlock(Volume *volume);
59 
60 		void Allocate(uint16 start, uint16 numBlocks);
61 		void Free(uint16 start, uint16 numBlocks);
62 		inline bool IsUsed(uint16 block);
63 
64 		status_t SetTo(AllocationGroup &group, uint16 block);
65 		status_t SetToWritable(Transaction &transaction, AllocationGroup &group, uint16 block);
66 
67 		uint32 NumBlockBits() const { return fNumBits; }
68 		uint32 &Block(int32 index) { return ((uint32 *)fBlock)[index]; }
69 
70 	private:
71 		uint32	fNumBits;
72 #ifdef DEBUG
73 		bool	fWritable;
74 #endif
75 };
76 
77 
78 class AllocationGroup {
79 	public:
80 		AllocationGroup();
81 
82 		void AddFreeRange(int32 start, int32 blocks);
83 		bool IsFull() const { return fFreeBits == 0; }
84 
85 		status_t Allocate(Transaction &transaction, uint16 start, int32 length);
86 		status_t Free(Transaction &transaction, uint16 start, int32 length);
87 
88 		uint32 fNumBits;
89 		int32 fStart;
90 		int32 fFirstFree, fLargest, fLargestFirst;
91 			// ToDo: fLargest & fLargestFirst are not maintained (and therefore used) yet!
92 		int32 fFreeBits;
93 };
94 
95 
96 AllocationBlock::AllocationBlock(Volume *volume)
97 	: CachedBlock(volume)
98 {
99 }
100 
101 
102 status_t
103 AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
104 {
105 	// 8 blocks per byte
106 	fNumBits = fVolume->BlockSize() << 3;
107 	// the last group may have less bits in the last block
108 	if ((block + 1) * fNumBits > group.fNumBits)
109 		fNumBits = group.fNumBits % fNumBits;
110 
111 #ifdef DEBUG
112 	fWritable = false;
113 #endif
114 	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
115 }
116 
117 
118 status_t
119 AllocationBlock::SetToWritable(Transaction &transaction, AllocationGroup &group, uint16 block)
120 {
121 	// 8 blocks per byte
122 	fNumBits = fVolume->BlockSize() << 3;
123 	// the last group may have less bits in the last block
124 	if ((block + 1) * fNumBits > group.fNumBits)
125 		fNumBits = group.fNumBits % fNumBits;
126 
127 #ifdef DEBUG
128 	fWritable = true;
129 #endif
130 	return CachedBlock::SetToWritable(transaction, group.fStart + block) != NULL ? B_OK : B_ERROR;
131 }
132 
133 
134 bool
135 AllocationBlock::IsUsed(uint16 block)
136 {
137 	if (block > fNumBits)
138 		return true;
139 	// the block bitmap is accessed in 32-bit blocks
140 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
141 }
142 
143 
144 void
145 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
146 {
147 	ASSERT(start < fNumBits);
148 	ASSERT(uint32(start + numBlocks) <= fNumBits);
149 #ifdef DEBUG
150 	ASSERT(fWritable);
151 #endif
152 
153 	if (uint32(start + numBlocks) > fNumBits) {
154 		FATAL(("Allocation::Allocate(): tried to allocate too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
155 		DIE(("Allocation::Allocate(): tried to allocate too many blocks"));
156 	}
157 
158 	int32 block = start >> 5;
159 
160 	while (numBlocks > 0) {
161 		uint32 mask = 0;
162 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
163 			mask |= 1UL << (i % 32);
164 
165 #ifdef DEBUG
166 		// check for already set blocks
167 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32 *)fBlock)[block]) {
168 			FATAL(("AllocationBlock::Allocate(): some blocks are already allocated, start = %u, numBlocks = %u\n", start, numBlocks));
169 			DEBUGGER(("blocks already set!"));
170 		}
171 #endif
172 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
173 		start = 0;
174 	}
175 }
176 
177 
178 void
179 AllocationBlock::Free(uint16 start, uint16 numBlocks)
180 {
181 	ASSERT(start < fNumBits);
182 	ASSERT(uint32(start + numBlocks) <= fNumBits);
183 #ifdef DEBUG
184 	ASSERT(fWritable);
185 #endif
186 
187 	if (uint32(start + numBlocks) > fNumBits) {
188 		FATAL(("Allocation::Free(): tried to free too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
189 		DIE(("Allocation::Free(): tried to free too many blocks"));
190 	}
191 
192 	int32 block = start >> 5;
193 
194 	while (numBlocks > 0) {
195 		uint32 mask = 0;
196 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
197 			mask |= 1UL << (i % 32);
198 
199 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
200 		start = 0;
201 	}
202 }
203 
204 
205 //	#pragma mark -
206 
207 
208 AllocationGroup::AllocationGroup()
209 	:
210 	fFirstFree(-1),
211 	fLargest(-1),
212 	fLargestFirst(-1),
213 	fFreeBits(0)
214 {
215 }
216 
217 
218 void
219 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
220 {
221 	//D(if (blocks > 512)
222 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
223 
224 	if (fFirstFree == -1)
225 		fFirstFree = start;
226 
227 	if (fLargest < blocks) {
228 		fLargest = blocks;
229 		fLargestFirst = start;
230 	}
231 
232 	fFreeBits += blocks;
233 }
234 
235 
236 /** Allocates the specified run in the allocation group.
237  *	Doesn't check if the run is valid or already allocated
238  *	partially, nor does it maintain the free ranges hints
239  *	or the volume's used blocks count.
240  *	It only does the low-level work of allocating some bits
241  *	in the block bitmap.
242  *	Assumes that the block bitmap lock is hold.
243  */
244 
245 status_t
246 AllocationGroup::Allocate(Transaction &transaction, uint16 start, int32 length)
247 {
248 	if (start > fNumBits)
249 		return B_ERROR;
250 
251 	// Update the allocation group info
252 	// ToDo: this info will be incorrect if something goes wrong later
253 	// Note, the fFirstFree block doesn't have to be really free
254 	if (start == fFirstFree)
255 		fFirstFree = start + length;
256 	fFreeBits -= length;
257 
258 	Volume *volume = transaction.GetVolume();
259 
260 	// calculate block in the block bitmap and position within
261 	uint32 bitsPerBlock = volume->BlockSize() << 3;
262 	uint32 block = start / bitsPerBlock;
263 	start = start % bitsPerBlock;
264 
265 	AllocationBlock cached(volume);
266 
267 	while (length > 0) {
268 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
269 			RETURN_ERROR(B_IO_ERROR);
270 
271 		uint32 numBlocks = length;
272 		if (start + numBlocks > cached.NumBlockBits())
273 			numBlocks = cached.NumBlockBits() - start;
274 
275 		cached.Allocate(start, numBlocks);
276 
277 		length -= numBlocks;
278 		start = 0;
279 		block++;
280 	}
281 
282 	return B_OK;
283 }
284 
285 
286 /** Frees the specified run in the allocation group.
287  *	Doesn't check if the run is valid or was not completely
288  *	allocated, nor does it maintain the free ranges hints
289  *	or the volume's used blocks count.
290  *	It only does the low-level work of freeing some bits
291  *	in the block bitmap.
292  *	Assumes that the block bitmap lock is hold.
293  */
294 
295 status_t
296 AllocationGroup::Free(Transaction &transaction, uint16 start, int32 length)
297 {
298 	if (start > fNumBits)
299 		return B_ERROR;
300 
301 	// Update the allocation group info
302 	// ToDo: this info will be incorrect if something goes wrong later
303 	if (fFirstFree > start)
304 		fFirstFree = start;
305 	fFreeBits += length;
306 
307 	Volume *volume = transaction.GetVolume();
308 
309 	// calculate block in the block bitmap and position within
310 	uint32 bitsPerBlock = volume->BlockSize() << 3;
311 	uint32 block = start / bitsPerBlock;
312 	start = start % bitsPerBlock;
313 
314 	AllocationBlock cached(volume);
315 
316 	while (length > 0) {
317 		if (cached.SetToWritable(transaction, *this, block) < B_OK)
318 			RETURN_ERROR(B_IO_ERROR);
319 
320 		uint16 freeLength = length;
321 		if (uint32(start + length) > cached.NumBlockBits())
322 			freeLength = cached.NumBlockBits() - start;
323 
324 		cached.Free(start, freeLength);
325 
326 		length -= freeLength;
327 		start = 0;
328 		block++;
329 	}
330 	return B_OK;
331 }
332 
333 
334 //	#pragma mark -
335 
336 
337 BlockAllocator::BlockAllocator(Volume *volume)
338 	:
339 	fVolume(volume),
340 	fLock("bfs allocator"),
341 	fGroups(NULL),
342 	fCheckBitmap(NULL)
343 {
344 }
345 
346 
347 BlockAllocator::~BlockAllocator()
348 {
349 	delete[] fGroups;
350 }
351 
352 
353 status_t
354 BlockAllocator::Initialize(bool full)
355 {
356 	if (fLock.InitCheck() < B_OK)
357 		return B_ERROR;
358 
359 	fNumGroups = fVolume->AllocationGroups();
360 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
361 	fGroups = new AllocationGroup[fNumGroups];
362 	if (fGroups == NULL)
363 		return B_NO_MEMORY;
364 
365 	if (!full)
366 		return B_OK;
367 
368 	fLock.Lock();
369 		// the lock will be released by the initialize() function
370 
371 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,
372 			"bfs block allocator", B_LOW_PRIORITY, (void *)this);
373 	if (id < B_OK)
374 		return initialize(this);
375 
376 	return resume_thread(id);
377 }
378 
379 
380 status_t
381 BlockAllocator::InitializeAndClearBitmap(Transaction &transaction)
382 {
383 	status_t status = Initialize(false);
384 	if (status < B_OK)
385 		return status;
386 
387 	uint32 blocks = fBlocksPerGroup;
388 	uint32 numBits = 8 * blocks * fVolume->BlockSize();
389 	uint32 blockShift = fVolume->BlockShift();
390 
391 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
392 	if (buffer == NULL)
393 		RETURN_ERROR(B_NO_MEMORY);
394 
395 	memset(buffer, 0, numBits >> 3);
396 
397 	off_t offset = 1;
398 
399 	// initialize the AllocationGroup objects and clear the on-disk bitmap
400 
401 	for (int32 i = 0; i < fNumGroups; i++) {
402 		if (write_pos(fVolume->Device(), offset << blockShift, buffer, blocks << blockShift) < B_OK)
403 			return B_ERROR;
404 
405 		// the last allocation group may contain less blocks than the others
406 		fGroups[i].fNumBits = i == fNumGroups - 1 ? fVolume->NumBlocks() - i * numBits : numBits;
407 		fGroups[i].fStart = offset;
408 		fGroups[i].fFirstFree = fGroups[i].fLargestFirst = 0;
409 		fGroups[i].fFreeBits = fGroups[i].fLargest = fGroups[i].fNumBits;
410 
411 		offset += blocks;
412 	}
413 	free(buffer);
414 
415 	// reserve the boot block, the log area, and the block bitmap itself
416 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
417 
418 	if (fGroups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
419 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
420 		return B_ERROR;
421 	}
422 	fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
423 
424 	return B_OK;
425 }
426 
427 
428 status_t
429 BlockAllocator::initialize(BlockAllocator *allocator)
430 {
431 	// The lock must already be held at this point
432 
433 	Volume *volume = allocator->fVolume;
434 	uint32 blocks = allocator->fBlocksPerGroup;
435 	uint32 numBits = 8 * blocks * volume->BlockSize();
436 	uint32 blockShift = volume->BlockShift();
437 	off_t freeBlocks = 0;
438 
439 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
440 	if (buffer == NULL) {
441 		allocator->fLock.Unlock();
442 		RETURN_ERROR(B_NO_MEMORY);
443 	}
444 
445 	AllocationGroup *groups = allocator->fGroups;
446 	off_t offset = 1;
447 	int32 num = allocator->fNumGroups;
448 
449 	for (int32 i = 0; i < num; i++) {
450 		if (read_pos(volume->Device(), offset << blockShift, buffer, blocks << blockShift) < B_OK)
451 			break;
452 
453 		// the last allocation group may contain less blocks than the others
454 		groups[i].fNumBits = i == num - 1 ? volume->NumBlocks() - i * numBits : numBits;
455 		groups[i].fStart = offset;
456 
457 		// finds all free ranges in this allocation group
458 		int32 start = -1, range = 0;
459 		int32 size = groups[i].fNumBits, num = 0;
460 
461 		for (int32 k = 0;k < (size >> 2);k++) {
462 			for (int32 j = 0; j < 32 && num < size; j++, num++) {
463 				if (buffer[k] & (1UL << j)) {
464 					if (range > 0) {
465 						groups[i].AddFreeRange(start, range);
466 						range = 0;
467 					}
468 				} else if (range++ == 0)
469 					start = num;
470 			}
471 		}
472 		if (range)
473 			groups[i].AddFreeRange(start, range);
474 
475 		freeBlocks += groups[i].fFreeBits;
476 
477 		offset += blocks;
478 	}
479 	free(buffer);
480 
481 	// check if block bitmap and log area are reserved
482 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
483 	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
484 		Transaction transaction(volume, 0);
485 		if (groups[0].Allocate(transaction, 0, reservedBlocks) < B_OK) {
486 			FATAL(("could not allocate reserved space for block bitmap/log!\n"));
487 			volume->Panic();
488 		} else {
489 			FATAL(("space for block bitmap or log area was not reserved!\n"));
490 
491 			transaction.Done();
492 		}
493 	}
494 
495 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
496 	if (volume->UsedBlocks() != usedBlocks) {
497 		// If the disk in a dirty state at mount time, it's
498 		// normal that the values don't match
499 		INFORM(("volume reports %Ld used blocks, correct is %Ld\n", volume->UsedBlocks(), usedBlocks));
500 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
501 	}
502 
503 	allocator->fLock.Unlock();
504 	return B_OK;
505 }
506 
507 
508 status_t
509 BlockAllocator::AllocateBlocks(Transaction &transaction, int32 group, uint16 start,
510 	uint16 maximum, uint16 minimum, block_run &run)
511 {
512 	if (maximum == 0)
513 		return B_BAD_VALUE;
514 
515 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n", group, start, maximum, minimum));
516 
517 	AllocationBlock cached(fVolume);
518 	Locker lock(fLock);
519 
520 	// the first scan through all allocation groups will look for the
521 	// wanted maximum of blocks, the second scan will just look to
522 	// satisfy the minimal requirement
523 	uint16 numBlocks = maximum;
524 
525 	for (int32 i = 0; i < fNumGroups * 2; i++, group++, start = 0) {
526 		group = group % fNumGroups;
527 
528 		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
529 			continue;
530 
531 		if (i >= fNumGroups) {
532 			// if the minimum is the same as the maximum, it's not necessary to
533 			// search for in the allocation groups a second time
534 			if (maximum == minimum)
535 				return B_DEVICE_FULL;
536 
537 			numBlocks = minimum;
538 		}
539 
540 		// The wanted maximum is smaller than the largest free block in the group
541 		// or already smaller than the minimum
542 		// ToDo: disabled because it's currently not maintained after the first allocation
543 		//if (numBlocks > fGroups[group].fLargest)
544 		//	continue;
545 
546 		if (start < fGroups[group].fFirstFree)
547 			start = fGroups[group].fFirstFree;
548 
549 		// there may be more than one block per allocation group - and
550 		// we iterate through it to find a place for the allocation.
551 		// (one allocation can't exceed one allocation group)
552 
553 		uint32 block = start / (fVolume->BlockSize() << 3);
554 		int32 range = 0, rangeStart = 0;
555 
556 		for (; block < fBlocksPerGroup; block++) {
557 			if (cached.SetTo(fGroups[group], block) < B_OK)
558 				RETURN_ERROR(B_ERROR);
559 
560 			// find a block large enough to hold the allocation
561 			for (uint32 bit = start % cached.NumBlockBits(); bit < cached.NumBlockBits(); bit++) {
562 				if (!cached.IsUsed(bit)) {
563 					if (range == 0) {
564 						// start new range
565 						rangeStart = block * cached.NumBlockBits() + bit;
566 					}
567 
568 					// have we found a range large enough to hold numBlocks?
569 					if (++range >= maximum)
570 						break;
571 				} else if (i >= fNumGroups && range >= minimum) {
572 					// we have found a block larger than the required minimum (second pass)
573 					break;
574 				} else {
575 					// end of a range
576 					range = 0;
577 				}
578 			}
579 
580 			// if we found a suitable block, mark the blocks as in use, and write
581 			// the updated block bitmap back to disk
582 			if (range >= numBlocks) {
583 				// adjust allocation size
584 				if (numBlocks < maximum)
585 					numBlocks = range;
586 
587 				if (fGroups[group].Allocate(transaction, rangeStart, numBlocks) < B_OK)
588 					RETURN_ERROR(B_IO_ERROR);
589 
590 				run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(group);
591 				run.start = HOST_ENDIAN_TO_BFS_INT16(rangeStart);
592 				run.length = HOST_ENDIAN_TO_BFS_INT16(numBlocks);
593 
594 				fVolume->SuperBlock().used_blocks =
595 					HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + numBlocks);
596 					// We are not writing back the disk's super block - it's
597 					// either done by the journaling code, or when the disk
598 					// is unmounted.
599 					// If the value is not correct at mount time, it will be
600 					// fixed anyway.
601 
602 				return B_OK;
603 			}
604 
605 			// start from the beginning of the next block
606 			start = 0;
607 		}
608 	}
609 	return B_DEVICE_FULL;
610 }
611 
612 
613 status_t
614 BlockAllocator::AllocateForInode(Transaction &transaction, const block_run *parent,
615 	mode_t type, block_run &run)
616 {
617 	// apply some allocation policies here (AllocateBlocks() will break them
618 	// if necessary) - we will start with those described in Dominic Giampaolo's
619 	// "Practical File System Design", and see how good they work
620 
621 	// files are going in the same allocation group as its parent, sub-directories
622 	// will be inserted 8 allocation groups after the one of the parent
623 	uint16 group = parent->AllocationGroup();
624 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
625 		group += 8;
626 
627 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
628 }
629 
630 
631 status_t
632 BlockAllocator::Allocate(Transaction &transaction, Inode *inode, off_t numBlocks,
633 	block_run &run, uint16 minimum)
634 {
635 	if (numBlocks <= 0)
636 		return B_ERROR;
637 
638 	// one block_run can't hold more data than there is in one allocation group
639 	if (numBlocks > fGroups[0].fNumBits)
640 		numBlocks = fGroups[0].fNumBits;
641 
642 	// since block_run.length is uint16, the largest number of blocks that
643 	// can be covered by a block_run is 65535
644 	// ToDo: if we drop compatibility, couldn't we do this any better?
645 	// There are basically two possibilities:
646 	// a) since a length of zero doesn't have any sense, take that for 65536 -
647 	//    but that could cause many problems (bugs) in other areas
648 	// b) reduce the maximum amount of blocks per block_run, so that the remaining
649 	//    number of free blocks can be used in a useful manner (like 4 blocks) -
650 	//    but that would also reduce the maximum file size
651 	// c) have BlockRun::Length() return (length + 1).
652 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
653 		numBlocks = MAX_BLOCK_RUN_LENGTH;
654 
655 	// apply some allocation policies here (AllocateBlocks() will break them
656 	// if necessary)
657 	uint16 group = inode->BlockRun().AllocationGroup();
658 	uint16 start = 0;
659 
660 	// are there already allocated blocks? (then just try to allocate near the last one)
661 	if (inode->Size() > 0) {
662 		const data_stream &data = inode->Node().data;
663 		// ToDo: we currently don't care for when the data stream
664 		// is already grown into the indirect ranges
665 		if (data.max_double_indirect_range == 0
666 			&& data.max_indirect_range == 0) {
667 			// Since size > 0, there must be a valid block run in this stream
668 			int32 last = 0;
669 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
670 				if (data.direct[last + 1].IsZero())
671 					break;
672 
673 			group = data.direct[last].AllocationGroup();
674 			start = data.direct[last].Start() + data.direct[last].Length();
675 		}
676 	} else if (inode->IsContainer() || inode->IsSymLink()) {
677 		// directory and symbolic link data will go in the same allocation
678 		// group as the inode is in but after the inode data
679 		start = inode->BlockRun().Start();
680 	} else {
681 		// file data will start in the next allocation group
682 		group = inode->BlockRun().AllocationGroup() + 1;
683 	}
684 
685 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
686 }
687 
688 
689 status_t
690 BlockAllocator::Free(Transaction &transaction, block_run run)
691 {
692 	Locker lock(fLock);
693 
694 	int32 group = run.AllocationGroup();
695 	uint16 start = run.Start();
696 	uint16 length = run.Length();
697 
698 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start, length));
699 
700 	// doesn't use Volume::IsValidBlockRun() here because it can check better
701 	// against the group size (the last group may have a different length)
702 	if (group < 0 || group >= fNumGroups
703 		|| start > fGroups[group].fNumBits
704 		|| uint32(start + length) > fGroups[group].fNumBits
705 		|| length == 0) {
706 		FATAL(("tried to free an invalid block_run (%ld, %u, %u)\n", group, start, length));
707 		DEBUGGER(("tried to free invalid block_run"));
708 		return B_BAD_VALUE;
709 	}
710 	// check if someone tries to free reserved areas at the beginning of the drive
711 	if (group == 0 && start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
712 		FATAL(("tried to free a reserved block_run (%ld, %u, %u)\n", group, start, length));
713 		DEBUGGER(("tried to free reserved block"));
714 		return B_BAD_VALUE;
715 	}
716 #ifdef DEBUG
717 	if (CheckBlockRun(run) < B_OK)
718 		return B_BAD_DATA;
719 #endif
720 
721 	if (fGroups[group].Free(transaction, start, length) < B_OK)
722 		RETURN_ERROR(B_IO_ERROR);
723 
724 #ifdef DEBUG
725 	if (CheckBlockRun(run, NULL, NULL, false) < B_OK)
726 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just freed)\n"));
727 #endif
728 
729 	fVolume->SuperBlock().used_blocks =
730 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
731 	return B_OK;
732 }
733 
734 
735 size_t
736 BlockAllocator::BitmapSize() const
737 {
738 	return fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
739 }
740 
741 
742 //	#pragma mark -
743 //	Functions to check the validity of the bitmap - they are used from
744 //	the "chkbfs" command
745 
746 
747 bool
748 BlockAllocator::IsValidCheckControl(check_control *control)
749 {
750 	if (control == NULL
751 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
752 		FATAL(("invalid check_control (%p)!\n", control));
753 		return false;
754 	}
755 
756 	return true;
757 }
758 
759 
760 status_t
761 BlockAllocator::StartChecking(check_control *control)
762 {
763 	if (!IsValidCheckControl(control))
764 		return B_BAD_VALUE;
765 
766 	status_t status = fLock.Lock();
767 	if (status < B_OK)
768 		return status;
769 
770 	size_t size = BitmapSize();
771 	fCheckBitmap = (uint32 *)malloc(size);
772 	if (fCheckBitmap == NULL) {
773 		fLock.Unlock();
774 		return B_NO_MEMORY;
775 	}
776 
777 	check_cookie *cookie = new check_cookie();
778 	if (cookie == NULL) {
779 		free(fCheckBitmap);
780 		fCheckBitmap = NULL;
781 		fLock.Unlock();
782 
783 		return B_NO_MEMORY;
784 	}
785 
786 	// initialize bitmap
787 	memset(fCheckBitmap, 0, size);
788 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length(); block-- > 0;)
789 		SetCheckBitmapAt(block);
790 
791 	cookie->stack.Push(fVolume->Root());
792 	cookie->stack.Push(fVolume->Indices());
793 	cookie->iterator = NULL;
794 	control->cookie = cookie;
795 
796 	fCheckCookie = cookie;
797 		// to be able to restore nicely if "chkbfs" exited abnormally
798 
799 	// ToDo: check reserved area in bitmap!
800 
801 	return B_OK;
802 }
803 
804 
805 status_t
806 BlockAllocator::StopChecking(check_control *control)
807 {
808 	check_cookie *cookie;
809 	if (control == NULL)
810 		cookie = fCheckCookie;
811 	else
812 		cookie = (check_cookie *)control->cookie;
813 
814 	if (cookie == NULL)
815 		return B_ERROR;
816 
817 	if (cookie->iterator != NULL) {
818 		delete cookie->iterator;
819 		cookie->iterator = NULL;
820 
821 		// the current directory inode is still locked in memory
822 		put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
823 	}
824 
825 	// if CheckNextNode() could completely work through, we can
826 	// fix any damages of the bitmap
827 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
828 		// calculate the number of used blocks in the check bitmap
829 		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
830 		off_t usedBlocks = 0LL;
831 
832 		// ToDo: update the allocation groups used blocks info
833 		for (uint32 i = size >> 2; i-- > 0;) {
834 			uint32 compare = 1;
835 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
836 				if (compare & fCheckBitmap[i])
837 					usedBlocks++;
838 			}
839 		}
840 
841 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks + control->stats.missing;
842 		if (control->stats.freed < 0)
843 			control->stats.freed = 0;
844 
845 		// Should we fix errors? Were there any errors we can fix?
846 		if (control->flags & BFS_FIX_BITMAP_ERRORS
847 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
848 			// if so, write the check bitmap back over the original one,
849 			// and use transactions here to play safe - we even use several
850 			// transactions, so that we don't blow the maximum log size
851 			// on large disks; since we don't need to make this atomic
852 			fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
853 
854 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
855 			int32 blockSize = fVolume->BlockSize();
856 
857 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
858 				Transaction transaction(fVolume, 1 + i);
859 
860 				int32 blocksToWrite = 512;
861 				if (blocksToWrite + i > blocksInBitmap)
862 					blocksToWrite = blocksInBitmap - i;
863 
864 				status_t status = transaction.WriteBlocks(1 + i,
865 					(uint8 *)fCheckBitmap + i * blockSize, blocksToWrite);
866 				if (status < B_OK) {
867 					FATAL(("error writing bitmap: %s\n", strerror(status)));
868 					break;
869 				}
870 				transaction.Done();
871 			}
872 		}
873 	} else
874 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
875 
876 	free(fCheckBitmap);
877 	fCheckBitmap = NULL;
878 	fCheckCookie = NULL;
879 	delete cookie;
880 	fLock.Unlock();
881 
882 	return B_OK;
883 }
884 
885 
886 status_t
887 BlockAllocator::CheckNextNode(check_control *control)
888 {
889 	if (!IsValidCheckControl(control))
890 		return B_BAD_VALUE;
891 
892 	check_cookie *cookie = (check_cookie *)control->cookie;
893 
894 	while (true) {
895 		if (cookie->iterator == NULL) {
896 			if (!cookie->stack.Pop(&cookie->current)) {
897 				// no more runs on the stack, we are obviously finished!
898 				control->status = B_ENTRY_NOT_FOUND;
899 				return B_ENTRY_NOT_FOUND;
900 			}
901 
902 			// get iterator for the next directory
903 
904 #ifdef UNSAFE_GET_VNODE
905 			RecursiveLocker locker(fVolume->Lock());
906 #endif
907 			Vnode vnode(fVolume, cookie->current);
908 			Inode *inode;
909 			if (vnode.Get(&inode) < B_OK) {
910 				FATAL(("check: Could not open inode at %Ld\n", fVolume->ToBlock(cookie->current)));
911 				continue;
912 			}
913 
914 			if (!inode->IsContainer()) {
915 				FATAL(("check: inode at %Ld should have been a directory\n", fVolume->ToBlock(cookie->current)));
916 				continue;
917 			}
918 
919 			BPlusTree *tree;
920 			if (inode->GetTree(&tree) != B_OK) {
921 				FATAL(("check: could not open b+tree from inode at %Ld\n", fVolume->ToBlock(cookie->current)));
922 				continue;
923 			}
924 
925 			cookie->parent = inode;
926 			cookie->parent_mode = inode->Mode();
927 
928 			cookie->iterator = new TreeIterator(tree);
929 			if (cookie->iterator == NULL)
930 				RETURN_ERROR(B_NO_MEMORY);
931 
932 			// the inode must stay locked in memory until the iterator is freed
933 			vnode.Keep();
934 
935 			// check the inode of the directory
936 			control->errors = 0;
937 			control->status = CheckInode(inode, control);
938 
939 			if (inode->GetName(control->name) < B_OK)
940 				strcpy(control->name, "(node has no name)");
941 
942 			control->inode = inode->ID();
943 			control->mode = inode->Mode();
944 
945 			return B_OK;
946 		}
947 
948 		char name[B_FILE_NAME_LENGTH];
949 		uint16 length;
950 		vnode_id id;
951 
952 		status_t status = cookie->iterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
953 		if (status == B_ENTRY_NOT_FOUND) {
954 			// there are no more entries in this iterator, free it and go on
955 			delete cookie->iterator;
956 			cookie->iterator = NULL;
957 
958 			// unlock the directory's inode from memory
959 			put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
960 
961 			continue;
962 		} else if (status == B_OK) {
963 			// ignore "." and ".." entries
964 			if (!strcmp(name, ".") || !strcmp(name, ".."))
965 				continue;
966 
967 			// fill in the control data as soon as we have them
968 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
969 			control->inode = id;
970 			control->errors = 0;
971 
972 			Vnode vnode(fVolume, id);
973 			Inode *inode;
974 			if (vnode.Get(&inode) < B_OK) {
975 				FATAL(("Could not open inode ID %Ld!\n", id));
976 				control->errors |= BFS_COULD_NOT_OPEN;
977 				control->status = B_ERROR;
978 				return B_OK;
979 			}
980 
981 			// check if the inode's name is the same as in the b+tree
982 			if (inode->IsRegularNode()) {
983 				SimpleLocker locker(inode->SmallDataLock());
984 				NodeGetter node(fVolume, inode);
985 
986 				const char *localName = inode->Name(node.Node());
987 				if (localName == NULL || strcmp(localName, name)) {
988 					control->errors |= BFS_NAMES_DONT_MATCH;
989 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name, localName));
990 				}
991 			}
992 
993 			control->mode = inode->Mode();
994 
995 			// Check for the correct mode of the node (if the mode of the
996 			// file don't fit to its parent, there is a serious problem)
997 			if ((cookie->parent_mode & S_ATTR_DIR && !inode->IsAttribute())
998 				|| (cookie->parent_mode & S_INDEX_DIR && !inode->IsIndex())
999 				|| ((cookie->parent_mode & S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR) == S_DIRECTORY
1000 					&& inode->Mode() & (S_ATTR | S_ATTR_DIR | S_INDEX_DIR))) {
1001 				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!\n",
1002 					inode->BlockNumber(), inode->Mode(), cookie->parent_mode, cookie->parent->BlockNumber()));
1003 
1004 				// if we are allowed to fix errors, we should remove the file
1005 				if (control->flags & BFS_REMOVE_WRONG_TYPES
1006 					&& control->flags & BFS_FIX_BITMAP_ERRORS) {
1007 #ifdef UNSAFE_GET_VNODE
1008 					RecursiveLocker locker(fVolume->Lock());
1009 #endif
1010 					// it's safe to start a transaction, because Inode::Remove()
1011 					// won't touch the block bitmap (which we hold the lock for)
1012 					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
1013 					// the bitmap anyway
1014 					Transaction transaction(fVolume, cookie->parent->BlockNumber());
1015 
1016 					inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_DONT_FREE_SPACE);
1017 					status = cookie->parent->Remove(transaction, name, NULL, inode->IsContainer());
1018 					if (status == B_OK)
1019 						transaction.Done();
1020 				} else
1021 					status = B_ERROR;
1022 
1023 				control->errors |= BFS_WRONG_TYPE;
1024 				control->status = status;
1025 				return B_OK;
1026 			}
1027 
1028 			// If the inode has an attribute directory, push it on the stack
1029 			if (!inode->Attributes().IsZero())
1030 				cookie->stack.Push(inode->Attributes());
1031 
1032 			// push the directory on the stack so that it will be scanned later
1033 			if (inode->IsContainer() && !inode->IsIndex())
1034 				cookie->stack.Push(inode->BlockRun());
1035 			else {
1036 				// check it now
1037 				control->status = CheckInode(inode, control);
1038 
1039 				return B_OK;
1040 			}
1041 		}
1042 	}
1043 	// is never reached
1044 }
1045 
1046 
1047 bool
1048 BlockAllocator::CheckBitmapIsUsedAt(off_t block) const
1049 {
1050 	size_t size = BitmapSize();
1051 	uint32 index = block / 32;	// 32bit resolution
1052 	if (index > size / 4)
1053 		return false;
1054 
1055 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index]) & (1UL << (block & 0x1f));
1056 }
1057 
1058 
1059 void
1060 BlockAllocator::SetCheckBitmapAt(off_t block)
1061 {
1062 	size_t size = BitmapSize();
1063 	uint32 index = block / 32;	// 32bit resolution
1064 	if (index > size / 4)
1065 		return;
1066 
1067 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1068 }
1069 
1070 
1071 status_t
1072 BlockAllocator::CheckBlockRun(block_run run, const char *type, check_control *control, bool allocated)
1073 {
1074 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1075 		|| run.Start() > fGroups[run.AllocationGroup()].fNumBits
1076 		|| uint32(run.Start() + run.Length()) > fGroups[run.AllocationGroup()].fNumBits
1077 		|| run.length == 0) {
1078 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1079 		if (control == NULL)
1080 			return B_BAD_DATA;
1081 
1082 		control->errors |= BFS_INVALID_BLOCK_RUN;
1083 		return B_OK;
1084 	}
1085 
1086 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1087 	uint32 block = run.Start() / bitsPerBlock;
1088 	uint32 pos = run.Start() % bitsPerBlock;
1089 	int32 length = 0;
1090 	off_t firstMissing = -1, firstSet = -1;
1091 	off_t firstGroupBlock = (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1092 
1093 	AllocationBlock cached(fVolume);
1094 
1095 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1096 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1097 			RETURN_ERROR(B_IO_ERROR);
1098 
1099 		if (pos >= cached.NumBlockBits()) {
1100 			// something very strange has happened...
1101 			RETURN_ERROR(B_ERROR);
1102 		}
1103 
1104 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1105 			if (cached.IsUsed(pos) != allocated) {
1106 				if (control == NULL) {
1107 					PRINT(("%s: block_run(%ld, %u, %u) is only partially allocated (pos = %ld, length = %ld)!\n",
1108 						type, run.AllocationGroup(), run.Start(), run.Length(), pos, length));
1109 					return B_BAD_DATA;
1110 				}
1111 				if (firstMissing == -1) {
1112 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1113 					control->errors |= BFS_MISSING_BLOCKS;
1114 				}
1115 				control->stats.missing++;
1116 			} else if (firstMissing != -1) {
1117 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are %sallocated!\n",
1118 					type, run.allocation_group, run.start, run.length, firstMissing,
1119 					firstGroupBlock + pos + block * bitsPerBlock - 1, allocated ? "not " : ""));
1120 				firstMissing = -1;
1121 			}
1122 
1123 			if (fCheckBitmap != NULL) {
1124 				// Set the block in the check bitmap as well, but have a look if it
1125 				// is already allocated first
1126 				uint32 offset = pos + block * bitsPerBlock;
1127 				if (CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1128 					if (firstSet == -1) {
1129 						firstSet = firstGroupBlock + offset;
1130 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1131 					}
1132 					control->stats.already_set++;
1133 				} else {
1134 					if (firstSet != -1) {
1135 						FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n",
1136 							type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + offset - 1));
1137 						firstSet = -1;
1138 					}
1139 					SetCheckBitmapAt(firstGroupBlock + offset);
1140 				}
1141 			}
1142 			length++;
1143 			pos++;
1144 		}
1145 
1146 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1147 			if (firstMissing != -1)
1148 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1149 			if (firstSet != -1)
1150 				FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + pos + block * bitsPerBlock - 1));
1151 		}
1152 	}
1153 
1154 	return B_OK;
1155 }
1156 
1157 
1158 status_t
1159 BlockAllocator::CheckInode(Inode *inode, check_control *control)
1160 {
1161 	if (control != NULL && fCheckBitmap == NULL)
1162 		return B_NO_INIT;
1163 	if (inode == NULL)
1164 		return B_BAD_VALUE;
1165 
1166 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1167 	if (status < B_OK)
1168 		return status;
1169 
1170 	data_stream *data = &inode->Node().data;
1171 
1172 	// check the direct range
1173 
1174 	if (data->max_direct_range) {
1175 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1176 			if (data->direct[i].IsZero())
1177 				break;
1178 
1179 			status = CheckBlockRun(data->direct[i], "direct", control);
1180 			if (status < B_OK)
1181 				return status;
1182 		}
1183 	}
1184 
1185 	CachedBlock cached(fVolume);
1186 
1187 	// check the indirect range
1188 
1189 	if (data->max_indirect_range) {
1190 		status = CheckBlockRun(data->indirect, "indirect", control);
1191 		if (status < B_OK)
1192 			return status;
1193 
1194 		off_t block = fVolume->ToBlock(data->indirect);
1195 
1196 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1197 			block_run *runs = (block_run *)cached.SetTo(block + i);
1198 			if (runs == NULL)
1199 				RETURN_ERROR(B_IO_ERROR);
1200 
1201 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1202 			int32 index = 0;
1203 			for (; index < runsPerBlock; index++) {
1204 				if (runs[index].IsZero())
1205 					break;
1206 
1207 				status = CheckBlockRun(runs[index], "indirect->run", control);
1208 				if (status < B_OK)
1209 					return status;
1210 			}
1211 			if (index < runsPerBlock)
1212 				break;
1213 		}
1214 	}
1215 
1216 	// check the double indirect range
1217 
1218 	if (data->max_double_indirect_range) {
1219 		status = CheckBlockRun(data->double_indirect, "double indirect", control);
1220 		if (status < B_OK)
1221 			return status;
1222 
1223 		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1224 		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1225 
1226 		CachedBlock cachedDirect(fVolume);
1227 		int32 maxIndirectIndex = (data->double_indirect.Length() << fVolume->BlockShift())
1228 									/ sizeof(block_run);
1229 
1230 		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex; indirectIndex++) {
1231 			// get the indirect array block
1232 			block_run *array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1233 				+ indirectIndex / runsPerBlock);
1234 			if (array == NULL)
1235 				return B_IO_ERROR;
1236 
1237 			block_run indirect = array[indirectIndex % runsPerBlock];
1238 			// are we finished yet?
1239 			if (indirect.IsZero())
1240 				return B_OK;
1241 
1242 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1243 			if (status < B_OK)
1244 				return status;
1245 
1246 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift()) / sizeof(block_run);
1247 
1248 			for (int32 index = 0; index < maxIndex; ) {
1249 				block_run *runs = (block_run *)cachedDirect.SetTo(fVolume->ToBlock(indirect)
1250 					+ index / runsPerBlock);
1251 				if (runs == NULL)
1252 					return B_IO_ERROR;
1253 
1254 				do {
1255 					// are we finished yet?
1256 					if (runs[index % runsPerBlock].IsZero())
1257 						return B_OK;
1258 
1259 					status = CheckBlockRun(runs[index % runsPerBlock], "double indirect->runs->run", control);
1260 					if (status < B_OK)
1261 						return status;
1262 				} while ((++index % runsPerArray) != 0);
1263 			}
1264 		}
1265 	}
1266 
1267 	return B_OK;
1268 }
1269 
1270