xref: /haiku/src/add-ons/kernel/file_systems/bfs/BlockAllocator.cpp (revision 9eb55bc1d104b8fda80898f8b25c94d8000c8255)
1 /* BlockAllocator - block bitmap handling and allocation policies
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** This file may be used under the terms of the OpenBeOS License.
5 */
6 
7 
8 #include "Debug.h"
9 #include "BlockAllocator.h"
10 #include "Volume.h"
11 #include "Inode.h"
12 #include "BPlusTree.h"
13 #include "Stack.h"
14 #include "bfs_control.h"
15 
16 #include <kernel_cpp.h>
17 #include <stdlib.h>
18 
19 #ifdef USER
20 #	define spawn_kernel_thread spawn_thread
21 #endif
22 
23 
24 // Things the BlockAllocator should do:
25 
26 // - find a range of blocks of a certain size nearby a specific position
27 // - allocating an unsharp range of blocks for pre-allocation
28 // - free blocks
29 // - know how to deal with each allocation, special handling for directories,
30 //   files, symlinks, etc. (type sensitive allocation policies)
31 
32 // What makes the code complicated is the fact that we are not just reading
33 // in the whole bitmap and operate on that in memory - e.g. a 13 GB partition
34 // with a block size of 2048 bytes already has a 800kB bitmap, and the size
35 // of partitions will grow even more - so that's not an option.
36 // Instead we are reading in every block when it's used - since an allocation
37 // group can span several blocks in the block bitmap, the AllocationBlock
38 // class is there to make handling those easier.
39 
40 // The current implementation is only slightly optimized and could probably
41 // be improved a lot. Furthermore, the allocation policies used here should
42 // have some real world tests.
43 
44 
45 struct check_cookie {
46 	check_cookie() {}
47 
48 	block_run			current;
49 	Inode				*parent;
50 	mode_t				parent_mode;
51 	Stack<block_run>	stack;
52 	TreeIterator		*iterator;
53 };
54 
55 
56 class AllocationBlock : public CachedBlock {
57 	public:
58 		AllocationBlock(Volume *volume);
59 
60 		void Allocate(uint16 start, uint16 numBlocks);
61 		void Free(uint16 start, uint16 numBlocks);
62 		inline bool IsUsed(uint16 block);
63 
64 		status_t SetTo(AllocationGroup &group, uint16 block);
65 
66 		uint32 NumBlockBits() const { return fNumBits; }
67 		uint32 &Block(int32 index) { return ((uint32 *)fBlock)[index]; }
68 
69 	private:
70 		uint32 fNumBits;
71 };
72 
73 
74 class AllocationGroup {
75 	public:
76 		AllocationGroup();
77 
78 		void AddFreeRange(int32 start, int32 blocks);
79 		bool IsFull() const { return fFreeBits == 0; }
80 
81 		status_t Allocate(Transaction *transaction, uint16 start, int32 length);
82 		status_t Free(Transaction *transaction, uint16 start, int32 length);
83 
84 		uint32 fNumBits;
85 		int32 fStart;
86 		int32 fFirstFree, fLargest, fLargestFirst;
87 			// ToDo: fLargest & fLargestFirst are not maintained (and therefore used) yet!
88 		int32 fFreeBits;
89 };
90 
91 
92 AllocationBlock::AllocationBlock(Volume *volume)
93 	: CachedBlock(volume)
94 {
95 }
96 
97 
98 status_t
99 AllocationBlock::SetTo(AllocationGroup &group, uint16 block)
100 {
101 	// 8 blocks per byte
102 	fNumBits = fVolume->BlockSize() << 3;
103 	// the last group may have less bits in the last block
104 	if ((block + 1) * fNumBits > group.fNumBits)
105 		fNumBits = group.fNumBits % fNumBits;
106 
107 	return CachedBlock::SetTo(group.fStart + block) != NULL ? B_OK : B_ERROR;
108 }
109 
110 
111 bool
112 AllocationBlock::IsUsed(uint16 block)
113 {
114 	if (block > fNumBits)
115 		return true;
116 	// the block bitmap is accessed in 32-bit blocks
117 	return Block(block >> 5) & HOST_ENDIAN_TO_BFS_INT32(1UL << (block % 32));
118 }
119 
120 
121 void
122 AllocationBlock::Allocate(uint16 start, uint16 numBlocks)
123 {
124 	ASSERT(start < fNumBits);
125 	ASSERT(uint32(start + numBlocks) <= fNumBits);
126 
127 	if (uint32(start + numBlocks) > fNumBits) {
128 		FATAL(("Allocation::Allocate(): tried to allocate too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
129 		DIE(("Allocation::Allocate(): tried to allocate too many blocks"));
130 	}
131 
132 	int32 block = start >> 5;
133 
134 	while (numBlocks > 0) {
135 		uint32 mask = 0;
136 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
137 			mask |= 1UL << (i % 32);
138 
139 #ifdef DEBUG
140 		// check for already set blocks
141 		if (HOST_ENDIAN_TO_BFS_INT32(mask) & ((uint32 *)fBlock)[block]) {
142 			FATAL(("AllocationBlock::Allocate(): some blocks are already allocated, start = %u, numBlocks = %u\n", start, numBlocks));
143 			DEBUGGER(("blocks already set!"));
144 		}
145 #endif
146 		Block(block++) |= HOST_ENDIAN_TO_BFS_INT32(mask);
147 		start = 0;
148 	}
149 }
150 
151 
152 void
153 AllocationBlock::Free(uint16 start, uint16 numBlocks)
154 {
155 	ASSERT(start < fNumBits);
156 	ASSERT(uint32(start + numBlocks) <= fNumBits);
157 
158 	if (uint32(start + numBlocks) > fNumBits) {
159 		FATAL(("Allocation::Free(): tried to free too many blocks: %u (numBlocks = %lu)!\n", numBlocks, fNumBits));
160 		DIE(("Allocation::Free(): tried to free too many blocks"));
161 	}
162 
163 	int32 block = start >> 5;
164 
165 	while (numBlocks > 0) {
166 		uint32 mask = 0;
167 		for (int32 i = start % 32; i < 32 && numBlocks; i++, numBlocks--)
168 			mask |= 1UL << (i % 32);
169 
170 		Block(block++) &= HOST_ENDIAN_TO_BFS_INT32(~mask);
171 		start = 0;
172 	}
173 }
174 
175 
176 //	#pragma mark -
177 
178 
179 AllocationGroup::AllocationGroup()
180 	:
181 	fFirstFree(-1),
182 	fLargest(-1),
183 	fLargestFirst(-1),
184 	fFreeBits(0)
185 {
186 }
187 
188 
189 void
190 AllocationGroup::AddFreeRange(int32 start, int32 blocks)
191 {
192 	//D(if (blocks > 512)
193 	//	PRINT(("range of %ld blocks starting at %ld\n",blocks,start)));
194 
195 	if (fFirstFree == -1)
196 		fFirstFree = start;
197 
198 	if (fLargest < blocks) {
199 		fLargest = blocks;
200 		fLargestFirst = start;
201 	}
202 
203 	fFreeBits += blocks;
204 }
205 
206 
207 /** Allocates the specified run in the allocation group.
208  *	Doesn't check if the run is valid or already allocated
209  *	partially, nor does it maintain the free ranges hints
210  *	or the volume's used blocks count.
211  *	It only does the low-level work of allocating some bits
212  *	in the block bitmap.
213  *	Assumes that the block bitmap lock is hold.
214  */
215 
216 status_t
217 AllocationGroup::Allocate(Transaction *transaction, uint16 start, int32 length)
218 {
219 	if (start > fNumBits)
220 		return B_ERROR;
221 
222 	// Update the allocation group info
223 	// ToDo: this info will be incorrect if something goes wrong later
224 	// Note, the fFirstFree block doesn't have to be really free
225 	if (start == fFirstFree)
226 		fFirstFree = start + length;
227 	fFreeBits -= length;
228 
229 	Volume *volume = transaction->GetVolume();
230 
231 	// calculate block in the block bitmap and position within
232 	uint32 bitsPerBlock = volume->BlockSize() << 3;
233 	uint32 block = start / bitsPerBlock;
234 	start = start % bitsPerBlock;
235 
236 	AllocationBlock cached(volume);
237 
238 	while (length > 0) {
239 		if (cached.SetTo(*this, block) < B_OK)
240 			RETURN_ERROR(B_IO_ERROR);
241 
242 		uint32 numBlocks = length;
243 		if (start + numBlocks > cached.NumBlockBits())
244 			numBlocks = cached.NumBlockBits() - start;
245 
246 		cached.Allocate(start, numBlocks);
247 
248 		if (cached.WriteBack(transaction) < B_OK)
249 			return B_IO_ERROR;
250 
251 		length -= numBlocks;
252 		start = 0;
253 		block++;
254 	}
255 
256 	return B_OK;
257 }
258 
259 
260 /** Frees the specified run in the allocation group.
261  *	Doesn't check if the run is valid or was not completely
262  *	allocated, nor does it maintain the free ranges hints
263  *	or the volume's used blocks count.
264  *	It only does the low-level work of freeing some bits
265  *	in the block bitmap.
266  *	Assumes that the block bitmap lock is hold.
267  */
268 
269 status_t
270 AllocationGroup::Free(Transaction *transaction, uint16 start, int32 length)
271 {
272 	if (start > fNumBits)
273 		return B_ERROR;
274 
275 	// Update the allocation group info
276 	// ToDo: this info will be incorrect if something goes wrong later
277 	if (fFirstFree > start)
278 		fFirstFree = start;
279 	fFreeBits += length;
280 
281 	Volume *volume = transaction->GetVolume();
282 
283 	// calculate block in the block bitmap and position within
284 	uint32 bitsPerBlock = volume->BlockSize() << 3;
285 	uint32 block = start / bitsPerBlock;
286 	start = start % bitsPerBlock;
287 
288 	AllocationBlock cached(volume);
289 
290 	while (length > 0) {
291 		if (cached.SetTo(*this, block) < B_OK)
292 			RETURN_ERROR(B_IO_ERROR);
293 
294 		uint16 freeLength = length;
295 		if (uint32(start + length) > cached.NumBlockBits())
296 			freeLength = cached.NumBlockBits() - start;
297 
298 		cached.Free(start, freeLength);
299 
300 		if (cached.WriteBack(transaction) < B_OK)
301 			return B_IO_ERROR;
302 
303 		length -= freeLength;
304 		start = 0;
305 		block++;
306 	}
307 	return B_OK;
308 }
309 
310 
311 //	#pragma mark -
312 
313 
314 BlockAllocator::BlockAllocator(Volume *volume)
315 	:
316 	fVolume(volume),
317 	fLock("bfs allocator"),
318 	fGroups(NULL),
319 	fCheckBitmap(NULL)
320 {
321 }
322 
323 
324 BlockAllocator::~BlockAllocator()
325 {
326 	delete[] fGroups;
327 }
328 
329 
330 status_t
331 BlockAllocator::Initialize(bool full)
332 {
333 	if (fLock.InitCheck() < B_OK)
334 		return B_ERROR;
335 
336 	fNumGroups = fVolume->AllocationGroups();
337 	fBlocksPerGroup = fVolume->SuperBlock().BlocksPerAllocationGroup();
338 	fGroups = new AllocationGroup[fNumGroups];
339 	if (fGroups == NULL)
340 		return B_NO_MEMORY;
341 
342 	if (!full)
343 		return B_OK;
344 
345 	thread_id id = spawn_kernel_thread((thread_func)BlockAllocator::initialize,
346 			"bfs block allocator", B_LOW_PRIORITY, (void *)this);
347 	if (id < B_OK)
348 		return initialize(this);
349 
350 	return resume_thread(id);
351 }
352 
353 
354 status_t
355 BlockAllocator::InitializeAndClearBitmap(Transaction &transaction)
356 {
357 	status_t status = Initialize(false);
358 	if (status < B_OK)
359 		return status;
360 
361 	uint32 blocks = fBlocksPerGroup;
362 	uint32 numBits = 8 * blocks * fVolume->BlockSize();
363 
364 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
365 	if (buffer == NULL)
366 		RETURN_ERROR(B_NO_MEMORY);
367 
368 	memset(buffer, 0, numBits >> 3);
369 
370 	off_t offset = 1;
371 
372 	// initialize the AllocationGroup objects and clear the on-disk bitmap
373 
374 	for (int32 i = 0; i < fNumGroups; i++) {
375 		if (cached_write(fVolume->Device(), offset, buffer, blocks, fVolume->BlockSize()) < B_OK)
376 			return B_ERROR;
377 
378 		// the last allocation group may contain less blocks than the others
379 		fGroups[i].fNumBits = i == fNumGroups - 1 ? fVolume->NumBlocks() - i * numBits : numBits;
380 		fGroups[i].fStart = offset;
381 		fGroups[i].fFirstFree = fGroups[i].fLargestFirst = 0;
382 		fGroups[i].fFreeBits = fGroups[i].fLargest = fGroups[i].fNumBits;
383 
384 		offset += blocks;
385 	}
386 	free(buffer);
387 
388 	// reserve the boot block, the log area, and the block bitmap itself
389 	uint32 reservedBlocks = fVolume->Log().Start() + fVolume->Log().Length();
390 
391 	if (fGroups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
392 		FATAL(("could not allocate reserved space for block bitmap/log!\n"));
393 		return B_ERROR;
394 	}
395 	fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(reservedBlocks);
396 
397 	return B_OK;
398 }
399 
400 
401 status_t
402 BlockAllocator::initialize(BlockAllocator *allocator)
403 {
404 	Locker lock(allocator->fLock);
405 
406 	Volume *volume = allocator->fVolume;
407 	uint32 blocks = allocator->fBlocksPerGroup;
408 	uint32 numBits = 8 * blocks * volume->BlockSize();
409 	off_t freeBlocks = 0;
410 
411 	uint32 *buffer = (uint32 *)malloc(numBits >> 3);
412 	if (buffer == NULL)
413 		RETURN_ERROR(B_NO_MEMORY);
414 
415 	AllocationGroup *groups = allocator->fGroups;
416 	off_t offset = 1;
417 	int32 num = allocator->fNumGroups;
418 
419 	for (int32 i = 0;i < num;i++) {
420 		if (cached_read(volume->Device(), offset, buffer, blocks, volume->BlockSize()) < B_OK)
421 			break;
422 
423 		// the last allocation group may contain less blocks than the others
424 		groups[i].fNumBits = i == num - 1 ? volume->NumBlocks() - i * numBits : numBits;
425 		groups[i].fStart = offset;
426 
427 		// finds all free ranges in this allocation group
428 		int32 start = -1, range = 0;
429 		int32 size = groups[i].fNumBits, num = 0;
430 
431 		for (int32 k = 0;k < (size >> 2);k++) {
432 			for (int32 j = 0; j < 32 && num < size; j++, num++) {
433 				if (buffer[k] & (1UL << j)) {
434 					if (range > 0) {
435 						groups[i].AddFreeRange(start, range);
436 						range = 0;
437 					}
438 				} else if (range++ == 0)
439 					start = num;
440 			}
441 		}
442 		if (range)
443 			groups[i].AddFreeRange(start, range);
444 
445 		freeBlocks += groups[i].fFreeBits;
446 
447 		offset += blocks;
448 	}
449 	free(buffer);
450 
451 	// check if block bitmap and log area are reserved
452 	uint32 reservedBlocks = volume->Log().Start() + volume->Log().Length();
453 	if (allocator->CheckBlockRun(block_run::Run(0, 0, reservedBlocks)) < B_OK) {
454 		Transaction transaction(volume, 0);
455 		if (groups[0].Allocate(&transaction, 0, reservedBlocks) < B_OK) {
456 			FATAL(("could not allocate reserved space for block bitmap/log!\n"));
457 			volume->Panic();
458 		} else {
459 			FATAL(("space for block bitmap or log area was not reserved!\n"));
460 
461 			transaction.Done();
462 		}
463 	}
464 
465 	off_t usedBlocks = volume->NumBlocks() - freeBlocks;
466 	if (volume->UsedBlocks() != usedBlocks) {
467 		// If the disk in a dirty state at mount time, it's
468 		// normal that the values don't match
469 		INFORM(("volume reports %Ld used blocks, correct is %Ld\n", volume->UsedBlocks(), usedBlocks));
470 		volume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
471 	}
472 
473 	return B_OK;
474 }
475 
476 
477 status_t
478 BlockAllocator::AllocateBlocks(Transaction *transaction, int32 group, uint16 start,
479 	uint16 maximum, uint16 minimum, block_run &run)
480 {
481 	if (maximum == 0)
482 		return B_BAD_VALUE;
483 
484 	FUNCTION_START(("group = %ld, start = %u, maximum = %u, minimum = %u\n", group, start, maximum, minimum));
485 
486 	AllocationBlock cached(fVolume);
487 	Locker lock(fLock);
488 
489 	// the first scan through all allocation groups will look for the
490 	// wanted maximum of blocks, the second scan will just look to
491 	// satisfy the minimal requirement
492 	uint16 numBlocks = maximum;
493 
494 	for (int32 i = 0; i < fNumGroups * 2; i++, group++, start = 0) {
495 		group = group % fNumGroups;
496 
497 		if (start >= fGroups[group].fNumBits || fGroups[group].IsFull())
498 			continue;
499 
500 		if (i >= fNumGroups) {
501 			// if the minimum is the same as the maximum, it's not necessary to
502 			// search for in the allocation groups a second time
503 			if (maximum == minimum)
504 				return B_DEVICE_FULL;
505 
506 			numBlocks = minimum;
507 		}
508 
509 		// The wanted maximum is smaller than the largest free block in the group
510 		// or already smaller than the minimum
511 		// ToDo: disabled because it's currently not maintained after the first allocation
512 		//if (numBlocks > fGroups[group].fLargest)
513 		//	continue;
514 
515 		if (start < fGroups[group].fFirstFree)
516 			start = fGroups[group].fFirstFree;
517 
518 		// there may be more than one block per allocation group - and
519 		// we iterate through it to find a place for the allocation.
520 		// (one allocation can't exceed one allocation group)
521 
522 		uint32 block = start / (fVolume->BlockSize() << 3);
523 		int32 range = 0, rangeStart = 0;
524 
525 		for (; block < fBlocksPerGroup; block++) {
526 			if (cached.SetTo(fGroups[group], block) < B_OK)
527 				RETURN_ERROR(B_ERROR);
528 
529 			// find a block large enough to hold the allocation
530 			for (uint32 bit = start % cached.NumBlockBits(); bit < cached.NumBlockBits(); bit++) {
531 				if (!cached.IsUsed(bit)) {
532 					if (range == 0) {
533 						// start new range
534 						rangeStart = block * cached.NumBlockBits() + bit;
535 					}
536 
537 					// have we found a range large enough to hold numBlocks?
538 					if (++range >= maximum)
539 						break;
540 				} else if (i >= fNumGroups && range >= minimum) {
541 					// we have found a block larger than the required minimum (second pass)
542 					break;
543 				} else {
544 					// end of a range
545 					range = 0;
546 				}
547 			}
548 
549 			// if we found a suitable block, mark the blocks as in use, and write
550 			// the updated block bitmap back to disk
551 			if (range >= numBlocks) {
552 				// adjust allocation size
553 				if (numBlocks < maximum)
554 					numBlocks = range;
555 
556 				if (fGroups[group].Allocate(transaction, rangeStart, numBlocks) < B_OK)
557 					RETURN_ERROR(B_IO_ERROR);
558 
559 				run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(group);
560 				run.start = HOST_ENDIAN_TO_BFS_INT16(rangeStart);
561 				run.length = HOST_ENDIAN_TO_BFS_INT16(numBlocks);
562 
563 				fVolume->SuperBlock().used_blocks =
564 					HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() + numBlocks);
565 					// We are not writing back the disk's super block - it's
566 					// either done by the journaling code, or when the disk
567 					// is unmounted.
568 					// If the value is not correct at mount time, it will be
569 					// fixed anyway.
570 
571 				return B_OK;
572 			}
573 
574 			// start from the beginning of the next block
575 			start = 0;
576 		}
577 	}
578 	return B_DEVICE_FULL;
579 }
580 
581 
582 status_t
583 BlockAllocator::AllocateForInode(Transaction *transaction, const block_run *parent,
584 	mode_t type, block_run &run)
585 {
586 	// apply some allocation policies here (AllocateBlocks() will break them
587 	// if necessary) - we will start with those described in Dominic Giampaolo's
588 	// "Practical File System Design", and see how good they work
589 
590 	// files are going in the same allocation group as its parent, sub-directories
591 	// will be inserted 8 allocation groups after the one of the parent
592 	uint16 group = parent->AllocationGroup();
593 	if ((type & (S_DIRECTORY | S_INDEX_DIR | S_ATTR_DIR)) == S_DIRECTORY)
594 		group += 8;
595 
596 	return AllocateBlocks(transaction, group, 0, 1, 1, run);
597 }
598 
599 
600 status_t
601 BlockAllocator::Allocate(Transaction *transaction, const Inode *inode, off_t numBlocks,
602 	block_run &run, uint16 minimum)
603 {
604 	if (numBlocks <= 0)
605 		return B_ERROR;
606 
607 	// one block_run can't hold more data than there is in one allocation group
608 	if (numBlocks > fGroups[0].fNumBits)
609 		numBlocks = fGroups[0].fNumBits;
610 
611 	// since block_run.length is uint16, the largest number of blocks that
612 	// can be covered by a block_run is 65535
613 	// ToDo: if we drop compatibility, couldn't we do this any better?
614 	// There are basically two possibilities:
615 	// a) since a length of zero doesn't have any sense, take that for 65536 -
616 	//    but that could cause many problems (bugs) in other areas
617 	// b) reduce the maximum amount of blocks per block_run, so that the remaining
618 	//    number of free blocks can be used in a useful manner (like 4 blocks) -
619 	//    but that would also reduce the maximum file size
620 	if (numBlocks > MAX_BLOCK_RUN_LENGTH)
621 		numBlocks = MAX_BLOCK_RUN_LENGTH;
622 
623 	// apply some allocation policies here (AllocateBlocks() will break them
624 	// if necessary)
625 	uint16 group = inode->BlockRun().AllocationGroup();
626 	uint16 start = 0;
627 
628 	// are there already allocated blocks? (then just try to allocate near the last one)
629 	if (inode->Size() > 0) {
630 		data_stream *data = &inode->Node()->data;
631 		// ToDo: we currently don't care for when the data stream
632 		// is already grown into the indirect ranges
633 		if (data->max_double_indirect_range == 0
634 			&& data->max_indirect_range == 0) {
635 			// Since size > 0, there must be a valid block run in this stream
636 			int32 last = 0;
637 			for (; last < NUM_DIRECT_BLOCKS - 1; last++)
638 				if (data->direct[last + 1].IsZero())
639 					break;
640 
641 			group = data->direct[last].AllocationGroup();
642 			start = data->direct[last].Start() + data->direct[last].Length();
643 		}
644 	} else if (inode->IsContainer() || inode->IsSymLink()) {
645 		// directory and symbolic link data will go in the same allocation
646 		// group as the inode is in but after the inode data
647 		start = inode->BlockRun().Start();
648 	} else {
649 		// file data will start in the next allocation group
650 		group = inode->BlockRun().AllocationGroup() + 1;
651 	}
652 
653 	return AllocateBlocks(transaction, group, start, numBlocks, minimum, run);
654 }
655 
656 
657 status_t
658 BlockAllocator::Free(Transaction *transaction, block_run run)
659 {
660 	Locker lock(fLock);
661 
662 	int32 group = run.AllocationGroup();
663 	uint16 start = run.Start();
664 	uint16 length = run.Length();
665 
666 	FUNCTION_START(("group = %ld, start = %u, length = %u\n", group, start, length));
667 
668 	// doesn't use Volume::IsValidBlockRun() here because it can check better
669 	// against the group size (the last group may have a different length)
670 	if (group < 0 || group >= fNumGroups
671 		|| start > fGroups[group].fNumBits
672 		|| uint32(start + length) > fGroups[group].fNumBits
673 		|| length == 0) {
674 		FATAL(("tried to free an invalid block_run (%ld, %u, %u)\n", group, start, length));
675 		DEBUGGER(("tried to free invalid block_run"));
676 		return B_BAD_VALUE;
677 	}
678 	// check if someone tries to free reserved areas at the beginning of the drive
679 	if (group == 0 && start < uint32(fVolume->Log().Start() + fVolume->Log().Length())) {
680 		FATAL(("tried to free a reserved block_run (%ld, %u, %u)\n", group, start, length));
681 		DEBUGGER(("tried to free reserved block"));
682 		return B_BAD_VALUE;
683 	}
684 #ifdef DEBUG
685 	if (CheckBlockRun(run) < B_OK)
686 		return B_BAD_DATA;
687 #endif
688 
689 	if (fGroups[group].Free(transaction, start, length) < B_OK)
690 		RETURN_ERROR(B_IO_ERROR);
691 
692 #ifdef DEBUG
693 	if (CheckBlockRun(run) == B_OK)
694 		DEBUGGER(("CheckBlockRun() reports allocated blocks (which were just freed)\n"));
695 #endif
696 
697 	fVolume->SuperBlock().used_blocks =
698 		HOST_ENDIAN_TO_BFS_INT64(fVolume->UsedBlocks() - run.Length());
699 	return B_OK;
700 }
701 
702 
703 size_t
704 BlockAllocator::BitmapSize() const
705 {
706 	return fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
707 }
708 
709 
710 //	#pragma mark -
711 //	Functions to check the validity of the bitmap - they are used from
712 //	the "chkbfs" command
713 
714 
715 bool
716 BlockAllocator::IsValidCheckControl(check_control *control)
717 {
718 	if (control == NULL
719 		|| control->magic != BFS_IOCTL_CHECK_MAGIC) {
720 		FATAL(("invalid check_control (%p)!\n", control));
721 		return false;
722 	}
723 
724 	return true;
725 }
726 
727 
728 status_t
729 BlockAllocator::StartChecking(check_control *control)
730 {
731 	if (!IsValidCheckControl(control))
732 		return B_BAD_VALUE;
733 
734 	status_t status = fLock.Lock();
735 	if (status < B_OK)
736 		return status;
737 
738 	size_t size = BitmapSize();
739 	fCheckBitmap = (uint32 *)malloc(size);
740 	if (fCheckBitmap == NULL) {
741 		fLock.Unlock();
742 		return B_NO_MEMORY;
743 	}
744 
745 	check_cookie *cookie = new check_cookie();
746 	if (cookie == NULL) {
747 		free(fCheckBitmap);
748 		fCheckBitmap = NULL;
749 		fLock.Unlock();
750 
751 		return B_NO_MEMORY;
752 	}
753 
754 	// initialize bitmap
755 	memset(fCheckBitmap, 0, size);
756 	for (int32 block = fVolume->Log().Start() + fVolume->Log().Length(); block-- > 0;)
757 		SetCheckBitmapAt(block);
758 
759 	cookie->stack.Push(fVolume->Root());
760 	cookie->stack.Push(fVolume->Indices());
761 	cookie->iterator = NULL;
762 	control->cookie = cookie;
763 
764 	fCheckCookie = cookie;
765 		// to be able to restore nicely if "chkbfs" exited abnormally
766 
767 	// ToDo: check reserved area in bitmap!
768 
769 	return B_OK;
770 }
771 
772 
773 status_t
774 BlockAllocator::StopChecking(check_control *control)
775 {
776 	check_cookie *cookie;
777 	if (control == NULL)
778 		cookie = fCheckCookie;
779 	else
780 		cookie = (check_cookie *)control->cookie;
781 
782 	if (cookie == NULL)
783 		return B_ERROR;
784 
785 	if (cookie->iterator != NULL) {
786 		delete cookie->iterator;
787 		cookie->iterator = NULL;
788 
789 		// the current directory inode is still locked in memory
790 		put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
791 	}
792 
793 	// if CheckNextNode() could completely work through, we can
794 	// fix any damages of the bitmap
795 	if (control != NULL && control->status == B_ENTRY_NOT_FOUND) {
796 		// calculate the number of used blocks in the check bitmap
797 		size_t size = fVolume->BlockSize() * fNumGroups * fBlocksPerGroup;
798 		off_t usedBlocks = 0LL;
799 
800 		// ToDo: update the allocation groups used blocks info
801 		for (uint32 i = size >> 2; i-- > 0;) {
802 			uint32 compare = 1;
803 			for (int16 j = 0; j < 32; j++, compare <<= 1) {
804 				if (compare & fCheckBitmap[i])
805 					usedBlocks++;
806 			}
807 		}
808 
809 		control->stats.freed = fVolume->UsedBlocks() - usedBlocks + control->stats.missing;
810 		if (control->stats.freed < 0)
811 			control->stats.freed = 0;
812 
813 		// Should we fix errors? Were there any errors we can fix?
814 		if (control->flags & BFS_FIX_BITMAP_ERRORS
815 			&& (control->stats.freed != 0 || control->stats.missing != 0)) {
816 			// if so, write the check bitmap back over the original one,
817 			// and use transactions here to play safe - we even use several
818 			// transactions, so that we don't blow the maximum log size
819 			// on large disks; since we don't need to make this atomic
820 			fVolume->SuperBlock().used_blocks = HOST_ENDIAN_TO_BFS_INT64(usedBlocks);
821 
822 			int32 blocksInBitmap = fNumGroups * fBlocksPerGroup;
823 			int32 blockSize = fVolume->BlockSize();
824 
825 			for (int32 i = 0; i < blocksInBitmap; i += 512) {
826 				Transaction transaction(fVolume, 1 + i);
827 
828 				int32 blocksToWrite = 512;
829 				if (blocksToWrite + i > blocksInBitmap)
830 					blocksToWrite = blocksInBitmap - i;
831 
832 				status_t status = transaction.WriteBlocks(1 + i,
833 					(uint8 *)fCheckBitmap + i * blockSize, blocksToWrite);
834 				if (status < B_OK) {
835 					FATAL(("error writing bitmap: %s\n", strerror(status)));
836 					break;
837 				}
838 				transaction.Done();
839 			}
840 		}
841 	} else
842 		FATAL(("BlockAllocator::CheckNextNode() didn't run through\n"));
843 
844 	free(fCheckBitmap);
845 	fCheckBitmap = NULL;
846 	fCheckCookie = NULL;
847 	delete cookie;
848 	fLock.Unlock();
849 
850 	return B_OK;
851 }
852 
853 
854 status_t
855 BlockAllocator::CheckNextNode(check_control *control)
856 {
857 	if (!IsValidCheckControl(control))
858 		return B_BAD_VALUE;
859 
860 	check_cookie *cookie = (check_cookie *)control->cookie;
861 
862 	while (true) {
863 		if (cookie->iterator == NULL) {
864 			if (!cookie->stack.Pop(&cookie->current)) {
865 				// no more runs on the stack, we are obviously finished!
866 				control->status = B_ENTRY_NOT_FOUND;
867 				return B_ENTRY_NOT_FOUND;
868 			}
869 
870 			// get iterator for the next directory
871 
872 #ifdef UNSAFE_GET_VNODE
873 			RecursiveLocker locker(fVolume->Lock());
874 #endif
875 			Vnode vnode(fVolume, cookie->current);
876 			Inode *inode;
877 			if (vnode.Get(&inode) < B_OK) {
878 				FATAL(("check: Could not open inode at %Ld\n", fVolume->ToBlock(cookie->current)));
879 				continue;
880 			}
881 
882 			if (!inode->IsContainer()) {
883 				FATAL(("check: inode at %Ld should have been a directory\n", fVolume->ToBlock(cookie->current)));
884 				continue;
885 			}
886 
887 			BPlusTree *tree;
888 			if (inode->GetTree(&tree) != B_OK) {
889 				FATAL(("check: could not open b+tree from inode at %Ld\n", fVolume->ToBlock(cookie->current)));
890 				continue;
891 			}
892 
893 			cookie->parent = inode;
894 			cookie->parent_mode = inode->Mode();
895 
896 			cookie->iterator = new TreeIterator(tree);
897 			if (cookie->iterator == NULL)
898 				RETURN_ERROR(B_NO_MEMORY);
899 
900 			// the inode must stay locked in memory until the iterator is freed
901 			vnode.Keep();
902 
903 			// check the inode of the directory
904 			control->errors = 0;
905 			control->status = CheckInode(inode, control);
906 
907 			if (inode->GetName(control->name) < B_OK)
908 				strcpy(control->name, "(node has no name)");
909 
910 			control->inode = inode->ID();
911 			control->mode = inode->Mode();
912 
913 			return B_OK;
914 		}
915 
916 		char name[B_FILE_NAME_LENGTH];
917 		uint16 length;
918 		vnode_id id;
919 
920 		status_t status = cookie->iterator->GetNextEntry(name, &length, B_FILE_NAME_LENGTH, &id);
921 		if (status == B_ENTRY_NOT_FOUND) {
922 			// there are no more entries in this iterator, free it and go on
923 			delete cookie->iterator;
924 			cookie->iterator = NULL;
925 
926 			// unlock the directory's inode from memory
927 			put_vnode(fVolume->ID(), fVolume->ToVnode(cookie->current));
928 
929 			continue;
930 		} else if (status == B_OK) {
931 			// ignore "." and ".." entries
932 			if (!strcmp(name, ".") || !strcmp(name, ".."))
933 				continue;
934 
935 			// fill in the control data as soon as we have them
936 			strlcpy(control->name, name, B_FILE_NAME_LENGTH);
937 			control->inode = id;
938 			control->errors = 0;
939 
940 			Vnode vnode(fVolume, id);
941 			Inode *inode;
942 			if (vnode.Get(&inode) < B_OK) {
943 				FATAL(("Could not open inode ID %Ld!\n", id));
944 				control->errors |= BFS_COULD_NOT_OPEN;
945 				control->status = B_ERROR;
946 				return B_OK;
947 			}
948 
949 			// check if the inode's name is the same as in the b+tree
950 			if (inode->IsRegularNode()) {
951 				SimpleLocker locker(inode->SmallDataLock());
952 
953 				const char *localName = inode->Name();
954 				if (localName == NULL || strcmp(localName, name)) {
955 					control->errors |= BFS_NAMES_DONT_MATCH;
956 					FATAL(("Names differ: tree \"%s\", inode \"%s\"\n", name, localName));
957 				}
958 			}
959 
960 			control->mode = inode->Mode();
961 
962 			// Check for the correct mode of the node (if the mode of the
963 			// file don't fit to its parent, there is a serious problem)
964 			if ((cookie->parent_mode & S_ATTR_DIR && !inode->IsAttribute())
965 				|| (cookie->parent_mode & S_INDEX_DIR && !inode->IsIndex())
966 				|| ((cookie->parent_mode & S_DIRECTORY | S_ATTR_DIR | S_INDEX_DIR) == S_DIRECTORY
967 					&& inode->Mode() & (S_ATTR | S_ATTR_DIR | S_INDEX_DIR))) {
968 				FATAL(("inode at %Ld is of wrong type: %o (parent %o at %Ld)!\n",
969 					inode->BlockNumber(), inode->Mode(), cookie->parent_mode, cookie->parent->BlockNumber()));
970 
971 				// if we are allowed to fix errors, we should remove the file
972 				if (control->flags & BFS_REMOVE_WRONG_TYPES
973 					&& control->flags & BFS_FIX_BITMAP_ERRORS) {
974 #ifdef UNSAFE_GET_VNODE
975 					RecursiveLocker locker(fVolume->Lock());
976 #endif
977 					// it's safe to start a transaction, because Inode::Remove()
978 					// won't touch the block bitmap (which we hold the lock for)
979 					// if we set the INODE_DONT_FREE_SPACE flag - since we fix
980 					// the bitmap anyway
981 					Transaction transaction(fVolume, cookie->parent->BlockNumber());
982 
983 					inode->Node()->flags |= INODE_DONT_FREE_SPACE;
984 					status = cookie->parent->Remove(&transaction, name, NULL, inode->IsContainer());
985 					if (status == B_OK)
986 						transaction.Done();
987 				} else
988 					status = B_ERROR;
989 
990 				control->errors |= BFS_WRONG_TYPE;
991 				control->status = status;
992 				return B_OK;
993 			}
994 
995 			// If the inode has an attribute directory, push it on the stack
996 			if (!inode->Attributes().IsZero())
997 				cookie->stack.Push(inode->Attributes());
998 
999 			// push the directory on the stack so that it will be scanned later
1000 			if (inode->IsContainer() && !inode->IsIndex())
1001 				cookie->stack.Push(inode->BlockRun());
1002 			else {
1003 				// check it now
1004 				control->status = CheckInode(inode, control);
1005 
1006 				return B_OK;
1007 			}
1008 		}
1009 	}
1010 	// is never reached
1011 }
1012 
1013 
1014 bool
1015 BlockAllocator::CheckBitmapIsUsedAt(off_t block) const
1016 {
1017 	size_t size = BitmapSize();
1018 	uint32 index = block / 32;	// 32bit resolution
1019 	if (index > size / 4)
1020 		return false;
1021 
1022 	return BFS_ENDIAN_TO_HOST_INT32(fCheckBitmap[index]) & (1UL << (block & 0x1f));
1023 }
1024 
1025 
1026 void
1027 BlockAllocator::SetCheckBitmapAt(off_t block)
1028 {
1029 	size_t size = BitmapSize();
1030 	uint32 index = block / 32;	// 32bit resolution
1031 	if (index > size / 4)
1032 		return;
1033 
1034 	fCheckBitmap[index] |= HOST_ENDIAN_TO_BFS_INT32(1UL << (block & 0x1f));
1035 }
1036 
1037 
1038 status_t
1039 BlockAllocator::CheckBlockRun(block_run run, const char *type, check_control *control)
1040 {
1041 	if (run.AllocationGroup() < 0 || run.AllocationGroup() >= fNumGroups
1042 		|| run.Start() > fGroups[run.allocation_group].fNumBits
1043 		|| uint32(run.Start() + run.Length()) > fGroups[run.AllocationGroup()].fNumBits
1044 		|| run.length == 0) {
1045 		PRINT(("%s: block_run(%ld, %u, %u) is invalid!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1046 		if (control == NULL)
1047 			return B_BAD_DATA;
1048 
1049 		control->errors |= BFS_INVALID_BLOCK_RUN;
1050 		return B_OK;
1051 	}
1052 
1053 	uint32 bitsPerBlock = fVolume->BlockSize() << 3;
1054 	uint32 block = run.Start() / bitsPerBlock;
1055 	uint32 pos = run.Start() % bitsPerBlock;
1056 	int32 length = 0;
1057 	off_t firstMissing = -1, firstSet = -1;
1058 	off_t firstGroupBlock = (off_t)run.AllocationGroup() << fVolume->AllocationGroupShift();
1059 
1060 	AllocationBlock cached(fVolume);
1061 
1062 	for (; block < fBlocksPerGroup && length < run.Length(); block++, pos = 0) {
1063 		if (cached.SetTo(fGroups[run.AllocationGroup()], block) < B_OK)
1064 			RETURN_ERROR(B_IO_ERROR);
1065 
1066 		if (pos >= cached.NumBlockBits()) {
1067 			// something very strange has happened...
1068 			RETURN_ERROR(B_ERROR);
1069 		}
1070 
1071 		while (length < run.Length() && pos < cached.NumBlockBits()) {
1072 			if (!cached.IsUsed(pos)) {
1073 				if (control == NULL) {
1074 					PRINT(("%s: block_run(%ld, %u, %u) is only partially allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length()));
1075 					return B_BAD_DATA;
1076 				}
1077 				if (firstMissing == -1) {
1078 					firstMissing = firstGroupBlock + pos + block * bitsPerBlock;
1079 					control->errors |= BFS_MISSING_BLOCKS;
1080 				}
1081 				control->stats.missing++;
1082 			} else if (firstMissing != -1) {
1083 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n",
1084 					type, run.allocation_group, run.start, run.length, firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1085 				firstMissing = -1;
1086 			}
1087 
1088 			if (fCheckBitmap != NULL) {
1089 				// Set the block in the check bitmap as well, but have a look if it
1090 				// is already allocated first
1091 				uint32 offset = pos + block * bitsPerBlock;
1092 				if (CheckBitmapIsUsedAt(firstGroupBlock + offset)) {
1093 					if (firstSet == -1) {
1094 						firstSet = firstGroupBlock + offset;
1095 						control->errors |= BFS_BLOCKS_ALREADY_SET;
1096 					}
1097 					control->stats.already_set++;
1098 				} else {
1099 					if (firstSet != -1) {
1100 						FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n",
1101 							type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + offset - 1));
1102 						firstSet = -1;
1103 					}
1104 					SetCheckBitmapAt(firstGroupBlock + offset);
1105 				}
1106 			}
1107 			length++;
1108 			pos++;
1109 		}
1110 
1111 		if (block + 1 >= fBlocksPerGroup || length >= run.Length()) {
1112 			if (firstMissing != -1)
1113 				PRINT(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are not allocated!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstMissing, firstGroupBlock + pos + block * bitsPerBlock - 1));
1114 			if (firstSet != -1)
1115 				FATAL(("%s: block_run(%ld, %u, %u): blocks %Ld - %Ld are already set!\n", type, run.AllocationGroup(), run.Start(), run.Length(), firstSet, firstGroupBlock + pos + block * bitsPerBlock - 1));
1116 		}
1117 	}
1118 
1119 	return B_OK;
1120 }
1121 
1122 
1123 status_t
1124 BlockAllocator::CheckInode(Inode *inode, check_control *control)
1125 {
1126 	if (control != NULL && fCheckBitmap == NULL)
1127 		return B_NO_INIT;
1128 	if (inode == NULL)
1129 		return B_BAD_VALUE;
1130 
1131 	status_t status = CheckBlockRun(inode->BlockRun(), "inode", control);
1132 	if (status < B_OK)
1133 		return status;
1134 
1135 	data_stream *data = &inode->Node()->data;
1136 
1137 	// check the direct range
1138 
1139 	if (data->max_direct_range) {
1140 		for (int32 i = 0; i < NUM_DIRECT_BLOCKS; i++) {
1141 			if (data->direct[i].IsZero())
1142 				break;
1143 
1144 			status = CheckBlockRun(data->direct[i], "direct", control);
1145 			if (status < B_OK)
1146 				return status;
1147 		}
1148 	}
1149 
1150 	CachedBlock cached(fVolume);
1151 
1152 	// check the indirect range
1153 
1154 	if (data->max_indirect_range) {
1155 		status = CheckBlockRun(data->indirect, "indirect", control);
1156 		if (status < B_OK)
1157 			return status;
1158 
1159 		off_t block = fVolume->ToBlock(data->indirect);
1160 
1161 		for (int32 i = 0; i < data->indirect.Length(); i++) {
1162 			block_run *runs = (block_run *)cached.SetTo(block + i);
1163 			if (runs == NULL)
1164 				RETURN_ERROR(B_IO_ERROR);
1165 
1166 			int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1167 			int32 index = 0;
1168 			for (; index < runsPerBlock; index++) {
1169 				if (runs[index].IsZero())
1170 					break;
1171 
1172 				status = CheckBlockRun(runs[index], "indirect->run", control);
1173 				if (status < B_OK)
1174 					return status;
1175 			}
1176 			if (index < runsPerBlock)
1177 				break;
1178 		}
1179 	}
1180 
1181 	// check the double indirect range
1182 
1183 	if (data->max_double_indirect_range) {
1184 		status = CheckBlockRun(data->double_indirect, "double indirect", control);
1185 		if (status < B_OK)
1186 			return status;
1187 
1188 		int32 runsPerBlock = fVolume->BlockSize() / sizeof(block_run);
1189 		int32 runsPerArray = runsPerBlock << ARRAY_BLOCKS_SHIFT;
1190 
1191 		CachedBlock cachedDirect(fVolume);
1192 		int32 maxIndirectIndex = (data->double_indirect.Length() << fVolume->BlockShift())
1193 									/ sizeof(block_run);
1194 
1195 		for (int32 indirectIndex = 0; indirectIndex < maxIndirectIndex; indirectIndex++) {
1196 			// get the indirect array block
1197 			block_run *array = (block_run *)cached.SetTo(fVolume->ToBlock(data->double_indirect)
1198 				+ indirectIndex / runsPerBlock);
1199 			if (array == NULL)
1200 				return B_IO_ERROR;
1201 
1202 			block_run indirect = array[indirectIndex % runsPerBlock];
1203 			// are we finished yet?
1204 			if (indirect.IsZero())
1205 				return B_OK;
1206 
1207 			status = CheckBlockRun(indirect, "double indirect->runs", control);
1208 			if (status < B_OK)
1209 				return status;
1210 
1211 			int32 maxIndex = (indirect.Length() << fVolume->BlockShift()) / sizeof(block_run);
1212 
1213 			for (int32 index = 0; index < maxIndex; ) {
1214 				block_run *runs = (block_run *)cachedDirect.SetTo(fVolume->ToBlock(indirect)
1215 					+ index / runsPerBlock);
1216 				if (runs == NULL)
1217 					return B_IO_ERROR;
1218 
1219 				do {
1220 					// are we finished yet?
1221 					if (runs[index % runsPerBlock].IsZero())
1222 						return B_OK;
1223 
1224 					status = CheckBlockRun(runs[index % runsPerBlock], "double indirect->runs->run", control);
1225 					if (status < B_OK)
1226 						return status;
1227 				} while ((++index % runsPerArray) != 0);
1228 			}
1229 		}
1230 	}
1231 
1232 	return B_OK;
1233 }
1234 
1235