xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision ddac407426cd3b3d0b4589d7a161b300b3539a2a)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! super block, mounting, etc.
7 
8 
9 #include "Debug.h"
10 #include "Volume.h"
11 #include "Journal.h"
12 #include "Inode.h"
13 #include "Query.h"
14 
15 
16 static const int32 kDesiredAllocationGroups = 56;
17 	// This is the number of allocation groups that will be tried
18 	// to be given for newly initialized disks.
19 	// That's only relevant for smaller disks, though, since any
20 	// of today's disk sizes already reach the maximum length
21 	// of an allocation group (65536 blocks).
22 	// It seems to create appropriate numbers for smaller disks
23 	// with this setting, though (i.e. you can create a 400 MB
24 	// file on a 1 GB disk without the need for double indirect
25 	// blocks).
26 
27 
28 class DeviceOpener {
29 public:
30 						DeviceOpener(int fd, int mode);
31 						DeviceOpener(const char* device, int mode);
32 						~DeviceOpener();
33 
34 			int			Open(const char* device, int mode);
35 			int			Open(int fd, int mode);
36 			void*		InitCache(off_t numBlocks, uint32 blockSize);
37 			void		RemoveCache(bool allowWrites);
38 
39 			void		Keep();
40 
41 			int			Device() const { return fDevice; }
42 			int			Mode() const { return fMode; }
43 			bool		IsReadOnly() const { return _IsReadOnly(fMode); }
44 
45 			status_t	GetSize(off_t* _size, uint32* _blockSize = NULL);
46 
47 private:
48 	static	bool		_IsReadOnly(int mode)
49 							{ return (mode & O_RWMASK) == O_RDONLY;}
50 	static	bool		_IsReadWrite(int mode)
51 							{ return (mode & O_RWMASK) == O_RDWR;}
52 
53 			int			fDevice;
54 			int			fMode;
55 			void*		fBlockCache;
56 };
57 
58 
59 DeviceOpener::DeviceOpener(const char* device, int mode)
60 	:
61 	fBlockCache(NULL)
62 {
63 	Open(device, mode);
64 }
65 
66 
67 DeviceOpener::DeviceOpener(int fd, int mode)
68 	:
69 	fBlockCache(NULL)
70 {
71 	Open(fd, mode);
72 }
73 
74 
75 DeviceOpener::~DeviceOpener()
76 {
77 	if (fDevice >= 0) {
78 		RemoveCache(false);
79 		close(fDevice);
80 	}
81 }
82 
83 
84 int
85 DeviceOpener::Open(const char* device, int mode)
86 {
87 	fDevice = open(device, mode | O_NOCACHE);
88 	if (fDevice < 0)
89 		fDevice = errno;
90 
91 	if (fDevice < 0 && _IsReadWrite(mode)) {
92 		// try again to open read-only (don't rely on a specific error code)
93 		return Open(device, O_RDONLY | O_NOCACHE);
94 	}
95 
96 	if (fDevice >= 0) {
97 		// opening succeeded
98 		fMode = mode;
99 		if (_IsReadWrite(mode)) {
100 			// check out if the device really allows for read/write access
101 			device_geometry geometry;
102 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
103 				if (geometry.read_only) {
104 					// reopen device read-only
105 					close(fDevice);
106 					return Open(device, O_RDONLY | O_NOCACHE);
107 				}
108 			}
109 		}
110 	}
111 
112 	return fDevice;
113 }
114 
115 
116 int
117 DeviceOpener::Open(int fd, int mode)
118 {
119 	fDevice = dup(fd);
120 	if (fDevice < 0)
121 		return errno;
122 
123 	fMode = mode;
124 
125 	return fDevice;
126 }
127 
128 
129 void*
130 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
131 {
132 	return fBlockCache = block_cache_create(fDevice, numBlocks, blockSize,
133 		IsReadOnly());
134 }
135 
136 
137 void
138 DeviceOpener::RemoveCache(bool allowWrites)
139 {
140 	if (fBlockCache == NULL)
141 		return;
142 
143 	block_cache_delete(fBlockCache, allowWrites);
144 	fBlockCache = NULL;
145 }
146 
147 
148 void
149 DeviceOpener::Keep()
150 {
151 	fDevice = -1;
152 }
153 
154 
155 /*!	Returns the size of the device in bytes. It uses B_GET_GEOMETRY
156 	to compute the size, or fstat() if that failed.
157 */
158 status_t
159 DeviceOpener::GetSize(off_t* _size, uint32* _blockSize)
160 {
161 	device_geometry geometry;
162 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
163 		// maybe it's just a file
164 		struct stat stat;
165 		if (fstat(fDevice, &stat) < 0)
166 			return B_ERROR;
167 
168 		if (_size)
169 			*_size = stat.st_size;
170 		if (_blockSize)	// that shouldn't cause us any problems
171 			*_blockSize = 512;
172 
173 		return B_OK;
174 	}
175 
176 	if (_size) {
177 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
178 			* geometry.sectors_per_track * geometry.bytes_per_sector;
179 	}
180 	if (_blockSize)
181 		*_blockSize = geometry.bytes_per_sector;
182 
183 	return B_OK;
184 }
185 
186 
187 //	#pragma mark -
188 
189 
190 bool
191 disk_super_block::IsValid()
192 {
193 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
194 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
195 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
196 		|| (int32)block_size != inode_size
197 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
198 		|| (1UL << BlockShift()) != BlockSize()
199 		|| AllocationGroups() < 1
200 		|| AllocationGroupShift() < 1
201 		|| BlocksPerAllocationGroup() < 1
202 		|| NumBlocks() < 10
203 		|| AllocationGroups() != divide_roundup(NumBlocks(),
204 			1L << AllocationGroupShift()))
205 		return false;
206 
207 	return true;
208 }
209 
210 
211 void
212 disk_super_block::Initialize(const char* diskName, off_t numBlocks,
213 	uint32 blockSize)
214 {
215 	memset(this, 0, sizeof(disk_super_block));
216 
217 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
218 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
219 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
220 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
221 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
222 
223 	strlcpy(name, diskName, sizeof(name));
224 
225 	int32 blockShift = 9;
226 	while ((1UL << blockShift) < blockSize) {
227 		blockShift++;
228 	}
229 
230 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
231 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
232 
233 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
234 	used_blocks = 0;
235 
236 	// Get the minimum ag_shift (that's determined by the block size)
237 
238 	int32 bitsPerBlock = blockSize << 3;
239 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
240 	int32 blocksPerGroup = 1;
241 	int32 groupShift = 13;
242 
243 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
244 		groupShift++;
245 	}
246 
247 	// Many allocation groups help applying allocation policies, but if
248 	// they are too small, we will need to many block_runs to cover large
249 	// files (see above to get an explanation of the kDesiredAllocationGroups
250 	// constant).
251 
252 	int32 numGroups;
253 
254 	while (true) {
255 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
256 		if (numGroups > kDesiredAllocationGroups) {
257 			if (groupShift == 16)
258 				break;
259 
260 			groupShift++;
261 			blocksPerGroup *= 2;
262 		} else
263 			break;
264 	}
265 
266 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
267 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
268 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
269 }
270 
271 
272 //	#pragma mark -
273 
274 
275 Volume::Volume(fs_volume* volume)
276 	:
277 	fVolume(volume),
278 	fBlockAllocator(this),
279 	fRootNode(NULL),
280 	fIndicesNode(NULL),
281 	fDirtyCachedBlocks(0),
282 	fFlags(0),
283 	fCheckingThread(-1)
284 {
285 	mutex_init(&fLock, "bfs volume");
286 	mutex_init(&fQueryLock, "bfs queries");
287 }
288 
289 
290 Volume::~Volume()
291 {
292 	mutex_destroy(&fQueryLock);
293 	mutex_destroy(&fLock);
294 }
295 
296 
297 bool
298 Volume::IsValidSuperBlock()
299 {
300 	return fSuperBlock.IsValid();
301 }
302 
303 
304 void
305 Volume::Panic()
306 {
307 	FATAL(("we have to panic... switch to read-only mode!\n"));
308 	fFlags |= VOLUME_READ_ONLY;
309 #ifdef DEBUG
310 	kernel_debugger("BFS panics!");
311 #endif
312 }
313 
314 
315 status_t
316 Volume::Mount(const char* deviceName, uint32 flags)
317 {
318 	// TODO: validate the FS in write mode as well!
319 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
320 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
321 	// in big endian mode, we only mount read-only for now
322 	flags |= B_MOUNT_READ_ONLY;
323 #endif
324 
325 	DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0
326 		? O_RDONLY : O_RDWR);
327 	fDevice = opener.Device();
328 	if (fDevice < B_OK)
329 		RETURN_ERROR(fDevice);
330 
331 	if (opener.IsReadOnly())
332 		fFlags |= VOLUME_READ_ONLY;
333 
334 	// read the super block
335 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
336 		FATAL(("invalid super block!\n"));
337 		return B_BAD_VALUE;
338 	}
339 
340 	// initialize short hands to the super block (to save byte swapping)
341 	fBlockSize = fSuperBlock.BlockSize();
342 	fBlockShift = fSuperBlock.BlockShift();
343 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
344 
345 	// check if the device size is large enough to hold the file system
346 	off_t diskSize;
347 	if (opener.GetSize(&diskSize) != B_OK)
348 		RETURN_ERROR(B_ERROR);
349 	if (diskSize < (NumBlocks() << BlockShift()))
350 		RETURN_ERROR(B_BAD_VALUE);
351 
352 	// set the current log pointers, so that journaling will work correctly
353 	fLogStart = fSuperBlock.LogStart();
354 	fLogEnd = fSuperBlock.LogEnd();
355 
356 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
357 		return B_ERROR;
358 
359 	fJournal = new Journal(this);
360 	if (fJournal == NULL)
361 		return B_NO_MEMORY;
362 
363 	status_t status = fJournal->InitCheck();
364 	if (status < B_OK) {
365 		FATAL(("could not initialize journal: %s!\n", strerror(status)));
366 		return status;
367 	}
368 
369 	// replaying the log is the first thing we will do on this disk
370 	status = fJournal->ReplayLog();
371 	if (status != B_OK) {
372 		FATAL(("Replaying log failed, data may be corrupted, volume "
373 			"read-only.\n"));
374 		fFlags |= VOLUME_READ_ONLY;
375 			// TODO: if this is the boot volume, Bootscript will assume this
376 			// is a CD...
377 			// TODO: it would be nice to have a user visible alert instead
378 			// of letting him just find this in the syslog.
379 	}
380 
381 	status = fBlockAllocator.Initialize();
382 	if (status != B_OK) {
383 		FATAL(("could not initialize block bitmap allocator!\n"));
384 		return status;
385 	}
386 
387 	fRootNode = new Inode(this, ToVnode(Root()));
388 	if (fRootNode != NULL && fRootNode->InitCheck() == B_OK) {
389 		status = publish_vnode(fVolume, ToVnode(Root()), (void*)fRootNode,
390 			&gBFSVnodeOps, fRootNode->Mode(), 0);
391 		if (status == B_OK) {
392 			// try to get indices root dir
393 
394 			if (!Indices().IsZero())
395 				fIndicesNode = new Inode(this, ToVnode(Indices()));
396 
397 			if (fIndicesNode == NULL
398 				|| fIndicesNode->InitCheck() < B_OK
399 				|| !fIndicesNode->IsContainer()) {
400 				INFORM(("bfs: volume doesn't have indices!\n"));
401 
402 				if (fIndicesNode) {
403 					// if this is the case, the index root node is gone bad,
404 					// and BFS switch to read-only mode
405 					fFlags |= VOLUME_READ_ONLY;
406 					delete fIndicesNode;
407 					fIndicesNode = NULL;
408 				}
409 			} else {
410 				// we don't use the vnode layer to access the indices node
411 			}
412 
413 			// all went fine
414 			opener.Keep();
415 			return B_OK;
416 		} else
417 			FATAL(("could not create root node: publish_vnode() failed!\n"));
418 
419 		delete fRootNode;
420 	} else {
421 		status = B_BAD_VALUE;
422 		FATAL(("could not create root node!\n"));
423 	}
424 
425 	return status;
426 }
427 
428 
429 status_t
430 Volume::Unmount()
431 {
432 	put_vnode(fVolume, ToVnode(Root()));
433 
434 	fBlockAllocator.Uninitialize();
435 
436 	// This will also flush the log & all blocks to disk
437 	delete fJournal;
438 	fJournal = NULL;
439 
440 	delete fIndicesNode;
441 
442 	block_cache_delete(fBlockCache, !IsReadOnly());
443 	close(fDevice);
444 
445 	return B_OK;
446 }
447 
448 
449 status_t
450 Volume::Sync()
451 {
452 	return fJournal->FlushLogAndBlocks();
453 }
454 
455 
456 status_t
457 Volume::ValidateBlockRun(block_run run)
458 {
459 	if (run.AllocationGroup() < 0
460 		|| run.AllocationGroup() > (int32)AllocationGroups()
461 		|| run.Start() > (1UL << AllocationGroupShift())
462 		|| run.length == 0
463 		|| uint32(run.Length() + run.Start())
464 				> (1UL << AllocationGroupShift())) {
465 		Panic();
466 		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(),
467 			run.Start(), run.Length()));
468 		return B_BAD_DATA;
469 	}
470 	return B_OK;
471 }
472 
473 
474 block_run
475 Volume::ToBlockRun(off_t block) const
476 {
477 	block_run run;
478 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(
479 		block >> AllocationGroupShift());
480 	run.start = HOST_ENDIAN_TO_BFS_INT16(
481 		block & ((1LL << AllocationGroupShift()) - 1));
482 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
483 	return run;
484 }
485 
486 
487 status_t
488 Volume::CreateIndicesRoot(Transaction& transaction)
489 {
490 	off_t id;
491 	status_t status = Inode::Create(transaction, NULL, NULL,
492 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id,
493 		&fIndicesNode);
494 	if (status < B_OK)
495 		RETURN_ERROR(status);
496 
497 	fSuperBlock.indices = ToBlockRun(id);
498 	return WriteSuperBlock();
499 }
500 
501 
502 status_t
503 Volume::AllocateForInode(Transaction& transaction, const Inode* parent,
504 	mode_t type, block_run& run)
505 {
506 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(),
507 		type, run);
508 }
509 
510 
511 status_t
512 Volume::WriteSuperBlock()
513 {
514 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block))
515 			!= sizeof(disk_super_block))
516 		return B_IO_ERROR;
517 
518 	return B_OK;
519 }
520 
521 
522 void
523 Volume::UpdateLiveQueries(Inode* inode, const char* attribute, int32 type,
524 	const uint8* oldKey, size_t oldLength, const uint8* newKey,
525 	size_t newLength)
526 {
527 	MutexLocker _(fQueryLock);
528 
529 	SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator();
530 	while (iterator.HasNext()) {
531 		Query* query = iterator.Next();
532 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey,
533 			newLength);
534 	}
535 }
536 
537 
538 /*!	Checks if there is a live query whose results depend on the presence
539 	or value of the specified attribute.
540 	Don't use it if you already have all the data together to evaluate
541 	the queries - it wouldn't safe you anything in this case.
542 */
543 bool
544 Volume::CheckForLiveQuery(const char* attribute)
545 {
546 	// TODO: check for a live query that depends on the specified attribute
547 	return true;
548 }
549 
550 
551 void
552 Volume::AddQuery(Query* query)
553 {
554 	MutexLocker _(fQueryLock);
555 	fQueries.Add(query);
556 }
557 
558 
559 void
560 Volume::RemoveQuery(Query* query)
561 {
562 	MutexLocker _(fQueryLock);
563 	fQueries.Remove(query);
564 }
565 
566 
567 //	#pragma mark - Disk scanning and initialization
568 
569 
570 /*static*/ status_t
571 Volume::CheckSuperBlock(const uint8* data, uint32* _offset)
572 {
573 	disk_super_block* superBlock = (disk_super_block*)(data + 512);
574 	if (superBlock->IsValid()) {
575 		if (_offset != NULL)
576 			*_offset = 512;
577 		return B_OK;
578 	}
579 
580 #ifndef BFS_LITTLE_ENDIAN_ONLY
581 	// For PPC, the super block might be located at offset 0
582 	superBlock = (disk_super_block*)data;
583 	if (superBlock->IsValid()) {
584 		if (_offset != NULL)
585 			*_offset = 0;
586 		return B_OK;
587 	}
588 #endif
589 
590 	return B_BAD_VALUE;
591 }
592 
593 
594 /*static*/ status_t
595 Volume::Identify(int fd, disk_super_block* superBlock)
596 {
597 	uint8 buffer[1024];
598 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
599 		return B_IO_ERROR;
600 
601 	uint32 offset;
602 	if (CheckSuperBlock(buffer, &offset) != B_OK)
603 		return B_BAD_VALUE;
604 
605 	memcpy(superBlock, buffer + offset, sizeof(disk_super_block));
606 	return B_OK;
607 }
608 
609 
610 status_t
611 Volume::Initialize(int fd, const char* name, uint32 blockSize,
612 	uint32 flags)
613 {
614 	// although there is no really good reason for it, we won't
615 	// accept '/' in disk names (mkbfs does this, too - and since
616 	// Tracker names mounted volumes like their name)
617 	if (strchr(name, '/') != NULL)
618 		return B_BAD_VALUE;
619 
620 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096
621 		&& blockSize != 8192)
622 		return B_BAD_VALUE;
623 
624 	DeviceOpener opener(fd, O_RDWR);
625 	if (opener.Device() < B_OK)
626 		return B_BAD_VALUE;
627 
628 	if (opener.IsReadOnly())
629 		return B_READ_ONLY_DEVICE;
630 
631 	fDevice = opener.Device();
632 
633 	uint32 deviceBlockSize;
634 	off_t deviceSize;
635 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
636 		return B_ERROR;
637 
638 	off_t numBlocks = deviceSize / blockSize;
639 
640 	// create valid super block
641 
642 	fSuperBlock.Initialize(name, numBlocks, blockSize);
643 
644 	// initialize short hands to the super block (to save byte swapping)
645 	fBlockSize = fSuperBlock.BlockSize();
646 	fBlockShift = fSuperBlock.BlockShift();
647 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
648 
649 	// determine log size depending on the size of the volume
650 	off_t logSize = 2048;
651 	if (numBlocks <= 20480)
652 		logSize = 512;
653 	if (deviceSize > 1LL * 1024 * 1024 * 1024)
654 		logSize = 4096;
655 
656 	// since the allocator has not been initialized yet, we
657 	// cannot use BlockAllocator::BitmapSize() here
658 	off_t bitmapBlocks = (numBlocks + blockSize * 8 - 1) / (blockSize * 8);
659 
660 	fSuperBlock.log_blocks = ToBlockRun(bitmapBlocks + 1);
661 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(logSize);
662 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(
663 		ToBlock(Log()));
664 
665 	// set the current log pointers, so that journaling will work correctly
666 	fLogStart = fSuperBlock.LogStart();
667 	fLogEnd = fSuperBlock.LogEnd();
668 
669 	if (!IsValidSuperBlock())
670 		RETURN_ERROR(B_ERROR);
671 
672 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
673 		return B_ERROR;
674 
675 	fJournal = new Journal(this);
676 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
677 		RETURN_ERROR(B_ERROR);
678 
679 	// ready to write data to disk
680 
681 	Transaction transaction(this, 0);
682 
683 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
684 		RETURN_ERROR(B_ERROR);
685 
686 	off_t id;
687 	status_t status = Inode::Create(transaction, NULL, NULL,
688 		S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode);
689 	if (status < B_OK)
690 		RETURN_ERROR(status);
691 
692 	fSuperBlock.root_dir = ToBlockRun(id);
693 
694 	if ((flags & VOLUME_NO_INDICES) == 0) {
695 		// The indices root directory will be created automatically
696 		// when the standard indices are created (or any other).
697 		Index index(this);
698 		status = index.Create(transaction, "name", B_STRING_TYPE);
699 		if (status < B_OK)
700 			return status;
701 
702 		status = index.Create(transaction, "BEOS:APP_SIG", B_STRING_TYPE);
703 		if (status < B_OK)
704 			return status;
705 
706 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
707 		if (status < B_OK)
708 			return status;
709 
710 		status = index.Create(transaction, "size", B_INT64_TYPE);
711 		if (status < B_OK)
712 			return status;
713 	}
714 
715 	WriteSuperBlock();
716 	transaction.Done();
717 
718 	Sync();
719 	opener.RemoveCache(true);
720 	return B_OK;
721 }
722