xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision 9d6d3fcf5fe8308cd020cecf89dede440346f8c4)
1 /* Volume - BFS super block, mounting, etc.
2  *
3  * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de.
4  * This file may be used under the terms of the MIT License.
5  */
6 
7 
8 #include "Debug.h"
9 #include "Volume.h"
10 #include "Journal.h"
11 #include "Inode.h"
12 #include "Query.h"
13 
14 #include <util/kernel_cpp.h>
15 #include <KernelExport.h>
16 #include <Drivers.h>
17 #include <fs_volume.h>
18 
19 #include <ctype.h>
20 #include <errno.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 
25 
26 static const int32 kDesiredAllocationGroups = 56;
27 	// This is the number of allocation groups that will be tried
28 	// to be given for newly initialized disks.
29 	// That's only relevant for smaller disks, though, since any
30 	// of today's disk sizes already reach the maximum length
31 	// of an allocation group (65536 blocks).
32 	// It seems to create appropriate numbers for smaller disks
33 	// with this setting, though (i.e. you can create a 400 MB
34 	// file on a 1 GB disk without the need for double indirect
35 	// blocks).
36 
37 
38 class DeviceOpener {
39 	public:
40 		DeviceOpener(const char *device, int mode);
41 		~DeviceOpener();
42 
43 		int Open(const char *device, int mode);
44 		void *InitCache(off_t numBlocks, uint32 blockSize);
45 		void RemoveCache(bool allowWrites);
46 
47 		void Keep();
48 
49 		int Device() const { return fDevice; }
50 		int Mode() const { return fMode; }
51 
52 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
53 
54 	private:
55 		int		fDevice;
56 		int		fMode;
57 		void	*fBlockCache;
58 };
59 
60 
61 DeviceOpener::DeviceOpener(const char *device, int mode)
62 	:
63 	fBlockCache(NULL)
64 {
65 	Open(device, mode);
66 }
67 
68 
69 DeviceOpener::~DeviceOpener()
70 {
71 	if (fDevice >= B_OK) {
72 		RemoveCache(false);
73 		close(fDevice);
74 	}
75 }
76 
77 
78 int
79 DeviceOpener::Open(const char *device, int mode)
80 {
81 	fDevice = open(device, mode);
82 	if (fDevice < 0)
83 		fDevice = errno;
84 
85 	if (fDevice < 0 && mode == O_RDWR) {
86 		// try again to open read-only (don't rely on a specific error code)
87 		return Open(device, O_RDONLY);
88 	}
89 
90 	if (fDevice >= 0) {
91 		// opening succeeded
92 		fMode = mode;
93 		if (mode == O_RDWR) {
94 			// check out if the device really allows for read/write access
95 			device_geometry geometry;
96 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
97 				if (geometry.read_only) {
98 					// reopen device read-only
99 					close(fDevice);
100 					return Open(device, O_RDONLY);
101 				}
102 			}
103 		}
104 	}
105 
106 	return fDevice;
107 }
108 
109 
110 void *
111 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
112 {
113 	return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY);
114 }
115 
116 
117 void
118 DeviceOpener::RemoveCache(bool allowWrites)
119 {
120 	if (fBlockCache == NULL)
121 		return;
122 
123 	block_cache_delete(fBlockCache, allowWrites);
124 	fBlockCache = NULL;
125 }
126 
127 
128 void
129 DeviceOpener::Keep()
130 {
131 	fDevice = -1;
132 }
133 
134 
135 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
136  *	to compute the size, or fstat() if that failed.
137  */
138 
139 status_t
140 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
141 {
142 	device_geometry geometry;
143 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
144 		// maybe it's just a file
145 		struct stat stat;
146 		if (fstat(fDevice, &stat) < 0)
147 			return B_ERROR;
148 
149 		if (_size)
150 			*_size = stat.st_size;
151 		if (_blockSize)	// that shouldn't cause us any problems
152 			*_blockSize = 512;
153 
154 		return B_OK;
155 	}
156 
157 	if (_size) {
158 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
159 			* geometry.sectors_per_track * geometry.bytes_per_sector;
160 	}
161 	if (_blockSize)
162 		*_blockSize = geometry.bytes_per_sector;
163 
164 	return B_OK;
165 }
166 
167 
168 //	#pragma mark -
169 
170 
171 bool
172 disk_super_block::IsValid()
173 {
174 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
175 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
176 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
177 		|| (int32)block_size != inode_size
178 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
179 		|| (1UL << BlockShift()) != BlockSize()
180 		|| AllocationGroups() < 1
181 		|| AllocationGroupShift() < 1
182 		|| BlocksPerAllocationGroup() < 1
183 		|| NumBlocks() < 10
184 		|| AllocationGroups() != divide_roundup(NumBlocks(),
185 			1L << AllocationGroupShift()))
186 		return false;
187 
188 	return true;
189 }
190 
191 
192 void
193 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
194 {
195 	memset(this, 0, sizeof(disk_super_block));
196 
197 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
198 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
199 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
200 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
201 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
202 
203 	strlcpy(name, diskName, sizeof(name));
204 
205 	int32 blockShift = 9;
206 	while ((1UL << blockShift) < blockSize) {
207 		blockShift++;
208 	}
209 
210 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
211 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
212 
213 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
214 	used_blocks = 0;
215 
216 	// Get the minimum ag_shift (that's determined by the block size)
217 
218 	int32 bitsPerBlock = blockSize << 3;
219 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
220 	int32 blocksPerGroup = 1;
221 	int32 groupShift = 13;
222 
223 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
224 		groupShift++;
225 	}
226 
227 	// Many allocation groups help applying allocation policies, but if
228 	// they are too small, we will need to many block_runs to cover large
229 	// files (see above to get an explanation of the kDesiredAllocationGroups
230 	// constant).
231 
232 	int32 numGroups;
233 
234 	while (true) {
235 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
236 		if (numGroups > kDesiredAllocationGroups) {
237 			if (groupShift == 16)
238 				break;
239 
240 			groupShift++;
241 			blocksPerGroup *= 2;
242 		} else
243 			break;
244 	}
245 
246 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
247 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(1);
248 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
249 }
250 
251 
252 //	#pragma mark -
253 
254 
255 Volume::Volume(mount_id id)
256 	:
257 	fID(id),
258 	fBlockAllocator(this),
259 	fLock("bfs volume"),
260 	fRootNode(NULL),
261 	fIndicesNode(NULL),
262 	fDirtyCachedBlocks(0),
263 	fUniqueID(0),
264 	fFlags(0)
265 {
266 }
267 
268 
269 Volume::~Volume()
270 {
271 }
272 
273 
274 bool
275 Volume::IsValidSuperBlock()
276 {
277 	return fSuperBlock.IsValid();
278 }
279 
280 
281 void
282 Volume::Panic()
283 {
284 	FATAL(("we have to panic... switch to read-only mode!\n"));
285 	fFlags |= VOLUME_READ_ONLY;
286 #ifdef USER
287 	debugger("BFS panics!");
288 #elif defined(DEBUG)
289 	kernel_debugger("BFS panics!");
290 #endif
291 }
292 
293 
294 status_t
295 Volume::Mount(const char *deviceName, uint32 flags)
296 {
297 	// ToDo: validate the FS in write mode as well!
298 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
299 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
300 	// in big endian mode, we only mount read-only for now
301 	flags |= B_MOUNT_READ_ONLY;
302 #endif
303 
304 	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
305 	fDevice = opener.Device();
306 	if (fDevice < B_OK)
307 		RETURN_ERROR(fDevice);
308 
309 	if (opener.Mode() == O_RDONLY)
310 		fFlags |= VOLUME_READ_ONLY;
311 
312 	// check if it's a regular file, and if so, disable the cache for the
313 	// underlaying file system
314 	struct stat stat;
315 	if (fstat(fDevice, &stat) < 0)
316 		RETURN_ERROR(B_ERROR);
317 
318 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works)
319 #if 0
320 #ifndef NO_FILE_UNCACHED_IO
321 	if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
322 		// mount read-only if the cache couldn't be disabled
323 #	ifdef DEBUG
324 		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
325 #	else
326 		FATAL(("couldn't disable cache for image file!\n"));
327 		Panic();
328 #	endif
329 	}
330 #endif
331 #endif
332 
333 	// read the super block
334 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
335 		FATAL(("invalid super block!\n"));
336 		return B_BAD_VALUE;
337 	}
338 
339 	// initialize short hands to the super block (to save byte swapping)
340 	fBlockSize = fSuperBlock.BlockSize();
341 	fBlockShift = fSuperBlock.BlockShift();
342 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
343 
344 	// check if the device size is large enough to hold the file system
345 	off_t diskSize;
346 	if (opener.GetSize(&diskSize) < B_OK)
347 		RETURN_ERROR(B_ERROR);
348 	if (diskSize < (NumBlocks() << BlockShift()))
349 		RETURN_ERROR(B_BAD_VALUE);
350 
351 	// set the current log pointers, so that journaling will work correctly
352 	fLogStart = fSuperBlock.LogStart();
353 	fLogEnd = fSuperBlock.LogEnd();
354 
355 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
356 		return B_ERROR;
357 
358 	fJournal = new Journal(this);
359 	// replaying the log is the first thing we will do on this disk
360 	if (fJournal && fJournal->InitCheck() < B_OK
361 		|| fBlockAllocator.Initialize() < B_OK) {
362 		// ToDo: improve error reporting for a bad journal
363 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
364 		return B_NO_MEMORY;
365 	}
366 
367 	status_t status = B_OK;
368 
369 	fRootNode = new Inode(this, ToVnode(Root()));
370 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
371 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
372 		if (status == B_OK) {
373 			// try to get indices root dir
374 
375 			// question: why doesn't get_vnode() work here??
376 			// answer: we have not yet backpropagated the pointer to the
377 			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
378 			// But it's not needed to do that anyway.
379 
380 			if (!Indices().IsZero())
381 				fIndicesNode = new Inode(this, ToVnode(Indices()));
382 
383 			if (fIndicesNode == NULL
384 				|| fIndicesNode->InitCheck() < B_OK
385 				|| !fIndicesNode->IsContainer()) {
386 				INFORM(("bfs: volume doesn't have indices!\n"));
387 
388 				if (fIndicesNode) {
389 					// if this is the case, the index root node is gone bad, and
390 					// BFS switch to read-only mode
391 					fFlags |= VOLUME_READ_ONLY;
392 					delete fIndicesNode;
393 					fIndicesNode = NULL;
394 				}
395 			}
396 
397 			// all went fine
398 			opener.Keep();
399 			return B_OK;
400 		} else
401 			FATAL(("could not create root node: publish_vnode() failed!\n"));
402 
403 		delete fRootNode;
404 	} else {
405 		status = B_BAD_VALUE;
406 		FATAL(("could not create root node!\n"));
407 	}
408 
409 	return status;
410 }
411 
412 
413 status_t
414 Volume::Unmount()
415 {
416 	// Unlike in BeOS, we need to put the reference to our root node ourselves
417 	put_vnode(fID, ToVnode(Root()));
418 
419 	// This will also flush the log & all blocks to disk
420 	delete fJournal;
421 	fJournal = NULL;
422 
423 	delete fIndicesNode;
424 
425 	block_cache_delete(fBlockCache, !IsReadOnly());
426 	close(fDevice);
427 
428 	return B_OK;
429 }
430 
431 
432 status_t
433 Volume::Sync()
434 {
435 	return fJournal->FlushLogAndBlocks();
436 }
437 
438 
439 status_t
440 Volume::ValidateBlockRun(block_run run)
441 {
442 	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
443 		|| run.Start() > (1UL << AllocationGroupShift())
444 		|| run.length == 0
445 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
446 		Panic();
447 		FATAL(("*** invalid run(%ld,%d,%d)\n", run.AllocationGroup(), run.Start(), run.Length()));
448 		return B_BAD_DATA;
449 	}
450 	return B_OK;
451 }
452 
453 
454 block_run
455 Volume::ToBlockRun(off_t block) const
456 {
457 	block_run run;
458 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
459 	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
460 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
461 	return run;
462 }
463 
464 
465 status_t
466 Volume::CreateIndicesRoot(Transaction &transaction)
467 {
468 	off_t id;
469 	status_t status = Inode::Create(transaction, NULL, NULL,
470 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode);
471 	if (status < B_OK)
472 		RETURN_ERROR(status);
473 
474 	fSuperBlock.indices = ToBlockRun(id);
475 	return WriteSuperBlock();
476 }
477 
478 
479 status_t
480 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run)
481 {
482 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
483 }
484 
485 
486 status_t
487 Volume::WriteSuperBlock()
488 {
489 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
490 		return B_IO_ERROR;
491 
492 	return B_OK;
493 }
494 
495 
496 void
497 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
498 	size_t oldLength, const uint8 *newKey, size_t newLength)
499 {
500 	if (fQueryLock.Lock() < B_OK)
501 		return;
502 
503 	Query *query = NULL;
504 	while ((query = fQueries.Next(query)) != NULL)
505 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
506 
507 	fQueryLock.Unlock();
508 }
509 
510 
511 /** Checks if there is a live query whose results depend on the presence
512  *	or value of the specified attribute.
513  *	Don't use it if you already have all the data together to evaluate
514  *	the queries - it wouldn't safe you anything in this case.
515  */
516 
517 bool
518 Volume::CheckForLiveQuery(const char *attribute)
519 {
520 	// ToDo: check for a live query that depends on the specified attribute
521 	return true;
522 }
523 
524 
525 void
526 Volume::AddQuery(Query *query)
527 {
528 	if (fQueryLock.Lock() < B_OK)
529 		return;
530 
531 	fQueries.Add(query);
532 
533 	fQueryLock.Unlock();
534 }
535 
536 
537 void
538 Volume::RemoveQuery(Query *query)
539 {
540 	if (fQueryLock.Lock() < B_OK)
541 		return;
542 
543 	fQueries.Remove(query);
544 
545 	fQueryLock.Unlock();
546 }
547 
548 
549 //	#pragma mark -
550 //	Disk scanning and initialization
551 
552 
553 status_t
554 Volume::Identify(int fd, disk_super_block *superBlock)
555 {
556 	char buffer[1024];
557 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
558 		return B_IO_ERROR;
559 
560 	// Note: that does work only for x86, for PowerPC, the super block
561 	// may be located at offset 0!
562 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
563 	if (!superBlock->IsValid()) {
564 #ifndef BFS_LITTLE_ENDIAN_ONLY
565 		memcpy(superBlock, buffer, sizeof(disk_super_block));
566 		if (!superBlock->IsValid())
567 			return B_BAD_VALUE;
568 #else
569 		return B_BAD_VALUE;
570 #endif
571 	}
572 
573 	return B_OK;
574 }
575 
576 #ifdef USER
577 extern "C" void kill_device_vnodes(dev_t id);
578 	// This call is only available in the userland fs_shell
579 
580 status_t
581 Volume::Initialize(const char *device, const char *name, uint32 blockSize, uint32 flags)
582 {
583 	// although there is no really good reason for it, we won't
584 	// accept '/' in disk names (mkbfs does this, too - and since
585 	// Tracker names mounted volumes like their name)
586 	if (strchr(name, '/') != NULL)
587 		return B_BAD_VALUE;
588 
589 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192)
590 		return B_BAD_VALUE;
591 
592 	DeviceOpener opener(device, O_RDWR);
593 	if (opener.Device() < B_OK)
594 		return B_BAD_VALUE;
595 
596 	fDevice = opener.Device();
597 
598 	uint32 deviceBlockSize;
599 	off_t deviceSize;
600 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
601 		return B_ERROR;
602 
603 	off_t numBlocks = deviceSize / blockSize;
604 
605 	// create valid super block
606 
607 	fSuperBlock.Initialize(name, numBlocks, blockSize);
608 
609 	// initialize short hands to the super block (to save byte swapping)
610 	fBlockSize = fSuperBlock.BlockSize();
611 	fBlockShift = fSuperBlock.BlockShift();
612 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
613 
614 	// since the allocator has not been initialized yet, we
615 	// cannot use BlockAllocator::BitmapSize() here
616 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
617 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
618 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
619 		// ToDo: set the log size depending on the disk size
620 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log()));
621 
622 	// set the current log pointers, so that journaling will work correctly
623 	fLogStart = fSuperBlock.LogStart();
624 	fLogEnd = fSuperBlock.LogEnd();
625 
626 	if (!IsValidSuperBlock())
627 		RETURN_ERROR(B_ERROR);
628 
629 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
630 		return B_ERROR;
631 
632 	fJournal = new Journal(this);
633 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
634 		RETURN_ERROR(B_ERROR);
635 
636 	// ready to write data to disk
637 
638 	Transaction transaction(this, 0);
639 
640 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
641 		RETURN_ERROR(B_ERROR);
642 
643 	off_t id;
644 	status_t status = Inode::Create(transaction, NULL, NULL,
645 		S_DIRECTORY | 0755, 0, 0, &id, &fRootNode);
646 	if (status < B_OK)
647 		RETURN_ERROR(status);
648 
649 	fSuperBlock.root_dir = ToBlockRun(id);
650 
651 	if ((flags & VOLUME_NO_INDICES) == 0) {
652 		// The indices root directory will be created automatically
653 		// when the standard indices are created (or any other).
654 		Index index(this);
655 		status = index.Create(transaction, "name", B_STRING_TYPE);
656 		if (status < B_OK)
657 			return status;
658 
659 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
660 		if (status < B_OK)
661 			return status;
662 
663 		status = index.Create(transaction, "size", B_INT64_TYPE);
664 		if (status < B_OK)
665 			return status;
666 	}
667 
668 	WriteSuperBlock();
669 	transaction.Done();
670 
671 	put_vnode(ID(), fRootNode->ID());
672 	if (fIndicesNode != NULL)
673 		put_vnode(ID(), fIndicesNode->ID());
674 
675 	kill_device_vnodes(ID());
676 		// This call is only available in the userland fs_shell
677 
678 	Sync();
679 	opener.RemoveCache(true);
680 	return B_OK;
681 }
682 #endif
683