xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision cd552c7a15cc10c36dae8d7439ba1d6c0bb168c5)
1 /* Volume - BFS super block, mounting, etc.
2  *
3  * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de.
4  * This file may be used under the terms of the MIT License.
5  */
6 
7 
8 #include "Debug.h"
9 #include "Volume.h"
10 #include "Journal.h"
11 #include "Inode.h"
12 #include "Query.h"
13 
14 #include <util/kernel_cpp.h>
15 #include <KernelExport.h>
16 #include <Drivers.h>
17 #include <fs_volume.h>
18 
19 #include <stdlib.h>
20 #include <stdio.h>
21 #include <string.h>
22 #include <ctype.h>
23 
24 
25 static const int32 kDesiredAllocationGroups = 56;
26 	// This is the number of allocation groups that will be tried
27 	// to be given for newly initialized disks.
28 	// That's only relevant for smaller disks, though, since any
29 	// of today's disk sizes already reach the maximum length
30 	// of an allocation group (65536 blocks).
31 	// It seems to create appropriate numbers for smaller disks
32 	// with this setting, though (i.e. you can create a 400 MB
33 	// file on a 1 GB disk without the need for double indirect
34 	// blocks).
35 
36 
37 class DeviceOpener {
38 	public:
39 		DeviceOpener(const char *device, int mode);
40 		~DeviceOpener();
41 
42 		int Open(const char *device, int mode);
43 		void *InitCache(off_t numBlocks, uint32 blockSize);
44 		void RemoveCache(bool allowWrites);
45 
46 		void Keep();
47 
48 		int Device() const { return fDevice; }
49 		int Mode() const { return fMode; }
50 
51 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
52 
53 	private:
54 		int		fDevice;
55 		int		fMode;
56 		void	*fBlockCache;
57 };
58 
59 
60 DeviceOpener::DeviceOpener(const char *device, int mode)
61 	:
62 	fBlockCache(NULL)
63 {
64 	Open(device, mode);
65 }
66 
67 
68 DeviceOpener::~DeviceOpener()
69 {
70 	if (fDevice >= B_OK) {
71 		RemoveCache(false);
72 		close(fDevice);
73 	}
74 }
75 
76 
77 int
78 DeviceOpener::Open(const char *device, int mode)
79 {
80 	fDevice = open(device, mode);
81 	if (fDevice < 0 && mode == O_RDWR) {
82 		// try again to open read-only (don't rely on a specific error code)
83 		return Open(device, O_RDONLY);
84 	}
85 
86 	if (fDevice >= 0) {
87 		// opening succeeded
88 		fMode = mode;
89 		if (mode == O_RDWR) {
90 			// check out if the device really allows for read/write access
91 			device_geometry geometry;
92 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
93 				if (geometry.read_only) {
94 					// reopen device read-only
95 					close(fDevice);
96 					return Open(device, O_RDONLY);
97 				}
98 			}
99 		}
100 	}
101 
102 	return fDevice;
103 }
104 
105 
106 void *
107 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
108 {
109 	return block_cache_create(fDevice, numBlocks, blockSize);
110 }
111 
112 
113 void
114 DeviceOpener::RemoveCache(bool allowWrites)
115 {
116 	if (fBlockCache == NULL)
117 		return;
118 
119 	block_cache_delete(fBlockCache, allowWrites);
120 	fBlockCache = NULL;
121 }
122 
123 
124 void
125 DeviceOpener::Keep()
126 {
127 	fDevice = -1;
128 }
129 
130 
131 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
132  *	to compute the size, or fstat() if that failed.
133  */
134 
135 status_t
136 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
137 {
138 	device_geometry geometry;
139 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
140 		// maybe it's just a file
141 		struct stat stat;
142 		if (fstat(fDevice, &stat) < 0)
143 			return B_ERROR;
144 
145 		if (_size)
146 			*_size = stat.st_size;
147 		if (_blockSize)	// that shouldn't cause us any problems
148 			*_blockSize = 512;
149 
150 		return B_OK;
151 	}
152 
153 	if (_size) {
154 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
155 					* geometry.sectors_per_track * geometry.bytes_per_sector;
156 	}
157 	if (_blockSize)
158 		*_blockSize = geometry.bytes_per_sector;
159 
160 	return B_OK;
161 }
162 
163 
164 //	#pragma mark -
165 
166 
167 bool
168 disk_super_block::IsValid()
169 {
170 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
171 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
172 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
173 		|| (int32)block_size != inode_size
174 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
175 		|| (1UL << BlockShift()) != BlockSize()
176 		|| AllocationGroups() < 1
177 		|| AllocationGroupShift() < 1
178 		|| BlocksPerAllocationGroup() < 1
179 		|| NumBlocks() < 10
180 		|| AllocationGroups() != divide_roundup(NumBlocks(),
181 			1L << AllocationGroupShift()))
182 		return false;
183 
184 	return true;
185 }
186 
187 
188 void
189 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
190 {
191 	memset(this, 0, sizeof(disk_super_block));
192 
193 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
194 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
195 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
196 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
197 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
198 
199 	strlcpy(name, diskName, sizeof(name));
200 
201 	int32 blockShift = 9;
202 	while ((1UL << blockShift) < blockSize) {
203 		blockShift++;
204 	}
205 
206 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
207 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
208 
209 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
210 	used_blocks = 0;
211 
212 	// Get the minimum ag_shift (that's determined by the block size)
213 
214 	int32 bitsPerBlock = blockSize << 3;
215 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
216 	int32 blocksPerGroup = 1;
217 	int32 groupShift = 13;
218 
219 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
220 		groupShift++;
221 	}
222 
223 	// Many allocation groups help applying allocation policies, but if
224 	// they are too small, we will need to many block_runs to cover large
225 	// files (see above to get an explanation of the kDesiredAllocationGroups
226 	// constant).
227 
228 	int32 numGroups;
229 
230 	while (true) {
231 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
232 		if (numGroups > kDesiredAllocationGroups) {
233 			if (groupShift == 16)
234 				break;
235 
236 			groupShift++;
237 			blocksPerGroup *= 2;
238 		} else
239 			break;
240 	}
241 
242 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
243 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(1);
244 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
245 }
246 
247 
248 //	#pragma mark -
249 
250 
251 Volume::Volume(mount_id id)
252 	:
253 	fID(id),
254 	fBlockAllocator(this),
255 	fLock("bfs volume"),
256 	fRootNode(NULL),
257 	fIndicesNode(NULL),
258 	fDirtyCachedBlocks(0),
259 	fUniqueID(0),
260 	fFlags(0)
261 {
262 }
263 
264 
265 Volume::~Volume()
266 {
267 }
268 
269 
270 bool
271 Volume::IsValidSuperBlock()
272 {
273 	return fSuperBlock.IsValid();
274 }
275 
276 
277 void
278 Volume::Panic()
279 {
280 	FATAL(("we have to panic... switch to read-only mode!\n"));
281 	fFlags |= VOLUME_READ_ONLY;
282 #ifdef USER
283 	debugger("BFS panics!");
284 #elif defined(DEBUG)
285 	kernel_debugger("BFS panics!");
286 #endif
287 }
288 
289 
290 status_t
291 Volume::Mount(const char *deviceName, uint32 flags)
292 {
293 	// ToDo: validate the FS in write mode as well!
294 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
295 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
296 	// in big endian mode, we only mount read-only for now
297 	flags |= B_MOUNT_READ_ONLY;
298 #endif
299 
300 	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
301 	fDevice = opener.Device();
302 	if (fDevice < B_OK)
303 		RETURN_ERROR(fDevice);
304 
305 	if (opener.Mode() == O_RDONLY)
306 		fFlags |= VOLUME_READ_ONLY;
307 
308 	// check if it's a regular file, and if so, disable the cache for the
309 	// underlaying file system
310 	struct stat stat;
311 	if (fstat(fDevice, &stat) < 0)
312 		RETURN_ERROR(B_ERROR);
313 
314 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works)
315 #if 0
316 #ifndef NO_FILE_UNCACHED_IO
317 	if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
318 		// mount read-only if the cache couldn't be disabled
319 #	ifdef DEBUG
320 		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
321 #	else
322 		FATAL(("couldn't disable cache for image file!\n"));
323 		Panic();
324 #	endif
325 	}
326 #endif
327 #endif
328 
329 	// read the super block
330 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
331 		FATAL(("invalid super block!\n"));
332 		return B_BAD_VALUE;
333 	}
334 
335 	// initialize short hands to the super block (to save byte swapping)
336 	fBlockSize = fSuperBlock.BlockSize();
337 	fBlockShift = fSuperBlock.BlockShift();
338 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
339 
340 	// check if the device size is large enough to hold the file system
341 	off_t diskSize;
342 	if (opener.GetSize(&diskSize) < B_OK)
343 		RETURN_ERROR(B_ERROR);
344 	if (diskSize < (NumBlocks() << BlockShift()))
345 		RETURN_ERROR(B_BAD_VALUE);
346 
347 	// set the current log pointers, so that journaling will work correctly
348 	fLogStart = fSuperBlock.LogStart();
349 	fLogEnd = fSuperBlock.LogEnd();
350 
351 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
352 		return B_ERROR;
353 
354 	fJournal = new Journal(this);
355 	// replaying the log is the first thing we will do on this disk
356 	if (fJournal && fJournal->InitCheck() < B_OK
357 		|| fBlockAllocator.Initialize() < B_OK) {
358 		// ToDo: improve error reporting for a bad journal
359 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
360 		return B_NO_MEMORY;
361 	}
362 
363 	status_t status = B_OK;
364 
365 	fRootNode = new Inode(this, ToVnode(Root()));
366 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
367 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
368 		if (status == B_OK) {
369 			// try to get indices root dir
370 
371 			// question: why doesn't get_vnode() work here??
372 			// answer: we have not yet backpropagated the pointer to the
373 			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
374 			// But it's not needed to do that anyway.
375 
376 			if (!Indices().IsZero())
377 				fIndicesNode = new Inode(this, ToVnode(Indices()));
378 
379 			if (fIndicesNode == NULL
380 				|| fIndicesNode->InitCheck() < B_OK
381 				|| !fIndicesNode->IsContainer()) {
382 				INFORM(("bfs: volume doesn't have indices!\n"));
383 
384 				if (fIndicesNode) {
385 					// if this is the case, the index root node is gone bad, and
386 					// BFS switch to read-only mode
387 					fFlags |= VOLUME_READ_ONLY;
388 					delete fIndicesNode;
389 					fIndicesNode = NULL;
390 				}
391 			}
392 
393 			// all went fine
394 			opener.Keep();
395 			return B_OK;
396 		} else
397 			FATAL(("could not create root node: publish_vnode() failed!\n"));
398 
399 		delete fRootNode;
400 	} else {
401 		status = B_BAD_VALUE;
402 		FATAL(("could not create root node!\n"));
403 	}
404 
405 	return status;
406 }
407 
408 
409 status_t
410 Volume::Unmount()
411 {
412 	// Unlike in BeOS, we need to put the reference to our root node ourselves
413 	put_vnode(fID, ToVnode(Root()));
414 
415 	// This will also flush the log & all blocks to disk
416 	delete fJournal;
417 	fJournal = NULL;
418 
419 	delete fIndicesNode;
420 
421 	block_cache_delete(fBlockCache, !IsReadOnly());
422 	close(fDevice);
423 
424 	return B_OK;
425 }
426 
427 
428 status_t
429 Volume::Sync()
430 {
431 	return fJournal->FlushLogAndBlocks();
432 }
433 
434 
435 status_t
436 Volume::ValidateBlockRun(block_run run)
437 {
438 	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
439 		|| run.Start() > (1UL << AllocationGroupShift())
440 		|| run.length == 0
441 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
442 		Panic();
443 		FATAL(("*** invalid run(%ld,%d,%d)\n", run.AllocationGroup(), run.Start(), run.Length()));
444 		return B_BAD_DATA;
445 	}
446 	return B_OK;
447 }
448 
449 
450 block_run
451 Volume::ToBlockRun(off_t block) const
452 {
453 	block_run run;
454 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
455 	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
456 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
457 	return run;
458 }
459 
460 
461 status_t
462 Volume::CreateIndicesRoot(Transaction &transaction)
463 {
464 	off_t id;
465 	status_t status = Inode::Create(transaction, NULL, NULL,
466 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode);
467 	if (status < B_OK)
468 		RETURN_ERROR(status);
469 
470 	fSuperBlock.indices = ToBlockRun(id);
471 	return WriteSuperBlock();
472 }
473 
474 
475 status_t
476 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run)
477 {
478 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
479 }
480 
481 
482 status_t
483 Volume::WriteSuperBlock()
484 {
485 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
486 		return B_IO_ERROR;
487 
488 	return B_OK;
489 }
490 
491 
492 void
493 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
494 	size_t oldLength, const uint8 *newKey, size_t newLength)
495 {
496 	if (fQueryLock.Lock() < B_OK)
497 		return;
498 
499 	Query *query = NULL;
500 	while ((query = fQueries.Next(query)) != NULL)
501 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
502 
503 	fQueryLock.Unlock();
504 }
505 
506 
507 /** Checks if there is a live query whose results depend on the presence
508  *	or value of the specified attribute.
509  *	Don't use it if you already have all the data together to evaluate
510  *	the queries - it wouldn't safe you anything in this case.
511  */
512 
513 bool
514 Volume::CheckForLiveQuery(const char *attribute)
515 {
516 	// ToDo: check for a live query that depends on the specified attribute
517 	return true;
518 }
519 
520 
521 void
522 Volume::AddQuery(Query *query)
523 {
524 	if (fQueryLock.Lock() < B_OK)
525 		return;
526 
527 	fQueries.Add(query);
528 
529 	fQueryLock.Unlock();
530 }
531 
532 
533 void
534 Volume::RemoveQuery(Query *query)
535 {
536 	if (fQueryLock.Lock() < B_OK)
537 		return;
538 
539 	fQueries.Remove(query);
540 
541 	fQueryLock.Unlock();
542 }
543 
544 
545 //	#pragma mark -
546 //	Disk scanning and initialization
547 
548 
549 status_t
550 Volume::Identify(int fd, disk_super_block *superBlock)
551 {
552 	char buffer[1024];
553 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
554 		return B_IO_ERROR;
555 
556 	// Note: that does work only for x86, for PowerPC, the super block
557 	// may be located at offset 0!
558 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
559 	if (!superBlock->IsValid()) {
560 #ifndef BFS_LITTLE_ENDIAN_ONLY
561 		memcpy(superBlock, buffer, sizeof(disk_super_block));
562 		if (!superBlock->IsValid())
563 			return B_BAD_VALUE;
564 #else
565 		return B_BAD_VALUE;
566 #endif
567 	}
568 
569 	return B_OK;
570 }
571 
572 #ifdef USER
573 extern "C" void kill_device_vnodes(dev_t id);
574 	// This call is only available in the userland fs_shell
575 
576 status_t
577 Volume::Initialize(const char *device, const char *name, uint32 blockSize, uint32 flags)
578 {
579 	// although there is no really good reason for it, we won't
580 	// accept '/' in disk names (mkbfs does this, too - and since
581 	// Tracker names mounted volumes like their name)
582 	if (strchr(name, '/') != NULL)
583 		return B_BAD_VALUE;
584 
585 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192)
586 		return B_BAD_VALUE;
587 
588 	DeviceOpener opener(device, O_RDWR);
589 	if (opener.Device() < B_OK)
590 		return B_BAD_VALUE;
591 
592 	fDevice = opener.Device();
593 
594 	uint32 deviceBlockSize;
595 	off_t deviceSize;
596 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
597 		return B_ERROR;
598 
599 	off_t numBlocks = deviceSize / blockSize;
600 
601 	// create valid super block
602 
603 	fSuperBlock.Initialize(name, numBlocks, blockSize);
604 
605 	// initialize short hands to the super block (to save byte swapping)
606 	fBlockSize = fSuperBlock.BlockSize();
607 	fBlockShift = fSuperBlock.BlockShift();
608 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
609 
610 	// since the allocator has not been initialized yet, we
611 	// cannot use BlockAllocator::BitmapSize() here
612 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
613 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
614 	fSuperBlock.log_blocks.length = 2048;
615 		// ToDo: set the log size depending on the disk size
616 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log()));
617 
618 	// set the current log pointers, so that journaling will work correctly
619 	fLogStart = fSuperBlock.LogStart();
620 	fLogEnd = fSuperBlock.LogEnd();
621 
622 	if (!IsValidSuperBlock())
623 		RETURN_ERROR(B_ERROR);
624 
625 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
626 		return B_ERROR;
627 
628 	fJournal = new Journal(this);
629 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
630 		RETURN_ERROR(B_ERROR);
631 
632 	// ready to write data to disk
633 
634 	Transaction transaction(this, 0);
635 
636 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
637 		RETURN_ERROR(B_ERROR);
638 
639 	off_t id;
640 	status_t status = Inode::Create(transaction, NULL, NULL,
641 		S_DIRECTORY | 0755, 0, 0, &id, &fRootNode);
642 	if (status < B_OK)
643 		RETURN_ERROR(status);
644 
645 	fSuperBlock.root_dir = ToBlockRun(id);
646 
647 	if ((flags & VOLUME_NO_INDICES) == 0) {
648 		// The indices root directory will be created automatically
649 		// when the standard indices are created (or any other).
650 		Index index(this);
651 		status = index.Create(transaction, "name", B_STRING_TYPE);
652 		if (status < B_OK)
653 			return status;
654 
655 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
656 		if (status < B_OK)
657 			return status;
658 
659 		status = index.Create(transaction, "size", B_INT64_TYPE);
660 		if (status < B_OK)
661 			return status;
662 	}
663 
664 	WriteSuperBlock();
665 	transaction.Done();
666 
667 	put_vnode(ID(), fRootNode->ID());
668 	if (fIndicesNode != NULL)
669 		put_vnode(ID(), fIndicesNode->ID());
670 
671 	kill_device_vnodes(ID());
672 		// This call is only available in the userland fs_shell
673 
674 	Sync();
675 	opener.RemoveCache(true);
676 	return B_OK;
677 }
678 #endif
679