xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision b028e77473189065f2baefc6f5e10d451cf591e2)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! super block, mounting, etc.
7 
8 
9 #include "Debug.h"
10 #include "Volume.h"
11 #include "Journal.h"
12 #include "Inode.h"
13 #include "Query.h"
14 
15 
16 static const int32 kDesiredAllocationGroups = 56;
17 	// This is the number of allocation groups that will be tried
18 	// to be given for newly initialized disks.
19 	// That's only relevant for smaller disks, though, since any
20 	// of today's disk sizes already reach the maximum length
21 	// of an allocation group (65536 blocks).
22 	// It seems to create appropriate numbers for smaller disks
23 	// with this setting, though (i.e. you can create a 400 MB
24 	// file on a 1 GB disk without the need for double indirect
25 	// blocks).
26 
27 
28 class DeviceOpener {
29 	public:
30 		DeviceOpener(int fd, int mode);
31 		DeviceOpener(const char *device, int mode);
32 		~DeviceOpener();
33 
34 		int Open(const char *device, int mode);
35 		int Open(int fd, int mode);
36 		void *InitCache(off_t numBlocks, uint32 blockSize);
37 		void RemoveCache(bool allowWrites);
38 
39 		void Keep();
40 
41 		int Device() const { return fDevice; }
42 		int Mode() const { return fMode; }
43 
44 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
45 
46 	private:
47 		int		fDevice;
48 		int		fMode;
49 		void	*fBlockCache;
50 };
51 
52 
53 DeviceOpener::DeviceOpener(const char *device, int mode)
54 	:
55 	fBlockCache(NULL)
56 {
57 	Open(device, mode);
58 }
59 
60 
61 DeviceOpener::DeviceOpener(int fd, int mode)
62 	:
63 	fBlockCache(NULL)
64 {
65 	Open(fd, mode);
66 }
67 
68 
69 DeviceOpener::~DeviceOpener()
70 {
71 	if (fDevice >= B_OK) {
72 		RemoveCache(false);
73 		close(fDevice);
74 	}
75 }
76 
77 
78 int
79 DeviceOpener::Open(const char *device, int mode)
80 {
81 	fDevice = open(device, mode);
82 	if (fDevice < 0)
83 		fDevice = errno;
84 
85 	if (fDevice < 0 && mode == O_RDWR) {
86 		// try again to open read-only (don't rely on a specific error code)
87 		return Open(device, O_RDONLY);
88 	}
89 
90 	if (fDevice >= 0) {
91 		// opening succeeded
92 		fMode = mode;
93 		if (mode == O_RDWR) {
94 			// check out if the device really allows for read/write access
95 			device_geometry geometry;
96 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
97 				if (geometry.read_only) {
98 					// reopen device read-only
99 					close(fDevice);
100 					return Open(device, O_RDONLY);
101 				}
102 			}
103 		}
104 	}
105 
106 	return fDevice;
107 }
108 
109 
110 int
111 DeviceOpener::Open(int fd, int mode)
112 {
113 	fDevice = dup(fd);
114 	if (fDevice < 0)
115 		return errno;
116 
117 	fMode = mode;
118 
119 	return fDevice;
120 }
121 
122 
123 void *
124 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
125 {
126 	return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY);
127 }
128 
129 
130 void
131 DeviceOpener::RemoveCache(bool allowWrites)
132 {
133 	if (fBlockCache == NULL)
134 		return;
135 
136 	block_cache_delete(fBlockCache, allowWrites);
137 	fBlockCache = NULL;
138 }
139 
140 
141 void
142 DeviceOpener::Keep()
143 {
144 	fDevice = -1;
145 }
146 
147 
148 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
149  *	to compute the size, or fstat() if that failed.
150  */
151 
152 status_t
153 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
154 {
155 	device_geometry geometry;
156 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
157 		// maybe it's just a file
158 		struct stat stat;
159 		if (fstat(fDevice, &stat) < 0)
160 			return B_ERROR;
161 
162 		if (_size)
163 			*_size = stat.st_size;
164 		if (_blockSize)	// that shouldn't cause us any problems
165 			*_blockSize = 512;
166 
167 		return B_OK;
168 	}
169 
170 	if (_size) {
171 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
172 			* geometry.sectors_per_track * geometry.bytes_per_sector;
173 	}
174 	if (_blockSize)
175 		*_blockSize = geometry.bytes_per_sector;
176 
177 	return B_OK;
178 }
179 
180 
181 //	#pragma mark -
182 
183 
184 bool
185 disk_super_block::IsValid()
186 {
187 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
188 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
189 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
190 		|| (int32)block_size != inode_size
191 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
192 		|| (1UL << BlockShift()) != BlockSize()
193 		|| AllocationGroups() < 1
194 		|| AllocationGroupShift() < 1
195 		|| BlocksPerAllocationGroup() < 1
196 		|| NumBlocks() < 10
197 		|| AllocationGroups() != divide_roundup(NumBlocks(),
198 			1L << AllocationGroupShift()))
199 		return false;
200 
201 	return true;
202 }
203 
204 
205 void
206 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
207 {
208 	memset(this, 0, sizeof(disk_super_block));
209 
210 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
211 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
212 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
213 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
214 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
215 
216 	strlcpy(name, diskName, sizeof(name));
217 
218 	int32 blockShift = 9;
219 	while ((1UL << blockShift) < blockSize) {
220 		blockShift++;
221 	}
222 
223 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
224 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
225 
226 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
227 	used_blocks = 0;
228 
229 	// Get the minimum ag_shift (that's determined by the block size)
230 
231 	int32 bitsPerBlock = blockSize << 3;
232 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
233 	int32 blocksPerGroup = 1;
234 	int32 groupShift = 13;
235 
236 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
237 		groupShift++;
238 	}
239 
240 	// Many allocation groups help applying allocation policies, but if
241 	// they are too small, we will need to many block_runs to cover large
242 	// files (see above to get an explanation of the kDesiredAllocationGroups
243 	// constant).
244 
245 	int32 numGroups;
246 
247 	while (true) {
248 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
249 		if (numGroups > kDesiredAllocationGroups) {
250 			if (groupShift == 16)
251 				break;
252 
253 			groupShift++;
254 			blocksPerGroup *= 2;
255 		} else
256 			break;
257 	}
258 
259 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
260 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
261 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
262 }
263 
264 
265 //	#pragma mark -
266 
267 
268 Volume::Volume(dev_t id)
269 	:
270 	fID(id),
271 	fBlockAllocator(this),
272 	fLock("bfs volume"),
273 	fRootNode(NULL),
274 	fIndicesNode(NULL),
275 	fDirtyCachedBlocks(0),
276 	fUniqueID(0),
277 	fFlags(0)
278 {
279 }
280 
281 
282 Volume::~Volume()
283 {
284 }
285 
286 
287 bool
288 Volume::IsValidSuperBlock()
289 {
290 	return fSuperBlock.IsValid();
291 }
292 
293 
294 void
295 Volume::Panic()
296 {
297 	FATAL(("we have to panic... switch to read-only mode!\n"));
298 	fFlags |= VOLUME_READ_ONLY;
299 #ifdef DEBUG
300 	kernel_debugger("BFS panics!");
301 #endif
302 }
303 
304 
305 status_t
306 Volume::Mount(const char *deviceName, uint32 flags)
307 {
308 	// ToDo: validate the FS in write mode as well!
309 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
310 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
311 	// in big endian mode, we only mount read-only for now
312 	flags |= B_MOUNT_READ_ONLY;
313 #endif
314 
315 	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
316 	fDevice = opener.Device();
317 	if (fDevice < B_OK)
318 		RETURN_ERROR(fDevice);
319 
320 	if (opener.Mode() == O_RDONLY)
321 		fFlags |= VOLUME_READ_ONLY;
322 
323 	// check if it's a regular file, and if so, disable the cache for the
324 	// underlaying file system
325 	struct stat stat;
326 	if (fstat(fDevice, &stat) < 0)
327 		RETURN_ERROR(B_ERROR);
328 
329 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works)
330 #if 0
331 #ifndef NO_FILE_UNCACHED_IO
332 	if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
333 		// mount read-only if the cache couldn't be disabled
334 #	ifdef DEBUG
335 		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
336 #	else
337 		FATAL(("couldn't disable cache for image file!\n"));
338 		Panic();
339 #	endif
340 	}
341 #endif
342 #endif
343 
344 	// read the super block
345 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
346 		FATAL(("invalid super block!\n"));
347 		return B_BAD_VALUE;
348 	}
349 
350 	// initialize short hands to the super block (to save byte swapping)
351 	fBlockSize = fSuperBlock.BlockSize();
352 	fBlockShift = fSuperBlock.BlockShift();
353 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
354 
355 	// check if the device size is large enough to hold the file system
356 	off_t diskSize;
357 	if (opener.GetSize(&diskSize) < B_OK)
358 		RETURN_ERROR(B_ERROR);
359 	if (diskSize < (NumBlocks() << BlockShift()))
360 		RETURN_ERROR(B_BAD_VALUE);
361 
362 	// set the current log pointers, so that journaling will work correctly
363 	fLogStart = fSuperBlock.LogStart();
364 	fLogEnd = fSuperBlock.LogEnd();
365 
366 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
367 		return B_ERROR;
368 
369 	fJournal = new Journal(this);
370 	// replaying the log is the first thing we will do on this disk
371 	if (fJournal && fJournal->InitCheck() < B_OK
372 		|| fBlockAllocator.Initialize() < B_OK) {
373 		// ToDo: improve error reporting for a bad journal
374 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
375 		return B_NO_MEMORY;
376 	}
377 
378 	status_t status = B_OK;
379 
380 	fRootNode = new Inode(this, ToVnode(Root()));
381 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
382 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
383 		if (status == B_OK) {
384 			// try to get indices root dir
385 
386 			// question: why doesn't get_vnode() work here??
387 			// answer: we have not yet backpropagated the pointer to the
388 			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
389 			// But it's not needed to do that anyway.
390 
391 			if (!Indices().IsZero())
392 				fIndicesNode = new Inode(this, ToVnode(Indices()));
393 
394 			if (fIndicesNode == NULL
395 				|| fIndicesNode->InitCheck() < B_OK
396 				|| !fIndicesNode->IsContainer()) {
397 				INFORM(("bfs: volume doesn't have indices!\n"));
398 
399 				if (fIndicesNode) {
400 					// if this is the case, the index root node is gone bad, and
401 					// BFS switch to read-only mode
402 					fFlags |= VOLUME_READ_ONLY;
403 					delete fIndicesNode;
404 					fIndicesNode = NULL;
405 				}
406 			}
407 
408 			// all went fine
409 			opener.Keep();
410 			return B_OK;
411 		} else
412 			FATAL(("could not create root node: publish_vnode() failed!\n"));
413 
414 		delete fRootNode;
415 	} else {
416 		status = B_BAD_VALUE;
417 		FATAL(("could not create root node!\n"));
418 	}
419 
420 	return status;
421 }
422 
423 
424 status_t
425 Volume::Unmount()
426 {
427 	// Unlike in BeOS, we need to put the reference to our root node ourselves
428 	put_vnode(fID, ToVnode(Root()));
429 
430 	// This will also flush the log & all blocks to disk
431 	delete fJournal;
432 	fJournal = NULL;
433 
434 	delete fIndicesNode;
435 
436 	block_cache_delete(fBlockCache, !IsReadOnly());
437 	close(fDevice);
438 
439 	return B_OK;
440 }
441 
442 
443 status_t
444 Volume::Sync()
445 {
446 	return fJournal->FlushLogAndBlocks();
447 }
448 
449 
450 status_t
451 Volume::ValidateBlockRun(block_run run)
452 {
453 	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
454 		|| run.Start() > (1UL << AllocationGroupShift())
455 		|| run.length == 0
456 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
457 		Panic();
458 		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), run.Start(), run.Length()));
459 		return B_BAD_DATA;
460 	}
461 	return B_OK;
462 }
463 
464 
465 block_run
466 Volume::ToBlockRun(off_t block) const
467 {
468 	block_run run;
469 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
470 	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
471 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
472 	return run;
473 }
474 
475 
476 status_t
477 Volume::CreateIndicesRoot(Transaction &transaction)
478 {
479 	off_t id;
480 	status_t status = Inode::Create(transaction, NULL, NULL,
481 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id,
482 		&fIndicesNode);
483 	if (status < B_OK)
484 		RETURN_ERROR(status);
485 
486 	fSuperBlock.indices = ToBlockRun(id);
487 	return WriteSuperBlock();
488 }
489 
490 
491 status_t
492 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run)
493 {
494 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
495 }
496 
497 
498 status_t
499 Volume::WriteSuperBlock()
500 {
501 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
502 		return B_IO_ERROR;
503 
504 	return B_OK;
505 }
506 
507 
508 void
509 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
510 	size_t oldLength, const uint8 *newKey, size_t newLength)
511 {
512 	if (fQueryLock.Lock() < B_OK)
513 		return;
514 
515 	Query *query = NULL;
516 	while ((query = fQueries.Next(query)) != NULL)
517 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
518 
519 	fQueryLock.Unlock();
520 }
521 
522 
523 /*!
524 	Checks if there is a live query whose results depend on the presence
525 	or value of the specified attribute.
526 	Don't use it if you already have all the data together to evaluate
527 	the queries - it wouldn't safe you anything in this case.
528 */
529 bool
530 Volume::CheckForLiveQuery(const char *attribute)
531 {
532 	// ToDo: check for a live query that depends on the specified attribute
533 	return true;
534 }
535 
536 
537 void
538 Volume::AddQuery(Query *query)
539 {
540 	if (fQueryLock.Lock() < B_OK)
541 		return;
542 
543 	fQueries.Add(query);
544 
545 	fQueryLock.Unlock();
546 }
547 
548 
549 void
550 Volume::RemoveQuery(Query *query)
551 {
552 	if (fQueryLock.Lock() < B_OK)
553 		return;
554 
555 	fQueries.Remove(query);
556 
557 	fQueryLock.Unlock();
558 }
559 
560 
561 //	#pragma mark - Disk scanning and initialization
562 
563 
564 status_t
565 Volume::Identify(int fd, disk_super_block *superBlock)
566 {
567 	char buffer[1024];
568 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
569 		return B_IO_ERROR;
570 
571 	// Note: that does work only for x86, for PowerPC, the super block
572 	// may be located at offset 0!
573 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
574 	if (!superBlock->IsValid()) {
575 #ifndef BFS_LITTLE_ENDIAN_ONLY
576 		memcpy(superBlock, buffer, sizeof(disk_super_block));
577 		if (!superBlock->IsValid())
578 			return B_BAD_VALUE;
579 #else
580 		return B_BAD_VALUE;
581 #endif
582 	}
583 
584 	return B_OK;
585 }
586 
587 
588 status_t
589 Volume::Initialize(int fd, const char *name, uint32 blockSize,
590 	uint32 flags)
591 {
592 	// although there is no really good reason for it, we won't
593 	// accept '/' in disk names (mkbfs does this, too - and since
594 	// Tracker names mounted volumes like their name)
595 	if (strchr(name, '/') != NULL)
596 		return B_BAD_VALUE;
597 
598 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096
599 		&& blockSize != 8192)
600 		return B_BAD_VALUE;
601 
602 	DeviceOpener opener(fd, O_RDWR);
603 	if (opener.Device() < B_OK)
604 		return B_BAD_VALUE;
605 
606 	fDevice = opener.Device();
607 
608 	uint32 deviceBlockSize;
609 	off_t deviceSize;
610 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
611 		return B_ERROR;
612 
613 	off_t numBlocks = deviceSize / blockSize;
614 
615 	// create valid super block
616 
617 	fSuperBlock.Initialize(name, numBlocks, blockSize);
618 
619 	// initialize short hands to the super block (to save byte swapping)
620 	fBlockSize = fSuperBlock.BlockSize();
621 	fBlockShift = fSuperBlock.BlockShift();
622 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
623 
624 	// since the allocator has not been initialized yet, we
625 	// cannot use BlockAllocator::BitmapSize() here
626 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
627 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
628 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
629 		// ToDo: set the log size depending on the disk size
630 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(
631 		ToBlock(Log()));
632 
633 	// set the current log pointers, so that journaling will work correctly
634 	fLogStart = fSuperBlock.LogStart();
635 	fLogEnd = fSuperBlock.LogEnd();
636 
637 	if (!IsValidSuperBlock())
638 		RETURN_ERROR(B_ERROR);
639 
640 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
641 		return B_ERROR;
642 
643 	fJournal = new Journal(this);
644 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
645 		RETURN_ERROR(B_ERROR);
646 
647 	// ready to write data to disk
648 
649 	Transaction transaction(this, 0);
650 
651 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
652 		RETURN_ERROR(B_ERROR);
653 
654 	off_t id;
655 	status_t status = Inode::Create(transaction, NULL, NULL,
656 		S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode);
657 	if (status < B_OK)
658 		RETURN_ERROR(status);
659 
660 	fSuperBlock.root_dir = ToBlockRun(id);
661 
662 	if ((flags & VOLUME_NO_INDICES) == 0) {
663 		// The indices root directory will be created automatically
664 		// when the standard indices are created (or any other).
665 		Index index(this);
666 		status = index.Create(transaction, "name", B_STRING_TYPE);
667 		if (status < B_OK)
668 			return status;
669 
670 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
671 		if (status < B_OK)
672 			return status;
673 
674 		status = index.Create(transaction, "size", B_INT64_TYPE);
675 		if (status < B_OK)
676 			return status;
677 	}
678 
679 	WriteSuperBlock();
680 	transaction.Done();
681 
682 // 	put_vnode(ID(), fRootNode->ID());
683 // 	if (fIndicesNode != NULL)
684 // 		put_vnode(ID(), fIndicesNode->ID());
685 
686 	Sync();
687 	opener.RemoveCache(true);
688 	return B_OK;
689 }
690