xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision 14e3d1b5768e7110b3d5c0855833267409b71dbb)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! super block, mounting, etc.
7 
8 
9 #include "Debug.h"
10 #include "Volume.h"
11 #include "Journal.h"
12 #include "Inode.h"
13 #include "Query.h"
14 
15 
16 static const int32 kDesiredAllocationGroups = 56;
17 	// This is the number of allocation groups that will be tried
18 	// to be given for newly initialized disks.
19 	// That's only relevant for smaller disks, though, since any
20 	// of today's disk sizes already reach the maximum length
21 	// of an allocation group (65536 blocks).
22 	// It seems to create appropriate numbers for smaller disks
23 	// with this setting, though (i.e. you can create a 400 MB
24 	// file on a 1 GB disk without the need for double indirect
25 	// blocks).
26 
27 
28 class DeviceOpener {
29 	public:
30 		DeviceOpener(const char *device, int mode);
31 		~DeviceOpener();
32 
33 		int Open(const char *device, int mode);
34 		void *InitCache(off_t numBlocks, uint32 blockSize);
35 		void RemoveCache(bool allowWrites);
36 
37 		void Keep();
38 
39 		int Device() const { return fDevice; }
40 		int Mode() const { return fMode; }
41 
42 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
43 
44 	private:
45 		int		fDevice;
46 		int		fMode;
47 		void	*fBlockCache;
48 };
49 
50 
51 DeviceOpener::DeviceOpener(const char *device, int mode)
52 	:
53 	fBlockCache(NULL)
54 {
55 	Open(device, mode);
56 }
57 
58 
59 DeviceOpener::~DeviceOpener()
60 {
61 	if (fDevice >= B_OK) {
62 		RemoveCache(false);
63 		close(fDevice);
64 	}
65 }
66 
67 
68 int
69 DeviceOpener::Open(const char *device, int mode)
70 {
71 	fDevice = open(device, mode);
72 	if (fDevice < 0)
73 		fDevice = errno;
74 
75 	if (fDevice < 0 && mode == O_RDWR) {
76 		// try again to open read-only (don't rely on a specific error code)
77 		return Open(device, O_RDONLY);
78 	}
79 
80 	if (fDevice >= 0) {
81 		// opening succeeded
82 		fMode = mode;
83 		if (mode == O_RDWR) {
84 			// check out if the device really allows for read/write access
85 			device_geometry geometry;
86 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
87 				if (geometry.read_only) {
88 					// reopen device read-only
89 					close(fDevice);
90 					return Open(device, O_RDONLY);
91 				}
92 			}
93 		}
94 	}
95 
96 	return fDevice;
97 }
98 
99 
100 void *
101 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
102 {
103 	return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY);
104 }
105 
106 
107 void
108 DeviceOpener::RemoveCache(bool allowWrites)
109 {
110 	if (fBlockCache == NULL)
111 		return;
112 
113 	block_cache_delete(fBlockCache, allowWrites);
114 	fBlockCache = NULL;
115 }
116 
117 
118 void
119 DeviceOpener::Keep()
120 {
121 	fDevice = -1;
122 }
123 
124 
125 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
126  *	to compute the size, or fstat() if that failed.
127  */
128 
129 status_t
130 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
131 {
132 	device_geometry geometry;
133 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
134 		// maybe it's just a file
135 		struct stat stat;
136 		if (fstat(fDevice, &stat) < 0)
137 			return B_ERROR;
138 
139 		if (_size)
140 			*_size = stat.st_size;
141 		if (_blockSize)	// that shouldn't cause us any problems
142 			*_blockSize = 512;
143 
144 		return B_OK;
145 	}
146 
147 	if (_size) {
148 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
149 			* geometry.sectors_per_track * geometry.bytes_per_sector;
150 	}
151 	if (_blockSize)
152 		*_blockSize = geometry.bytes_per_sector;
153 
154 	return B_OK;
155 }
156 
157 
158 //	#pragma mark -
159 
160 
161 bool
162 disk_super_block::IsValid()
163 {
164 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
165 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
166 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
167 		|| (int32)block_size != inode_size
168 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
169 		|| (1UL << BlockShift()) != BlockSize()
170 		|| AllocationGroups() < 1
171 		|| AllocationGroupShift() < 1
172 		|| BlocksPerAllocationGroup() < 1
173 		|| NumBlocks() < 10
174 		|| AllocationGroups() != divide_roundup(NumBlocks(),
175 			1L << AllocationGroupShift()))
176 		return false;
177 
178 	return true;
179 }
180 
181 
182 void
183 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
184 {
185 	memset(this, 0, sizeof(disk_super_block));
186 
187 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
188 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
189 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
190 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
191 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
192 
193 	strlcpy(name, diskName, sizeof(name));
194 
195 	int32 blockShift = 9;
196 	while ((1UL << blockShift) < blockSize) {
197 		blockShift++;
198 	}
199 
200 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
201 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
202 
203 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
204 	used_blocks = 0;
205 
206 	// Get the minimum ag_shift (that's determined by the block size)
207 
208 	int32 bitsPerBlock = blockSize << 3;
209 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
210 	int32 blocksPerGroup = 1;
211 	int32 groupShift = 13;
212 
213 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
214 		groupShift++;
215 	}
216 
217 	// Many allocation groups help applying allocation policies, but if
218 	// they are too small, we will need to many block_runs to cover large
219 	// files (see above to get an explanation of the kDesiredAllocationGroups
220 	// constant).
221 
222 	int32 numGroups;
223 
224 	while (true) {
225 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
226 		if (numGroups > kDesiredAllocationGroups) {
227 			if (groupShift == 16)
228 				break;
229 
230 			groupShift++;
231 			blocksPerGroup *= 2;
232 		} else
233 			break;
234 	}
235 
236 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
237 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
238 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
239 }
240 
241 
242 //	#pragma mark -
243 
244 
245 Volume::Volume(mount_id id)
246 	:
247 	fID(id),
248 	fBlockAllocator(this),
249 	fLock("bfs volume"),
250 	fRootNode(NULL),
251 	fIndicesNode(NULL),
252 	fDirtyCachedBlocks(0),
253 	fUniqueID(0),
254 	fFlags(0)
255 {
256 }
257 
258 
259 Volume::~Volume()
260 {
261 }
262 
263 
264 bool
265 Volume::IsValidSuperBlock()
266 {
267 	return fSuperBlock.IsValid();
268 }
269 
270 
271 void
272 Volume::Panic()
273 {
274 	FATAL(("we have to panic... switch to read-only mode!\n"));
275 	fFlags |= VOLUME_READ_ONLY;
276 #ifdef DEBUG
277 	kernel_debugger("BFS panics!");
278 #endif
279 }
280 
281 
282 status_t
283 Volume::Mount(const char *deviceName, uint32 flags)
284 {
285 	// ToDo: validate the FS in write mode as well!
286 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
287 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
288 	// in big endian mode, we only mount read-only for now
289 	flags |= B_MOUNT_READ_ONLY;
290 #endif
291 
292 	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
293 	fDevice = opener.Device();
294 	if (fDevice < B_OK)
295 		RETURN_ERROR(fDevice);
296 
297 	if (opener.Mode() == O_RDONLY)
298 		fFlags |= VOLUME_READ_ONLY;
299 
300 	// check if it's a regular file, and if so, disable the cache for the
301 	// underlaying file system
302 	struct stat stat;
303 	if (fstat(fDevice, &stat) < 0)
304 		RETURN_ERROR(B_ERROR);
305 
306 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works)
307 #if 0
308 #ifndef NO_FILE_UNCACHED_IO
309 	if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
310 		// mount read-only if the cache couldn't be disabled
311 #	ifdef DEBUG
312 		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
313 #	else
314 		FATAL(("couldn't disable cache for image file!\n"));
315 		Panic();
316 #	endif
317 	}
318 #endif
319 #endif
320 
321 	// read the super block
322 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
323 		FATAL(("invalid super block!\n"));
324 		return B_BAD_VALUE;
325 	}
326 
327 	// initialize short hands to the super block (to save byte swapping)
328 	fBlockSize = fSuperBlock.BlockSize();
329 	fBlockShift = fSuperBlock.BlockShift();
330 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
331 
332 	// check if the device size is large enough to hold the file system
333 	off_t diskSize;
334 	if (opener.GetSize(&diskSize) < B_OK)
335 		RETURN_ERROR(B_ERROR);
336 	if (diskSize < (NumBlocks() << BlockShift()))
337 		RETURN_ERROR(B_BAD_VALUE);
338 
339 	// set the current log pointers, so that journaling will work correctly
340 	fLogStart = fSuperBlock.LogStart();
341 	fLogEnd = fSuperBlock.LogEnd();
342 
343 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
344 		return B_ERROR;
345 
346 	fJournal = new Journal(this);
347 	// replaying the log is the first thing we will do on this disk
348 	if (fJournal && fJournal->InitCheck() < B_OK
349 		|| fBlockAllocator.Initialize() < B_OK) {
350 		// ToDo: improve error reporting for a bad journal
351 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
352 		return B_NO_MEMORY;
353 	}
354 
355 	status_t status = B_OK;
356 
357 	fRootNode = new Inode(this, ToVnode(Root()));
358 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
359 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
360 		if (status == B_OK) {
361 			// try to get indices root dir
362 
363 			// question: why doesn't get_vnode() work here??
364 			// answer: we have not yet backpropagated the pointer to the
365 			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
366 			// But it's not needed to do that anyway.
367 
368 			if (!Indices().IsZero())
369 				fIndicesNode = new Inode(this, ToVnode(Indices()));
370 
371 			if (fIndicesNode == NULL
372 				|| fIndicesNode->InitCheck() < B_OK
373 				|| !fIndicesNode->IsContainer()) {
374 				INFORM(("bfs: volume doesn't have indices!\n"));
375 
376 				if (fIndicesNode) {
377 					// if this is the case, the index root node is gone bad, and
378 					// BFS switch to read-only mode
379 					fFlags |= VOLUME_READ_ONLY;
380 					delete fIndicesNode;
381 					fIndicesNode = NULL;
382 				}
383 			}
384 
385 			// all went fine
386 			opener.Keep();
387 			return B_OK;
388 		} else
389 			FATAL(("could not create root node: publish_vnode() failed!\n"));
390 
391 		delete fRootNode;
392 	} else {
393 		status = B_BAD_VALUE;
394 		FATAL(("could not create root node!\n"));
395 	}
396 
397 	return status;
398 }
399 
400 
401 status_t
402 Volume::Unmount()
403 {
404 	// Unlike in BeOS, we need to put the reference to our root node ourselves
405 	put_vnode(fID, ToVnode(Root()));
406 
407 	// This will also flush the log & all blocks to disk
408 	delete fJournal;
409 	fJournal = NULL;
410 
411 	delete fIndicesNode;
412 
413 	block_cache_delete(fBlockCache, !IsReadOnly());
414 	close(fDevice);
415 
416 	return B_OK;
417 }
418 
419 
420 status_t
421 Volume::Sync()
422 {
423 	return fJournal->FlushLogAndBlocks();
424 }
425 
426 
427 status_t
428 Volume::ValidateBlockRun(block_run run)
429 {
430 	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
431 		|| run.Start() > (1UL << AllocationGroupShift())
432 		|| run.length == 0
433 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
434 		Panic();
435 		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), run.Start(), run.Length()));
436 		return B_BAD_DATA;
437 	}
438 	return B_OK;
439 }
440 
441 
442 block_run
443 Volume::ToBlockRun(off_t block) const
444 {
445 	block_run run;
446 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
447 	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
448 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
449 	return run;
450 }
451 
452 
453 status_t
454 Volume::CreateIndicesRoot(Transaction &transaction)
455 {
456 	off_t id;
457 	status_t status = Inode::Create(transaction, NULL, NULL,
458 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id,
459 		&fIndicesNode);
460 	if (status < B_OK)
461 		RETURN_ERROR(status);
462 
463 	fSuperBlock.indices = ToBlockRun(id);
464 	return WriteSuperBlock();
465 }
466 
467 
468 status_t
469 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run)
470 {
471 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
472 }
473 
474 
475 status_t
476 Volume::WriteSuperBlock()
477 {
478 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
479 		return B_IO_ERROR;
480 
481 	return B_OK;
482 }
483 
484 
485 void
486 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
487 	size_t oldLength, const uint8 *newKey, size_t newLength)
488 {
489 	if (fQueryLock.Lock() < B_OK)
490 		return;
491 
492 	Query *query = NULL;
493 	while ((query = fQueries.Next(query)) != NULL)
494 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
495 
496 	fQueryLock.Unlock();
497 }
498 
499 
500 /*!
501 	Checks if there is a live query whose results depend on the presence
502 	or value of the specified attribute.
503 	Don't use it if you already have all the data together to evaluate
504 	the queries - it wouldn't safe you anything in this case.
505 */
506 bool
507 Volume::CheckForLiveQuery(const char *attribute)
508 {
509 	// ToDo: check for a live query that depends on the specified attribute
510 	return true;
511 }
512 
513 
514 void
515 Volume::AddQuery(Query *query)
516 {
517 	if (fQueryLock.Lock() < B_OK)
518 		return;
519 
520 	fQueries.Add(query);
521 
522 	fQueryLock.Unlock();
523 }
524 
525 
526 void
527 Volume::RemoveQuery(Query *query)
528 {
529 	if (fQueryLock.Lock() < B_OK)
530 		return;
531 
532 	fQueries.Remove(query);
533 
534 	fQueryLock.Unlock();
535 }
536 
537 
538 //	#pragma mark - Disk scanning and initialization
539 
540 
541 status_t
542 Volume::Identify(int fd, disk_super_block *superBlock)
543 {
544 	char buffer[1024];
545 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
546 		return B_IO_ERROR;
547 
548 	// Note: that does work only for x86, for PowerPC, the super block
549 	// may be located at offset 0!
550 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
551 	if (!superBlock->IsValid()) {
552 #ifndef BFS_LITTLE_ENDIAN_ONLY
553 		memcpy(superBlock, buffer, sizeof(disk_super_block));
554 		if (!superBlock->IsValid())
555 			return B_BAD_VALUE;
556 #else
557 		return B_BAD_VALUE;
558 #endif
559 	}
560 
561 	return B_OK;
562 }
563 
564 
565 status_t
566 Volume::Initialize(const char *device, const char *name, uint32 blockSize,
567 	uint32 flags)
568 {
569 	// although there is no really good reason for it, we won't
570 	// accept '/' in disk names (mkbfs does this, too - and since
571 	// Tracker names mounted volumes like their name)
572 	if (strchr(name, '/') != NULL)
573 		return B_BAD_VALUE;
574 
575 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096
576 		&& blockSize != 8192)
577 		return B_BAD_VALUE;
578 
579 	DeviceOpener opener(device, O_RDWR);
580 	if (opener.Device() < B_OK)
581 		return B_BAD_VALUE;
582 
583 	fDevice = opener.Device();
584 
585 	uint32 deviceBlockSize;
586 	off_t deviceSize;
587 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
588 		return B_ERROR;
589 
590 	off_t numBlocks = deviceSize / blockSize;
591 
592 	// create valid super block
593 
594 	fSuperBlock.Initialize(name, numBlocks, blockSize);
595 
596 	// initialize short hands to the super block (to save byte swapping)
597 	fBlockSize = fSuperBlock.BlockSize();
598 	fBlockShift = fSuperBlock.BlockShift();
599 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
600 
601 	// since the allocator has not been initialized yet, we
602 	// cannot use BlockAllocator::BitmapSize() here
603 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
604 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
605 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
606 		// ToDo: set the log size depending on the disk size
607 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(
608 		ToBlock(Log()));
609 
610 	// set the current log pointers, so that journaling will work correctly
611 	fLogStart = fSuperBlock.LogStart();
612 	fLogEnd = fSuperBlock.LogEnd();
613 
614 	if (!IsValidSuperBlock())
615 		RETURN_ERROR(B_ERROR);
616 
617 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
618 		return B_ERROR;
619 
620 	fJournal = new Journal(this);
621 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
622 		RETURN_ERROR(B_ERROR);
623 
624 	// ready to write data to disk
625 
626 	Transaction transaction(this, 0);
627 
628 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
629 		RETURN_ERROR(B_ERROR);
630 
631 	off_t id;
632 	status_t status = Inode::Create(transaction, NULL, NULL,
633 		S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode);
634 	if (status < B_OK)
635 		RETURN_ERROR(status);
636 
637 	fSuperBlock.root_dir = ToBlockRun(id);
638 
639 	if ((flags & VOLUME_NO_INDICES) == 0) {
640 		// The indices root directory will be created automatically
641 		// when the standard indices are created (or any other).
642 		Index index(this);
643 		status = index.Create(transaction, "name", B_STRING_TYPE);
644 		if (status < B_OK)
645 			return status;
646 
647 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
648 		if (status < B_OK)
649 			return status;
650 
651 		status = index.Create(transaction, "size", B_INT64_TYPE);
652 		if (status < B_OK)
653 			return status;
654 	}
655 
656 	WriteSuperBlock();
657 	transaction.Done();
658 
659 // 	put_vnode(ID(), fRootNode->ID());
660 // 	if (fIndicesNode != NULL)
661 // 		put_vnode(ID(), fIndicesNode->ID());
662 
663 	Sync();
664 	opener.RemoveCache(true);
665 	return B_OK;
666 }
667