xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision a381c8a06378de22ff08adf4282b4e3f7e50d250)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! super block, mounting, etc.
7 
8 
9 #include "Debug.h"
10 #include "Volume.h"
11 #include "Journal.h"
12 #include "Inode.h"
13 #include "Query.h"
14 
15 
16 static const int32 kDesiredAllocationGroups = 56;
17 	// This is the number of allocation groups that will be tried
18 	// to be given for newly initialized disks.
19 	// That's only relevant for smaller disks, though, since any
20 	// of today's disk sizes already reach the maximum length
21 	// of an allocation group (65536 blocks).
22 	// It seems to create appropriate numbers for smaller disks
23 	// with this setting, though (i.e. you can create a 400 MB
24 	// file on a 1 GB disk without the need for double indirect
25 	// blocks).
26 
27 
28 class DeviceOpener {
29 	public:
30 		DeviceOpener(int fd, int mode);
31 		DeviceOpener(const char *device, int mode);
32 		~DeviceOpener();
33 
34 		int Open(const char *device, int mode);
35 		int Open(int fd, int mode);
36 		void *InitCache(off_t numBlocks, uint32 blockSize);
37 		void RemoveCache(bool allowWrites);
38 
39 		void Keep();
40 
41 		int Device() const { return fDevice; }
42 		int Mode() const { return fMode; }
43 
44 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
45 
46 	private:
47 		int		fDevice;
48 		int		fMode;
49 		void	*fBlockCache;
50 };
51 
52 
53 DeviceOpener::DeviceOpener(const char *device, int mode)
54 	:
55 	fBlockCache(NULL)
56 {
57 	Open(device, mode);
58 }
59 
60 
61 DeviceOpener::DeviceOpener(int fd, int mode)
62 	:
63 	fBlockCache(NULL)
64 {
65 	Open(fd, mode);
66 }
67 
68 
69 DeviceOpener::~DeviceOpener()
70 {
71 	if (fDevice >= B_OK) {
72 		RemoveCache(false);
73 		close(fDevice);
74 	}
75 }
76 
77 
78 int
79 DeviceOpener::Open(const char *device, int mode)
80 {
81 	fDevice = open(device, mode | O_NOCACHE);
82 	if (fDevice < 0)
83 		fDevice = errno;
84 
85 	if (fDevice < 0 && mode == O_RDWR) {
86 		// try again to open read-only (don't rely on a specific error code)
87 		return Open(device, O_RDONLY | O_NOCACHE);
88 	}
89 
90 	if (fDevice >= 0) {
91 		// opening succeeded
92 		fMode = mode;
93 		if (mode == O_RDWR) {
94 			// check out if the device really allows for read/write access
95 			device_geometry geometry;
96 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
97 				if (geometry.read_only) {
98 					// reopen device read-only
99 					close(fDevice);
100 					return Open(device, O_RDONLY | O_NOCACHE);
101 				}
102 			}
103 		}
104 	}
105 
106 	return fDevice;
107 }
108 
109 
110 int
111 DeviceOpener::Open(int fd, int mode)
112 {
113 	fDevice = dup(fd);
114 	if (fDevice < 0)
115 		return errno;
116 
117 	fMode = mode;
118 
119 	return fDevice;
120 }
121 
122 
123 void *
124 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
125 {
126 	return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY);
127 }
128 
129 
130 void
131 DeviceOpener::RemoveCache(bool allowWrites)
132 {
133 	if (fBlockCache == NULL)
134 		return;
135 
136 	block_cache_delete(fBlockCache, allowWrites);
137 	fBlockCache = NULL;
138 }
139 
140 
141 void
142 DeviceOpener::Keep()
143 {
144 	fDevice = -1;
145 }
146 
147 
148 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
149  *	to compute the size, or fstat() if that failed.
150  */
151 
152 status_t
153 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
154 {
155 	device_geometry geometry;
156 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
157 		// maybe it's just a file
158 		struct stat stat;
159 		if (fstat(fDevice, &stat) < 0)
160 			return B_ERROR;
161 
162 		if (_size)
163 			*_size = stat.st_size;
164 		if (_blockSize)	// that shouldn't cause us any problems
165 			*_blockSize = 512;
166 
167 		return B_OK;
168 	}
169 
170 	if (_size) {
171 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
172 			* geometry.sectors_per_track * geometry.bytes_per_sector;
173 	}
174 	if (_blockSize)
175 		*_blockSize = geometry.bytes_per_sector;
176 
177 	return B_OK;
178 }
179 
180 
181 //	#pragma mark -
182 
183 
184 bool
185 disk_super_block::IsValid()
186 {
187 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
188 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
189 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
190 		|| (int32)block_size != inode_size
191 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
192 		|| (1UL << BlockShift()) != BlockSize()
193 		|| AllocationGroups() < 1
194 		|| AllocationGroupShift() < 1
195 		|| BlocksPerAllocationGroup() < 1
196 		|| NumBlocks() < 10
197 		|| AllocationGroups() != divide_roundup(NumBlocks(),
198 			1L << AllocationGroupShift()))
199 		return false;
200 
201 	return true;
202 }
203 
204 
205 void
206 disk_super_block::Initialize(const char *diskName, off_t numBlocks,
207 	uint32 blockSize)
208 {
209 	memset(this, 0, sizeof(disk_super_block));
210 
211 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
212 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
213 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
214 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
215 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
216 
217 	strlcpy(name, diskName, sizeof(name));
218 
219 	int32 blockShift = 9;
220 	while ((1UL << blockShift) < blockSize) {
221 		blockShift++;
222 	}
223 
224 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
225 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
226 
227 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
228 	used_blocks = 0;
229 
230 	// Get the minimum ag_shift (that's determined by the block size)
231 
232 	int32 bitsPerBlock = blockSize << 3;
233 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
234 	int32 blocksPerGroup = 1;
235 	int32 groupShift = 13;
236 
237 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
238 		groupShift++;
239 	}
240 
241 	// Many allocation groups help applying allocation policies, but if
242 	// they are too small, we will need to many block_runs to cover large
243 	// files (see above to get an explanation of the kDesiredAllocationGroups
244 	// constant).
245 
246 	int32 numGroups;
247 
248 	while (true) {
249 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
250 		if (numGroups > kDesiredAllocationGroups) {
251 			if (groupShift == 16)
252 				break;
253 
254 			groupShift++;
255 			blocksPerGroup *= 2;
256 		} else
257 			break;
258 	}
259 
260 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
261 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup);
262 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
263 }
264 
265 
266 //	#pragma mark -
267 
268 
269 Volume::Volume(dev_t id)
270 	:
271 	fID(id),
272 	fBlockAllocator(this),
273 	fLock("bfs volume"),
274 	fRootNode(NULL),
275 	fIndicesNode(NULL),
276 	fDirtyCachedBlocks(0),
277 	fUniqueID(0),
278 	fFlags(0)
279 {
280 }
281 
282 
283 Volume::~Volume()
284 {
285 }
286 
287 
288 bool
289 Volume::IsValidSuperBlock()
290 {
291 	return fSuperBlock.IsValid();
292 }
293 
294 
295 void
296 Volume::Panic()
297 {
298 	FATAL(("we have to panic... switch to read-only mode!\n"));
299 	fFlags |= VOLUME_READ_ONLY;
300 #ifdef DEBUG
301 	kernel_debugger("BFS panics!");
302 #endif
303 }
304 
305 
306 status_t
307 Volume::Mount(const char *deviceName, uint32 flags)
308 {
309 	// ToDo: validate the FS in write mode as well!
310 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
311 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
312 	// in big endian mode, we only mount read-only for now
313 	flags |= B_MOUNT_READ_ONLY;
314 #endif
315 
316 	DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0
317 		? O_RDONLY : O_RDWR);
318 	fDevice = opener.Device();
319 	if (fDevice < B_OK)
320 		RETURN_ERROR(fDevice);
321 
322 	if (opener.Mode() == O_RDONLY)
323 		fFlags |= VOLUME_READ_ONLY;
324 
325 	// check if it's a regular file, and if so, disable the cache for the
326 	// underlaying file system
327 	struct stat stat;
328 	if (fstat(fDevice, &stat) < 0)
329 		RETURN_ERROR(B_ERROR);
330 
331 	// read the super block
332 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
333 		FATAL(("invalid super block!\n"));
334 		return B_BAD_VALUE;
335 	}
336 
337 	// initialize short hands to the super block (to save byte swapping)
338 	fBlockSize = fSuperBlock.BlockSize();
339 	fBlockShift = fSuperBlock.BlockShift();
340 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
341 
342 	// check if the device size is large enough to hold the file system
343 	off_t diskSize;
344 	if (opener.GetSize(&diskSize) < B_OK)
345 		RETURN_ERROR(B_ERROR);
346 	if (diskSize < (NumBlocks() << BlockShift()))
347 		RETURN_ERROR(B_BAD_VALUE);
348 
349 	// set the current log pointers, so that journaling will work correctly
350 	fLogStart = fSuperBlock.LogStart();
351 	fLogEnd = fSuperBlock.LogEnd();
352 
353 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
354 		return B_ERROR;
355 
356 	fJournal = new Journal(this);
357 	// replaying the log is the first thing we will do on this disk
358 	if (fJournal && fJournal->InitCheck() < B_OK
359 		|| fBlockAllocator.Initialize() < B_OK) {
360 		// ToDo: improve error reporting for a bad journal
361 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
362 		return B_NO_MEMORY;
363 	}
364 
365 	status_t status = B_OK;
366 
367 	fRootNode = new Inode(this, ToVnode(Root()));
368 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
369 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
370 		if (status == B_OK) {
371 			// try to get indices root dir
372 
373 			if (!Indices().IsZero())
374 				fIndicesNode = new Inode(this, ToVnode(Indices()));
375 
376 			if (fIndicesNode == NULL
377 				|| fIndicesNode->InitCheck() < B_OK
378 				|| !fIndicesNode->IsContainer()) {
379 				INFORM(("bfs: volume doesn't have indices!\n"));
380 
381 				if (fIndicesNode) {
382 					// if this is the case, the index root node is gone bad,
383 					// and BFS switch to read-only mode
384 					fFlags |= VOLUME_READ_ONLY;
385 					delete fIndicesNode;
386 					fIndicesNode = NULL;
387 				}
388 			}
389 
390 			// all went fine
391 			opener.Keep();
392 			return B_OK;
393 		} else
394 			FATAL(("could not create root node: publish_vnode() failed!\n"));
395 
396 		delete fRootNode;
397 	} else {
398 		status = B_BAD_VALUE;
399 		FATAL(("could not create root node!\n"));
400 	}
401 
402 	return status;
403 }
404 
405 
406 status_t
407 Volume::Unmount()
408 {
409 	// Unlike in BeOS, we need to put the reference to our root node ourselves
410 	put_vnode(fID, ToVnode(Root()));
411 
412 	// This will also flush the log & all blocks to disk
413 	delete fJournal;
414 	fJournal = NULL;
415 
416 	delete fIndicesNode;
417 
418 	block_cache_delete(fBlockCache, !IsReadOnly());
419 	close(fDevice);
420 
421 	return B_OK;
422 }
423 
424 
425 status_t
426 Volume::Sync()
427 {
428 	return fJournal->FlushLogAndBlocks();
429 }
430 
431 
432 status_t
433 Volume::ValidateBlockRun(block_run run)
434 {
435 	if (run.AllocationGroup() < 0
436 		|| run.AllocationGroup() > (int32)AllocationGroups()
437 		|| run.Start() > (1UL << AllocationGroupShift())
438 		|| run.length == 0
439 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
440 		Panic();
441 		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(),
442 			run.Start(), run.Length()));
443 		return B_BAD_DATA;
444 	}
445 	return B_OK;
446 }
447 
448 
449 block_run
450 Volume::ToBlockRun(off_t block) const
451 {
452 	block_run run;
453 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(
454 		block >> AllocationGroupShift());
455 	run.start = HOST_ENDIAN_TO_BFS_INT16(
456 		block & ((1LL << AllocationGroupShift()) - 1));
457 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
458 	return run;
459 }
460 
461 
462 status_t
463 Volume::CreateIndicesRoot(Transaction &transaction)
464 {
465 	off_t id;
466 	status_t status = Inode::Create(transaction, NULL, NULL,
467 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id,
468 		&fIndicesNode);
469 	if (status < B_OK)
470 		RETURN_ERROR(status);
471 
472 	fSuperBlock.indices = ToBlockRun(id);
473 	return WriteSuperBlock();
474 }
475 
476 
477 status_t
478 Volume::AllocateForInode(Transaction &transaction, const Inode *parent,
479 	mode_t type, block_run &run)
480 {
481 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(),
482 		type, run);
483 }
484 
485 
486 status_t
487 Volume::WriteSuperBlock()
488 {
489 	// TODO: this assumes a block size of 512 bytes of the underlying device
490 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block))
491 			!= sizeof(disk_super_block))
492 		return B_IO_ERROR;
493 
494 	return B_OK;
495 }
496 
497 
498 void
499 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type,
500 	const uint8 *oldKey, size_t oldLength, const uint8 *newKey,
501 	size_t newLength)
502 {
503 	if (fQueryLock.Lock() < B_OK)
504 		return;
505 
506 	Query *query = NULL;
507 	while ((query = fQueries.Next(query)) != NULL) {
508 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey,
509 			newLength);
510 	}
511 
512 	fQueryLock.Unlock();
513 }
514 
515 
516 /*!
517 	Checks if there is a live query whose results depend on the presence
518 	or value of the specified attribute.
519 	Don't use it if you already have all the data together to evaluate
520 	the queries - it wouldn't safe you anything in this case.
521 */
522 bool
523 Volume::CheckForLiveQuery(const char *attribute)
524 {
525 	// ToDo: check for a live query that depends on the specified attribute
526 	return true;
527 }
528 
529 
530 void
531 Volume::AddQuery(Query *query)
532 {
533 	if (fQueryLock.Lock() < B_OK)
534 		return;
535 
536 	fQueries.Add(query);
537 
538 	fQueryLock.Unlock();
539 }
540 
541 
542 void
543 Volume::RemoveQuery(Query *query)
544 {
545 	if (fQueryLock.Lock() < B_OK)
546 		return;
547 
548 	fQueries.Remove(query);
549 
550 	fQueryLock.Unlock();
551 }
552 
553 
554 //	#pragma mark - Disk scanning and initialization
555 
556 
557 status_t
558 Volume::Identify(int fd, disk_super_block *superBlock)
559 {
560 	char buffer[1024];
561 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
562 		return B_IO_ERROR;
563 
564 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
565 	if (!superBlock->IsValid()) {
566 #ifndef BFS_LITTLE_ENDIAN_ONLY
567 		// For PPC, the super block might be located at offset 0
568 		memcpy(superBlock, buffer, sizeof(disk_super_block));
569 		if (!superBlock->IsValid())
570 			return B_BAD_VALUE;
571 #else
572 		return B_BAD_VALUE;
573 #endif
574 	}
575 
576 	return B_OK;
577 }
578 
579 
580 status_t
581 Volume::Initialize(int fd, const char *name, uint32 blockSize,
582 	uint32 flags)
583 {
584 	// although there is no really good reason for it, we won't
585 	// accept '/' in disk names (mkbfs does this, too - and since
586 	// Tracker names mounted volumes like their name)
587 	if (strchr(name, '/') != NULL)
588 		return B_BAD_VALUE;
589 
590 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096
591 		&& blockSize != 8192)
592 		return B_BAD_VALUE;
593 
594 	DeviceOpener opener(fd, O_RDWR);
595 	if (opener.Device() < B_OK)
596 		return B_BAD_VALUE;
597 
598 	fDevice = opener.Device();
599 
600 	uint32 deviceBlockSize;
601 	off_t deviceSize;
602 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
603 		return B_ERROR;
604 
605 	off_t numBlocks = deviceSize / blockSize;
606 
607 	// create valid super block
608 
609 	fSuperBlock.Initialize(name, numBlocks, blockSize);
610 
611 	// initialize short hands to the super block (to save byte swapping)
612 	fBlockSize = fSuperBlock.BlockSize();
613 	fBlockShift = fSuperBlock.BlockShift();
614 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
615 
616 	// since the allocator has not been initialized yet, we
617 	// cannot use BlockAllocator::BitmapSize() here
618 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
619 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
620 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
621 		// ToDo: set the log size depending on the disk size
622 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(
623 		ToBlock(Log()));
624 
625 	// set the current log pointers, so that journaling will work correctly
626 	fLogStart = fSuperBlock.LogStart();
627 	fLogEnd = fSuperBlock.LogEnd();
628 
629 	if (!IsValidSuperBlock())
630 		RETURN_ERROR(B_ERROR);
631 
632 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
633 		return B_ERROR;
634 
635 	fJournal = new Journal(this);
636 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
637 		RETURN_ERROR(B_ERROR);
638 
639 	// ready to write data to disk
640 
641 	Transaction transaction(this, 0);
642 
643 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
644 		RETURN_ERROR(B_ERROR);
645 
646 	off_t id;
647 	status_t status = Inode::Create(transaction, NULL, NULL,
648 		S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode);
649 	if (status < B_OK)
650 		RETURN_ERROR(status);
651 
652 	fSuperBlock.root_dir = ToBlockRun(id);
653 
654 	if ((flags & VOLUME_NO_INDICES) == 0) {
655 		// The indices root directory will be created automatically
656 		// when the standard indices are created (or any other).
657 		Index index(this);
658 		status = index.Create(transaction, "name", B_STRING_TYPE);
659 		if (status < B_OK)
660 			return status;
661 
662 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
663 		if (status < B_OK)
664 			return status;
665 
666 		status = index.Create(transaction, "size", B_INT64_TYPE);
667 		if (status < B_OK)
668 			return status;
669 	}
670 
671 	WriteSuperBlock();
672 	transaction.Done();
673 
674 // 	put_vnode(ID(), fRootNode->ID());
675 // 	if (fIndicesNode != NULL)
676 // 		put_vnode(ID(), fIndicesNode->ID());
677 
678 	Sync();
679 	opener.RemoveCache(true);
680 	return B_OK;
681 }
682