xref: /haiku/src/add-ons/kernel/file_systems/bfs/Volume.cpp (revision 5115ca085884f7b604a3d607688f0ca20fb7cf57)
1 /* Volume - BFS super block, mounting, etc.
2  *
3  * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de.
4  * This file may be used under the terms of the MIT License.
5  */
6 
7 
8 #include "Debug.h"
9 #include "Volume.h"
10 #include "Journal.h"
11 #include "Inode.h"
12 #include "Query.h"
13 
14 
15 static const int32 kDesiredAllocationGroups = 56;
16 	// This is the number of allocation groups that will be tried
17 	// to be given for newly initialized disks.
18 	// That's only relevant for smaller disks, though, since any
19 	// of today's disk sizes already reach the maximum length
20 	// of an allocation group (65536 blocks).
21 	// It seems to create appropriate numbers for smaller disks
22 	// with this setting, though (i.e. you can create a 400 MB
23 	// file on a 1 GB disk without the need for double indirect
24 	// blocks).
25 
26 
27 class DeviceOpener {
28 	public:
29 		DeviceOpener(const char *device, int mode);
30 		~DeviceOpener();
31 
32 		int Open(const char *device, int mode);
33 		void *InitCache(off_t numBlocks, uint32 blockSize);
34 		void RemoveCache(bool allowWrites);
35 
36 		void Keep();
37 
38 		int Device() const { return fDevice; }
39 		int Mode() const { return fMode; }
40 
41 		status_t GetSize(off_t *_size, uint32 *_blockSize = NULL);
42 
43 	private:
44 		int		fDevice;
45 		int		fMode;
46 		void	*fBlockCache;
47 };
48 
49 
50 DeviceOpener::DeviceOpener(const char *device, int mode)
51 	:
52 	fBlockCache(NULL)
53 {
54 	Open(device, mode);
55 }
56 
57 
58 DeviceOpener::~DeviceOpener()
59 {
60 	if (fDevice >= B_OK) {
61 		RemoveCache(false);
62 		close(fDevice);
63 	}
64 }
65 
66 
67 int
68 DeviceOpener::Open(const char *device, int mode)
69 {
70 	fDevice = open(device, mode);
71 	if (fDevice < 0)
72 		fDevice = errno;
73 
74 	if (fDevice < 0 && mode == O_RDWR) {
75 		// try again to open read-only (don't rely on a specific error code)
76 		return Open(device, O_RDONLY);
77 	}
78 
79 	if (fDevice >= 0) {
80 		// opening succeeded
81 		fMode = mode;
82 		if (mode == O_RDWR) {
83 			// check out if the device really allows for read/write access
84 			device_geometry geometry;
85 			if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) {
86 				if (geometry.read_only) {
87 					// reopen device read-only
88 					close(fDevice);
89 					return Open(device, O_RDONLY);
90 				}
91 			}
92 		}
93 	}
94 
95 	return fDevice;
96 }
97 
98 
99 void *
100 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize)
101 {
102 	return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY);
103 }
104 
105 
106 void
107 DeviceOpener::RemoveCache(bool allowWrites)
108 {
109 	if (fBlockCache == NULL)
110 		return;
111 
112 	block_cache_delete(fBlockCache, allowWrites);
113 	fBlockCache = NULL;
114 }
115 
116 
117 void
118 DeviceOpener::Keep()
119 {
120 	fDevice = -1;
121 }
122 
123 
124 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY
125  *	to compute the size, or fstat() if that failed.
126  */
127 
128 status_t
129 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize)
130 {
131 	device_geometry geometry;
132 	if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) {
133 		// maybe it's just a file
134 		struct stat stat;
135 		if (fstat(fDevice, &stat) < 0)
136 			return B_ERROR;
137 
138 		if (_size)
139 			*_size = stat.st_size;
140 		if (_blockSize)	// that shouldn't cause us any problems
141 			*_blockSize = 512;
142 
143 		return B_OK;
144 	}
145 
146 	if (_size) {
147 		*_size = 1LL * geometry.head_count * geometry.cylinder_count
148 			* geometry.sectors_per_track * geometry.bytes_per_sector;
149 	}
150 	if (_blockSize)
151 		*_blockSize = geometry.bytes_per_sector;
152 
153 	return B_OK;
154 }
155 
156 
157 //	#pragma mark -
158 
159 
160 bool
161 disk_super_block::IsValid()
162 {
163 	if (Magic1() != (int32)SUPER_BLOCK_MAGIC1
164 		|| Magic2() != (int32)SUPER_BLOCK_MAGIC2
165 		|| Magic3() != (int32)SUPER_BLOCK_MAGIC3
166 		|| (int32)block_size != inode_size
167 		|| ByteOrder() != SUPER_BLOCK_FS_LENDIAN
168 		|| (1UL << BlockShift()) != BlockSize()
169 		|| AllocationGroups() < 1
170 		|| AllocationGroupShift() < 1
171 		|| BlocksPerAllocationGroup() < 1
172 		|| NumBlocks() < 10
173 		|| AllocationGroups() != divide_roundup(NumBlocks(),
174 			1L << AllocationGroupShift()))
175 		return false;
176 
177 	return true;
178 }
179 
180 
181 void
182 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize)
183 {
184 	memset(this, 0, sizeof(disk_super_block));
185 
186 	magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1);
187 	magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2);
188 	magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3);
189 	fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN);
190 	flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
191 
192 	strlcpy(name, diskName, sizeof(name));
193 
194 	int32 blockShift = 9;
195 	while ((1UL << blockShift) < blockSize) {
196 		blockShift++;
197 	}
198 
199 	block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize);
200 	block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift);
201 
202 	num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks);
203 	used_blocks = 0;
204 
205 	// Get the minimum ag_shift (that's determined by the block size)
206 
207 	int32 bitsPerBlock = blockSize << 3;
208 	off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock;
209 	int32 blocksPerGroup = 1;
210 	int32 groupShift = 13;
211 
212 	for (int32 i = 8192; i < bitsPerBlock; i *= 2) {
213 		groupShift++;
214 	}
215 
216 	// Many allocation groups help applying allocation policies, but if
217 	// they are too small, we will need to many block_runs to cover large
218 	// files (see above to get an explanation of the kDesiredAllocationGroups
219 	// constant).
220 
221 	int32 numGroups;
222 
223 	while (true) {
224 		numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup;
225 		if (numGroups > kDesiredAllocationGroups) {
226 			if (groupShift == 16)
227 				break;
228 
229 			groupShift++;
230 			blocksPerGroup *= 2;
231 		} else
232 			break;
233 	}
234 
235 	num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups);
236 	blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(1);
237 	ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift);
238 }
239 
240 
241 //	#pragma mark -
242 
243 
244 Volume::Volume(mount_id id)
245 	:
246 	fID(id),
247 	fBlockAllocator(this),
248 	fLock("bfs volume"),
249 	fRootNode(NULL),
250 	fIndicesNode(NULL),
251 	fDirtyCachedBlocks(0),
252 	fUniqueID(0),
253 	fFlags(0)
254 {
255 }
256 
257 
258 Volume::~Volume()
259 {
260 }
261 
262 
263 bool
264 Volume::IsValidSuperBlock()
265 {
266 	return fSuperBlock.IsValid();
267 }
268 
269 
270 void
271 Volume::Panic()
272 {
273 	FATAL(("we have to panic... switch to read-only mode!\n"));
274 	fFlags |= VOLUME_READ_ONLY;
275 #ifdef USER
276 	debugger("BFS panics!");
277 #elif defined(DEBUG)
278 	kernel_debugger("BFS panics!");
279 #endif
280 }
281 
282 
283 status_t
284 Volume::Mount(const char *deviceName, uint32 flags)
285 {
286 	// ToDo: validate the FS in write mode as well!
287 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \
288 	|| (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY))
289 	// in big endian mode, we only mount read-only for now
290 	flags |= B_MOUNT_READ_ONLY;
291 #endif
292 
293 	DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR);
294 	fDevice = opener.Device();
295 	if (fDevice < B_OK)
296 		RETURN_ERROR(fDevice);
297 
298 	if (opener.Mode() == O_RDONLY)
299 		fFlags |= VOLUME_READ_ONLY;
300 
301 	// check if it's a regular file, and if so, disable the cache for the
302 	// underlaying file system
303 	struct stat stat;
304 	if (fstat(fDevice, &stat) < 0)
305 		RETURN_ERROR(B_ERROR);
306 
307 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works)
308 #if 0
309 #ifndef NO_FILE_UNCACHED_IO
310 	if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) {
311 		// mount read-only if the cache couldn't be disabled
312 #	ifdef DEBUG
313 		FATAL(("couldn't disable cache for image file - system may dead-lock!\n"));
314 #	else
315 		FATAL(("couldn't disable cache for image file!\n"));
316 		Panic();
317 #	endif
318 	}
319 #endif
320 #endif
321 
322 	// read the super block
323 	if (Identify(fDevice, &fSuperBlock) != B_OK) {
324 		FATAL(("invalid super block!\n"));
325 		return B_BAD_VALUE;
326 	}
327 
328 	// initialize short hands to the super block (to save byte swapping)
329 	fBlockSize = fSuperBlock.BlockSize();
330 	fBlockShift = fSuperBlock.BlockShift();
331 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
332 
333 	// check if the device size is large enough to hold the file system
334 	off_t diskSize;
335 	if (opener.GetSize(&diskSize) < B_OK)
336 		RETURN_ERROR(B_ERROR);
337 	if (diskSize < (NumBlocks() << BlockShift()))
338 		RETURN_ERROR(B_BAD_VALUE);
339 
340 	// set the current log pointers, so that journaling will work correctly
341 	fLogStart = fSuperBlock.LogStart();
342 	fLogEnd = fSuperBlock.LogEnd();
343 
344 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
345 		return B_ERROR;
346 
347 	fJournal = new Journal(this);
348 	// replaying the log is the first thing we will do on this disk
349 	if (fJournal && fJournal->InitCheck() < B_OK
350 		|| fBlockAllocator.Initialize() < B_OK) {
351 		// ToDo: improve error reporting for a bad journal
352 		FATAL(("could not initialize journal/block bitmap allocator!\n"));
353 		return B_NO_MEMORY;
354 	}
355 
356 	status_t status = B_OK;
357 
358 	fRootNode = new Inode(this, ToVnode(Root()));
359 	if (fRootNode && fRootNode->InitCheck() == B_OK) {
360 		status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode);
361 		if (status == B_OK) {
362 			// try to get indices root dir
363 
364 			// question: why doesn't get_vnode() work here??
365 			// answer: we have not yet backpropagated the pointer to the
366 			// volume in bfs_mount(), so bfs_read_vnode() can't get it.
367 			// But it's not needed to do that anyway.
368 
369 			if (!Indices().IsZero())
370 				fIndicesNode = new Inode(this, ToVnode(Indices()));
371 
372 			if (fIndicesNode == NULL
373 				|| fIndicesNode->InitCheck() < B_OK
374 				|| !fIndicesNode->IsContainer()) {
375 				INFORM(("bfs: volume doesn't have indices!\n"));
376 
377 				if (fIndicesNode) {
378 					// if this is the case, the index root node is gone bad, and
379 					// BFS switch to read-only mode
380 					fFlags |= VOLUME_READ_ONLY;
381 					delete fIndicesNode;
382 					fIndicesNode = NULL;
383 				}
384 			}
385 
386 			// all went fine
387 			opener.Keep();
388 			return B_OK;
389 		} else
390 			FATAL(("could not create root node: publish_vnode() failed!\n"));
391 
392 		delete fRootNode;
393 	} else {
394 		status = B_BAD_VALUE;
395 		FATAL(("could not create root node!\n"));
396 	}
397 
398 	return status;
399 }
400 
401 
402 status_t
403 Volume::Unmount()
404 {
405 	// Unlike in BeOS, we need to put the reference to our root node ourselves
406 	put_vnode(fID, ToVnode(Root()));
407 
408 	// This will also flush the log & all blocks to disk
409 	delete fJournal;
410 	fJournal = NULL;
411 
412 	delete fIndicesNode;
413 
414 	block_cache_delete(fBlockCache, !IsReadOnly());
415 	close(fDevice);
416 
417 	return B_OK;
418 }
419 
420 
421 status_t
422 Volume::Sync()
423 {
424 	return fJournal->FlushLogAndBlocks();
425 }
426 
427 
428 status_t
429 Volume::ValidateBlockRun(block_run run)
430 {
431 	if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups()
432 		|| run.Start() > (1UL << AllocationGroupShift())
433 		|| run.length == 0
434 		|| uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) {
435 		Panic();
436 		FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), run.Start(), run.Length()));
437 		return B_BAD_DATA;
438 	}
439 	return B_OK;
440 }
441 
442 
443 block_run
444 Volume::ToBlockRun(off_t block) const
445 {
446 	block_run run;
447 	run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift());
448 	run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1));
449 	run.length = HOST_ENDIAN_TO_BFS_INT16(1);
450 	return run;
451 }
452 
453 
454 status_t
455 Volume::CreateIndicesRoot(Transaction &transaction)
456 {
457 	off_t id;
458 	status_t status = Inode::Create(transaction, NULL, NULL,
459 		S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode);
460 	if (status < B_OK)
461 		RETURN_ERROR(status);
462 
463 	fSuperBlock.indices = ToBlockRun(id);
464 	return WriteSuperBlock();
465 }
466 
467 
468 status_t
469 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run)
470 {
471 	return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run);
472 }
473 
474 
475 status_t
476 Volume::WriteSuperBlock()
477 {
478 	if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block))
479 		return B_IO_ERROR;
480 
481 	return B_OK;
482 }
483 
484 
485 void
486 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey,
487 	size_t oldLength, const uint8 *newKey, size_t newLength)
488 {
489 	if (fQueryLock.Lock() < B_OK)
490 		return;
491 
492 	Query *query = NULL;
493 	while ((query = fQueries.Next(query)) != NULL)
494 		query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength);
495 
496 	fQueryLock.Unlock();
497 }
498 
499 
500 /** Checks if there is a live query whose results depend on the presence
501  *	or value of the specified attribute.
502  *	Don't use it if you already have all the data together to evaluate
503  *	the queries - it wouldn't safe you anything in this case.
504  */
505 
506 bool
507 Volume::CheckForLiveQuery(const char *attribute)
508 {
509 	// ToDo: check for a live query that depends on the specified attribute
510 	return true;
511 }
512 
513 
514 void
515 Volume::AddQuery(Query *query)
516 {
517 	if (fQueryLock.Lock() < B_OK)
518 		return;
519 
520 	fQueries.Add(query);
521 
522 	fQueryLock.Unlock();
523 }
524 
525 
526 void
527 Volume::RemoveQuery(Query *query)
528 {
529 	if (fQueryLock.Lock() < B_OK)
530 		return;
531 
532 	fQueries.Remove(query);
533 
534 	fQueryLock.Unlock();
535 }
536 
537 
538 //	#pragma mark -
539 //	Disk scanning and initialization
540 
541 
542 status_t
543 Volume::Identify(int fd, disk_super_block *superBlock)
544 {
545 	char buffer[1024];
546 	if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer))
547 		return B_IO_ERROR;
548 
549 	// Note: that does work only for x86, for PowerPC, the super block
550 	// may be located at offset 0!
551 	memcpy(superBlock, buffer + 512, sizeof(disk_super_block));
552 	if (!superBlock->IsValid()) {
553 #ifndef BFS_LITTLE_ENDIAN_ONLY
554 		memcpy(superBlock, buffer, sizeof(disk_super_block));
555 		if (!superBlock->IsValid())
556 			return B_BAD_VALUE;
557 #else
558 		return B_BAD_VALUE;
559 #endif
560 	}
561 
562 	return B_OK;
563 }
564 
565 
566 status_t
567 Volume::Initialize(const char *device, const char *name, uint32 blockSize,
568 	uint32 flags)
569 {
570 	// although there is no really good reason for it, we won't
571 	// accept '/' in disk names (mkbfs does this, too - and since
572 	// Tracker names mounted volumes like their name)
573 	if (strchr(name, '/') != NULL)
574 		return B_BAD_VALUE;
575 
576 	if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192)
577 		return B_BAD_VALUE;
578 
579 	DeviceOpener opener(device, O_RDWR);
580 	if (opener.Device() < B_OK)
581 		return B_BAD_VALUE;
582 
583 	fDevice = opener.Device();
584 
585 	uint32 deviceBlockSize;
586 	off_t deviceSize;
587 	if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK)
588 		return B_ERROR;
589 
590 	off_t numBlocks = deviceSize / blockSize;
591 
592 	// create valid super block
593 
594 	fSuperBlock.Initialize(name, numBlocks, blockSize);
595 
596 	// initialize short hands to the super block (to save byte swapping)
597 	fBlockSize = fSuperBlock.BlockSize();
598 	fBlockShift = fSuperBlock.BlockShift();
599 	fAllocationGroupShift = fSuperBlock.AllocationGroupShift();
600 
601 	// since the allocator has not been initialized yet, we
602 	// cannot use BlockAllocator::BitmapSize() here
603 	fSuperBlock.log_blocks = ToBlockRun(AllocationGroups()
604 		* fSuperBlock.BlocksPerAllocationGroup() + 1);
605 	fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048);
606 		// ToDo: set the log size depending on the disk size
607 	fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log()));
608 
609 	// set the current log pointers, so that journaling will work correctly
610 	fLogStart = fSuperBlock.LogStart();
611 	fLogEnd = fSuperBlock.LogEnd();
612 
613 	if (!IsValidSuperBlock())
614 		RETURN_ERROR(B_ERROR);
615 
616 	if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL)
617 		return B_ERROR;
618 
619 	fJournal = new Journal(this);
620 	if (fJournal == NULL || fJournal->InitCheck() < B_OK)
621 		RETURN_ERROR(B_ERROR);
622 
623 	// ready to write data to disk
624 
625 	Transaction transaction(this, 0);
626 
627 	if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK)
628 		RETURN_ERROR(B_ERROR);
629 
630 	off_t id;
631 	status_t status = Inode::Create(transaction, NULL, NULL,
632 		S_DIRECTORY | 0755, 0, 0, &id, &fRootNode);
633 	if (status < B_OK)
634 		RETURN_ERROR(status);
635 
636 	fSuperBlock.root_dir = ToBlockRun(id);
637 
638 	if ((flags & VOLUME_NO_INDICES) == 0) {
639 		// The indices root directory will be created automatically
640 		// when the standard indices are created (or any other).
641 		Index index(this);
642 		status = index.Create(transaction, "name", B_STRING_TYPE);
643 		if (status < B_OK)
644 			return status;
645 
646 		status = index.Create(transaction, "last_modified", B_INT64_TYPE);
647 		if (status < B_OK)
648 			return status;
649 
650 		status = index.Create(transaction, "size", B_INT64_TYPE);
651 		if (status < B_OK)
652 			return status;
653 	}
654 
655 	WriteSuperBlock();
656 	transaction.Done();
657 
658 // 	put_vnode(ID(), fRootNode->ID());
659 // 	if (fIndicesNode != NULL)
660 // 		put_vnode(ID(), fIndicesNode->ID());
661 
662 	Sync();
663 	opener.RemoveCache(true);
664 	return B_OK;
665 }
666