1 /* Volume - BFS super block, mounting, etc. 2 * 3 * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de. 4 * This file may be used under the terms of the MIT License. 5 */ 6 7 8 #include "Debug.h" 9 #include "Volume.h" 10 #include "Journal.h" 11 #include "Inode.h" 12 #include "Query.h" 13 14 #include <util/kernel_cpp.h> 15 #include <KernelExport.h> 16 #include <Drivers.h> 17 #include <fs_volume.h> 18 19 #include <ctype.h> 20 #include <errno.h> 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 25 26 static const int32 kDesiredAllocationGroups = 56; 27 // This is the number of allocation groups that will be tried 28 // to be given for newly initialized disks. 29 // That's only relevant for smaller disks, though, since any 30 // of today's disk sizes already reach the maximum length 31 // of an allocation group (65536 blocks). 32 // It seems to create appropriate numbers for smaller disks 33 // with this setting, though (i.e. you can create a 400 MB 34 // file on a 1 GB disk without the need for double indirect 35 // blocks). 36 37 38 class DeviceOpener { 39 public: 40 DeviceOpener(const char *device, int mode); 41 ~DeviceOpener(); 42 43 int Open(const char *device, int mode); 44 void *InitCache(off_t numBlocks, uint32 blockSize); 45 void RemoveCache(bool allowWrites); 46 47 void Keep(); 48 49 int Device() const { return fDevice; } 50 int Mode() const { return fMode; } 51 52 status_t GetSize(off_t *_size, uint32 *_blockSize = NULL); 53 54 private: 55 int fDevice; 56 int fMode; 57 void *fBlockCache; 58 }; 59 60 61 DeviceOpener::DeviceOpener(const char *device, int mode) 62 : 63 fBlockCache(NULL) 64 { 65 Open(device, mode); 66 } 67 68 69 DeviceOpener::~DeviceOpener() 70 { 71 if (fDevice >= B_OK) { 72 RemoveCache(false); 73 close(fDevice); 74 } 75 } 76 77 78 int 79 DeviceOpener::Open(const char *device, int mode) 80 { 81 fDevice = open(device, mode); 82 if (fDevice < 0) 83 fDevice = errno; 84 85 if (fDevice < 0 && mode == O_RDWR) { 86 // try again to open read-only (don't rely on a specific error code) 87 return Open(device, O_RDONLY); 88 } 89 90 if (fDevice >= 0) { 91 // opening succeeded 92 fMode = mode; 93 if (mode == O_RDWR) { 94 // check out if the device really allows for read/write access 95 device_geometry geometry; 96 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 97 if (geometry.read_only) { 98 // reopen device read-only 99 close(fDevice); 100 return Open(device, O_RDONLY); 101 } 102 } 103 } 104 } 105 106 return fDevice; 107 } 108 109 110 void * 111 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 112 { 113 return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY); 114 } 115 116 117 void 118 DeviceOpener::RemoveCache(bool allowWrites) 119 { 120 if (fBlockCache == NULL) 121 return; 122 123 block_cache_delete(fBlockCache, allowWrites); 124 fBlockCache = NULL; 125 } 126 127 128 void 129 DeviceOpener::Keep() 130 { 131 fDevice = -1; 132 } 133 134 135 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY 136 * to compute the size, or fstat() if that failed. 137 */ 138 139 status_t 140 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize) 141 { 142 device_geometry geometry; 143 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 144 // maybe it's just a file 145 struct stat stat; 146 if (fstat(fDevice, &stat) < 0) 147 return B_ERROR; 148 149 if (_size) 150 *_size = stat.st_size; 151 if (_blockSize) // that shouldn't cause us any problems 152 *_blockSize = 512; 153 154 return B_OK; 155 } 156 157 if (_size) { 158 *_size = 1LL * geometry.head_count * geometry.cylinder_count 159 * geometry.sectors_per_track * geometry.bytes_per_sector; 160 } 161 if (_blockSize) 162 *_blockSize = geometry.bytes_per_sector; 163 164 return B_OK; 165 } 166 167 168 // #pragma mark - 169 170 171 bool 172 disk_super_block::IsValid() 173 { 174 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 175 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 176 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 177 || (int32)block_size != inode_size 178 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 179 || (1UL << BlockShift()) != BlockSize() 180 || AllocationGroups() < 1 181 || AllocationGroupShift() < 1 182 || BlocksPerAllocationGroup() < 1 183 || NumBlocks() < 10 184 || AllocationGroups() != divide_roundup(NumBlocks(), 185 1L << AllocationGroupShift())) 186 return false; 187 188 return true; 189 } 190 191 192 void 193 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize) 194 { 195 memset(this, 0, sizeof(disk_super_block)); 196 197 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 198 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 199 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 200 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 201 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 202 203 strlcpy(name, diskName, sizeof(name)); 204 205 int32 blockShift = 9; 206 while ((1UL << blockShift) < blockSize) { 207 blockShift++; 208 } 209 210 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 211 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 212 213 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 214 used_blocks = 0; 215 216 // Get the minimum ag_shift (that's determined by the block size) 217 218 int32 bitsPerBlock = blockSize << 3; 219 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 220 int32 blocksPerGroup = 1; 221 int32 groupShift = 13; 222 223 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 224 groupShift++; 225 } 226 227 // Many allocation groups help applying allocation policies, but if 228 // they are too small, we will need to many block_runs to cover large 229 // files (see above to get an explanation of the kDesiredAllocationGroups 230 // constant). 231 232 int32 numGroups; 233 234 while (true) { 235 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 236 if (numGroups > kDesiredAllocationGroups) { 237 if (groupShift == 16) 238 break; 239 240 groupShift++; 241 blocksPerGroup *= 2; 242 } else 243 break; 244 } 245 246 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 247 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(1); 248 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 249 } 250 251 252 // #pragma mark - 253 254 255 Volume::Volume(mount_id id) 256 : 257 fID(id), 258 fBlockAllocator(this), 259 fLock("bfs volume"), 260 fRootNode(NULL), 261 fIndicesNode(NULL), 262 fDirtyCachedBlocks(0), 263 fUniqueID(0), 264 fFlags(0) 265 { 266 } 267 268 269 Volume::~Volume() 270 { 271 } 272 273 274 bool 275 Volume::IsValidSuperBlock() 276 { 277 return fSuperBlock.IsValid(); 278 } 279 280 281 void 282 Volume::Panic() 283 { 284 FATAL(("we have to panic... switch to read-only mode!\n")); 285 fFlags |= VOLUME_READ_ONLY; 286 #ifdef USER 287 debugger("BFS panics!"); 288 #elif defined(DEBUG) 289 kernel_debugger("BFS panics!"); 290 #endif 291 } 292 293 294 status_t 295 Volume::Mount(const char *deviceName, uint32 flags) 296 { 297 // ToDo: validate the FS in write mode as well! 298 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 299 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 300 // in big endian mode, we only mount read-only for now 301 flags |= B_MOUNT_READ_ONLY; 302 #endif 303 304 DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR); 305 fDevice = opener.Device(); 306 if (fDevice < B_OK) 307 RETURN_ERROR(fDevice); 308 309 if (opener.Mode() == O_RDONLY) 310 fFlags |= VOLUME_READ_ONLY; 311 312 // check if it's a regular file, and if so, disable the cache for the 313 // underlaying file system 314 struct stat stat; 315 if (fstat(fDevice, &stat) < 0) 316 RETURN_ERROR(B_ERROR); 317 318 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works) 319 #if 0 320 #ifndef NO_FILE_UNCACHED_IO 321 if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) { 322 // mount read-only if the cache couldn't be disabled 323 # ifdef DEBUG 324 FATAL(("couldn't disable cache for image file - system may dead-lock!\n")); 325 # else 326 FATAL(("couldn't disable cache for image file!\n")); 327 Panic(); 328 # endif 329 } 330 #endif 331 #endif 332 333 // read the super block 334 if (Identify(fDevice, &fSuperBlock) != B_OK) { 335 FATAL(("invalid super block!\n")); 336 return B_BAD_VALUE; 337 } 338 339 // initialize short hands to the super block (to save byte swapping) 340 fBlockSize = fSuperBlock.BlockSize(); 341 fBlockShift = fSuperBlock.BlockShift(); 342 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 343 344 // check if the device size is large enough to hold the file system 345 off_t diskSize; 346 if (opener.GetSize(&diskSize) < B_OK) 347 RETURN_ERROR(B_ERROR); 348 if (diskSize < (NumBlocks() << BlockShift())) 349 RETURN_ERROR(B_BAD_VALUE); 350 351 // set the current log pointers, so that journaling will work correctly 352 fLogStart = fSuperBlock.LogStart(); 353 fLogEnd = fSuperBlock.LogEnd(); 354 355 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 356 return B_ERROR; 357 358 fJournal = new Journal(this); 359 // replaying the log is the first thing we will do on this disk 360 if (fJournal && fJournal->InitCheck() < B_OK 361 || fBlockAllocator.Initialize() < B_OK) { 362 // ToDo: improve error reporting for a bad journal 363 FATAL(("could not initialize journal/block bitmap allocator!\n")); 364 return B_NO_MEMORY; 365 } 366 367 status_t status = B_OK; 368 369 fRootNode = new Inode(this, ToVnode(Root())); 370 if (fRootNode && fRootNode->InitCheck() == B_OK) { 371 status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode); 372 if (status == B_OK) { 373 // try to get indices root dir 374 375 // question: why doesn't get_vnode() work here?? 376 // answer: we have not yet backpropagated the pointer to the 377 // volume in bfs_mount(), so bfs_read_vnode() can't get it. 378 // But it's not needed to do that anyway. 379 380 if (!Indices().IsZero()) 381 fIndicesNode = new Inode(this, ToVnode(Indices())); 382 383 if (fIndicesNode == NULL 384 || fIndicesNode->InitCheck() < B_OK 385 || !fIndicesNode->IsContainer()) { 386 INFORM(("bfs: volume doesn't have indices!\n")); 387 388 if (fIndicesNode) { 389 // if this is the case, the index root node is gone bad, and 390 // BFS switch to read-only mode 391 fFlags |= VOLUME_READ_ONLY; 392 delete fIndicesNode; 393 fIndicesNode = NULL; 394 } 395 } 396 397 // all went fine 398 opener.Keep(); 399 return B_OK; 400 } else 401 FATAL(("could not create root node: publish_vnode() failed!\n")); 402 403 delete fRootNode; 404 } else { 405 status = B_BAD_VALUE; 406 FATAL(("could not create root node!\n")); 407 } 408 409 return status; 410 } 411 412 413 status_t 414 Volume::Unmount() 415 { 416 // Unlike in BeOS, we need to put the reference to our root node ourselves 417 put_vnode(fID, ToVnode(Root())); 418 419 // This will also flush the log & all blocks to disk 420 delete fJournal; 421 fJournal = NULL; 422 423 delete fIndicesNode; 424 425 block_cache_delete(fBlockCache, !IsReadOnly()); 426 close(fDevice); 427 428 return B_OK; 429 } 430 431 432 status_t 433 Volume::Sync() 434 { 435 return fJournal->FlushLogAndBlocks(); 436 } 437 438 439 status_t 440 Volume::ValidateBlockRun(block_run run) 441 { 442 if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups() 443 || run.Start() > (1UL << AllocationGroupShift()) 444 || run.length == 0 445 || uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) { 446 Panic(); 447 FATAL(("*** invalid run(%ld,%d,%d)\n", run.AllocationGroup(), run.Start(), run.Length())); 448 return B_BAD_DATA; 449 } 450 return B_OK; 451 } 452 453 454 block_run 455 Volume::ToBlockRun(off_t block) const 456 { 457 block_run run; 458 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift()); 459 run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1)); 460 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 461 return run; 462 } 463 464 465 status_t 466 Volume::CreateIndicesRoot(Transaction &transaction) 467 { 468 off_t id; 469 status_t status = Inode::Create(transaction, NULL, NULL, 470 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode); 471 if (status < B_OK) 472 RETURN_ERROR(status); 473 474 fSuperBlock.indices = ToBlockRun(id); 475 return WriteSuperBlock(); 476 } 477 478 479 status_t 480 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run) 481 { 482 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run); 483 } 484 485 486 status_t 487 Volume::WriteSuperBlock() 488 { 489 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block)) 490 return B_IO_ERROR; 491 492 return B_OK; 493 } 494 495 496 void 497 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey, 498 size_t oldLength, const uint8 *newKey, size_t newLength) 499 { 500 if (fQueryLock.Lock() < B_OK) 501 return; 502 503 Query *query = NULL; 504 while ((query = fQueries.Next(query)) != NULL) 505 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength); 506 507 fQueryLock.Unlock(); 508 } 509 510 511 /** Checks if there is a live query whose results depend on the presence 512 * or value of the specified attribute. 513 * Don't use it if you already have all the data together to evaluate 514 * the queries - it wouldn't safe you anything in this case. 515 */ 516 517 bool 518 Volume::CheckForLiveQuery(const char *attribute) 519 { 520 // ToDo: check for a live query that depends on the specified attribute 521 return true; 522 } 523 524 525 void 526 Volume::AddQuery(Query *query) 527 { 528 if (fQueryLock.Lock() < B_OK) 529 return; 530 531 fQueries.Add(query); 532 533 fQueryLock.Unlock(); 534 } 535 536 537 void 538 Volume::RemoveQuery(Query *query) 539 { 540 if (fQueryLock.Lock() < B_OK) 541 return; 542 543 fQueries.Remove(query); 544 545 fQueryLock.Unlock(); 546 } 547 548 549 // #pragma mark - 550 // Disk scanning and initialization 551 552 553 status_t 554 Volume::Identify(int fd, disk_super_block *superBlock) 555 { 556 char buffer[1024]; 557 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 558 return B_IO_ERROR; 559 560 // Note: that does work only for x86, for PowerPC, the super block 561 // may be located at offset 0! 562 memcpy(superBlock, buffer + 512, sizeof(disk_super_block)); 563 if (!superBlock->IsValid()) { 564 #ifndef BFS_LITTLE_ENDIAN_ONLY 565 memcpy(superBlock, buffer, sizeof(disk_super_block)); 566 if (!superBlock->IsValid()) 567 return B_BAD_VALUE; 568 #else 569 return B_BAD_VALUE; 570 #endif 571 } 572 573 return B_OK; 574 } 575 576 #ifdef USER 577 extern "C" void kill_device_vnodes(dev_t id); 578 // This call is only available in the userland fs_shell 579 580 status_t 581 Volume::Initialize(const char *device, const char *name, uint32 blockSize, uint32 flags) 582 { 583 // although there is no really good reason for it, we won't 584 // accept '/' in disk names (mkbfs does this, too - and since 585 // Tracker names mounted volumes like their name) 586 if (strchr(name, '/') != NULL) 587 return B_BAD_VALUE; 588 589 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192) 590 return B_BAD_VALUE; 591 592 DeviceOpener opener(device, O_RDWR); 593 if (opener.Device() < B_OK) 594 return B_BAD_VALUE; 595 596 fDevice = opener.Device(); 597 598 uint32 deviceBlockSize; 599 off_t deviceSize; 600 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 601 return B_ERROR; 602 603 off_t numBlocks = deviceSize / blockSize; 604 605 // create valid super block 606 607 fSuperBlock.Initialize(name, numBlocks, blockSize); 608 609 // initialize short hands to the super block (to save byte swapping) 610 fBlockSize = fSuperBlock.BlockSize(); 611 fBlockShift = fSuperBlock.BlockShift(); 612 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 613 614 // since the allocator has not been initialized yet, we 615 // cannot use BlockAllocator::BitmapSize() here 616 fSuperBlock.log_blocks = ToBlockRun(AllocationGroups() 617 * fSuperBlock.BlocksPerAllocationGroup() + 1); 618 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048); 619 // ToDo: set the log size depending on the disk size 620 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log())); 621 622 // set the current log pointers, so that journaling will work correctly 623 fLogStart = fSuperBlock.LogStart(); 624 fLogEnd = fSuperBlock.LogEnd(); 625 626 if (!IsValidSuperBlock()) 627 RETURN_ERROR(B_ERROR); 628 629 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 630 return B_ERROR; 631 632 fJournal = new Journal(this); 633 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 634 RETURN_ERROR(B_ERROR); 635 636 // ready to write data to disk 637 638 Transaction transaction(this, 0); 639 640 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 641 RETURN_ERROR(B_ERROR); 642 643 off_t id; 644 status_t status = Inode::Create(transaction, NULL, NULL, 645 S_DIRECTORY | 0755, 0, 0, &id, &fRootNode); 646 if (status < B_OK) 647 RETURN_ERROR(status); 648 649 fSuperBlock.root_dir = ToBlockRun(id); 650 651 if ((flags & VOLUME_NO_INDICES) == 0) { 652 // The indices root directory will be created automatically 653 // when the standard indices are created (or any other). 654 Index index(this); 655 status = index.Create(transaction, "name", B_STRING_TYPE); 656 if (status < B_OK) 657 return status; 658 659 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 660 if (status < B_OK) 661 return status; 662 663 status = index.Create(transaction, "size", B_INT64_TYPE); 664 if (status < B_OK) 665 return status; 666 } 667 668 WriteSuperBlock(); 669 transaction.Done(); 670 671 put_vnode(ID(), fRootNode->ID()); 672 if (fIndicesNode != NULL) 673 put_vnode(ID(), fIndicesNode->ID()); 674 675 kill_device_vnodes(ID()); 676 // This call is only available in the userland fs_shell 677 678 Sync(); 679 opener.RemoveCache(true); 680 return B_OK; 681 } 682 #endif 683