1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! super block, mounting, etc. 7 8 9 #include "Debug.h" 10 #include "Volume.h" 11 #include "Journal.h" 12 #include "Inode.h" 13 #include "Query.h" 14 15 16 static const int32 kDesiredAllocationGroups = 56; 17 // This is the number of allocation groups that will be tried 18 // to be given for newly initialized disks. 19 // That's only relevant for smaller disks, though, since any 20 // of today's disk sizes already reach the maximum length 21 // of an allocation group (65536 blocks). 22 // It seems to create appropriate numbers for smaller disks 23 // with this setting, though (i.e. you can create a 400 MB 24 // file on a 1 GB disk without the need for double indirect 25 // blocks). 26 27 28 class DeviceOpener { 29 public: 30 DeviceOpener(int fd, int mode); 31 DeviceOpener(const char* device, int mode); 32 ~DeviceOpener(); 33 34 int Open(const char* device, int mode); 35 int Open(int fd, int mode); 36 void* InitCache(off_t numBlocks, uint32 blockSize); 37 void RemoveCache(bool allowWrites); 38 39 void Keep(); 40 41 int Device() const { return fDevice; } 42 int Mode() const { return fMode; } 43 bool IsReadOnly() const { return _IsReadOnly(fMode); } 44 45 status_t GetSize(off_t* _size, uint32* _blockSize = NULL); 46 47 private: 48 static bool _IsReadOnly(int mode) 49 { return (mode & O_RWMASK) == O_RDONLY;} 50 static bool _IsReadWrite(int mode) 51 { return (mode & O_RWMASK) == O_RDWR;} 52 53 int fDevice; 54 int fMode; 55 void* fBlockCache; 56 }; 57 58 59 DeviceOpener::DeviceOpener(const char* device, int mode) 60 : 61 fBlockCache(NULL) 62 { 63 Open(device, mode); 64 } 65 66 67 DeviceOpener::DeviceOpener(int fd, int mode) 68 : 69 fBlockCache(NULL) 70 { 71 Open(fd, mode); 72 } 73 74 75 DeviceOpener::~DeviceOpener() 76 { 77 if (fDevice >= 0) { 78 RemoveCache(false); 79 close(fDevice); 80 } 81 } 82 83 84 int 85 DeviceOpener::Open(const char* device, int mode) 86 { 87 fDevice = open(device, mode | O_NOCACHE); 88 if (fDevice < 0) 89 fDevice = errno; 90 91 if (fDevice < 0 && _IsReadWrite(mode)) { 92 // try again to open read-only (don't rely on a specific error code) 93 return Open(device, O_RDONLY | O_NOCACHE); 94 } 95 96 if (fDevice >= 0) { 97 // opening succeeded 98 fMode = mode; 99 if (_IsReadWrite(mode)) { 100 // check out if the device really allows for read/write access 101 device_geometry geometry; 102 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 103 if (geometry.read_only) { 104 // reopen device read-only 105 close(fDevice); 106 return Open(device, O_RDONLY | O_NOCACHE); 107 } 108 } 109 } 110 } 111 112 return fDevice; 113 } 114 115 116 int 117 DeviceOpener::Open(int fd, int mode) 118 { 119 fDevice = dup(fd); 120 if (fDevice < 0) 121 return errno; 122 123 fMode = mode; 124 125 return fDevice; 126 } 127 128 129 void* 130 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 131 { 132 return fBlockCache = block_cache_create(fDevice, numBlocks, blockSize, 133 IsReadOnly()); 134 } 135 136 137 void 138 DeviceOpener::RemoveCache(bool allowWrites) 139 { 140 if (fBlockCache == NULL) 141 return; 142 143 block_cache_delete(fBlockCache, allowWrites); 144 fBlockCache = NULL; 145 } 146 147 148 void 149 DeviceOpener::Keep() 150 { 151 fDevice = -1; 152 } 153 154 155 /*! Returns the size of the device in bytes. It uses B_GET_GEOMETRY 156 to compute the size, or fstat() if that failed. 157 */ 158 status_t 159 DeviceOpener::GetSize(off_t* _size, uint32* _blockSize) 160 { 161 device_geometry geometry; 162 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 163 // maybe it's just a file 164 struct stat stat; 165 if (fstat(fDevice, &stat) < 0) 166 return B_ERROR; 167 168 if (_size) 169 *_size = stat.st_size; 170 if (_blockSize) // that shouldn't cause us any problems 171 *_blockSize = 512; 172 173 return B_OK; 174 } 175 176 if (_size) { 177 *_size = 1LL * geometry.head_count * geometry.cylinder_count 178 * geometry.sectors_per_track * geometry.bytes_per_sector; 179 } 180 if (_blockSize) 181 *_blockSize = geometry.bytes_per_sector; 182 183 return B_OK; 184 } 185 186 187 // #pragma mark - 188 189 190 bool 191 disk_super_block::IsValid() 192 { 193 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 194 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 195 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 196 || (int32)block_size != inode_size 197 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 198 || (1UL << BlockShift()) != BlockSize() 199 || AllocationGroups() < 1 200 || AllocationGroupShift() < 1 201 || BlocksPerAllocationGroup() < 1 202 || NumBlocks() < 10 203 || AllocationGroups() != divide_roundup(NumBlocks(), 204 1L << AllocationGroupShift())) 205 return false; 206 207 return true; 208 } 209 210 211 void 212 disk_super_block::Initialize(const char* diskName, off_t numBlocks, 213 uint32 blockSize) 214 { 215 memset(this, 0, sizeof(disk_super_block)); 216 217 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 218 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 219 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 220 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 221 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 222 223 strlcpy(name, diskName, sizeof(name)); 224 225 int32 blockShift = 9; 226 while ((1UL << blockShift) < blockSize) { 227 blockShift++; 228 } 229 230 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 231 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 232 233 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 234 used_blocks = 0; 235 236 // Get the minimum ag_shift (that's determined by the block size) 237 238 int32 bitsPerBlock = blockSize << 3; 239 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 240 int32 blocksPerGroup = 1; 241 int32 groupShift = 13; 242 243 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 244 groupShift++; 245 } 246 247 // Many allocation groups help applying allocation policies, but if 248 // they are too small, we will need to many block_runs to cover large 249 // files (see above to get an explanation of the kDesiredAllocationGroups 250 // constant). 251 252 int32 numGroups; 253 254 while (true) { 255 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 256 if (numGroups > kDesiredAllocationGroups) { 257 if (groupShift == 16) 258 break; 259 260 groupShift++; 261 blocksPerGroup *= 2; 262 } else 263 break; 264 } 265 266 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 267 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup); 268 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 269 } 270 271 272 // #pragma mark - 273 274 275 Volume::Volume(fs_volume* volume) 276 : 277 fVolume(volume), 278 fBlockAllocator(this), 279 fRootNode(NULL), 280 fIndicesNode(NULL), 281 fDirtyCachedBlocks(0), 282 fFlags(0), 283 fCheckingThread(-1) 284 { 285 mutex_init(&fLock, "bfs volume"); 286 mutex_init(&fQueryLock, "bfs queries"); 287 } 288 289 290 Volume::~Volume() 291 { 292 mutex_destroy(&fQueryLock); 293 mutex_destroy(&fLock); 294 } 295 296 297 bool 298 Volume::IsValidSuperBlock() 299 { 300 return fSuperBlock.IsValid(); 301 } 302 303 304 void 305 Volume::Panic() 306 { 307 FATAL(("Disk corrupted... switch to read-only mode!\n")); 308 fFlags |= VOLUME_READ_ONLY; 309 #if KDEBUG 310 kernel_debugger("BFS panics!"); 311 #endif 312 } 313 314 315 status_t 316 Volume::Mount(const char* deviceName, uint32 flags) 317 { 318 // TODO: validate the FS in write mode as well! 319 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 320 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 321 // in big endian mode, we only mount read-only for now 322 flags |= B_MOUNT_READ_ONLY; 323 #endif 324 325 DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0 326 ? O_RDONLY : O_RDWR); 327 fDevice = opener.Device(); 328 if (fDevice < B_OK) 329 RETURN_ERROR(fDevice); 330 331 if (opener.IsReadOnly()) 332 fFlags |= VOLUME_READ_ONLY; 333 334 // read the super block 335 if (Identify(fDevice, &fSuperBlock) != B_OK) { 336 FATAL(("invalid super block!\n")); 337 return B_BAD_VALUE; 338 } 339 340 // initialize short hands to the super block (to save byte swapping) 341 fBlockSize = fSuperBlock.BlockSize(); 342 fBlockShift = fSuperBlock.BlockShift(); 343 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 344 345 // check if the device size is large enough to hold the file system 346 off_t diskSize; 347 if (opener.GetSize(&diskSize, &fDeviceBlockSize) != B_OK) 348 RETURN_ERROR(B_ERROR); 349 if (diskSize < (NumBlocks() << BlockShift())) 350 RETURN_ERROR(B_BAD_VALUE); 351 352 // set the current log pointers, so that journaling will work correctly 353 fLogStart = fSuperBlock.LogStart(); 354 fLogEnd = fSuperBlock.LogEnd(); 355 356 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 357 return B_ERROR; 358 359 fJournal = new(std::nothrow) Journal(this); 360 if (fJournal == NULL) 361 return B_NO_MEMORY; 362 363 status_t status = fJournal->InitCheck(); 364 if (status < B_OK) { 365 FATAL(("could not initialize journal: %s!\n", strerror(status))); 366 return status; 367 } 368 369 // replaying the log is the first thing we will do on this disk 370 status = fJournal->ReplayLog(); 371 if (status != B_OK) { 372 FATAL(("Replaying log failed, data may be corrupted, volume " 373 "read-only.\n")); 374 fFlags |= VOLUME_READ_ONLY; 375 // TODO: if this is the boot volume, Bootscript will assume this 376 // is a CD... 377 // TODO: it would be nice to have a user visible alert instead 378 // of letting him just find this in the syslog. 379 } 380 381 status = fBlockAllocator.Initialize(); 382 if (status != B_OK) { 383 FATAL(("could not initialize block bitmap allocator!\n")); 384 return status; 385 } 386 387 fRootNode = new(std::nothrow) Inode(this, ToVnode(Root())); 388 if (fRootNode != NULL && fRootNode->InitCheck() == B_OK) { 389 status = publish_vnode(fVolume, ToVnode(Root()), (void*)fRootNode, 390 &gBFSVnodeOps, fRootNode->Mode(), 0); 391 if (status == B_OK) { 392 // try to get indices root dir 393 394 if (!Indices().IsZero()) { 395 fIndicesNode = new(std::nothrow) Inode(this, 396 ToVnode(Indices())); 397 } 398 399 if (fIndicesNode == NULL 400 || fIndicesNode->InitCheck() < B_OK 401 || !fIndicesNode->IsContainer()) { 402 INFORM(("bfs: volume doesn't have indices!\n")); 403 404 if (fIndicesNode) { 405 // if this is the case, the index root node is gone bad, 406 // and BFS switch to read-only mode 407 fFlags |= VOLUME_READ_ONLY; 408 delete fIndicesNode; 409 fIndicesNode = NULL; 410 } 411 } else { 412 // we don't use the vnode layer to access the indices node 413 } 414 415 // all went fine 416 opener.Keep(); 417 return B_OK; 418 } else 419 FATAL(("could not create root node: publish_vnode() failed!\n")); 420 421 delete fRootNode; 422 } else { 423 status = B_BAD_VALUE; 424 FATAL(("could not create root node!\n")); 425 } 426 427 return status; 428 } 429 430 431 status_t 432 Volume::Unmount() 433 { 434 put_vnode(fVolume, ToVnode(Root())); 435 436 fBlockAllocator.Uninitialize(); 437 438 // This will also flush the log & all blocks to disk 439 delete fJournal; 440 fJournal = NULL; 441 442 delete fIndicesNode; 443 444 block_cache_delete(fBlockCache, !IsReadOnly()); 445 close(fDevice); 446 447 return B_OK; 448 } 449 450 451 status_t 452 Volume::Sync() 453 { 454 return fJournal->FlushLogAndBlocks(); 455 } 456 457 458 status_t 459 Volume::ValidateBlockRun(block_run run) 460 { 461 if (run.AllocationGroup() < 0 462 || run.AllocationGroup() > (int32)AllocationGroups() 463 || run.Start() > (1UL << AllocationGroupShift()) 464 || run.length == 0 465 || uint32(run.Length() + run.Start()) 466 > (1UL << AllocationGroupShift())) { 467 Panic(); 468 FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), 469 run.Start(), run.Length())); 470 return B_BAD_DATA; 471 } 472 return B_OK; 473 } 474 475 476 block_run 477 Volume::ToBlockRun(off_t block) const 478 { 479 block_run run; 480 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32( 481 block >> AllocationGroupShift()); 482 run.start = HOST_ENDIAN_TO_BFS_INT16( 483 block & ((1LL << AllocationGroupShift()) - 1)); 484 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 485 return run; 486 } 487 488 489 status_t 490 Volume::CreateIndicesRoot(Transaction& transaction) 491 { 492 off_t id; 493 status_t status = Inode::Create(transaction, NULL, NULL, 494 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id, 495 &fIndicesNode); 496 if (status < B_OK) 497 RETURN_ERROR(status); 498 499 fSuperBlock.indices = ToBlockRun(id); 500 return WriteSuperBlock(); 501 } 502 503 504 status_t 505 Volume::AllocateForInode(Transaction& transaction, const Inode* parent, 506 mode_t type, block_run& run) 507 { 508 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), 509 type, run); 510 } 511 512 513 status_t 514 Volume::WriteSuperBlock() 515 { 516 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) 517 != sizeof(disk_super_block)) 518 return B_IO_ERROR; 519 520 return B_OK; 521 } 522 523 524 void 525 Volume::UpdateLiveQueries(Inode* inode, const char* attribute, int32 type, 526 const uint8* oldKey, size_t oldLength, const uint8* newKey, 527 size_t newLength) 528 { 529 MutexLocker _(fQueryLock); 530 531 SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator(); 532 while (iterator.HasNext()) { 533 Query* query = iterator.Next(); 534 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, 535 newLength); 536 } 537 } 538 539 540 void 541 Volume::UpdateLiveQueriesRenameMove(Inode* inode, ino_t oldDirectoryID, 542 const char* oldName, ino_t newDirectoryID, const char* newName) 543 { 544 MutexLocker _(fQueryLock); 545 546 size_t oldLength = strlen(oldName); 547 size_t newLength = strlen(newName); 548 549 SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator(); 550 while (iterator.HasNext()) { 551 Query* query = iterator.Next(); 552 query->LiveUpdateRenameMove(inode, oldDirectoryID, oldName, oldLength, 553 newDirectoryID, newName, newLength); 554 } 555 } 556 557 558 /*! Checks if there is a live query whose results depend on the presence 559 or value of the specified attribute. 560 Don't use it if you already have all the data together to evaluate 561 the queries - it wouldn't safe you anything in this case. 562 */ 563 bool 564 Volume::CheckForLiveQuery(const char* attribute) 565 { 566 // TODO: check for a live query that depends on the specified attribute 567 return true; 568 } 569 570 571 void 572 Volume::AddQuery(Query* query) 573 { 574 MutexLocker _(fQueryLock); 575 fQueries.Add(query); 576 } 577 578 579 void 580 Volume::RemoveQuery(Query* query) 581 { 582 MutexLocker _(fQueryLock); 583 fQueries.Remove(query); 584 } 585 586 587 // #pragma mark - Disk scanning and initialization 588 589 590 /*static*/ status_t 591 Volume::CheckSuperBlock(const uint8* data, uint32* _offset) 592 { 593 disk_super_block* superBlock = (disk_super_block*)(data + 512); 594 if (superBlock->IsValid()) { 595 if (_offset != NULL) 596 *_offset = 512; 597 return B_OK; 598 } 599 600 #ifndef BFS_LITTLE_ENDIAN_ONLY 601 // For PPC, the super block might be located at offset 0 602 superBlock = (disk_super_block*)data; 603 if (superBlock->IsValid()) { 604 if (_offset != NULL) 605 *_offset = 0; 606 return B_OK; 607 } 608 #endif 609 610 return B_BAD_VALUE; 611 } 612 613 614 /*static*/ status_t 615 Volume::Identify(int fd, disk_super_block* superBlock) 616 { 617 uint8 buffer[1024]; 618 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 619 return B_IO_ERROR; 620 621 uint32 offset; 622 if (CheckSuperBlock(buffer, &offset) != B_OK) 623 return B_BAD_VALUE; 624 625 memcpy(superBlock, buffer + offset, sizeof(disk_super_block)); 626 return B_OK; 627 } 628 629 630 status_t 631 Volume::Initialize(int fd, const char* name, uint32 blockSize, 632 uint32 flags) 633 { 634 // although there is no really good reason for it, we won't 635 // accept '/' in disk names (mkbfs does this, too - and since 636 // Tracker names mounted volumes like their name) 637 if (strchr(name, '/') != NULL) 638 return B_BAD_VALUE; 639 640 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 641 && blockSize != 8192) 642 return B_BAD_VALUE; 643 644 DeviceOpener opener(fd, O_RDWR); 645 if (opener.Device() < B_OK) 646 return B_BAD_VALUE; 647 648 if (opener.IsReadOnly()) 649 return B_READ_ONLY_DEVICE; 650 651 fDevice = opener.Device(); 652 653 uint32 deviceBlockSize; 654 off_t deviceSize; 655 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 656 return B_ERROR; 657 658 off_t numBlocks = deviceSize / blockSize; 659 660 // create valid super block 661 662 fSuperBlock.Initialize(name, numBlocks, blockSize); 663 664 // initialize short hands to the super block (to save byte swapping) 665 fBlockSize = fSuperBlock.BlockSize(); 666 fBlockShift = fSuperBlock.BlockShift(); 667 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 668 669 // determine log size depending on the size of the volume 670 off_t logSize = 2048; 671 if (numBlocks <= 20480) 672 logSize = 512; 673 if (deviceSize > 1LL * 1024 * 1024 * 1024) 674 logSize = 4096; 675 676 // since the allocator has not been initialized yet, we 677 // cannot use BlockAllocator::BitmapSize() here 678 off_t bitmapBlocks = (numBlocks + blockSize * 8 - 1) / (blockSize * 8); 679 680 fSuperBlock.log_blocks = ToBlockRun(bitmapBlocks + 1); 681 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(logSize); 682 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64( 683 ToBlock(Log())); 684 685 // set the current log pointers, so that journaling will work correctly 686 fLogStart = fSuperBlock.LogStart(); 687 fLogEnd = fSuperBlock.LogEnd(); 688 689 if (!IsValidSuperBlock()) 690 RETURN_ERROR(B_ERROR); 691 692 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 693 return B_ERROR; 694 695 fJournal = new(std::nothrow) Journal(this); 696 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 697 RETURN_ERROR(B_ERROR); 698 699 // ready to write data to disk 700 701 Transaction transaction(this, 0); 702 703 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 704 RETURN_ERROR(B_ERROR); 705 706 off_t id; 707 status_t status = Inode::Create(transaction, NULL, NULL, 708 S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode); 709 if (status < B_OK) 710 RETURN_ERROR(status); 711 712 fSuperBlock.root_dir = ToBlockRun(id); 713 714 if ((flags & VOLUME_NO_INDICES) == 0) { 715 // The indices root directory will be created automatically 716 // when the standard indices are created (or any other). 717 Index index(this); 718 status = index.Create(transaction, "name", B_STRING_TYPE); 719 if (status < B_OK) 720 return status; 721 722 status = index.Create(transaction, "BEOS:APP_SIG", B_STRING_TYPE); 723 if (status < B_OK) 724 return status; 725 726 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 727 if (status < B_OK) 728 return status; 729 730 status = index.Create(transaction, "size", B_INT64_TYPE); 731 if (status < B_OK) 732 return status; 733 } 734 735 WriteSuperBlock(); 736 transaction.Done(); 737 738 Sync(); 739 opener.RemoveCache(true); 740 return B_OK; 741 } 742