1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! super block, mounting, etc. 7 8 9 #include "Attribute.h" 10 #include "Debug.h" 11 #include "Inode.h" 12 #include "Journal.h" 13 #include "Query.h" 14 #include "Volume.h" 15 16 17 static const int32 kDesiredAllocationGroups = 56; 18 // This is the number of allocation groups that will be tried 19 // to be given for newly initialized disks. 20 // That's only relevant for smaller disks, though, since any 21 // of today's disk sizes already reach the maximum length 22 // of an allocation group (65536 blocks). 23 // It seems to create appropriate numbers for smaller disks 24 // with this setting, though (i.e. you can create a 400 MB 25 // file on a 1 GB disk without the need for double indirect 26 // blocks). 27 28 29 class DeviceOpener { 30 public: 31 DeviceOpener(int fd, int mode); 32 DeviceOpener(const char* device, int mode); 33 ~DeviceOpener(); 34 35 int Open(const char* device, int mode); 36 int Open(int fd, int mode); 37 void* InitCache(off_t numBlocks, uint32 blockSize); 38 void RemoveCache(bool allowWrites); 39 40 void Keep(); 41 42 int Device() const { return fDevice; } 43 int Mode() const { return fMode; } 44 bool IsReadOnly() const { return _IsReadOnly(fMode); } 45 46 status_t GetSize(off_t* _size, uint32* _blockSize = NULL); 47 48 private: 49 static bool _IsReadOnly(int mode) 50 { return (mode & O_RWMASK) == O_RDONLY;} 51 static bool _IsReadWrite(int mode) 52 { return (mode & O_RWMASK) == O_RDWR;} 53 54 int fDevice; 55 int fMode; 56 void* fBlockCache; 57 }; 58 59 60 DeviceOpener::DeviceOpener(const char* device, int mode) 61 : 62 fBlockCache(NULL) 63 { 64 Open(device, mode); 65 } 66 67 68 DeviceOpener::DeviceOpener(int fd, int mode) 69 : 70 fBlockCache(NULL) 71 { 72 Open(fd, mode); 73 } 74 75 76 DeviceOpener::~DeviceOpener() 77 { 78 if (fDevice >= 0) { 79 RemoveCache(false); 80 close(fDevice); 81 } 82 } 83 84 85 int 86 DeviceOpener::Open(const char* device, int mode) 87 { 88 fDevice = open(device, mode | O_NOCACHE); 89 if (fDevice < 0) 90 fDevice = errno; 91 92 if (fDevice < 0 && _IsReadWrite(mode)) { 93 // try again to open read-only (don't rely on a specific error code) 94 return Open(device, O_RDONLY | O_NOCACHE); 95 } 96 97 if (fDevice >= 0) { 98 // opening succeeded 99 fMode = mode; 100 if (_IsReadWrite(mode)) { 101 // check out if the device really allows for read/write access 102 device_geometry geometry; 103 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 104 if (geometry.read_only) { 105 // reopen device read-only 106 close(fDevice); 107 return Open(device, O_RDONLY | O_NOCACHE); 108 } 109 } 110 } 111 } 112 113 return fDevice; 114 } 115 116 117 int 118 DeviceOpener::Open(int fd, int mode) 119 { 120 fDevice = dup(fd); 121 if (fDevice < 0) 122 return errno; 123 124 fMode = mode; 125 126 return fDevice; 127 } 128 129 130 void* 131 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 132 { 133 return fBlockCache = block_cache_create(fDevice, numBlocks, blockSize, 134 IsReadOnly()); 135 } 136 137 138 void 139 DeviceOpener::RemoveCache(bool allowWrites) 140 { 141 if (fBlockCache == NULL) 142 return; 143 144 block_cache_delete(fBlockCache, allowWrites); 145 fBlockCache = NULL; 146 } 147 148 149 void 150 DeviceOpener::Keep() 151 { 152 fDevice = -1; 153 } 154 155 156 /*! Returns the size of the device in bytes. It uses B_GET_GEOMETRY 157 to compute the size, or fstat() if that failed. 158 */ 159 status_t 160 DeviceOpener::GetSize(off_t* _size, uint32* _blockSize) 161 { 162 device_geometry geometry; 163 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 164 // maybe it's just a file 165 struct stat stat; 166 if (fstat(fDevice, &stat) < 0) 167 return B_ERROR; 168 169 if (_size) 170 *_size = stat.st_size; 171 if (_blockSize) // that shouldn't cause us any problems 172 *_blockSize = 512; 173 174 return B_OK; 175 } 176 177 if (_size) { 178 *_size = 1LL * geometry.head_count * geometry.cylinder_count 179 * geometry.sectors_per_track * geometry.bytes_per_sector; 180 } 181 if (_blockSize) 182 *_blockSize = geometry.bytes_per_sector; 183 184 return B_OK; 185 } 186 187 188 // #pragma mark - 189 190 191 bool 192 disk_super_block::IsValid() 193 { 194 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 195 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 196 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 197 || (int32)block_size != inode_size 198 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 199 || (1UL << BlockShift()) != BlockSize() 200 || AllocationGroups() < 1 201 || AllocationGroupShift() < 1 202 || BlocksPerAllocationGroup() < 1 203 || NumBlocks() < 10 204 || AllocationGroups() != divide_roundup(NumBlocks(), 205 1L << AllocationGroupShift())) 206 return false; 207 208 return true; 209 } 210 211 212 void 213 disk_super_block::Initialize(const char* diskName, off_t numBlocks, 214 uint32 blockSize) 215 { 216 memset(this, 0, sizeof(disk_super_block)); 217 218 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 219 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 220 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 221 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 222 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 223 224 strlcpy(name, diskName, sizeof(name)); 225 226 int32 blockShift = 9; 227 while ((1UL << blockShift) < blockSize) { 228 blockShift++; 229 } 230 231 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 232 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 233 234 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 235 used_blocks = 0; 236 237 // Get the minimum ag_shift (that's determined by the block size) 238 239 int32 bitsPerBlock = blockSize << 3; 240 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 241 int32 blocksPerGroup = 1; 242 int32 groupShift = 13; 243 244 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 245 groupShift++; 246 } 247 248 // Many allocation groups help applying allocation policies, but if 249 // they are too small, we will need to many block_runs to cover large 250 // files (see above to get an explanation of the kDesiredAllocationGroups 251 // constant). 252 253 int32 numGroups; 254 255 while (true) { 256 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 257 if (numGroups > kDesiredAllocationGroups) { 258 if (groupShift == 16) 259 break; 260 261 groupShift++; 262 blocksPerGroup *= 2; 263 } else 264 break; 265 } 266 267 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 268 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup); 269 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 270 } 271 272 273 // #pragma mark - 274 275 276 Volume::Volume(fs_volume* volume) 277 : 278 fVolume(volume), 279 fBlockAllocator(this), 280 fRootNode(NULL), 281 fIndicesNode(NULL), 282 fDirtyCachedBlocks(0), 283 fFlags(0), 284 fCheckingThread(-1) 285 { 286 mutex_init(&fLock, "bfs volume"); 287 mutex_init(&fQueryLock, "bfs queries"); 288 } 289 290 291 Volume::~Volume() 292 { 293 mutex_destroy(&fQueryLock); 294 mutex_destroy(&fLock); 295 } 296 297 298 bool 299 Volume::IsValidSuperBlock() 300 { 301 return fSuperBlock.IsValid(); 302 } 303 304 305 void 306 Volume::Panic() 307 { 308 FATAL(("Disk corrupted... switch to read-only mode!\n")); 309 fFlags |= VOLUME_READ_ONLY; 310 #if KDEBUG 311 kernel_debugger("BFS panics!"); 312 #endif 313 } 314 315 316 status_t 317 Volume::Mount(const char* deviceName, uint32 flags) 318 { 319 // TODO: validate the FS in write mode as well! 320 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 321 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 322 // in big endian mode, we only mount read-only for now 323 flags |= B_MOUNT_READ_ONLY; 324 #endif 325 326 DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0 327 ? O_RDONLY : O_RDWR); 328 fDevice = opener.Device(); 329 if (fDevice < B_OK) 330 RETURN_ERROR(fDevice); 331 332 if (opener.IsReadOnly()) 333 fFlags |= VOLUME_READ_ONLY; 334 335 // read the super block 336 if (Identify(fDevice, &fSuperBlock) != B_OK) { 337 FATAL(("invalid super block!\n")); 338 return B_BAD_VALUE; 339 } 340 341 // initialize short hands to the super block (to save byte swapping) 342 fBlockSize = fSuperBlock.BlockSize(); 343 fBlockShift = fSuperBlock.BlockShift(); 344 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 345 346 // check if the device size is large enough to hold the file system 347 off_t diskSize; 348 if (opener.GetSize(&diskSize, &fDeviceBlockSize) != B_OK) 349 RETURN_ERROR(B_ERROR); 350 if (diskSize < (NumBlocks() << BlockShift())) 351 RETURN_ERROR(B_BAD_VALUE); 352 353 // set the current log pointers, so that journaling will work correctly 354 fLogStart = fSuperBlock.LogStart(); 355 fLogEnd = fSuperBlock.LogEnd(); 356 357 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 358 return B_ERROR; 359 360 fJournal = new(std::nothrow) Journal(this); 361 if (fJournal == NULL) 362 return B_NO_MEMORY; 363 364 status_t status = fJournal->InitCheck(); 365 if (status < B_OK) { 366 FATAL(("could not initialize journal: %s!\n", strerror(status))); 367 return status; 368 } 369 370 // replaying the log is the first thing we will do on this disk 371 status = fJournal->ReplayLog(); 372 if (status != B_OK) { 373 FATAL(("Replaying log failed, data may be corrupted, volume " 374 "read-only.\n")); 375 fFlags |= VOLUME_READ_ONLY; 376 // TODO: if this is the boot volume, Bootscript will assume this 377 // is a CD... 378 // TODO: it would be nice to have a user visible alert instead 379 // of letting him just find this in the syslog. 380 } 381 382 status = fBlockAllocator.Initialize(); 383 if (status != B_OK) { 384 FATAL(("could not initialize block bitmap allocator!\n")); 385 return status; 386 } 387 388 fRootNode = new(std::nothrow) Inode(this, ToVnode(Root())); 389 if (fRootNode != NULL && fRootNode->InitCheck() == B_OK) { 390 status = publish_vnode(fVolume, ToVnode(Root()), (void*)fRootNode, 391 &gBFSVnodeOps, fRootNode->Mode(), 0); 392 if (status == B_OK) { 393 // try to get indices root dir 394 395 if (!Indices().IsZero()) { 396 fIndicesNode = new(std::nothrow) Inode(this, 397 ToVnode(Indices())); 398 } 399 400 if (fIndicesNode == NULL 401 || fIndicesNode->InitCheck() < B_OK 402 || !fIndicesNode->IsContainer()) { 403 INFORM(("bfs: volume doesn't have indices!\n")); 404 405 if (fIndicesNode) { 406 // if this is the case, the index root node is gone bad, 407 // and BFS switch to read-only mode 408 fFlags |= VOLUME_READ_ONLY; 409 delete fIndicesNode; 410 fIndicesNode = NULL; 411 } 412 } else { 413 // we don't use the vnode layer to access the indices node 414 } 415 } else { 416 FATAL(("could not create root node: publish_vnode() failed!\n")); 417 delete fRootNode; 418 return status; 419 } 420 } else { 421 status = B_BAD_VALUE; 422 FATAL(("could not create root node!\n")); 423 return status; 424 } 425 426 // all went fine 427 opener.Keep(); 428 return B_OK; 429 } 430 431 432 status_t 433 Volume::Unmount() 434 { 435 put_vnode(fVolume, ToVnode(Root())); 436 437 fBlockAllocator.Uninitialize(); 438 439 // This will also flush the log & all blocks to disk 440 delete fJournal; 441 fJournal = NULL; 442 443 delete fIndicesNode; 444 445 block_cache_delete(fBlockCache, !IsReadOnly()); 446 close(fDevice); 447 448 return B_OK; 449 } 450 451 452 status_t 453 Volume::Sync() 454 { 455 return fJournal->FlushLogAndBlocks(); 456 } 457 458 459 status_t 460 Volume::ValidateBlockRun(block_run run) 461 { 462 if (run.AllocationGroup() < 0 463 || run.AllocationGroup() > (int32)AllocationGroups() 464 || run.Start() > (1UL << AllocationGroupShift()) 465 || run.length == 0 466 || uint32(run.Length() + run.Start()) 467 > (1UL << AllocationGroupShift())) { 468 Panic(); 469 FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), 470 run.Start(), run.Length())); 471 return B_BAD_DATA; 472 } 473 return B_OK; 474 } 475 476 477 block_run 478 Volume::ToBlockRun(off_t block) const 479 { 480 block_run run; 481 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32( 482 block >> AllocationGroupShift()); 483 run.start = HOST_ENDIAN_TO_BFS_INT16( 484 block & ((1LL << AllocationGroupShift()) - 1)); 485 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 486 return run; 487 } 488 489 490 status_t 491 Volume::CreateIndicesRoot(Transaction& transaction) 492 { 493 off_t id; 494 status_t status = Inode::Create(transaction, NULL, NULL, 495 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id, 496 &fIndicesNode); 497 if (status < B_OK) 498 RETURN_ERROR(status); 499 500 fSuperBlock.indices = ToBlockRun(id); 501 return WriteSuperBlock(); 502 } 503 504 505 status_t 506 Volume::CreateVolumeID(Transaction& transaction) 507 { 508 Attribute attr(fRootNode); 509 status_t status; 510 attr_cookie* cookie; 511 status = attr.Create("be:volume_id", B_UINT64_TYPE, O_RDWR, &cookie); 512 if (status == B_OK) { 513 static bool seeded = false; 514 if (!seeded) { 515 // seed the random number generator for the be:volume_id attribute. 516 srand(time(NULL)); 517 seeded = true; 518 } 519 uint64_t id; 520 size_t length = sizeof(id); 521 id = ((uint64_t)rand() << 32) | rand(); 522 attr.Write(transaction, cookie, 0, (uint8_t *)&id, &length, NULL); 523 } 524 return status; 525 } 526 527 528 529 status_t 530 Volume::AllocateForInode(Transaction& transaction, const Inode* parent, 531 mode_t type, block_run& run) 532 { 533 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), 534 type, run); 535 } 536 537 538 status_t 539 Volume::WriteSuperBlock() 540 { 541 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) 542 != sizeof(disk_super_block)) 543 return B_IO_ERROR; 544 545 return B_OK; 546 } 547 548 549 void 550 Volume::UpdateLiveQueries(Inode* inode, const char* attribute, int32 type, 551 const uint8* oldKey, size_t oldLength, const uint8* newKey, 552 size_t newLength) 553 { 554 MutexLocker _(fQueryLock); 555 556 SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator(); 557 while (iterator.HasNext()) { 558 Query* query = iterator.Next(); 559 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, 560 newLength); 561 } 562 } 563 564 565 void 566 Volume::UpdateLiveQueriesRenameMove(Inode* inode, ino_t oldDirectoryID, 567 const char* oldName, ino_t newDirectoryID, const char* newName) 568 { 569 MutexLocker _(fQueryLock); 570 571 size_t oldLength = strlen(oldName); 572 size_t newLength = strlen(newName); 573 574 SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator(); 575 while (iterator.HasNext()) { 576 Query* query = iterator.Next(); 577 query->LiveUpdateRenameMove(inode, oldDirectoryID, oldName, oldLength, 578 newDirectoryID, newName, newLength); 579 } 580 } 581 582 583 /*! Checks if there is a live query whose results depend on the presence 584 or value of the specified attribute. 585 Don't use it if you already have all the data together to evaluate 586 the queries - it wouldn't safe you anything in this case. 587 */ 588 bool 589 Volume::CheckForLiveQuery(const char* attribute) 590 { 591 // TODO: check for a live query that depends on the specified attribute 592 return true; 593 } 594 595 596 void 597 Volume::AddQuery(Query* query) 598 { 599 MutexLocker _(fQueryLock); 600 fQueries.Add(query); 601 } 602 603 604 void 605 Volume::RemoveQuery(Query* query) 606 { 607 MutexLocker _(fQueryLock); 608 fQueries.Remove(query); 609 } 610 611 612 // #pragma mark - Disk scanning and initialization 613 614 615 /*static*/ status_t 616 Volume::CheckSuperBlock(const uint8* data, uint32* _offset) 617 { 618 disk_super_block* superBlock = (disk_super_block*)(data + 512); 619 if (superBlock->IsValid()) { 620 if (_offset != NULL) 621 *_offset = 512; 622 return B_OK; 623 } 624 625 #ifndef BFS_LITTLE_ENDIAN_ONLY 626 // For PPC, the super block might be located at offset 0 627 superBlock = (disk_super_block*)data; 628 if (superBlock->IsValid()) { 629 if (_offset != NULL) 630 *_offset = 0; 631 return B_OK; 632 } 633 #endif 634 635 return B_BAD_VALUE; 636 } 637 638 639 /*static*/ status_t 640 Volume::Identify(int fd, disk_super_block* superBlock) 641 { 642 uint8 buffer[1024]; 643 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 644 return B_IO_ERROR; 645 646 uint32 offset; 647 if (CheckSuperBlock(buffer, &offset) != B_OK) 648 return B_BAD_VALUE; 649 650 memcpy(superBlock, buffer + offset, sizeof(disk_super_block)); 651 return B_OK; 652 } 653 654 655 status_t 656 Volume::Initialize(int fd, const char* name, uint32 blockSize, 657 uint32 flags) 658 { 659 // although there is no really good reason for it, we won't 660 // accept '/' in disk names (mkbfs does this, too - and since 661 // Tracker names mounted volumes like their name) 662 if (strchr(name, '/') != NULL) 663 return B_BAD_VALUE; 664 665 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 666 && blockSize != 8192) 667 return B_BAD_VALUE; 668 669 DeviceOpener opener(fd, O_RDWR); 670 if (opener.Device() < B_OK) 671 return B_BAD_VALUE; 672 673 if (opener.IsReadOnly()) 674 return B_READ_ONLY_DEVICE; 675 676 fDevice = opener.Device(); 677 678 uint32 deviceBlockSize; 679 off_t deviceSize; 680 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 681 return B_ERROR; 682 683 off_t numBlocks = deviceSize / blockSize; 684 685 // create valid super block 686 687 fSuperBlock.Initialize(name, numBlocks, blockSize); 688 689 // initialize short hands to the super block (to save byte swapping) 690 fBlockSize = fSuperBlock.BlockSize(); 691 fBlockShift = fSuperBlock.BlockShift(); 692 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 693 694 // determine log size depending on the size of the volume 695 off_t logSize = 2048; 696 if (numBlocks <= 20480) 697 logSize = 512; 698 if (deviceSize > 1LL * 1024 * 1024 * 1024) 699 logSize = 4096; 700 701 // since the allocator has not been initialized yet, we 702 // cannot use BlockAllocator::BitmapSize() here 703 off_t bitmapBlocks = (numBlocks + blockSize * 8 - 1) / (blockSize * 8); 704 705 fSuperBlock.log_blocks = ToBlockRun(bitmapBlocks + 1); 706 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(logSize); 707 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64( 708 ToBlock(Log())); 709 710 // set the current log pointers, so that journaling will work correctly 711 fLogStart = fSuperBlock.LogStart(); 712 fLogEnd = fSuperBlock.LogEnd(); 713 714 if (!IsValidSuperBlock()) 715 RETURN_ERROR(B_ERROR); 716 717 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 718 return B_ERROR; 719 720 fJournal = new(std::nothrow) Journal(this); 721 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 722 RETURN_ERROR(B_ERROR); 723 724 // ready to write data to disk 725 726 Transaction transaction(this, 0); 727 728 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 729 RETURN_ERROR(B_ERROR); 730 731 off_t id; 732 status_t status = Inode::Create(transaction, NULL, NULL, 733 S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode); 734 if (status < B_OK) 735 RETURN_ERROR(status); 736 737 fSuperBlock.root_dir = ToBlockRun(id); 738 739 if ((flags & VOLUME_NO_INDICES) == 0) { 740 // The indices root directory will be created automatically 741 // when the standard indices are created (or any other). 742 Index index(this); 743 status = index.Create(transaction, "name", B_STRING_TYPE); 744 if (status < B_OK) 745 return status; 746 747 status = index.Create(transaction, "BEOS:APP_SIG", B_STRING_TYPE); 748 if (status < B_OK) 749 return status; 750 751 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 752 if (status < B_OK) 753 return status; 754 755 status = index.Create(transaction, "size", B_INT64_TYPE); 756 if (status < B_OK) 757 return status; 758 } 759 760 CreateVolumeID(transaction); 761 762 WriteSuperBlock(); 763 transaction.Done(); 764 765 Sync(); 766 opener.RemoveCache(true); 767 return B_OK; 768 } 769