1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! super block, mounting, etc. 7 8 9 #include "Debug.h" 10 #include "Volume.h" 11 #include "Journal.h" 12 #include "Inode.h" 13 #include "Query.h" 14 15 16 static const int32 kDesiredAllocationGroups = 56; 17 // This is the number of allocation groups that will be tried 18 // to be given for newly initialized disks. 19 // That's only relevant for smaller disks, though, since any 20 // of today's disk sizes already reach the maximum length 21 // of an allocation group (65536 blocks). 22 // It seems to create appropriate numbers for smaller disks 23 // with this setting, though (i.e. you can create a 400 MB 24 // file on a 1 GB disk without the need for double indirect 25 // blocks). 26 27 28 class DeviceOpener { 29 public: 30 DeviceOpener(int fd, int mode); 31 DeviceOpener(const char* device, int mode); 32 ~DeviceOpener(); 33 34 int Open(const char* device, int mode); 35 int Open(int fd, int mode); 36 void* InitCache(off_t numBlocks, uint32 blockSize); 37 void RemoveCache(bool allowWrites); 38 39 void Keep(); 40 41 int Device() const { return fDevice; } 42 int Mode() const { return fMode; } 43 bool IsReadOnly() const { return _IsReadOnly(fMode); } 44 45 status_t GetSize(off_t* _size, uint32* _blockSize = NULL); 46 47 private: 48 static bool _IsReadOnly(int mode) 49 { return (mode & O_RWMASK) == O_RDONLY;} 50 static bool _IsReadWrite(int mode) 51 { return (mode & O_RWMASK) == O_RDWR;} 52 53 int fDevice; 54 int fMode; 55 void* fBlockCache; 56 }; 57 58 59 DeviceOpener::DeviceOpener(const char* device, int mode) 60 : 61 fBlockCache(NULL) 62 { 63 Open(device, mode); 64 } 65 66 67 DeviceOpener::DeviceOpener(int fd, int mode) 68 : 69 fBlockCache(NULL) 70 { 71 Open(fd, mode); 72 } 73 74 75 DeviceOpener::~DeviceOpener() 76 { 77 if (fDevice >= 0) { 78 RemoveCache(false); 79 close(fDevice); 80 } 81 } 82 83 84 int 85 DeviceOpener::Open(const char* device, int mode) 86 { 87 fDevice = open(device, mode | O_NOCACHE); 88 if (fDevice < 0) 89 fDevice = errno; 90 91 if (fDevice < 0 && _IsReadWrite(mode)) { 92 // try again to open read-only (don't rely on a specific error code) 93 return Open(device, O_RDONLY | O_NOCACHE); 94 } 95 96 if (fDevice >= 0) { 97 // opening succeeded 98 fMode = mode; 99 if (_IsReadWrite(mode)) { 100 // check out if the device really allows for read/write access 101 device_geometry geometry; 102 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 103 if (geometry.read_only) { 104 // reopen device read-only 105 close(fDevice); 106 return Open(device, O_RDONLY | O_NOCACHE); 107 } 108 } 109 } 110 } 111 112 return fDevice; 113 } 114 115 116 int 117 DeviceOpener::Open(int fd, int mode) 118 { 119 fDevice = dup(fd); 120 if (fDevice < 0) 121 return errno; 122 123 fMode = mode; 124 125 return fDevice; 126 } 127 128 129 void* 130 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 131 { 132 return fBlockCache = block_cache_create(fDevice, numBlocks, blockSize, 133 IsReadOnly()); 134 } 135 136 137 void 138 DeviceOpener::RemoveCache(bool allowWrites) 139 { 140 if (fBlockCache == NULL) 141 return; 142 143 block_cache_delete(fBlockCache, allowWrites); 144 fBlockCache = NULL; 145 } 146 147 148 void 149 DeviceOpener::Keep() 150 { 151 fDevice = -1; 152 } 153 154 155 /*! Returns the size of the device in bytes. It uses B_GET_GEOMETRY 156 to compute the size, or fstat() if that failed. 157 */ 158 status_t 159 DeviceOpener::GetSize(off_t* _size, uint32* _blockSize) 160 { 161 device_geometry geometry; 162 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 163 // maybe it's just a file 164 struct stat stat; 165 if (fstat(fDevice, &stat) < 0) 166 return B_ERROR; 167 168 if (_size) 169 *_size = stat.st_size; 170 if (_blockSize) // that shouldn't cause us any problems 171 *_blockSize = 512; 172 173 return B_OK; 174 } 175 176 if (_size) { 177 *_size = 1LL * geometry.head_count * geometry.cylinder_count 178 * geometry.sectors_per_track * geometry.bytes_per_sector; 179 } 180 if (_blockSize) 181 *_blockSize = geometry.bytes_per_sector; 182 183 return B_OK; 184 } 185 186 187 // #pragma mark - 188 189 190 bool 191 disk_super_block::IsValid() 192 { 193 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 194 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 195 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 196 || (int32)block_size != inode_size 197 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 198 || (1UL << BlockShift()) != BlockSize() 199 || AllocationGroups() < 1 200 || AllocationGroupShift() < 1 201 || BlocksPerAllocationGroup() < 1 202 || NumBlocks() < 10 203 || AllocationGroups() != divide_roundup(NumBlocks(), 204 1L << AllocationGroupShift())) 205 return false; 206 207 return true; 208 } 209 210 211 void 212 disk_super_block::Initialize(const char* diskName, off_t numBlocks, 213 uint32 blockSize) 214 { 215 memset(this, 0, sizeof(disk_super_block)); 216 217 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 218 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 219 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 220 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 221 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 222 223 strlcpy(name, diskName, sizeof(name)); 224 225 int32 blockShift = 9; 226 while ((1UL << blockShift) < blockSize) { 227 blockShift++; 228 } 229 230 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 231 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 232 233 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 234 used_blocks = 0; 235 236 // Get the minimum ag_shift (that's determined by the block size) 237 238 int32 bitsPerBlock = blockSize << 3; 239 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 240 int32 blocksPerGroup = 1; 241 int32 groupShift = 13; 242 243 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 244 groupShift++; 245 } 246 247 // Many allocation groups help applying allocation policies, but if 248 // they are too small, we will need to many block_runs to cover large 249 // files (see above to get an explanation of the kDesiredAllocationGroups 250 // constant). 251 252 int32 numGroups; 253 254 while (true) { 255 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 256 if (numGroups > kDesiredAllocationGroups) { 257 if (groupShift == 16) 258 break; 259 260 groupShift++; 261 blocksPerGroup *= 2; 262 } else 263 break; 264 } 265 266 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 267 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup); 268 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 269 } 270 271 272 // #pragma mark - 273 274 275 Volume::Volume(fs_volume* volume) 276 : 277 fVolume(volume), 278 fBlockAllocator(this), 279 fRootNode(NULL), 280 fIndicesNode(NULL), 281 fDirtyCachedBlocks(0), 282 fFlags(0), 283 fCheckingThread(-1) 284 { 285 mutex_init(&fLock, "bfs volume"); 286 mutex_init(&fQueryLock, "bfs queries"); 287 } 288 289 290 Volume::~Volume() 291 { 292 mutex_destroy(&fQueryLock); 293 mutex_destroy(&fLock); 294 } 295 296 297 bool 298 Volume::IsValidSuperBlock() 299 { 300 return fSuperBlock.IsValid(); 301 } 302 303 304 void 305 Volume::Panic() 306 { 307 FATAL(("we have to panic... switch to read-only mode!\n")); 308 fFlags |= VOLUME_READ_ONLY; 309 #ifdef DEBUG 310 kernel_debugger("BFS panics!"); 311 #endif 312 } 313 314 315 status_t 316 Volume::Mount(const char* deviceName, uint32 flags) 317 { 318 // TODO: validate the FS in write mode as well! 319 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 320 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 321 // in big endian mode, we only mount read-only for now 322 flags |= B_MOUNT_READ_ONLY; 323 #endif 324 325 DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0 326 ? O_RDONLY : O_RDWR); 327 fDevice = opener.Device(); 328 if (fDevice < B_OK) 329 RETURN_ERROR(fDevice); 330 331 if (opener.IsReadOnly()) 332 fFlags |= VOLUME_READ_ONLY; 333 334 // read the super block 335 if (Identify(fDevice, &fSuperBlock) != B_OK) { 336 FATAL(("invalid super block!\n")); 337 return B_BAD_VALUE; 338 } 339 340 // initialize short hands to the super block (to save byte swapping) 341 fBlockSize = fSuperBlock.BlockSize(); 342 fBlockShift = fSuperBlock.BlockShift(); 343 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 344 345 // check if the device size is large enough to hold the file system 346 off_t diskSize; 347 if (opener.GetSize(&diskSize) != B_OK) 348 RETURN_ERROR(B_ERROR); 349 if (diskSize < (NumBlocks() << BlockShift())) 350 RETURN_ERROR(B_BAD_VALUE); 351 352 // set the current log pointers, so that journaling will work correctly 353 fLogStart = fSuperBlock.LogStart(); 354 fLogEnd = fSuperBlock.LogEnd(); 355 356 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 357 return B_ERROR; 358 359 fJournal = new Journal(this); 360 if (fJournal == NULL) 361 return B_NO_MEMORY; 362 363 status_t status = fJournal->InitCheck(); 364 if (status < B_OK) { 365 FATAL(("could not initialize journal: %s!\n", strerror(status))); 366 return status; 367 } 368 369 // replaying the log is the first thing we will do on this disk 370 status = fJournal->ReplayLog(); 371 if (status != B_OK) { 372 FATAL(("Replaying log failed, data may be corrupted, volume " 373 "read-only.\n")); 374 fFlags |= VOLUME_READ_ONLY; 375 // TODO: if this is the boot volume, Bootscript will assume this 376 // is a CD... 377 // TODO: it would be nice to have a user visible alert instead 378 // of letting him just find this in the syslog. 379 } 380 381 status = fBlockAllocator.Initialize(); 382 if (status != B_OK) { 383 FATAL(("could not initialize block bitmap allocator!\n")); 384 return status; 385 } 386 387 fRootNode = new Inode(this, ToVnode(Root())); 388 if (fRootNode != NULL && fRootNode->InitCheck() == B_OK) { 389 status = publish_vnode(fVolume, ToVnode(Root()), (void*)fRootNode, 390 &gBFSVnodeOps, fRootNode->Mode(), 0); 391 if (status == B_OK) { 392 // try to get indices root dir 393 394 if (!Indices().IsZero()) 395 fIndicesNode = new Inode(this, ToVnode(Indices())); 396 397 if (fIndicesNode == NULL 398 || fIndicesNode->InitCheck() < B_OK 399 || !fIndicesNode->IsContainer()) { 400 INFORM(("bfs: volume doesn't have indices!\n")); 401 402 if (fIndicesNode) { 403 // if this is the case, the index root node is gone bad, 404 // and BFS switch to read-only mode 405 fFlags |= VOLUME_READ_ONLY; 406 delete fIndicesNode; 407 fIndicesNode = NULL; 408 } 409 } else { 410 // we don't use the vnode layer to access the indices node 411 } 412 413 // all went fine 414 opener.Keep(); 415 return B_OK; 416 } else 417 FATAL(("could not create root node: publish_vnode() failed!\n")); 418 419 delete fRootNode; 420 } else { 421 status = B_BAD_VALUE; 422 FATAL(("could not create root node!\n")); 423 } 424 425 return status; 426 } 427 428 429 status_t 430 Volume::Unmount() 431 { 432 put_vnode(fVolume, ToVnode(Root())); 433 434 fBlockAllocator.Uninitialize(); 435 436 // This will also flush the log & all blocks to disk 437 delete fJournal; 438 fJournal = NULL; 439 440 delete fIndicesNode; 441 442 block_cache_delete(fBlockCache, !IsReadOnly()); 443 close(fDevice); 444 445 return B_OK; 446 } 447 448 449 status_t 450 Volume::Sync() 451 { 452 return fJournal->FlushLogAndBlocks(); 453 } 454 455 456 status_t 457 Volume::ValidateBlockRun(block_run run) 458 { 459 if (run.AllocationGroup() < 0 460 || run.AllocationGroup() > (int32)AllocationGroups() 461 || run.Start() > (1UL << AllocationGroupShift()) 462 || run.length == 0 463 || uint32(run.Length() + run.Start()) 464 > (1UL << AllocationGroupShift())) { 465 Panic(); 466 FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), 467 run.Start(), run.Length())); 468 return B_BAD_DATA; 469 } 470 return B_OK; 471 } 472 473 474 block_run 475 Volume::ToBlockRun(off_t block) const 476 { 477 block_run run; 478 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32( 479 block >> AllocationGroupShift()); 480 run.start = HOST_ENDIAN_TO_BFS_INT16( 481 block & ((1LL << AllocationGroupShift()) - 1)); 482 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 483 return run; 484 } 485 486 487 status_t 488 Volume::CreateIndicesRoot(Transaction& transaction) 489 { 490 off_t id; 491 status_t status = Inode::Create(transaction, NULL, NULL, 492 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id, 493 &fIndicesNode); 494 if (status < B_OK) 495 RETURN_ERROR(status); 496 497 fSuperBlock.indices = ToBlockRun(id); 498 return WriteSuperBlock(); 499 } 500 501 502 status_t 503 Volume::AllocateForInode(Transaction& transaction, const Inode* parent, 504 mode_t type, block_run& run) 505 { 506 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), 507 type, run); 508 } 509 510 511 status_t 512 Volume::WriteSuperBlock() 513 { 514 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) 515 != sizeof(disk_super_block)) 516 return B_IO_ERROR; 517 518 return B_OK; 519 } 520 521 522 void 523 Volume::UpdateLiveQueries(Inode* inode, const char* attribute, int32 type, 524 const uint8* oldKey, size_t oldLength, const uint8* newKey, 525 size_t newLength) 526 { 527 MutexLocker _(fQueryLock); 528 529 SinglyLinkedList<Query>::Iterator iterator = fQueries.GetIterator(); 530 while (iterator.HasNext()) { 531 Query* query = iterator.Next(); 532 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, 533 newLength); 534 } 535 } 536 537 538 /*! Checks if there is a live query whose results depend on the presence 539 or value of the specified attribute. 540 Don't use it if you already have all the data together to evaluate 541 the queries - it wouldn't safe you anything in this case. 542 */ 543 bool 544 Volume::CheckForLiveQuery(const char* attribute) 545 { 546 // TODO: check for a live query that depends on the specified attribute 547 return true; 548 } 549 550 551 void 552 Volume::AddQuery(Query* query) 553 { 554 MutexLocker _(fQueryLock); 555 fQueries.Add(query); 556 } 557 558 559 void 560 Volume::RemoveQuery(Query* query) 561 { 562 MutexLocker _(fQueryLock); 563 fQueries.Remove(query); 564 } 565 566 567 // #pragma mark - Disk scanning and initialization 568 569 570 /*static*/ status_t 571 Volume::CheckSuperBlock(const uint8* data, uint32* _offset) 572 { 573 disk_super_block* superBlock = (disk_super_block*)(data + 512); 574 if (superBlock->IsValid()) { 575 if (_offset != NULL) 576 *_offset = 512; 577 return B_OK; 578 } 579 580 #ifndef BFS_LITTLE_ENDIAN_ONLY 581 // For PPC, the super block might be located at offset 0 582 superBlock = (disk_super_block*)data; 583 if (superBlock->IsValid()) { 584 if (_offset != NULL) 585 *_offset = 0; 586 return B_OK; 587 } 588 #endif 589 590 return B_BAD_VALUE; 591 } 592 593 594 /*static*/ status_t 595 Volume::Identify(int fd, disk_super_block* superBlock) 596 { 597 uint8 buffer[1024]; 598 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 599 return B_IO_ERROR; 600 601 uint32 offset; 602 if (CheckSuperBlock(buffer, &offset) != B_OK) 603 return B_BAD_VALUE; 604 605 memcpy(superBlock, buffer + offset, sizeof(disk_super_block)); 606 return B_OK; 607 } 608 609 610 status_t 611 Volume::Initialize(int fd, const char* name, uint32 blockSize, 612 uint32 flags) 613 { 614 // although there is no really good reason for it, we won't 615 // accept '/' in disk names (mkbfs does this, too - and since 616 // Tracker names mounted volumes like their name) 617 if (strchr(name, '/') != NULL) 618 return B_BAD_VALUE; 619 620 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 621 && blockSize != 8192) 622 return B_BAD_VALUE; 623 624 DeviceOpener opener(fd, O_RDWR); 625 if (opener.Device() < B_OK) 626 return B_BAD_VALUE; 627 628 if (opener.IsReadOnly()) 629 return B_READ_ONLY_DEVICE; 630 631 fDevice = opener.Device(); 632 633 uint32 deviceBlockSize; 634 off_t deviceSize; 635 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 636 return B_ERROR; 637 638 off_t numBlocks = deviceSize / blockSize; 639 640 // create valid super block 641 642 fSuperBlock.Initialize(name, numBlocks, blockSize); 643 644 // initialize short hands to the super block (to save byte swapping) 645 fBlockSize = fSuperBlock.BlockSize(); 646 fBlockShift = fSuperBlock.BlockShift(); 647 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 648 649 // determine log size depending on the size of the volume 650 off_t logSize = 2048; 651 if (numBlocks <= 20480) 652 logSize = 512; 653 if (deviceSize > 1LL * 1024 * 1024 * 1024) 654 logSize = 4096; 655 656 // since the allocator has not been initialized yet, we 657 // cannot use BlockAllocator::BitmapSize() here 658 off_t bitmapBlocks = (numBlocks + blockSize * 8 - 1) / (blockSize * 8); 659 660 fSuperBlock.log_blocks = ToBlockRun(bitmapBlocks + 1); 661 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(logSize); 662 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64( 663 ToBlock(Log())); 664 665 // set the current log pointers, so that journaling will work correctly 666 fLogStart = fSuperBlock.LogStart(); 667 fLogEnd = fSuperBlock.LogEnd(); 668 669 if (!IsValidSuperBlock()) 670 RETURN_ERROR(B_ERROR); 671 672 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 673 return B_ERROR; 674 675 fJournal = new Journal(this); 676 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 677 RETURN_ERROR(B_ERROR); 678 679 // ready to write data to disk 680 681 Transaction transaction(this, 0); 682 683 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 684 RETURN_ERROR(B_ERROR); 685 686 off_t id; 687 status_t status = Inode::Create(transaction, NULL, NULL, 688 S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode); 689 if (status < B_OK) 690 RETURN_ERROR(status); 691 692 fSuperBlock.root_dir = ToBlockRun(id); 693 694 if ((flags & VOLUME_NO_INDICES) == 0) { 695 // The indices root directory will be created automatically 696 // when the standard indices are created (or any other). 697 Index index(this); 698 status = index.Create(transaction, "name", B_STRING_TYPE); 699 if (status < B_OK) 700 return status; 701 702 status = index.Create(transaction, "BEOS:APP_SIG", B_STRING_TYPE); 703 if (status < B_OK) 704 return status; 705 706 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 707 if (status < B_OK) 708 return status; 709 710 status = index.Create(transaction, "size", B_INT64_TYPE); 711 if (status < B_OK) 712 return status; 713 } 714 715 WriteSuperBlock(); 716 transaction.Done(); 717 718 Sync(); 719 opener.RemoveCache(true); 720 return B_OK; 721 } 722