1 /* 2 * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! super block, mounting, etc. 7 8 9 #include "Debug.h" 10 #include "Volume.h" 11 #include "Journal.h" 12 #include "Inode.h" 13 #include "Query.h" 14 15 16 static const int32 kDesiredAllocationGroups = 56; 17 // This is the number of allocation groups that will be tried 18 // to be given for newly initialized disks. 19 // That's only relevant for smaller disks, though, since any 20 // of today's disk sizes already reach the maximum length 21 // of an allocation group (65536 blocks). 22 // It seems to create appropriate numbers for smaller disks 23 // with this setting, though (i.e. you can create a 400 MB 24 // file on a 1 GB disk without the need for double indirect 25 // blocks). 26 27 28 class DeviceOpener { 29 public: 30 DeviceOpener(int fd, int mode); 31 DeviceOpener(const char *device, int mode); 32 ~DeviceOpener(); 33 34 int Open(const char *device, int mode); 35 int Open(int fd, int mode); 36 void *InitCache(off_t numBlocks, uint32 blockSize); 37 void RemoveCache(bool allowWrites); 38 39 void Keep(); 40 41 int Device() const { return fDevice; } 42 int Mode() const { return fMode; } 43 44 status_t GetSize(off_t *_size, uint32 *_blockSize = NULL); 45 46 private: 47 int fDevice; 48 int fMode; 49 void *fBlockCache; 50 }; 51 52 53 DeviceOpener::DeviceOpener(const char *device, int mode) 54 : 55 fBlockCache(NULL) 56 { 57 Open(device, mode); 58 } 59 60 61 DeviceOpener::DeviceOpener(int fd, int mode) 62 : 63 fBlockCache(NULL) 64 { 65 Open(fd, mode); 66 } 67 68 69 DeviceOpener::~DeviceOpener() 70 { 71 if (fDevice >= B_OK) { 72 RemoveCache(false); 73 close(fDevice); 74 } 75 } 76 77 78 int 79 DeviceOpener::Open(const char *device, int mode) 80 { 81 fDevice = open(device, mode); 82 if (fDevice < 0) 83 fDevice = errno; 84 85 if (fDevice < 0 && mode == O_RDWR) { 86 // try again to open read-only (don't rely on a specific error code) 87 return Open(device, O_RDONLY); 88 } 89 90 if (fDevice >= 0) { 91 // opening succeeded 92 fMode = mode; 93 if (mode == O_RDWR) { 94 // check out if the device really allows for read/write access 95 device_geometry geometry; 96 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 97 if (geometry.read_only) { 98 // reopen device read-only 99 close(fDevice); 100 return Open(device, O_RDONLY); 101 } 102 } 103 } 104 } 105 106 return fDevice; 107 } 108 109 110 int 111 DeviceOpener::Open(int fd, int mode) 112 { 113 fDevice = dup(fd); 114 if (fDevice < 0) 115 return errno; 116 117 fMode = mode; 118 119 return fDevice; 120 } 121 122 123 void * 124 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 125 { 126 return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY); 127 } 128 129 130 void 131 DeviceOpener::RemoveCache(bool allowWrites) 132 { 133 if (fBlockCache == NULL) 134 return; 135 136 block_cache_delete(fBlockCache, allowWrites); 137 fBlockCache = NULL; 138 } 139 140 141 void 142 DeviceOpener::Keep() 143 { 144 fDevice = -1; 145 } 146 147 148 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY 149 * to compute the size, or fstat() if that failed. 150 */ 151 152 status_t 153 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize) 154 { 155 device_geometry geometry; 156 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 157 // maybe it's just a file 158 struct stat stat; 159 if (fstat(fDevice, &stat) < 0) 160 return B_ERROR; 161 162 if (_size) 163 *_size = stat.st_size; 164 if (_blockSize) // that shouldn't cause us any problems 165 *_blockSize = 512; 166 167 return B_OK; 168 } 169 170 if (_size) { 171 *_size = 1LL * geometry.head_count * geometry.cylinder_count 172 * geometry.sectors_per_track * geometry.bytes_per_sector; 173 } 174 if (_blockSize) 175 *_blockSize = geometry.bytes_per_sector; 176 177 return B_OK; 178 } 179 180 181 // #pragma mark - 182 183 184 bool 185 disk_super_block::IsValid() 186 { 187 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 188 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 189 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 190 || (int32)block_size != inode_size 191 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 192 || (1UL << BlockShift()) != BlockSize() 193 || AllocationGroups() < 1 194 || AllocationGroupShift() < 1 195 || BlocksPerAllocationGroup() < 1 196 || NumBlocks() < 10 197 || AllocationGroups() != divide_roundup(NumBlocks(), 198 1L << AllocationGroupShift())) 199 return false; 200 201 return true; 202 } 203 204 205 void 206 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize) 207 { 208 memset(this, 0, sizeof(disk_super_block)); 209 210 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 211 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 212 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 213 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 214 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 215 216 strlcpy(name, diskName, sizeof(name)); 217 218 int32 blockShift = 9; 219 while ((1UL << blockShift) < blockSize) { 220 blockShift++; 221 } 222 223 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 224 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 225 226 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 227 used_blocks = 0; 228 229 // Get the minimum ag_shift (that's determined by the block size) 230 231 int32 bitsPerBlock = blockSize << 3; 232 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 233 int32 blocksPerGroup = 1; 234 int32 groupShift = 13; 235 236 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 237 groupShift++; 238 } 239 240 // Many allocation groups help applying allocation policies, but if 241 // they are too small, we will need to many block_runs to cover large 242 // files (see above to get an explanation of the kDesiredAllocationGroups 243 // constant). 244 245 int32 numGroups; 246 247 while (true) { 248 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 249 if (numGroups > kDesiredAllocationGroups) { 250 if (groupShift == 16) 251 break; 252 253 groupShift++; 254 blocksPerGroup *= 2; 255 } else 256 break; 257 } 258 259 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 260 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup); 261 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 262 } 263 264 265 // #pragma mark - 266 267 268 Volume::Volume(dev_t id) 269 : 270 fID(id), 271 fBlockAllocator(this), 272 fLock("bfs volume"), 273 fRootNode(NULL), 274 fIndicesNode(NULL), 275 fDirtyCachedBlocks(0), 276 fUniqueID(0), 277 fFlags(0) 278 { 279 } 280 281 282 Volume::~Volume() 283 { 284 } 285 286 287 bool 288 Volume::IsValidSuperBlock() 289 { 290 return fSuperBlock.IsValid(); 291 } 292 293 294 void 295 Volume::Panic() 296 { 297 FATAL(("we have to panic... switch to read-only mode!\n")); 298 fFlags |= VOLUME_READ_ONLY; 299 #ifdef DEBUG 300 kernel_debugger("BFS panics!"); 301 #endif 302 } 303 304 305 status_t 306 Volume::Mount(const char *deviceName, uint32 flags) 307 { 308 // ToDo: validate the FS in write mode as well! 309 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 310 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 311 // in big endian mode, we only mount read-only for now 312 flags |= B_MOUNT_READ_ONLY; 313 #endif 314 315 DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR); 316 fDevice = opener.Device(); 317 if (fDevice < B_OK) 318 RETURN_ERROR(fDevice); 319 320 if (opener.Mode() == O_RDONLY) 321 fFlags |= VOLUME_READ_ONLY; 322 323 // check if it's a regular file, and if so, disable the cache for the 324 // underlaying file system 325 struct stat stat; 326 if (fstat(fDevice, &stat) < 0) 327 RETURN_ERROR(B_ERROR); 328 329 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works) 330 #if 0 331 #ifndef NO_FILE_UNCACHED_IO 332 if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) { 333 // mount read-only if the cache couldn't be disabled 334 # ifdef DEBUG 335 FATAL(("couldn't disable cache for image file - system may dead-lock!\n")); 336 # else 337 FATAL(("couldn't disable cache for image file!\n")); 338 Panic(); 339 # endif 340 } 341 #endif 342 #endif 343 344 // read the super block 345 if (Identify(fDevice, &fSuperBlock) != B_OK) { 346 FATAL(("invalid super block!\n")); 347 return B_BAD_VALUE; 348 } 349 350 // initialize short hands to the super block (to save byte swapping) 351 fBlockSize = fSuperBlock.BlockSize(); 352 fBlockShift = fSuperBlock.BlockShift(); 353 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 354 355 // check if the device size is large enough to hold the file system 356 off_t diskSize; 357 if (opener.GetSize(&diskSize) < B_OK) 358 RETURN_ERROR(B_ERROR); 359 if (diskSize < (NumBlocks() << BlockShift())) 360 RETURN_ERROR(B_BAD_VALUE); 361 362 // set the current log pointers, so that journaling will work correctly 363 fLogStart = fSuperBlock.LogStart(); 364 fLogEnd = fSuperBlock.LogEnd(); 365 366 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 367 return B_ERROR; 368 369 fJournal = new Journal(this); 370 // replaying the log is the first thing we will do on this disk 371 if (fJournal && fJournal->InitCheck() < B_OK 372 || fBlockAllocator.Initialize() < B_OK) { 373 // ToDo: improve error reporting for a bad journal 374 FATAL(("could not initialize journal/block bitmap allocator!\n")); 375 return B_NO_MEMORY; 376 } 377 378 status_t status = B_OK; 379 380 fRootNode = new Inode(this, ToVnode(Root())); 381 if (fRootNode && fRootNode->InitCheck() == B_OK) { 382 status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode); 383 if (status == B_OK) { 384 // try to get indices root dir 385 386 // question: why doesn't get_vnode() work here?? 387 // answer: we have not yet backpropagated the pointer to the 388 // volume in bfs_mount(), so bfs_read_vnode() can't get it. 389 // But it's not needed to do that anyway. 390 391 if (!Indices().IsZero()) 392 fIndicesNode = new Inode(this, ToVnode(Indices())); 393 394 if (fIndicesNode == NULL 395 || fIndicesNode->InitCheck() < B_OK 396 || !fIndicesNode->IsContainer()) { 397 INFORM(("bfs: volume doesn't have indices!\n")); 398 399 if (fIndicesNode) { 400 // if this is the case, the index root node is gone bad, and 401 // BFS switch to read-only mode 402 fFlags |= VOLUME_READ_ONLY; 403 delete fIndicesNode; 404 fIndicesNode = NULL; 405 } 406 } 407 408 // all went fine 409 opener.Keep(); 410 return B_OK; 411 } else 412 FATAL(("could not create root node: publish_vnode() failed!\n")); 413 414 delete fRootNode; 415 } else { 416 status = B_BAD_VALUE; 417 FATAL(("could not create root node!\n")); 418 } 419 420 return status; 421 } 422 423 424 status_t 425 Volume::Unmount() 426 { 427 // Unlike in BeOS, we need to put the reference to our root node ourselves 428 put_vnode(fID, ToVnode(Root())); 429 430 // This will also flush the log & all blocks to disk 431 delete fJournal; 432 fJournal = NULL; 433 434 delete fIndicesNode; 435 436 block_cache_delete(fBlockCache, !IsReadOnly()); 437 close(fDevice); 438 439 return B_OK; 440 } 441 442 443 status_t 444 Volume::Sync() 445 { 446 return fJournal->FlushLogAndBlocks(); 447 } 448 449 450 status_t 451 Volume::ValidateBlockRun(block_run run) 452 { 453 if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups() 454 || run.Start() > (1UL << AllocationGroupShift()) 455 || run.length == 0 456 || uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) { 457 Panic(); 458 FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), run.Start(), run.Length())); 459 return B_BAD_DATA; 460 } 461 return B_OK; 462 } 463 464 465 block_run 466 Volume::ToBlockRun(off_t block) const 467 { 468 block_run run; 469 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift()); 470 run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1)); 471 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 472 return run; 473 } 474 475 476 status_t 477 Volume::CreateIndicesRoot(Transaction &transaction) 478 { 479 off_t id; 480 status_t status = Inode::Create(transaction, NULL, NULL, 481 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id, 482 &fIndicesNode); 483 if (status < B_OK) 484 RETURN_ERROR(status); 485 486 fSuperBlock.indices = ToBlockRun(id); 487 return WriteSuperBlock(); 488 } 489 490 491 status_t 492 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run) 493 { 494 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run); 495 } 496 497 498 status_t 499 Volume::WriteSuperBlock() 500 { 501 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block)) 502 return B_IO_ERROR; 503 504 return B_OK; 505 } 506 507 508 void 509 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey, 510 size_t oldLength, const uint8 *newKey, size_t newLength) 511 { 512 if (fQueryLock.Lock() < B_OK) 513 return; 514 515 Query *query = NULL; 516 while ((query = fQueries.Next(query)) != NULL) 517 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength); 518 519 fQueryLock.Unlock(); 520 } 521 522 523 /*! 524 Checks if there is a live query whose results depend on the presence 525 or value of the specified attribute. 526 Don't use it if you already have all the data together to evaluate 527 the queries - it wouldn't safe you anything in this case. 528 */ 529 bool 530 Volume::CheckForLiveQuery(const char *attribute) 531 { 532 // ToDo: check for a live query that depends on the specified attribute 533 return true; 534 } 535 536 537 void 538 Volume::AddQuery(Query *query) 539 { 540 if (fQueryLock.Lock() < B_OK) 541 return; 542 543 fQueries.Add(query); 544 545 fQueryLock.Unlock(); 546 } 547 548 549 void 550 Volume::RemoveQuery(Query *query) 551 { 552 if (fQueryLock.Lock() < B_OK) 553 return; 554 555 fQueries.Remove(query); 556 557 fQueryLock.Unlock(); 558 } 559 560 561 // #pragma mark - Disk scanning and initialization 562 563 564 status_t 565 Volume::Identify(int fd, disk_super_block *superBlock) 566 { 567 char buffer[1024]; 568 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 569 return B_IO_ERROR; 570 571 // Note: that does work only for x86, for PowerPC, the super block 572 // may be located at offset 0! 573 memcpy(superBlock, buffer + 512, sizeof(disk_super_block)); 574 if (!superBlock->IsValid()) { 575 #ifndef BFS_LITTLE_ENDIAN_ONLY 576 memcpy(superBlock, buffer, sizeof(disk_super_block)); 577 if (!superBlock->IsValid()) 578 return B_BAD_VALUE; 579 #else 580 return B_BAD_VALUE; 581 #endif 582 } 583 584 return B_OK; 585 } 586 587 588 status_t 589 Volume::Initialize(int fd, const char *name, uint32 blockSize, 590 uint32 flags) 591 { 592 // although there is no really good reason for it, we won't 593 // accept '/' in disk names (mkbfs does this, too - and since 594 // Tracker names mounted volumes like their name) 595 if (strchr(name, '/') != NULL) 596 return B_BAD_VALUE; 597 598 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 599 && blockSize != 8192) 600 return B_BAD_VALUE; 601 602 DeviceOpener opener(fd, O_RDWR); 603 if (opener.Device() < B_OK) 604 return B_BAD_VALUE; 605 606 fDevice = opener.Device(); 607 608 uint32 deviceBlockSize; 609 off_t deviceSize; 610 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 611 return B_ERROR; 612 613 off_t numBlocks = deviceSize / blockSize; 614 615 // create valid super block 616 617 fSuperBlock.Initialize(name, numBlocks, blockSize); 618 619 // initialize short hands to the super block (to save byte swapping) 620 fBlockSize = fSuperBlock.BlockSize(); 621 fBlockShift = fSuperBlock.BlockShift(); 622 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 623 624 // since the allocator has not been initialized yet, we 625 // cannot use BlockAllocator::BitmapSize() here 626 fSuperBlock.log_blocks = ToBlockRun(AllocationGroups() 627 * fSuperBlock.BlocksPerAllocationGroup() + 1); 628 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048); 629 // ToDo: set the log size depending on the disk size 630 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64( 631 ToBlock(Log())); 632 633 // set the current log pointers, so that journaling will work correctly 634 fLogStart = fSuperBlock.LogStart(); 635 fLogEnd = fSuperBlock.LogEnd(); 636 637 if (!IsValidSuperBlock()) 638 RETURN_ERROR(B_ERROR); 639 640 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 641 return B_ERROR; 642 643 fJournal = new Journal(this); 644 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 645 RETURN_ERROR(B_ERROR); 646 647 // ready to write data to disk 648 649 Transaction transaction(this, 0); 650 651 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 652 RETURN_ERROR(B_ERROR); 653 654 off_t id; 655 status_t status = Inode::Create(transaction, NULL, NULL, 656 S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode); 657 if (status < B_OK) 658 RETURN_ERROR(status); 659 660 fSuperBlock.root_dir = ToBlockRun(id); 661 662 if ((flags & VOLUME_NO_INDICES) == 0) { 663 // The indices root directory will be created automatically 664 // when the standard indices are created (or any other). 665 Index index(this); 666 status = index.Create(transaction, "name", B_STRING_TYPE); 667 if (status < B_OK) 668 return status; 669 670 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 671 if (status < B_OK) 672 return status; 673 674 status = index.Create(transaction, "size", B_INT64_TYPE); 675 if (status < B_OK) 676 return status; 677 } 678 679 WriteSuperBlock(); 680 transaction.Done(); 681 682 // put_vnode(ID(), fRootNode->ID()); 683 // if (fIndicesNode != NULL) 684 // put_vnode(ID(), fIndicesNode->ID()); 685 686 Sync(); 687 opener.RemoveCache(true); 688 return B_OK; 689 } 690