1 /* 2 * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! super block, mounting, etc. 7 8 9 #include "Debug.h" 10 #include "Volume.h" 11 #include "Journal.h" 12 #include "Inode.h" 13 #include "Query.h" 14 15 16 static const int32 kDesiredAllocationGroups = 56; 17 // This is the number of allocation groups that will be tried 18 // to be given for newly initialized disks. 19 // That's only relevant for smaller disks, though, since any 20 // of today's disk sizes already reach the maximum length 21 // of an allocation group (65536 blocks). 22 // It seems to create appropriate numbers for smaller disks 23 // with this setting, though (i.e. you can create a 400 MB 24 // file on a 1 GB disk without the need for double indirect 25 // blocks). 26 27 28 class DeviceOpener { 29 public: 30 DeviceOpener(int fd, int mode); 31 DeviceOpener(const char *device, int mode); 32 ~DeviceOpener(); 33 34 int Open(const char *device, int mode); 35 int Open(int fd, int mode); 36 void *InitCache(off_t numBlocks, uint32 blockSize); 37 void RemoveCache(bool allowWrites); 38 39 void Keep(); 40 41 int Device() const { return fDevice; } 42 int Mode() const { return fMode; } 43 44 status_t GetSize(off_t *_size, uint32 *_blockSize = NULL); 45 46 private: 47 int fDevice; 48 int fMode; 49 void *fBlockCache; 50 }; 51 52 53 DeviceOpener::DeviceOpener(const char *device, int mode) 54 : 55 fBlockCache(NULL) 56 { 57 Open(device, mode); 58 } 59 60 61 DeviceOpener::DeviceOpener(int fd, int mode) 62 : 63 fBlockCache(NULL) 64 { 65 Open(fd, mode); 66 } 67 68 69 DeviceOpener::~DeviceOpener() 70 { 71 if (fDevice >= B_OK) { 72 RemoveCache(false); 73 close(fDevice); 74 } 75 } 76 77 78 int 79 DeviceOpener::Open(const char *device, int mode) 80 { 81 fDevice = open(device, mode | O_NOCACHE); 82 if (fDevice < 0) 83 fDevice = errno; 84 85 if (fDevice < 0 && mode == O_RDWR) { 86 // try again to open read-only (don't rely on a specific error code) 87 return Open(device, O_RDONLY | O_NOCACHE); 88 } 89 90 if (fDevice >= 0) { 91 // opening succeeded 92 fMode = mode; 93 if (mode == O_RDWR) { 94 // check out if the device really allows for read/write access 95 device_geometry geometry; 96 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 97 if (geometry.read_only) { 98 // reopen device read-only 99 close(fDevice); 100 return Open(device, O_RDONLY | O_NOCACHE); 101 } 102 } 103 } 104 } 105 106 return fDevice; 107 } 108 109 110 int 111 DeviceOpener::Open(int fd, int mode) 112 { 113 fDevice = dup(fd); 114 if (fDevice < 0) 115 return errno; 116 117 fMode = mode; 118 119 return fDevice; 120 } 121 122 123 void * 124 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 125 { 126 return block_cache_create(fDevice, numBlocks, blockSize, fMode == O_RDONLY); 127 } 128 129 130 void 131 DeviceOpener::RemoveCache(bool allowWrites) 132 { 133 if (fBlockCache == NULL) 134 return; 135 136 block_cache_delete(fBlockCache, allowWrites); 137 fBlockCache = NULL; 138 } 139 140 141 void 142 DeviceOpener::Keep() 143 { 144 fDevice = -1; 145 } 146 147 148 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY 149 * to compute the size, or fstat() if that failed. 150 */ 151 152 status_t 153 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize) 154 { 155 device_geometry geometry; 156 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 157 // maybe it's just a file 158 struct stat stat; 159 if (fstat(fDevice, &stat) < 0) 160 return B_ERROR; 161 162 if (_size) 163 *_size = stat.st_size; 164 if (_blockSize) // that shouldn't cause us any problems 165 *_blockSize = 512; 166 167 return B_OK; 168 } 169 170 if (_size) { 171 *_size = 1LL * geometry.head_count * geometry.cylinder_count 172 * geometry.sectors_per_track * geometry.bytes_per_sector; 173 } 174 if (_blockSize) 175 *_blockSize = geometry.bytes_per_sector; 176 177 return B_OK; 178 } 179 180 181 // #pragma mark - 182 183 184 bool 185 disk_super_block::IsValid() 186 { 187 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 188 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 189 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 190 || (int32)block_size != inode_size 191 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 192 || (1UL << BlockShift()) != BlockSize() 193 || AllocationGroups() < 1 194 || AllocationGroupShift() < 1 195 || BlocksPerAllocationGroup() < 1 196 || NumBlocks() < 10 197 || AllocationGroups() != divide_roundup(NumBlocks(), 198 1L << AllocationGroupShift())) 199 return false; 200 201 return true; 202 } 203 204 205 void 206 disk_super_block::Initialize(const char *diskName, off_t numBlocks, 207 uint32 blockSize) 208 { 209 memset(this, 0, sizeof(disk_super_block)); 210 211 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 212 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 213 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 214 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 215 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 216 217 strlcpy(name, diskName, sizeof(name)); 218 219 int32 blockShift = 9; 220 while ((1UL << blockShift) < blockSize) { 221 blockShift++; 222 } 223 224 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 225 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 226 227 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 228 used_blocks = 0; 229 230 // Get the minimum ag_shift (that's determined by the block size) 231 232 int32 bitsPerBlock = blockSize << 3; 233 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 234 int32 blocksPerGroup = 1; 235 int32 groupShift = 13; 236 237 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 238 groupShift++; 239 } 240 241 // Many allocation groups help applying allocation policies, but if 242 // they are too small, we will need to many block_runs to cover large 243 // files (see above to get an explanation of the kDesiredAllocationGroups 244 // constant). 245 246 int32 numGroups; 247 248 while (true) { 249 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 250 if (numGroups > kDesiredAllocationGroups) { 251 if (groupShift == 16) 252 break; 253 254 groupShift++; 255 blocksPerGroup *= 2; 256 } else 257 break; 258 } 259 260 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 261 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(blocksPerGroup); 262 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 263 } 264 265 266 // #pragma mark - 267 268 269 Volume::Volume(dev_t id) 270 : 271 fID(id), 272 fBlockAllocator(this), 273 fLock("bfs volume"), 274 fRootNode(NULL), 275 fIndicesNode(NULL), 276 fDirtyCachedBlocks(0), 277 fUniqueID(0), 278 fFlags(0) 279 { 280 } 281 282 283 Volume::~Volume() 284 { 285 } 286 287 288 bool 289 Volume::IsValidSuperBlock() 290 { 291 return fSuperBlock.IsValid(); 292 } 293 294 295 void 296 Volume::Panic() 297 { 298 FATAL(("we have to panic... switch to read-only mode!\n")); 299 fFlags |= VOLUME_READ_ONLY; 300 #ifdef DEBUG 301 kernel_debugger("BFS panics!"); 302 #endif 303 } 304 305 306 status_t 307 Volume::Mount(const char *deviceName, uint32 flags) 308 { 309 // ToDo: validate the FS in write mode as well! 310 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 311 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 312 // in big endian mode, we only mount read-only for now 313 flags |= B_MOUNT_READ_ONLY; 314 #endif 315 316 DeviceOpener opener(deviceName, (flags & B_MOUNT_READ_ONLY) != 0 317 ? O_RDONLY : O_RDWR); 318 fDevice = opener.Device(); 319 if (fDevice < B_OK) 320 RETURN_ERROR(fDevice); 321 322 if (opener.Mode() == O_RDONLY) 323 fFlags |= VOLUME_READ_ONLY; 324 325 // check if it's a regular file, and if so, disable the cache for the 326 // underlaying file system 327 struct stat stat; 328 if (fstat(fDevice, &stat) < 0) 329 RETURN_ERROR(B_ERROR); 330 331 // read the super block 332 if (Identify(fDevice, &fSuperBlock) != B_OK) { 333 FATAL(("invalid super block!\n")); 334 return B_BAD_VALUE; 335 } 336 337 // initialize short hands to the super block (to save byte swapping) 338 fBlockSize = fSuperBlock.BlockSize(); 339 fBlockShift = fSuperBlock.BlockShift(); 340 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 341 342 // check if the device size is large enough to hold the file system 343 off_t diskSize; 344 if (opener.GetSize(&diskSize) < B_OK) 345 RETURN_ERROR(B_ERROR); 346 if (diskSize < (NumBlocks() << BlockShift())) 347 RETURN_ERROR(B_BAD_VALUE); 348 349 // set the current log pointers, so that journaling will work correctly 350 fLogStart = fSuperBlock.LogStart(); 351 fLogEnd = fSuperBlock.LogEnd(); 352 353 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 354 return B_ERROR; 355 356 fJournal = new Journal(this); 357 // replaying the log is the first thing we will do on this disk 358 if (fJournal && fJournal->InitCheck() < B_OK 359 || fBlockAllocator.Initialize() < B_OK) { 360 // ToDo: improve error reporting for a bad journal 361 FATAL(("could not initialize journal/block bitmap allocator!\n")); 362 return B_NO_MEMORY; 363 } 364 365 status_t status = B_OK; 366 367 fRootNode = new Inode(this, ToVnode(Root())); 368 if (fRootNode && fRootNode->InitCheck() == B_OK) { 369 status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode); 370 if (status == B_OK) { 371 // try to get indices root dir 372 373 if (!Indices().IsZero()) 374 fIndicesNode = new Inode(this, ToVnode(Indices())); 375 376 if (fIndicesNode == NULL 377 || fIndicesNode->InitCheck() < B_OK 378 || !fIndicesNode->IsContainer()) { 379 INFORM(("bfs: volume doesn't have indices!\n")); 380 381 if (fIndicesNode) { 382 // if this is the case, the index root node is gone bad, 383 // and BFS switch to read-only mode 384 fFlags |= VOLUME_READ_ONLY; 385 delete fIndicesNode; 386 fIndicesNode = NULL; 387 } 388 } 389 390 // all went fine 391 opener.Keep(); 392 return B_OK; 393 } else 394 FATAL(("could not create root node: publish_vnode() failed!\n")); 395 396 delete fRootNode; 397 } else { 398 status = B_BAD_VALUE; 399 FATAL(("could not create root node!\n")); 400 } 401 402 return status; 403 } 404 405 406 status_t 407 Volume::Unmount() 408 { 409 // Unlike in BeOS, we need to put the reference to our root node ourselves 410 put_vnode(fID, ToVnode(Root())); 411 412 // This will also flush the log & all blocks to disk 413 delete fJournal; 414 fJournal = NULL; 415 416 delete fIndicesNode; 417 418 block_cache_delete(fBlockCache, !IsReadOnly()); 419 close(fDevice); 420 421 return B_OK; 422 } 423 424 425 status_t 426 Volume::Sync() 427 { 428 return fJournal->FlushLogAndBlocks(); 429 } 430 431 432 status_t 433 Volume::ValidateBlockRun(block_run run) 434 { 435 if (run.AllocationGroup() < 0 436 || run.AllocationGroup() > (int32)AllocationGroups() 437 || run.Start() > (1UL << AllocationGroupShift()) 438 || run.length == 0 439 || uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) { 440 Panic(); 441 FATAL(("*** invalid run(%d,%d,%d)\n", (int)run.AllocationGroup(), 442 run.Start(), run.Length())); 443 return B_BAD_DATA; 444 } 445 return B_OK; 446 } 447 448 449 block_run 450 Volume::ToBlockRun(off_t block) const 451 { 452 block_run run; 453 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32( 454 block >> AllocationGroupShift()); 455 run.start = HOST_ENDIAN_TO_BFS_INT16( 456 block & ((1LL << AllocationGroupShift()) - 1)); 457 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 458 return run; 459 } 460 461 462 status_t 463 Volume::CreateIndicesRoot(Transaction &transaction) 464 { 465 off_t id; 466 status_t status = Inode::Create(transaction, NULL, NULL, 467 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, NULL, &id, 468 &fIndicesNode); 469 if (status < B_OK) 470 RETURN_ERROR(status); 471 472 fSuperBlock.indices = ToBlockRun(id); 473 return WriteSuperBlock(); 474 } 475 476 477 status_t 478 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, 479 mode_t type, block_run &run) 480 { 481 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), 482 type, run); 483 } 484 485 486 status_t 487 Volume::WriteSuperBlock() 488 { 489 // TODO: this assumes a block size of 512 bytes of the underlying device 490 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) 491 != sizeof(disk_super_block)) 492 return B_IO_ERROR; 493 494 return B_OK; 495 } 496 497 498 void 499 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, 500 const uint8 *oldKey, size_t oldLength, const uint8 *newKey, 501 size_t newLength) 502 { 503 if (fQueryLock.Lock() < B_OK) 504 return; 505 506 Query *query = NULL; 507 while ((query = fQueries.Next(query)) != NULL) { 508 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, 509 newLength); 510 } 511 512 fQueryLock.Unlock(); 513 } 514 515 516 /*! 517 Checks if there is a live query whose results depend on the presence 518 or value of the specified attribute. 519 Don't use it if you already have all the data together to evaluate 520 the queries - it wouldn't safe you anything in this case. 521 */ 522 bool 523 Volume::CheckForLiveQuery(const char *attribute) 524 { 525 // ToDo: check for a live query that depends on the specified attribute 526 return true; 527 } 528 529 530 void 531 Volume::AddQuery(Query *query) 532 { 533 if (fQueryLock.Lock() < B_OK) 534 return; 535 536 fQueries.Add(query); 537 538 fQueryLock.Unlock(); 539 } 540 541 542 void 543 Volume::RemoveQuery(Query *query) 544 { 545 if (fQueryLock.Lock() < B_OK) 546 return; 547 548 fQueries.Remove(query); 549 550 fQueryLock.Unlock(); 551 } 552 553 554 // #pragma mark - Disk scanning and initialization 555 556 557 status_t 558 Volume::Identify(int fd, disk_super_block *superBlock) 559 { 560 char buffer[1024]; 561 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 562 return B_IO_ERROR; 563 564 memcpy(superBlock, buffer + 512, sizeof(disk_super_block)); 565 if (!superBlock->IsValid()) { 566 #ifndef BFS_LITTLE_ENDIAN_ONLY 567 // For PPC, the super block might be located at offset 0 568 memcpy(superBlock, buffer, sizeof(disk_super_block)); 569 if (!superBlock->IsValid()) 570 return B_BAD_VALUE; 571 #else 572 return B_BAD_VALUE; 573 #endif 574 } 575 576 return B_OK; 577 } 578 579 580 status_t 581 Volume::Initialize(int fd, const char *name, uint32 blockSize, 582 uint32 flags) 583 { 584 // although there is no really good reason for it, we won't 585 // accept '/' in disk names (mkbfs does this, too - and since 586 // Tracker names mounted volumes like their name) 587 if (strchr(name, '/') != NULL) 588 return B_BAD_VALUE; 589 590 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 591 && blockSize != 8192) 592 return B_BAD_VALUE; 593 594 DeviceOpener opener(fd, O_RDWR); 595 if (opener.Device() < B_OK) 596 return B_BAD_VALUE; 597 598 fDevice = opener.Device(); 599 600 uint32 deviceBlockSize; 601 off_t deviceSize; 602 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 603 return B_ERROR; 604 605 off_t numBlocks = deviceSize / blockSize; 606 607 // create valid super block 608 609 fSuperBlock.Initialize(name, numBlocks, blockSize); 610 611 // initialize short hands to the super block (to save byte swapping) 612 fBlockSize = fSuperBlock.BlockSize(); 613 fBlockShift = fSuperBlock.BlockShift(); 614 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 615 616 // since the allocator has not been initialized yet, we 617 // cannot use BlockAllocator::BitmapSize() here 618 fSuperBlock.log_blocks = ToBlockRun(AllocationGroups() 619 * fSuperBlock.BlocksPerAllocationGroup() + 1); 620 fSuperBlock.log_blocks.length = HOST_ENDIAN_TO_BFS_INT16(2048); 621 // ToDo: set the log size depending on the disk size 622 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64( 623 ToBlock(Log())); 624 625 // set the current log pointers, so that journaling will work correctly 626 fLogStart = fSuperBlock.LogStart(); 627 fLogEnd = fSuperBlock.LogEnd(); 628 629 if (!IsValidSuperBlock()) 630 RETURN_ERROR(B_ERROR); 631 632 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 633 return B_ERROR; 634 635 fJournal = new Journal(this); 636 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 637 RETURN_ERROR(B_ERROR); 638 639 // ready to write data to disk 640 641 Transaction transaction(this, 0); 642 643 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 644 RETURN_ERROR(B_ERROR); 645 646 off_t id; 647 status_t status = Inode::Create(transaction, NULL, NULL, 648 S_DIRECTORY | 0755, 0, 0, NULL, &id, &fRootNode); 649 if (status < B_OK) 650 RETURN_ERROR(status); 651 652 fSuperBlock.root_dir = ToBlockRun(id); 653 654 if ((flags & VOLUME_NO_INDICES) == 0) { 655 // The indices root directory will be created automatically 656 // when the standard indices are created (or any other). 657 Index index(this); 658 status = index.Create(transaction, "name", B_STRING_TYPE); 659 if (status < B_OK) 660 return status; 661 662 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 663 if (status < B_OK) 664 return status; 665 666 status = index.Create(transaction, "size", B_INT64_TYPE); 667 if (status < B_OK) 668 return status; 669 } 670 671 WriteSuperBlock(); 672 transaction.Done(); 673 674 // put_vnode(ID(), fRootNode->ID()); 675 // if (fIndicesNode != NULL) 676 // put_vnode(ID(), fIndicesNode->ID()); 677 678 Sync(); 679 opener.RemoveCache(true); 680 return B_OK; 681 } 682