1 /* Volume - BFS super block, mounting, etc. 2 * 3 * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de. 4 * This file may be used under the terms of the MIT License. 5 */ 6 7 8 #include "Debug.h" 9 #include "Volume.h" 10 #include "Journal.h" 11 #include "Inode.h" 12 #include "Query.h" 13 14 #include <util/kernel_cpp.h> 15 #include <KernelExport.h> 16 #include <Drivers.h> 17 #include <fs_volume.h> 18 19 #include <stdlib.h> 20 #include <stdio.h> 21 #include <string.h> 22 #include <ctype.h> 23 24 25 static const int32 kDesiredAllocationGroups = 56; 26 // This is the number of allocation groups that will be tried 27 // to be given for newly initialized disks. 28 // That's only relevant for smaller disks, though, since any 29 // of today's disk sizes already reach the maximum length 30 // of an allocation group (65536 blocks). 31 // It seems to create appropriate numbers for smaller disks 32 // with this setting, though (i.e. you can create a 400 MB 33 // file on a 1 GB disk without the need for double indirect 34 // blocks). 35 36 37 class DeviceOpener { 38 public: 39 DeviceOpener(const char *device, int mode); 40 ~DeviceOpener(); 41 42 int Open(const char *device, int mode); 43 void *InitCache(off_t numBlocks, uint32 blockSize); 44 void RemoveCache(bool allowWrites); 45 46 void Keep(); 47 48 int Device() const { return fDevice; } 49 int Mode() const { return fMode; } 50 51 status_t GetSize(off_t *_size, uint32 *_blockSize = NULL); 52 53 private: 54 int fDevice; 55 int fMode; 56 void *fBlockCache; 57 }; 58 59 60 DeviceOpener::DeviceOpener(const char *device, int mode) 61 : 62 fBlockCache(NULL) 63 { 64 Open(device, mode); 65 } 66 67 68 DeviceOpener::~DeviceOpener() 69 { 70 if (fDevice >= B_OK) { 71 RemoveCache(false); 72 close(fDevice); 73 } 74 } 75 76 77 int 78 DeviceOpener::Open(const char *device, int mode) 79 { 80 fDevice = open(device, mode); 81 if (fDevice < 0 && mode == O_RDWR) { 82 // try again to open read-only (don't rely on a specific error code) 83 return Open(device, O_RDONLY); 84 } 85 86 if (fDevice >= 0) { 87 // opening succeeded 88 fMode = mode; 89 if (mode == O_RDWR) { 90 // check out if the device really allows for read/write access 91 device_geometry geometry; 92 if (!ioctl(fDevice, B_GET_GEOMETRY, &geometry)) { 93 if (geometry.read_only) { 94 // reopen device read-only 95 close(fDevice); 96 return Open(device, O_RDONLY); 97 } 98 } 99 } 100 } 101 102 return fDevice; 103 } 104 105 106 void * 107 DeviceOpener::InitCache(off_t numBlocks, uint32 blockSize) 108 { 109 return block_cache_create(fDevice, numBlocks, blockSize); 110 } 111 112 113 void 114 DeviceOpener::RemoveCache(bool allowWrites) 115 { 116 if (fBlockCache == NULL) 117 return; 118 119 block_cache_delete(fBlockCache, allowWrites); 120 fBlockCache = NULL; 121 } 122 123 124 void 125 DeviceOpener::Keep() 126 { 127 fDevice = -1; 128 } 129 130 131 /** Returns the size of the device in bytes. It uses B_GET_GEOMETRY 132 * to compute the size, or fstat() if that failed. 133 */ 134 135 status_t 136 DeviceOpener::GetSize(off_t *_size, uint32 *_blockSize) 137 { 138 device_geometry geometry; 139 if (ioctl(fDevice, B_GET_GEOMETRY, &geometry) < 0) { 140 // maybe it's just a file 141 struct stat stat; 142 if (fstat(fDevice, &stat) < 0) 143 return B_ERROR; 144 145 if (_size) 146 *_size = stat.st_size; 147 if (_blockSize) // that shouldn't cause us any problems 148 *_blockSize = 512; 149 150 return B_OK; 151 } 152 153 if (_size) { 154 *_size = 1LL * geometry.head_count * geometry.cylinder_count 155 * geometry.sectors_per_track * geometry.bytes_per_sector; 156 } 157 if (_blockSize) 158 *_blockSize = geometry.bytes_per_sector; 159 160 return B_OK; 161 } 162 163 164 // #pragma mark - 165 166 167 bool 168 disk_super_block::IsValid() 169 { 170 if (Magic1() != (int32)SUPER_BLOCK_MAGIC1 171 || Magic2() != (int32)SUPER_BLOCK_MAGIC2 172 || Magic3() != (int32)SUPER_BLOCK_MAGIC3 173 || (int32)block_size != inode_size 174 || ByteOrder() != SUPER_BLOCK_FS_LENDIAN 175 || (1UL << BlockShift()) != BlockSize() 176 || AllocationGroups() < 1 177 || AllocationGroupShift() < 1 178 || BlocksPerAllocationGroup() < 1 179 || NumBlocks() < 10 180 || AllocationGroups() != divide_roundup(NumBlocks(), 181 1L << AllocationGroupShift())) 182 return false; 183 184 return true; 185 } 186 187 188 void 189 disk_super_block::Initialize(const char *diskName, off_t numBlocks, uint32 blockSize) 190 { 191 memset(this, 0, sizeof(disk_super_block)); 192 193 magic1 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC1); 194 magic2 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC2); 195 magic3 = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_MAGIC3); 196 fs_byte_order = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_FS_LENDIAN); 197 flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 198 199 strlcpy(name, diskName, sizeof(name)); 200 201 int32 blockShift = 9; 202 while ((1UL << blockShift) < blockSize) { 203 blockShift++; 204 } 205 206 block_size = inode_size = HOST_ENDIAN_TO_BFS_INT32(blockSize); 207 block_shift = HOST_ENDIAN_TO_BFS_INT32(blockShift); 208 209 num_blocks = HOST_ENDIAN_TO_BFS_INT64(numBlocks); 210 used_blocks = 0; 211 212 // Get the minimum ag_shift (that's determined by the block size) 213 214 int32 bitsPerBlock = blockSize << 3; 215 off_t bitmapBlocks = (numBlocks + bitsPerBlock - 1) / bitsPerBlock; 216 int32 blocksPerGroup = 1; 217 int32 groupShift = 13; 218 219 for (int32 i = 8192; i < bitsPerBlock; i *= 2) { 220 groupShift++; 221 } 222 223 // Many allocation groups help applying allocation policies, but if 224 // they are too small, we will need to many block_runs to cover large 225 // files (see above to get an explanation of the kDesiredAllocationGroups 226 // constant). 227 228 int32 numGroups; 229 230 while (true) { 231 numGroups = (bitmapBlocks + blocksPerGroup - 1) / blocksPerGroup; 232 if (numGroups > kDesiredAllocationGroups) { 233 if (groupShift == 16) 234 break; 235 236 groupShift++; 237 blocksPerGroup *= 2; 238 } else 239 break; 240 } 241 242 num_ags = HOST_ENDIAN_TO_BFS_INT32(numGroups); 243 blocks_per_ag = HOST_ENDIAN_TO_BFS_INT32(1); 244 ag_shift = HOST_ENDIAN_TO_BFS_INT32(groupShift); 245 } 246 247 248 // #pragma mark - 249 250 251 Volume::Volume(mount_id id) 252 : 253 fID(id), 254 fBlockAllocator(this), 255 fLock("bfs volume"), 256 fRootNode(NULL), 257 fIndicesNode(NULL), 258 fDirtyCachedBlocks(0), 259 fUniqueID(0), 260 fFlags(0) 261 { 262 } 263 264 265 Volume::~Volume() 266 { 267 } 268 269 270 bool 271 Volume::IsValidSuperBlock() 272 { 273 return fSuperBlock.IsValid(); 274 } 275 276 277 void 278 Volume::Panic() 279 { 280 FATAL(("we have to panic... switch to read-only mode!\n")); 281 fFlags |= VOLUME_READ_ONLY; 282 #ifdef USER 283 debugger("BFS panics!"); 284 #elif defined(DEBUG) 285 kernel_debugger("BFS panics!"); 286 #endif 287 } 288 289 290 status_t 291 Volume::Mount(const char *deviceName, uint32 flags) 292 { 293 // ToDo: validate the FS in write mode as well! 294 #if (B_HOST_IS_LENDIAN && defined(BFS_BIG_ENDIAN_ONLY)) \ 295 || (B_HOST_IS_BENDIAN && defined(BFS_LITTLE_ENDIAN_ONLY)) 296 // in big endian mode, we only mount read-only for now 297 flags |= B_MOUNT_READ_ONLY; 298 #endif 299 300 DeviceOpener opener(deviceName, flags & B_MOUNT_READ_ONLY ? O_RDONLY : O_RDWR); 301 fDevice = opener.Device(); 302 if (fDevice < B_OK) 303 RETURN_ERROR(fDevice); 304 305 if (opener.Mode() == O_RDONLY) 306 fFlags |= VOLUME_READ_ONLY; 307 308 // check if it's a regular file, and if so, disable the cache for the 309 // underlaying file system 310 struct stat stat; 311 if (fstat(fDevice, &stat) < 0) 312 RETURN_ERROR(B_ERROR); 313 314 // TODO: allow turning off caching of the underlying file (once O_NOCACHE works) 315 #if 0 316 #ifndef NO_FILE_UNCACHED_IO 317 if ((stat.st_mode & S_FILE) != 0 && ioctl(fDevice, IOCTL_FILE_UNCACHED_IO, NULL) < 0) { 318 // mount read-only if the cache couldn't be disabled 319 # ifdef DEBUG 320 FATAL(("couldn't disable cache for image file - system may dead-lock!\n")); 321 # else 322 FATAL(("couldn't disable cache for image file!\n")); 323 Panic(); 324 # endif 325 } 326 #endif 327 #endif 328 329 // read the super block 330 if (Identify(fDevice, &fSuperBlock) != B_OK) { 331 FATAL(("invalid super block!\n")); 332 return B_BAD_VALUE; 333 } 334 335 // initialize short hands to the super block (to save byte swapping) 336 fBlockSize = fSuperBlock.BlockSize(); 337 fBlockShift = fSuperBlock.BlockShift(); 338 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 339 340 // check if the device size is large enough to hold the file system 341 off_t diskSize; 342 if (opener.GetSize(&diskSize) < B_OK) 343 RETURN_ERROR(B_ERROR); 344 if (diskSize < (NumBlocks() << BlockShift())) 345 RETURN_ERROR(B_BAD_VALUE); 346 347 // set the current log pointers, so that journaling will work correctly 348 fLogStart = fSuperBlock.LogStart(); 349 fLogEnd = fSuperBlock.LogEnd(); 350 351 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 352 return B_ERROR; 353 354 fJournal = new Journal(this); 355 // replaying the log is the first thing we will do on this disk 356 if (fJournal && fJournal->InitCheck() < B_OK 357 || fBlockAllocator.Initialize() < B_OK) { 358 // ToDo: improve error reporting for a bad journal 359 FATAL(("could not initialize journal/block bitmap allocator!\n")); 360 return B_NO_MEMORY; 361 } 362 363 status_t status = B_OK; 364 365 fRootNode = new Inode(this, ToVnode(Root())); 366 if (fRootNode && fRootNode->InitCheck() == B_OK) { 367 status = publish_vnode(fID, ToVnode(Root()), (void *)fRootNode); 368 if (status == B_OK) { 369 // try to get indices root dir 370 371 // question: why doesn't get_vnode() work here?? 372 // answer: we have not yet backpropagated the pointer to the 373 // volume in bfs_mount(), so bfs_read_vnode() can't get it. 374 // But it's not needed to do that anyway. 375 376 if (!Indices().IsZero()) 377 fIndicesNode = new Inode(this, ToVnode(Indices())); 378 379 if (fIndicesNode == NULL 380 || fIndicesNode->InitCheck() < B_OK 381 || !fIndicesNode->IsContainer()) { 382 INFORM(("bfs: volume doesn't have indices!\n")); 383 384 if (fIndicesNode) { 385 // if this is the case, the index root node is gone bad, and 386 // BFS switch to read-only mode 387 fFlags |= VOLUME_READ_ONLY; 388 delete fIndicesNode; 389 fIndicesNode = NULL; 390 } 391 } 392 393 // all went fine 394 opener.Keep(); 395 return B_OK; 396 } else 397 FATAL(("could not create root node: publish_vnode() failed!\n")); 398 399 delete fRootNode; 400 } else { 401 status = B_BAD_VALUE; 402 FATAL(("could not create root node!\n")); 403 } 404 405 return status; 406 } 407 408 409 status_t 410 Volume::Unmount() 411 { 412 // Unlike in BeOS, we need to put the reference to our root node ourselves 413 put_vnode(fID, ToVnode(Root())); 414 415 // This will also flush the log & all blocks to disk 416 delete fJournal; 417 fJournal = NULL; 418 419 delete fIndicesNode; 420 421 block_cache_delete(fBlockCache, !IsReadOnly()); 422 close(fDevice); 423 424 return B_OK; 425 } 426 427 428 status_t 429 Volume::Sync() 430 { 431 return fJournal->FlushLogAndBlocks(); 432 } 433 434 435 status_t 436 Volume::ValidateBlockRun(block_run run) 437 { 438 if (run.AllocationGroup() < 0 || run.AllocationGroup() > (int32)AllocationGroups() 439 || run.Start() > (1UL << AllocationGroupShift()) 440 || run.length == 0 441 || uint32(run.Length() + run.Start()) > (1UL << AllocationGroupShift())) { 442 Panic(); 443 FATAL(("*** invalid run(%ld,%d,%d)\n", run.AllocationGroup(), run.Start(), run.Length())); 444 return B_BAD_DATA; 445 } 446 return B_OK; 447 } 448 449 450 block_run 451 Volume::ToBlockRun(off_t block) const 452 { 453 block_run run; 454 run.allocation_group = HOST_ENDIAN_TO_BFS_INT32(block >> AllocationGroupShift()); 455 run.start = HOST_ENDIAN_TO_BFS_INT16(block & ((1LL << AllocationGroupShift()) - 1)); 456 run.length = HOST_ENDIAN_TO_BFS_INT16(1); 457 return run; 458 } 459 460 461 status_t 462 Volume::CreateIndicesRoot(Transaction &transaction) 463 { 464 off_t id; 465 status_t status = Inode::Create(transaction, NULL, NULL, 466 S_INDEX_DIR | S_STR_INDEX | S_DIRECTORY | 0700, 0, 0, &id, &fIndicesNode); 467 if (status < B_OK) 468 RETURN_ERROR(status); 469 470 fSuperBlock.indices = ToBlockRun(id); 471 return WriteSuperBlock(); 472 } 473 474 475 status_t 476 Volume::AllocateForInode(Transaction &transaction, const Inode *parent, mode_t type, block_run &run) 477 { 478 return fBlockAllocator.AllocateForInode(transaction, &parent->BlockRun(), type, run); 479 } 480 481 482 status_t 483 Volume::WriteSuperBlock() 484 { 485 if (write_pos(fDevice, 512, &fSuperBlock, sizeof(disk_super_block)) != sizeof(disk_super_block)) 486 return B_IO_ERROR; 487 488 return B_OK; 489 } 490 491 492 void 493 Volume::UpdateLiveQueries(Inode *inode, const char *attribute, int32 type, const uint8 *oldKey, 494 size_t oldLength, const uint8 *newKey, size_t newLength) 495 { 496 if (fQueryLock.Lock() < B_OK) 497 return; 498 499 Query *query = NULL; 500 while ((query = fQueries.Next(query)) != NULL) 501 query->LiveUpdate(inode, attribute, type, oldKey, oldLength, newKey, newLength); 502 503 fQueryLock.Unlock(); 504 } 505 506 507 /** Checks if there is a live query whose results depend on the presence 508 * or value of the specified attribute. 509 * Don't use it if you already have all the data together to evaluate 510 * the queries - it wouldn't safe you anything in this case. 511 */ 512 513 bool 514 Volume::CheckForLiveQuery(const char *attribute) 515 { 516 // ToDo: check for a live query that depends on the specified attribute 517 return true; 518 } 519 520 521 void 522 Volume::AddQuery(Query *query) 523 { 524 if (fQueryLock.Lock() < B_OK) 525 return; 526 527 fQueries.Add(query); 528 529 fQueryLock.Unlock(); 530 } 531 532 533 void 534 Volume::RemoveQuery(Query *query) 535 { 536 if (fQueryLock.Lock() < B_OK) 537 return; 538 539 fQueries.Remove(query); 540 541 fQueryLock.Unlock(); 542 } 543 544 545 // #pragma mark - 546 // Disk scanning and initialization 547 548 549 status_t 550 Volume::Identify(int fd, disk_super_block *superBlock) 551 { 552 char buffer[1024]; 553 if (read_pos(fd, 0, buffer, sizeof(buffer)) != sizeof(buffer)) 554 return B_IO_ERROR; 555 556 // Note: that does work only for x86, for PowerPC, the super block 557 // may be located at offset 0! 558 memcpy(superBlock, buffer + 512, sizeof(disk_super_block)); 559 if (!superBlock->IsValid()) { 560 #ifndef BFS_LITTLE_ENDIAN_ONLY 561 memcpy(superBlock, buffer, sizeof(disk_super_block)); 562 if (!superBlock->IsValid()) 563 return B_BAD_VALUE; 564 #else 565 return B_BAD_VALUE; 566 #endif 567 } 568 569 return B_OK; 570 } 571 572 #ifdef USER 573 extern "C" void kill_device_vnodes(dev_t id); 574 // This call is only available in the userland fs_shell 575 576 status_t 577 Volume::Initialize(const char *device, const char *name, uint32 blockSize, uint32 flags) 578 { 579 // although there is no really good reason for it, we won't 580 // accept '/' in disk names (mkbfs does this, too - and since 581 // Tracker names mounted volumes like their name) 582 if (strchr(name, '/') != NULL) 583 return B_BAD_VALUE; 584 585 if (blockSize != 1024 && blockSize != 2048 && blockSize != 4096 && blockSize != 8192) 586 return B_BAD_VALUE; 587 588 DeviceOpener opener(device, O_RDWR); 589 if (opener.Device() < B_OK) 590 return B_BAD_VALUE; 591 592 fDevice = opener.Device(); 593 594 uint32 deviceBlockSize; 595 off_t deviceSize; 596 if (opener.GetSize(&deviceSize, &deviceBlockSize) < B_OK) 597 return B_ERROR; 598 599 off_t numBlocks = deviceSize / blockSize; 600 601 // create valid super block 602 603 fSuperBlock.Initialize(name, numBlocks, blockSize); 604 605 // initialize short hands to the super block (to save byte swapping) 606 fBlockSize = fSuperBlock.BlockSize(); 607 fBlockShift = fSuperBlock.BlockShift(); 608 fAllocationGroupShift = fSuperBlock.AllocationGroupShift(); 609 610 // since the allocator has not been initialized yet, we 611 // cannot use BlockAllocator::BitmapSize() here 612 fSuperBlock.log_blocks = ToBlockRun(AllocationGroups() 613 * fSuperBlock.BlocksPerAllocationGroup() + 1); 614 fSuperBlock.log_blocks.length = 2048; 615 // ToDo: set the log size depending on the disk size 616 fSuperBlock.log_start = fSuperBlock.log_end = HOST_ENDIAN_TO_BFS_INT64(ToBlock(Log())); 617 618 // set the current log pointers, so that journaling will work correctly 619 fLogStart = fSuperBlock.LogStart(); 620 fLogEnd = fSuperBlock.LogEnd(); 621 622 if (!IsValidSuperBlock()) 623 RETURN_ERROR(B_ERROR); 624 625 if ((fBlockCache = opener.InitCache(NumBlocks(), fBlockSize)) == NULL) 626 return B_ERROR; 627 628 fJournal = new Journal(this); 629 if (fJournal == NULL || fJournal->InitCheck() < B_OK) 630 RETURN_ERROR(B_ERROR); 631 632 // ready to write data to disk 633 634 Transaction transaction(this, 0); 635 636 if (fBlockAllocator.InitializeAndClearBitmap(transaction) < B_OK) 637 RETURN_ERROR(B_ERROR); 638 639 off_t id; 640 status_t status = Inode::Create(transaction, NULL, NULL, 641 S_DIRECTORY | 0755, 0, 0, &id, &fRootNode); 642 if (status < B_OK) 643 RETURN_ERROR(status); 644 645 fSuperBlock.root_dir = ToBlockRun(id); 646 647 if ((flags & VOLUME_NO_INDICES) == 0) { 648 // The indices root directory will be created automatically 649 // when the standard indices are created (or any other). 650 Index index(this); 651 status = index.Create(transaction, "name", B_STRING_TYPE); 652 if (status < B_OK) 653 return status; 654 655 status = index.Create(transaction, "last_modified", B_INT64_TYPE); 656 if (status < B_OK) 657 return status; 658 659 status = index.Create(transaction, "size", B_INT64_TYPE); 660 if (status < B_OK) 661 return status; 662 } 663 664 WriteSuperBlock(); 665 transaction.Done(); 666 667 put_vnode(ID(), fRootNode->ID()); 668 if (fIndicesNode != NULL) 669 put_vnode(ID(), fIndicesNode->ID()); 670 671 kill_device_vnodes(ID()); 672 // This call is only available in the userland fs_shell 673 674 Sync(); 675 opener.RemoveCache(true); 676 return B_OK; 677 } 678 #endif 679