1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! Transaction and logging 7 8 9 #include "Journal.h" 10 11 #include "Debug.h" 12 #include "Inode.h" 13 14 15 struct run_array { 16 int32 count; 17 int32 max_runs; 18 block_run runs[0]; 19 20 void Init(int32 blockSize); 21 void Insert(block_run& run); 22 23 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 24 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 25 // that -1 accounts for an off-by-one error in Be's BFS implementation 26 const block_run& RunAt(int32 i) const { return runs[i]; } 27 28 static int32 MaxRuns(int32 blockSize); 29 30 private: 31 static int _Compare(block_run& a, block_run& b); 32 int32 _FindInsertionIndex(block_run& run); 33 }; 34 35 class RunArrays { 36 public: 37 RunArrays(Journal* journal); 38 ~RunArrays(); 39 40 status_t Insert(off_t blockNumber); 41 42 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 43 int32 CountArrays() const { return fArrays.CountItems(); } 44 45 uint32 CountBlocks() const { return fBlockCount; } 46 uint32 LogEntryLength() const 47 { return CountBlocks() + CountArrays(); } 48 49 int32 MaxArrayLength(); 50 51 private: 52 status_t _AddArray(); 53 bool _ContainsRun(block_run& run); 54 bool _AddRun(block_run& run); 55 56 Journal* fJournal; 57 uint32 fBlockCount; 58 Stack<run_array*> fArrays; 59 run_array* fLastArray; 60 }; 61 62 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 63 public: 64 LogEntry(Journal* journal, uint32 logStart, 65 uint32 length); 66 ~LogEntry(); 67 68 uint32 Start() const { return fStart; } 69 uint32 Length() const { return fLength; } 70 71 #ifdef BFS_DEBUGGER_COMMANDS 72 void SetTransactionID(int32 id) { fTransactionID = id; } 73 int32 TransactionID() const { return fTransactionID; } 74 #endif 75 76 Journal* GetJournal() { return fJournal; } 77 78 private: 79 Journal* fJournal; 80 uint32 fStart; 81 uint32 fLength; 82 #ifdef BFS_DEBUGGER_COMMANDS 83 int32 fTransactionID; 84 #endif 85 }; 86 87 88 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE) 89 namespace BFSJournalTracing { 90 91 class LogEntry : public AbstractTraceEntry { 92 public: 93 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 94 : 95 fEntry(entry), 96 #ifdef BFS_DEBUGGER_COMMANDS 97 fTransactionID(entry->TransactionID()), 98 #endif 99 fStart(entry->Start()), 100 fLength(entry->Length()), 101 fLogPosition(logPosition), 102 fStarted(started) 103 { 104 Initialized(); 105 } 106 107 virtual void AddDump(TraceOutput& out) 108 { 109 #ifdef BFS_DEBUGGER_COMMANDS 110 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 111 "%lu\n", fStarted ? "Started" : "Written", fEntry, 112 fTransactionID, fStart, fLength, 113 fStarted ? "end" : "start", fLogPosition); 114 #else 115 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 116 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 117 fStarted ? "end" : "start", fLogPosition); 118 #endif 119 } 120 121 private: 122 ::LogEntry* fEntry; 123 #ifdef BFS_DEBUGGER_COMMANDS 124 int32 fTransactionID; 125 #endif 126 uint32 fStart; 127 uint32 fLength; 128 uint32 fLogPosition; 129 bool fStarted; 130 }; 131 132 } // namespace BFSJournalTracing 133 134 # define T(x) new(std::nothrow) BFSJournalTracing::x; 135 #else 136 # define T(x) ; 137 #endif 138 139 140 // #pragma mark - 141 142 143 static void 144 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 145 size_t size) 146 { 147 if (index > 0 && (addr_t)vecs[index - 1].iov_base 148 + vecs[index - 1].iov_len == (addr_t)address) { 149 // the iovec can be combined with the previous one 150 vecs[index - 1].iov_len += size; 151 return; 152 } 153 154 if (index == max) 155 panic("no more space for iovecs!"); 156 157 // we need to start a new iovec 158 vecs[index].iov_base = const_cast<void*>(address); 159 vecs[index].iov_len = size; 160 index++; 161 } 162 163 164 // #pragma mark - LogEntry 165 166 167 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 168 : 169 fJournal(journal), 170 fStart(start), 171 fLength(length) 172 { 173 } 174 175 176 LogEntry::~LogEntry() 177 { 178 } 179 180 181 // #pragma mark - run_array 182 183 184 /*! The run_array's size equals the block size of the BFS volume, so we 185 cannot use a (non-overridden) new. 186 This makes a freshly allocated run_array ready to run. 187 */ 188 void 189 run_array::Init(int32 blockSize) 190 { 191 memset(this, 0, blockSize); 192 count = 0; 193 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 194 } 195 196 197 /*! Inserts the block_run into the array. You will have to make sure the 198 array is large enough to contain the entry before calling this function. 199 */ 200 void 201 run_array::Insert(block_run& run) 202 { 203 int32 index = _FindInsertionIndex(run); 204 if (index == -1) { 205 // add to the end 206 runs[CountRuns()] = run; 207 } else { 208 // insert at index 209 memmove(&runs[index + 1], &runs[index], 210 (CountRuns() - index) * sizeof(off_t)); 211 runs[index] = run; 212 } 213 214 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 215 } 216 217 218 /*static*/ int32 219 run_array::MaxRuns(int32 blockSize) 220 { 221 // For whatever reason, BFS restricts the maximum array size 222 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 223 if (maxCount < 128) 224 return maxCount; 225 226 return 127; 227 } 228 229 230 /*static*/ int 231 run_array::_Compare(block_run& a, block_run& b) 232 { 233 int cmp = a.AllocationGroup() - b.AllocationGroup(); 234 if (cmp == 0) 235 return a.Start() - b.Start(); 236 237 return cmp; 238 } 239 240 241 int32 242 run_array::_FindInsertionIndex(block_run& run) 243 { 244 int32 min = 0, max = CountRuns() - 1; 245 int32 i = 0; 246 if (max >= 8) { 247 while (min <= max) { 248 i = (min + max) / 2; 249 250 int cmp = _Compare(runs[i], run); 251 if (cmp < 0) 252 min = i + 1; 253 else if (cmp > 0) 254 max = i - 1; 255 else 256 return -1; 257 } 258 259 if (_Compare(runs[i], run) < 0) 260 i++; 261 } else { 262 for (; i <= max; i++) { 263 if (_Compare(runs[i], run) > 0) 264 break; 265 } 266 if (i == count) 267 return -1; 268 } 269 270 return i; 271 } 272 273 274 // #pragma mark - RunArrays 275 276 277 RunArrays::RunArrays(Journal* journal) 278 : 279 fJournal(journal), 280 fBlockCount(0), 281 fArrays(), 282 fLastArray(NULL) 283 { 284 } 285 286 287 RunArrays::~RunArrays() 288 { 289 run_array* array; 290 while (fArrays.Pop(&array)) 291 free(array); 292 } 293 294 295 bool 296 RunArrays::_ContainsRun(block_run& run) 297 { 298 for (int32 i = 0; i < CountArrays(); i++) { 299 run_array* array = ArrayAt(i); 300 301 for (int32 j = 0; j < array->CountRuns(); j++) { 302 block_run& arrayRun = array->runs[j]; 303 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 304 continue; 305 306 if (run.Start() >= arrayRun.Start() 307 && run.Start() + run.Length() 308 <= arrayRun.Start() + arrayRun.Length()) 309 return true; 310 } 311 } 312 313 return false; 314 } 315 316 317 /*! Adds the specified block_run into the array. 318 Note: it doesn't support overlapping - it must only be used 319 with block_runs of length 1! 320 */ 321 bool 322 RunArrays::_AddRun(block_run& run) 323 { 324 ASSERT(run.length == 1); 325 326 // Be's BFS log replay routine can only deal with block_runs of size 1 327 // A pity, isn't it? Too sad we have to be compatible. 328 329 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 330 return false; 331 332 fLastArray->Insert(run); 333 fBlockCount++; 334 return true; 335 } 336 337 338 status_t 339 RunArrays::_AddArray() 340 { 341 int32 blockSize = fJournal->GetVolume()->BlockSize(); 342 343 run_array* array = (run_array*)malloc(blockSize); 344 if (array == NULL) 345 return B_NO_MEMORY; 346 347 if (fArrays.Push(array) != B_OK) { 348 free(array); 349 return B_NO_MEMORY; 350 } 351 352 array->Init(blockSize); 353 fLastArray = array; 354 return B_OK; 355 } 356 357 358 status_t 359 RunArrays::Insert(off_t blockNumber) 360 { 361 Volume* volume = fJournal->GetVolume(); 362 block_run run = volume->ToBlockRun(blockNumber); 363 364 if (fLastArray != NULL) { 365 // check if the block is already in the array 366 if (_ContainsRun(run)) 367 return B_OK; 368 } 369 370 // insert block into array 371 372 if (!_AddRun(run)) { 373 // array is full 374 if (_AddArray() != B_OK || !_AddRun(run)) 375 return B_NO_MEMORY; 376 } 377 378 return B_OK; 379 } 380 381 382 int32 383 RunArrays::MaxArrayLength() 384 { 385 int32 max = 0; 386 for (int32 i = 0; i < CountArrays(); i++) { 387 if (ArrayAt(i)->CountRuns() > max) 388 max = ArrayAt(i)->CountRuns(); 389 } 390 391 return max; 392 } 393 394 395 // #pragma mark - Journal 396 397 398 Journal::Journal(Volume* volume) 399 : 400 fVolume(volume), 401 fOwner(NULL), 402 fLogSize(volume->Log().Length()), 403 fMaxTransactionSize(fLogSize / 2 - 5), 404 fUsed(0), 405 fUnwrittenTransactions(0), 406 fHasSubtransaction(false), 407 fSeparateSubTransactions(false) 408 { 409 recursive_lock_init(&fLock, "bfs journal"); 410 mutex_init(&fEntriesLock, "bfs journal entries"); 411 } 412 413 414 Journal::~Journal() 415 { 416 FlushLogAndBlocks(); 417 418 recursive_lock_destroy(&fLock); 419 mutex_destroy(&fEntriesLock); 420 } 421 422 423 status_t 424 Journal::InitCheck() 425 { 426 return B_OK; 427 } 428 429 430 /*! \brief Does a very basic consistency check of the run array. 431 It will check the maximum run count as well as if all of the runs fall 432 within a the volume. 433 */ 434 status_t 435 Journal::_CheckRunArray(const run_array* array) 436 { 437 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 438 // the -1 works around an off-by-one bug in Be's BFS implementation, 439 // same as in run_array::MaxRuns() 440 if (array->MaxRuns() != maxRuns 441 || array->CountRuns() > maxRuns 442 || array->CountRuns() <= 0) { 443 dprintf("run count: %d, array max: %d, max runs: %d\n", 444 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 445 FATAL(("Log entry has broken header!\n")); 446 return B_ERROR; 447 } 448 449 for (int32 i = 0; i < array->CountRuns(); i++) { 450 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 451 return B_ERROR; 452 } 453 454 PRINT(("Log entry has %ld entries\n", array->CountRuns())); 455 return B_OK; 456 } 457 458 459 /*! Replays an entry in the log. 460 \a _start points to the entry in the log, and will be bumped to the next 461 one if replaying succeeded. 462 */ 463 status_t 464 Journal::_ReplayRunArray(int32* _start) 465 { 466 PRINT(("ReplayRunArray(start = %ld)\n", *_start)); 467 468 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 469 off_t firstBlockNumber = *_start % fLogSize; 470 471 CachedBlock cachedArray(fVolume); 472 473 const run_array* array = (const run_array*)cachedArray.SetTo(logOffset 474 + firstBlockNumber); 475 if (array == NULL) 476 return B_IO_ERROR; 477 478 if (_CheckRunArray(array) < B_OK) 479 return B_BAD_DATA; 480 481 // First pass: check integrity of the blocks in the run array 482 483 CachedBlock cached(fVolume); 484 485 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 486 off_t blockNumber = firstBlockNumber; 487 int32 blockSize = fVolume->BlockSize(); 488 489 for (int32 index = 0; index < array->CountRuns(); index++) { 490 const block_run& run = array->RunAt(index); 491 492 off_t offset = fVolume->ToOffset(run); 493 for (int32 i = 0; i < run.Length(); i++) { 494 const uint8* data = cached.SetTo(logOffset + blockNumber); 495 if (data == NULL) 496 RETURN_ERROR(B_IO_ERROR); 497 498 // TODO: eventually check other well known offsets, like the 499 // root and index dirs 500 if (offset == 0) { 501 // This log entry writes over the super block - check if 502 // it's valid! 503 if (Volume::CheckSuperBlock(data) != B_OK) { 504 FATAL(("Log contains invalid super block!\n")); 505 RETURN_ERROR(B_BAD_DATA); 506 } 507 } 508 509 blockNumber = (blockNumber + 1) % fLogSize; 510 offset += blockSize; 511 } 512 } 513 514 // Second pass: write back its blocks 515 516 blockNumber = firstBlockNumber; 517 int32 count = 1; 518 519 for (int32 index = 0; index < array->CountRuns(); index++) { 520 const block_run& run = array->RunAt(index); 521 INFORM(("replay block run %u:%u:%u in log at %Ld!\n", 522 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 523 524 off_t offset = fVolume->ToOffset(run); 525 for (int32 i = 0; i < run.Length(); i++) { 526 const uint8* data = cached.SetTo(logOffset + blockNumber); 527 if (data == NULL) 528 RETURN_ERROR(B_IO_ERROR); 529 530 ssize_t written = write_pos(fVolume->Device(), offset, data, 531 blockSize); 532 if (written != blockSize) 533 RETURN_ERROR(B_IO_ERROR); 534 535 blockNumber = (blockNumber + 1) % fLogSize; 536 offset += blockSize; 537 count++; 538 } 539 } 540 541 *_start += count; 542 return B_OK; 543 } 544 545 546 /*! Replays all log entries - this will put the disk into a 547 consistent and clean state, if it was not correctly unmounted 548 before. 549 This method is called by Journal::InitCheck() if the log start 550 and end pointer don't match. 551 */ 552 status_t 553 Journal::ReplayLog() 554 { 555 // TODO: this logic won't work whenever the size of the pending transaction 556 // equals the size of the log (happens with the original BFS only) 557 if (fVolume->LogStart() == fVolume->LogEnd()) 558 return B_OK; 559 560 INFORM(("Replay log, disk was not correctly unmounted...\n")); 561 562 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) { 563 INFORM(("log_start and log_end differ, but disk is marked clean - " 564 "trying to replay log...\n")); 565 } 566 567 if (fVolume->IsReadOnly()) 568 return B_READ_ONLY_DEVICE; 569 570 int32 start = fVolume->LogStart(); 571 int32 lastStart = -1; 572 while (true) { 573 // stop if the log is completely flushed 574 if (start == fVolume->LogEnd()) 575 break; 576 577 if (start == lastStart) { 578 // strange, flushing the log hasn't changed the log_start pointer 579 return B_ERROR; 580 } 581 lastStart = start; 582 583 status_t status = _ReplayRunArray(&start); 584 if (status != B_OK) { 585 FATAL(("replaying log entry from %d failed: %s\n", (int)start, 586 strerror(status))); 587 return B_ERROR; 588 } 589 start = start % fLogSize; 590 } 591 592 PRINT(("replaying worked fine!\n")); 593 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 594 fVolume->LogEnd()); 595 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 596 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 597 598 return fVolume->WriteSuperBlock(); 599 } 600 601 602 /*! This is a callback function that is called by the cache, whenever 603 all blocks of a transaction have been flushed to disk. 604 This lets us keep track of completed transactions, and update 605 the log start pointer as needed. Note, the transactions may not be 606 completed in the order they were written. 607 */ 608 /*static*/ void 609 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 610 { 611 LogEntry* logEntry = (LogEntry*)_logEntry; 612 613 PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, 614 transactionID)); 615 616 Journal* journal = logEntry->GetJournal(); 617 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 618 bool update = false; 619 620 // Set log_start pointer if possible... 621 622 mutex_lock(&journal->fEntriesLock); 623 624 if (logEntry == journal->fEntries.First()) { 625 LogEntry* next = journal->fEntries.GetNext(logEntry); 626 if (next != NULL) { 627 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 628 % journal->fLogSize); 629 } else { 630 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 631 journal->fVolume->LogEnd()); 632 } 633 634 update = true; 635 } 636 637 T(LogEntry(logEntry, superBlock.LogStart(), false)); 638 639 journal->fUsed -= logEntry->Length(); 640 journal->fEntries.Remove(logEntry); 641 mutex_unlock(&journal->fEntriesLock); 642 643 delete logEntry; 644 645 // update the super block, and change the disk's state, if necessary 646 647 if (update) { 648 if (superBlock.log_start == superBlock.log_end) 649 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 650 651 status_t status = journal->fVolume->WriteSuperBlock(); 652 if (status != B_OK) { 653 FATAL(("_TransactionWritten: could not write back super block: %s\n", 654 strerror(status))); 655 } 656 657 journal->fVolume->LogStart() = superBlock.LogStart(); 658 } 659 } 660 661 662 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 663 /*static*/ void 664 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 665 { 666 // The current transaction seems to be idle - flush it 667 668 Journal* journal = (Journal*)_journal; 669 journal->_FlushLog(false, false); 670 } 671 672 673 /*! Writes the blocks that are part of current transaction into the log, 674 and ends the current transaction. 675 If the current transaction is too large to fit into the log, it will 676 try to detach an existing sub-transaction. 677 */ 678 status_t 679 Journal::_WriteTransactionToLog() 680 { 681 // TODO: in case of a failure, we need a backup plan like writing all 682 // changed blocks back to disk immediately (hello disk corruption!) 683 684 bool detached = false; 685 686 if (_TransactionSize() > fLogSize) { 687 // The current transaction won't fit into the log anymore, try to 688 // detach the current sub-transaction 689 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 690 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 691 detached = true; 692 } else { 693 // TODO: what are our options here? 694 // a) abort the transaction - bad, because all changes are lost 695 // b) carry out the changes, but don't use the log - even worse, 696 // as it potentially creates a corrupted disk. 697 dprintf("transaction too large (%d blocks, %d main, log size %d)!\n", 698 (int)_TransactionSize(), (int)cache_blocks_in_main_transaction( 699 fVolume->BlockCache(), fTransactionID), (int)fLogSize); 700 return B_BUFFER_OVERFLOW; 701 } 702 } 703 704 fHasSubtransaction = false; 705 706 int32 blockShift = fVolume->BlockShift(); 707 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 708 off_t logStart = fVolume->LogEnd() % fLogSize; 709 off_t logPosition = logStart; 710 status_t status; 711 712 // create run_array structures for all changed blocks 713 714 RunArrays runArrays(this); 715 716 off_t blockNumber; 717 long cookie = 0; 718 while (cache_next_block_in_transaction(fVolume->BlockCache(), 719 fTransactionID, detached, &cookie, &blockNumber, NULL, 720 NULL) == B_OK) { 721 status = runArrays.Insert(blockNumber); 722 if (status < B_OK) { 723 FATAL(("filling log entry failed!")); 724 return status; 725 } 726 } 727 728 if (runArrays.CountBlocks() == 0) { 729 // nothing has changed during this transaction 730 if (detached) { 731 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 732 fTransactionID, NULL, NULL); 733 fUnwrittenTransactions = 1; 734 } else { 735 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 736 NULL); 737 fUnwrittenTransactions = 0; 738 } 739 return B_OK; 740 } 741 742 // If necessary, flush the log, so that we have enough space for this 743 // transaction 744 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 745 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 746 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 747 panic("no space in log after sync (%ld for %ld blocks)!", 748 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 749 } 750 } 751 752 // Write log entries to disk 753 754 int32 maxVecs = runArrays.MaxArrayLength() + 1; 755 // one extra for the index block 756 757 iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs); 758 if (vecs == NULL) { 759 // TODO: write back log entries directly? 760 return B_NO_MEMORY; 761 } 762 763 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 764 run_array* array = runArrays.ArrayAt(k); 765 int32 index = 0, count = 1; 766 int32 wrap = fLogSize - logStart; 767 768 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 769 770 // add block runs 771 772 for (int32 i = 0; i < array->CountRuns(); i++) { 773 const block_run& run = array->RunAt(i); 774 off_t blockNumber = fVolume->ToBlock(run); 775 776 for (int32 j = 0; j < run.Length(); j++) { 777 if (count >= wrap) { 778 // We need to write back the first half of the entry 779 // directly as the log wraps around 780 if (writev_pos(fVolume->Device(), logOffset 781 + (logStart << blockShift), vecs, index) < 0) 782 FATAL(("could not write log area!\n")); 783 784 logPosition = logStart + count; 785 logStart = 0; 786 wrap = fLogSize; 787 count = 0; 788 index = 0; 789 } 790 791 // make blocks available in the cache 792 const void* data = block_cache_get(fVolume->BlockCache(), 793 blockNumber + j); 794 if (data == NULL) { 795 free(vecs); 796 return B_IO_ERROR; 797 } 798 799 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 800 count++; 801 } 802 } 803 804 // write back the rest of the log entry 805 if (count > 0) { 806 logPosition = logStart + count; 807 if (writev_pos(fVolume->Device(), logOffset 808 + (logStart << blockShift), vecs, index) < 0) 809 FATAL(("could not write log area: %s!\n", strerror(errno))); 810 } 811 812 // release blocks again 813 for (int32 i = 0; i < array->CountRuns(); i++) { 814 const block_run& run = array->RunAt(i); 815 off_t blockNumber = fVolume->ToBlock(run); 816 817 for (int32 j = 0; j < run.Length(); j++) { 818 block_cache_put(fVolume->BlockCache(), blockNumber + j); 819 } 820 } 821 822 logStart = logPosition % fLogSize; 823 } 824 825 free(vecs); 826 827 LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(), 828 runArrays.LogEntryLength()); 829 if (logEntry == NULL) { 830 FATAL(("no memory to allocate log entries!")); 831 return B_NO_MEMORY; 832 } 833 834 #ifdef BFS_DEBUGGER_COMMANDS 835 logEntry->SetTransactionID(fTransactionID); 836 #endif 837 838 // Update the log end pointer in the super block 839 840 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 841 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 842 843 status = fVolume->WriteSuperBlock(); 844 845 fVolume->LogEnd() = logPosition; 846 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 847 848 // We need to flush the drives own cache here to ensure 849 // disk consistency. 850 // If that call fails, we can't do anything about it anyway 851 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 852 853 // at this point, we can finally end the transaction - we're in 854 // a guaranteed valid state 855 856 mutex_lock(&fEntriesLock); 857 fEntries.Add(logEntry); 858 fUsed += logEntry->Length(); 859 mutex_unlock(&fEntriesLock); 860 861 if (detached) { 862 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 863 fTransactionID, _TransactionWritten, logEntry); 864 fUnwrittenTransactions = 1; 865 } else { 866 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 867 _TransactionWritten, logEntry); 868 fUnwrittenTransactions = 0; 869 } 870 871 return status; 872 } 873 874 875 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 876 also write back all dirty blocks for this volume. 877 */ 878 status_t 879 Journal::_FlushLog(bool canWait, bool flushBlocks) 880 { 881 status_t status = canWait ? recursive_lock_lock(&fLock) 882 : recursive_lock_trylock(&fLock); 883 if (status != B_OK) 884 return status; 885 886 if (recursive_lock_get_recursion(&fLock) > 1) { 887 // whoa, FlushLogAndBlocks() was called from inside a transaction 888 recursive_lock_unlock(&fLock); 889 return B_OK; 890 } 891 892 // write the current log entry to disk 893 894 if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) { 895 status = _WriteTransactionToLog(); 896 if (status < B_OK) 897 FATAL(("writing current log entry failed: %s\n", strerror(status))); 898 } 899 900 if (flushBlocks) 901 status = fVolume->FlushDevice(); 902 903 recursive_lock_unlock(&fLock); 904 return status; 905 } 906 907 908 /*! Flushes the current log entry to disk, and also writes back all dirty 909 blocks for this volume (completing all open transactions). 910 */ 911 status_t 912 Journal::FlushLogAndBlocks() 913 { 914 return _FlushLog(true, true); 915 } 916 917 918 status_t 919 Journal::Lock(Transaction* owner, bool separateSubTransactions) 920 { 921 status_t status = recursive_lock_lock(&fLock); 922 if (status != B_OK) 923 return status; 924 925 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 926 // we'll just use the current transaction again 927 return B_OK; 928 } 929 930 if (separateSubTransactions) 931 fSeparateSubTransactions = true; 932 933 fOwner = owner; 934 935 // TODO: we need a way to find out how big the current transaction is; 936 // we need to be able to either detach the latest sub transaction on 937 // demand, as well as having some kind of fall back plan in case the 938 // sub transaction itself grows bigger than the log. 939 // For that, it would be nice to have some call-back interface in the 940 // cache transaction API... 941 942 if (fOwner != NULL) { 943 if (fUnwrittenTransactions > 0) { 944 // start a sub transaction 945 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 946 fHasSubtransaction = true; 947 } else 948 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 949 950 if (fTransactionID < B_OK) { 951 recursive_lock_unlock(&fLock); 952 return fTransactionID; 953 } 954 955 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 956 TRANSACTION_IDLE, _TransactionIdle, this); 957 } 958 return B_OK; 959 } 960 961 962 void 963 Journal::Unlock(Transaction* owner, bool success) 964 { 965 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 966 // we only end the transaction if we would really unlock it 967 // TODO: what about failing transactions that do not unlock? 968 // (they must make the parent fail, too) 969 if (fOwner != NULL) 970 _TransactionDone(success); 971 972 fTimestamp = system_time(); 973 fOwner = NULL; 974 975 if (fSeparateSubTransactions 976 && recursive_lock_get_recursion(&fLock) == 1) 977 fSeparateSubTransactions = false; 978 } 979 980 recursive_lock_unlock(&fLock); 981 } 982 983 984 uint32 985 Journal::_TransactionSize() const 986 { 987 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 988 fTransactionID); 989 if (count <= 0) 990 return 0; 991 992 // take the number of array blocks in this transaction into account 993 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 994 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 995 return count + arrayBlocks; 996 } 997 998 999 status_t 1000 Journal::_TransactionDone(bool success) 1001 { 1002 if (!success) { 1003 if (_HasSubTransaction()) 1004 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 1005 else 1006 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1007 1008 return B_OK; 1009 } 1010 1011 // Up to a maximum size, we will just batch several 1012 // transactions together to improve speed 1013 uint32 size = _TransactionSize(); 1014 if (size < fMaxTransactionSize) { 1015 // Flush the log from time to time, so that we have enough space 1016 // for this transaction 1017 if (size > FreeLogBlocks()) 1018 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1019 1020 fUnwrittenTransactions++; 1021 return B_OK; 1022 } 1023 1024 return _WriteTransactionToLog(); 1025 } 1026 1027 1028 // #pragma mark - debugger commands 1029 1030 1031 #ifdef BFS_DEBUGGER_COMMANDS 1032 1033 void 1034 Journal::Dump() 1035 { 1036 kprintf("log start: %ld\n", fVolume->LogStart()); 1037 kprintf("log end: %ld\n", fVolume->LogEnd()); 1038 kprintf("entries:\n"); 1039 kprintf(" address id start length\n"); 1040 1041 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1042 1043 while (iterator.HasNext()) { 1044 LogEntry* entry = iterator.Next(); 1045 1046 kprintf(" %p %6ld %6lu %6lu\n", entry, entry->TransactionID(), 1047 entry->Start(), entry->Length()); 1048 } 1049 } 1050 1051 1052 int 1053 dump_journal(int argc, char** argv) 1054 { 1055 if (argc != 2 || !strcmp(argv[1], "--help")) { 1056 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1057 return 0; 1058 } 1059 1060 Volume* volume = (Volume*)parse_expression(argv[1]); 1061 Journal* journal = volume->GetJournal(0); 1062 1063 journal->Dump(); 1064 return 0; 1065 } 1066 1067 #endif // BFS_DEBUGGER_COMMANDS 1068 1069 1070 // #pragma mark - Transaction 1071 1072 1073 status_t 1074 Transaction::Start(Volume* volume, off_t refBlock) 1075 { 1076 // has it already been started? 1077 if (fJournal != NULL) 1078 return B_OK; 1079 1080 fJournal = volume->GetJournal(refBlock); 1081 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1082 return B_OK; 1083 1084 fJournal = NULL; 1085 return B_ERROR; 1086 } 1087 1088 1089 /*! Adds an inode to this transaction. This means that the inode will be write 1090 locked until the transaction ended. 1091 To ensure that the inode will stay valid until that point, an extra reference 1092 is acquired to it as long as this transaction stays active. 1093 */ 1094 void 1095 Transaction::AddInode(Inode* inode) 1096 { 1097 if (fJournal == NULL) 1098 panic("Transaction is not running!"); 1099 1100 // These flags can only change while holding the transaction lock 1101 if ((inode->Flags() & INODE_IN_TRANSACTION) != 0) 1102 return; 1103 1104 // We share the same list link with the removed list, so we have to remove 1105 // the inode from that list here (and add it back when we no longer need it) 1106 if ((inode->Flags() & INODE_DELETED) != 0) 1107 GetVolume()->RemovedInodes().Remove(inode); 1108 1109 if (!GetVolume()->IsInitializing()) 1110 acquire_vnode(GetVolume()->FSVolume(), inode->ID()); 1111 1112 rw_lock_write_lock(&inode->Lock()); 1113 fLockedInodes.Add(inode); 1114 inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1115 } 1116 1117 1118 void 1119 Transaction::RemoveInode(Inode* inode) 1120 { 1121 if (fJournal == NULL) 1122 panic("Transaction is not running!"); 1123 1124 inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1125 fLockedInodes.Remove(inode); 1126 rw_lock_write_unlock(&inode->Lock()); 1127 1128 // See AddInode() why we do this here 1129 if ((inode->Flags() & INODE_DELETED) != 0) 1130 GetVolume()->RemovedInodes().Add(inode); 1131 1132 if (!GetVolume()->IsInitializing()) 1133 put_vnode(GetVolume()->FSVolume(), inode->ID()); 1134 } 1135 1136 1137 void 1138 Transaction::_UnlockInodes() 1139 { 1140 while (Inode* inode = fLockedInodes.RemoveHead()) { 1141 inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1142 rw_lock_write_unlock(&inode->Lock()); 1143 1144 // See AddInode() why we do this here 1145 if ((inode->Flags() & INODE_DELETED) != 0) 1146 GetVolume()->RemovedInodes().Add(inode); 1147 1148 if (!GetVolume()->IsInitializing()) 1149 put_vnode(GetVolume()->FSVolume(), inode->ID()); 1150 } 1151 } 1152 1153