1 /* 2 * Copyright 2001-2020, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 7 //! Transaction and logging 8 9 10 #include <StackOrHeapArray.h> 11 12 #include "Journal.h" 13 14 #include "Debug.h" 15 #include "Inode.h" 16 17 18 struct run_array { 19 int32 count; 20 int32 max_runs; 21 block_run runs[0]; 22 23 void Init(int32 blockSize); 24 void Insert(block_run& run); 25 26 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 27 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 28 // that -1 accounts for an off-by-one error in Be's BFS implementation 29 const block_run& RunAt(int32 i) const { return runs[i]; } 30 31 static int32 MaxRuns(int32 blockSize); 32 33 private: 34 static int _Compare(block_run& a, block_run& b); 35 int32 _FindInsertionIndex(block_run& run); 36 }; 37 38 class RunArrays { 39 public: 40 RunArrays(Journal* journal); 41 ~RunArrays(); 42 43 status_t Insert(off_t blockNumber); 44 45 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 46 int32 CountArrays() const { return fArrays.CountItems(); } 47 48 uint32 CountBlocks() const { return fBlockCount; } 49 uint32 LogEntryLength() const 50 { return CountBlocks() + CountArrays(); } 51 52 int32 MaxArrayLength(); 53 54 private: 55 status_t _AddArray(); 56 bool _ContainsRun(block_run& run); 57 bool _AddRun(block_run& run); 58 59 Journal* fJournal; 60 uint32 fBlockCount; 61 Stack<run_array*> fArrays; 62 run_array* fLastArray; 63 }; 64 65 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 66 public: 67 LogEntry(Journal* journal, uint32 logStart, 68 uint32 length); 69 ~LogEntry(); 70 71 uint32 Start() const { return fStart; } 72 uint32 Length() const { return fLength; } 73 74 #ifdef BFS_DEBUGGER_COMMANDS 75 void SetTransactionID(int32 id) { fTransactionID = id; } 76 int32 TransactionID() const { return fTransactionID; } 77 #endif 78 79 Journal* GetJournal() { return fJournal; } 80 81 private: 82 Journal* fJournal; 83 uint32 fStart; 84 uint32 fLength; 85 #ifdef BFS_DEBUGGER_COMMANDS 86 int32 fTransactionID; 87 #endif 88 }; 89 90 91 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE) 92 namespace BFSJournalTracing { 93 94 class LogEntry : public AbstractTraceEntry { 95 public: 96 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 97 : 98 fEntry(entry), 99 #ifdef BFS_DEBUGGER_COMMANDS 100 fTransactionID(entry->TransactionID()), 101 #endif 102 fStart(entry->Start()), 103 fLength(entry->Length()), 104 fLogPosition(logPosition), 105 fStarted(started) 106 { 107 Initialized(); 108 } 109 110 virtual void AddDump(TraceOutput& out) 111 { 112 #ifdef BFS_DEBUGGER_COMMANDS 113 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 114 "%lu\n", fStarted ? "Started" : "Written", fEntry, 115 fTransactionID, fStart, fLength, 116 fStarted ? "end" : "start", fLogPosition); 117 #else 118 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 119 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 120 fStarted ? "end" : "start", fLogPosition); 121 #endif 122 } 123 124 private: 125 ::LogEntry* fEntry; 126 #ifdef BFS_DEBUGGER_COMMANDS 127 int32 fTransactionID; 128 #endif 129 uint32 fStart; 130 uint32 fLength; 131 uint32 fLogPosition; 132 bool fStarted; 133 }; 134 135 } // namespace BFSJournalTracing 136 137 # define T(x) new(std::nothrow) BFSJournalTracing::x; 138 #else 139 # define T(x) ; 140 #endif 141 142 143 // #pragma mark - 144 145 146 static void 147 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 148 size_t size) 149 { 150 if (index > 0 && (addr_t)vecs[index - 1].iov_base 151 + vecs[index - 1].iov_len == (addr_t)address) { 152 // the iovec can be combined with the previous one 153 vecs[index - 1].iov_len += size; 154 return; 155 } 156 157 if (index == max) 158 panic("no more space for iovecs!"); 159 160 // we need to start a new iovec 161 vecs[index].iov_base = const_cast<void*>(address); 162 vecs[index].iov_len = size; 163 index++; 164 } 165 166 167 // #pragma mark - LogEntry 168 169 170 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 171 : 172 fJournal(journal), 173 fStart(start), 174 fLength(length) 175 { 176 } 177 178 179 LogEntry::~LogEntry() 180 { 181 } 182 183 184 // #pragma mark - run_array 185 186 187 /*! The run_array's size equals the block size of the BFS volume, so we 188 cannot use a (non-overridden) new. 189 This makes a freshly allocated run_array ready to run. 190 */ 191 void 192 run_array::Init(int32 blockSize) 193 { 194 memset(this, 0, blockSize); 195 count = 0; 196 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 197 } 198 199 200 /*! Inserts the block_run into the array. You will have to make sure the 201 array is large enough to contain the entry before calling this function. 202 */ 203 void 204 run_array::Insert(block_run& run) 205 { 206 int32 index = _FindInsertionIndex(run); 207 if (index == -1) { 208 // add to the end 209 runs[CountRuns()] = run; 210 } else { 211 // insert at index 212 memmove(&runs[index + 1], &runs[index], 213 (CountRuns() - index) * sizeof(off_t)); 214 runs[index] = run; 215 } 216 217 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 218 } 219 220 221 /*static*/ int32 222 run_array::MaxRuns(int32 blockSize) 223 { 224 // For whatever reason, BFS restricts the maximum array size 225 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 226 if (maxCount < 128) 227 return maxCount; 228 229 return 127; 230 } 231 232 233 /*static*/ int 234 run_array::_Compare(block_run& a, block_run& b) 235 { 236 int cmp = a.AllocationGroup() - b.AllocationGroup(); 237 if (cmp == 0) 238 return a.Start() - b.Start(); 239 240 return cmp; 241 } 242 243 244 int32 245 run_array::_FindInsertionIndex(block_run& run) 246 { 247 int32 min = 0, max = CountRuns() - 1; 248 int32 i = 0; 249 if (max >= 8) { 250 while (min <= max) { 251 i = (min + max) / 2; 252 253 int cmp = _Compare(runs[i], run); 254 if (cmp < 0) 255 min = i + 1; 256 else if (cmp > 0) 257 max = i - 1; 258 else 259 return -1; 260 } 261 262 if (_Compare(runs[i], run) < 0) 263 i++; 264 } else { 265 for (; i <= max; i++) { 266 if (_Compare(runs[i], run) > 0) 267 break; 268 } 269 if (i == count) 270 return -1; 271 } 272 273 return i; 274 } 275 276 277 // #pragma mark - RunArrays 278 279 280 RunArrays::RunArrays(Journal* journal) 281 : 282 fJournal(journal), 283 fBlockCount(0), 284 fArrays(), 285 fLastArray(NULL) 286 { 287 } 288 289 290 RunArrays::~RunArrays() 291 { 292 run_array* array; 293 while (fArrays.Pop(&array)) 294 free(array); 295 } 296 297 298 bool 299 RunArrays::_ContainsRun(block_run& run) 300 { 301 for (int32 i = 0; i < CountArrays(); i++) { 302 run_array* array = ArrayAt(i); 303 304 for (int32 j = 0; j < array->CountRuns(); j++) { 305 block_run& arrayRun = array->runs[j]; 306 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 307 continue; 308 309 if (run.Start() >= arrayRun.Start() 310 && run.Start() + run.Length() 311 <= arrayRun.Start() + arrayRun.Length()) 312 return true; 313 } 314 } 315 316 return false; 317 } 318 319 320 /*! Adds the specified block_run into the array. 321 Note: it doesn't support overlapping - it must only be used 322 with block_runs of length 1! 323 */ 324 bool 325 RunArrays::_AddRun(block_run& run) 326 { 327 ASSERT(run.length == 1); 328 329 // Be's BFS log replay routine can only deal with block_runs of size 1 330 // A pity, isn't it? Too sad we have to be compatible. 331 332 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 333 return false; 334 335 fLastArray->Insert(run); 336 fBlockCount++; 337 return true; 338 } 339 340 341 status_t 342 RunArrays::_AddArray() 343 { 344 int32 blockSize = fJournal->GetVolume()->BlockSize(); 345 346 run_array* array = (run_array*)malloc(blockSize); 347 if (array == NULL) 348 return B_NO_MEMORY; 349 350 if (fArrays.Push(array) != B_OK) { 351 free(array); 352 return B_NO_MEMORY; 353 } 354 355 array->Init(blockSize); 356 fLastArray = array; 357 return B_OK; 358 } 359 360 361 status_t 362 RunArrays::Insert(off_t blockNumber) 363 { 364 Volume* volume = fJournal->GetVolume(); 365 block_run run = volume->ToBlockRun(blockNumber); 366 367 if (fLastArray != NULL) { 368 // check if the block is already in the array 369 if (_ContainsRun(run)) 370 return B_OK; 371 } 372 373 // insert block into array 374 375 if (!_AddRun(run)) { 376 // array is full 377 if (_AddArray() != B_OK || !_AddRun(run)) 378 return B_NO_MEMORY; 379 } 380 381 return B_OK; 382 } 383 384 385 int32 386 RunArrays::MaxArrayLength() 387 { 388 int32 max = 0; 389 for (int32 i = 0; i < CountArrays(); i++) { 390 if (ArrayAt(i)->CountRuns() > max) 391 max = ArrayAt(i)->CountRuns(); 392 } 393 394 return max; 395 } 396 397 398 // #pragma mark - Journal 399 400 401 Journal::Journal(Volume* volume) 402 : 403 fVolume(volume), 404 fOwner(NULL), 405 fLogSize(volume->Log().Length()), 406 fMaxTransactionSize(fLogSize / 2 - 5), 407 fUsed(0), 408 fUnwrittenTransactions(0), 409 fHasSubtransaction(false), 410 fSeparateSubTransactions(false) 411 { 412 recursive_lock_init(&fLock, "bfs journal"); 413 mutex_init(&fEntriesLock, "bfs journal entries"); 414 415 fLogFlusherSem = create_sem(0, "bfs log flusher"); 416 fLogFlusher = spawn_kernel_thread(&Journal::_LogFlusher, "bfs log flusher", 417 B_NORMAL_PRIORITY, this); 418 if (fLogFlusher > 0) 419 resume_thread(fLogFlusher); 420 } 421 422 423 Journal::~Journal() 424 { 425 FlushLogAndBlocks(); 426 427 recursive_lock_destroy(&fLock); 428 mutex_destroy(&fEntriesLock); 429 430 sem_id logFlusher = fLogFlusherSem; 431 fLogFlusherSem = -1; 432 delete_sem(logFlusher); 433 wait_for_thread(fLogFlusher, NULL); 434 } 435 436 437 status_t 438 Journal::InitCheck() 439 { 440 return B_OK; 441 } 442 443 444 /*! \brief Does a very basic consistency check of the run array. 445 It will check the maximum run count as well as if all of the runs fall 446 within a the volume. 447 */ 448 status_t 449 Journal::_CheckRunArray(const run_array* array) 450 { 451 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 452 // the -1 works around an off-by-one bug in Be's BFS implementation, 453 // same as in run_array::MaxRuns() 454 if (array->MaxRuns() != maxRuns 455 || array->CountRuns() > maxRuns 456 || array->CountRuns() <= 0) { 457 dprintf("run count: %d, array max: %d, max runs: %d\n", 458 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 459 FATAL(("Log entry has broken header!\n")); 460 return B_ERROR; 461 } 462 463 for (int32 i = 0; i < array->CountRuns(); i++) { 464 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 465 return B_ERROR; 466 } 467 468 PRINT(("Log entry has %" B_PRId32 " entries\n", array->CountRuns())); 469 return B_OK; 470 } 471 472 473 /*! Replays an entry in the log. 474 \a _start points to the entry in the log, and will be bumped to the next 475 one if replaying succeeded. 476 */ 477 status_t 478 Journal::_ReplayRunArray(int32* _start) 479 { 480 PRINT(("ReplayRunArray(start = %" B_PRId32 ")\n", *_start)); 481 482 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 483 off_t firstBlockNumber = *_start % fLogSize; 484 485 CachedBlock cachedArray(fVolume); 486 487 status_t status = cachedArray.SetTo(logOffset + firstBlockNumber); 488 if (status != B_OK) 489 return status; 490 491 const run_array* array = (const run_array*)cachedArray.Block(); 492 if (_CheckRunArray(array) < B_OK) 493 return B_BAD_DATA; 494 495 // First pass: check integrity of the blocks in the run array 496 497 CachedBlock cached(fVolume); 498 499 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 500 off_t blockNumber = firstBlockNumber; 501 int32 blockSize = fVolume->BlockSize(); 502 503 for (int32 index = 0; index < array->CountRuns(); index++) { 504 const block_run& run = array->RunAt(index); 505 506 off_t offset = fVolume->ToOffset(run); 507 for (int32 i = 0; i < run.Length(); i++) { 508 status = cached.SetTo(logOffset + blockNumber); 509 if (status != B_OK) 510 RETURN_ERROR(status); 511 512 // TODO: eventually check other well known offsets, like the 513 // root and index dirs 514 if (offset == 0) { 515 // This log entry writes over the superblock - check if 516 // it's valid! 517 if (Volume::CheckSuperBlock(cached.Block()) != B_OK) { 518 FATAL(("Log contains invalid superblock!\n")); 519 RETURN_ERROR(B_BAD_DATA); 520 } 521 } 522 523 blockNumber = (blockNumber + 1) % fLogSize; 524 offset += blockSize; 525 } 526 } 527 528 // Second pass: write back its blocks 529 530 blockNumber = firstBlockNumber; 531 int32 count = 1; 532 533 for (int32 index = 0; index < array->CountRuns(); index++) { 534 const block_run& run = array->RunAt(index); 535 INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n", 536 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 537 538 off_t offset = fVolume->ToOffset(run); 539 for (int32 i = 0; i < run.Length(); i++) { 540 status = cached.SetTo(logOffset + blockNumber); 541 if (status != B_OK) 542 RETURN_ERROR(status); 543 544 ssize_t written = write_pos(fVolume->Device(), offset, 545 cached.Block(), blockSize); 546 if (written != blockSize) 547 RETURN_ERROR(B_IO_ERROR); 548 549 blockNumber = (blockNumber + 1) % fLogSize; 550 offset += blockSize; 551 count++; 552 } 553 } 554 555 *_start += count; 556 return B_OK; 557 } 558 559 560 /*! Replays all log entries - this will put the disk into a 561 consistent and clean state, if it was not correctly unmounted 562 before. 563 This method is called by Journal::InitCheck() if the log start 564 and end pointer don't match. 565 */ 566 status_t 567 Journal::ReplayLog() 568 { 569 // TODO: this logic won't work whenever the size of the pending transaction 570 // equals the size of the log (happens with the original BFS only) 571 if (fVolume->LogStart() == fVolume->LogEnd()) 572 return B_OK; 573 574 INFORM(("Replay log, disk was not correctly unmounted...\n")); 575 576 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) { 577 INFORM(("log_start and log_end differ, but disk is marked clean - " 578 "trying to replay log...\n")); 579 } 580 581 if (fVolume->IsReadOnly()) 582 return B_READ_ONLY_DEVICE; 583 584 int32 start = fVolume->LogStart(); 585 int32 lastStart = -1; 586 while (true) { 587 // stop if the log is completely flushed 588 if (start == fVolume->LogEnd()) 589 break; 590 591 if (start == lastStart) { 592 // strange, flushing the log hasn't changed the log_start pointer 593 return B_ERROR; 594 } 595 lastStart = start; 596 597 status_t status = _ReplayRunArray(&start); 598 if (status != B_OK) { 599 FATAL(("replaying log entry from %d failed: %s\n", (int)start, 600 strerror(status))); 601 return B_ERROR; 602 } 603 start = start % fLogSize; 604 } 605 606 PRINT(("replaying worked fine!\n")); 607 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 608 fVolume->LogEnd()); 609 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 610 fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32( 611 SUPER_BLOCK_DISK_CLEAN); 612 613 return fVolume->WriteSuperBlock(); 614 } 615 616 617 size_t 618 Journal::CurrentTransactionSize() const 619 { 620 if (_HasSubTransaction()) { 621 return cache_blocks_in_sub_transaction(fVolume->BlockCache(), 622 fTransactionID); 623 } 624 625 return cache_blocks_in_main_transaction(fVolume->BlockCache(), 626 fTransactionID); 627 } 628 629 630 bool 631 Journal::CurrentTransactionTooLarge() const 632 { 633 return CurrentTransactionSize() > fLogSize; 634 } 635 636 637 /*! This is a callback function that is called by the cache, whenever 638 all blocks of a transaction have been flushed to disk. 639 This lets us keep track of completed transactions, and update 640 the log start pointer as needed. Note, the transactions may not be 641 completed in the order they were written. 642 */ 643 /*static*/ void 644 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 645 { 646 LogEntry* logEntry = (LogEntry*)_logEntry; 647 648 PRINT(("Log entry %p has been finished, transaction ID = %" B_PRId32 "\n", 649 logEntry, transactionID)); 650 651 Journal* journal = logEntry->GetJournal(); 652 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 653 bool update = false; 654 655 // Set log_start pointer if possible... 656 657 mutex_lock(&journal->fEntriesLock); 658 659 if (logEntry == journal->fEntries.First()) { 660 LogEntry* next = journal->fEntries.GetNext(logEntry); 661 if (next != NULL) { 662 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 663 % journal->fLogSize); 664 } else { 665 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 666 journal->fVolume->LogEnd()); 667 } 668 669 update = true; 670 } 671 672 T(LogEntry(logEntry, superBlock.LogStart(), false)); 673 674 journal->fUsed -= logEntry->Length(); 675 journal->fEntries.Remove(logEntry); 676 mutex_unlock(&journal->fEntriesLock); 677 678 delete logEntry; 679 680 // update the superblock, and change the disk's state, if necessary 681 682 if (update) { 683 if (superBlock.log_start == superBlock.log_end) 684 superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 685 686 status_t status = journal->fVolume->WriteSuperBlock(); 687 if (status != B_OK) { 688 FATAL(("_TransactionWritten: could not write back superblock: %s\n", 689 strerror(status))); 690 } 691 692 journal->fVolume->LogStart() = superBlock.LogStart(); 693 } 694 } 695 696 697 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 698 /*static*/ void 699 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 700 { 701 // The current transaction seems to be idle - flush it. (We can't do this 702 // in this thread, as flushing the log can produce new transaction events.) 703 Journal* journal = (Journal*)_journal; 704 release_sem(journal->fLogFlusherSem); 705 } 706 707 708 /*static*/ status_t 709 Journal::_LogFlusher(void* _journal) 710 { 711 Journal* journal = (Journal*)_journal; 712 while (journal->fLogFlusherSem >= 0) { 713 if (acquire_sem(journal->fLogFlusherSem) != B_OK) 714 continue; 715 716 journal->_FlushLog(false, false); 717 } 718 return B_OK; 719 } 720 721 722 /*! Writes the blocks that are part of current transaction into the log, 723 and ends the current transaction. 724 If the current transaction is too large to fit into the log, it will 725 try to detach an existing sub-transaction. 726 */ 727 status_t 728 Journal::_WriteTransactionToLog() 729 { 730 // TODO: in case of a failure, we need a backup plan like writing all 731 // changed blocks back to disk immediately (hello disk corruption!) 732 733 bool detached = false; 734 735 if (_TransactionSize() > fLogSize) { 736 // The current transaction won't fit into the log anymore, try to 737 // detach the current sub-transaction 738 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 739 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 740 detached = true; 741 } else { 742 // We created a transaction larger than one we can write back to 743 // disk - the only option we have (besides risking disk corruption 744 // by writing it back anyway), is to let it fail. 745 dprintf("transaction too large (%d blocks, log size %d)!\n", 746 (int)_TransactionSize(), (int)fLogSize); 747 return B_BUFFER_OVERFLOW; 748 } 749 } 750 751 fHasSubtransaction = false; 752 753 int32 blockShift = fVolume->BlockShift(); 754 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 755 off_t logStart = fVolume->LogEnd() % fLogSize; 756 off_t logPosition = logStart; 757 status_t status; 758 759 // create run_array structures for all changed blocks 760 761 RunArrays runArrays(this); 762 763 off_t blockNumber; 764 long cookie = 0; 765 while (cache_next_block_in_transaction(fVolume->BlockCache(), 766 fTransactionID, detached, &cookie, &blockNumber, NULL, 767 NULL) == B_OK) { 768 status = runArrays.Insert(blockNumber); 769 if (status < B_OK) { 770 FATAL(("filling log entry failed!")); 771 return status; 772 } 773 } 774 775 if (runArrays.CountBlocks() == 0) { 776 // nothing has changed during this transaction 777 if (detached) { 778 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 779 fTransactionID, NULL, NULL); 780 fUnwrittenTransactions = 1; 781 } else { 782 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 783 NULL); 784 fUnwrittenTransactions = 0; 785 } 786 return B_OK; 787 } 788 789 // If necessary, flush the log, so that we have enough space for this 790 // transaction 791 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 792 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 793 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 794 panic("no space in log after sync (%ld for %ld blocks)!", 795 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 796 } 797 } 798 799 // Write log entries to disk 800 801 int32 maxVecs = runArrays.MaxArrayLength() + 1; 802 // one extra for the index block 803 804 BStackOrHeapArray<iovec, 8> vecs(maxVecs); 805 if (!vecs.IsValid()) { 806 // TODO: write back log entries directly? 807 return B_NO_MEMORY; 808 } 809 810 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 811 run_array* array = runArrays.ArrayAt(k); 812 int32 index = 0, count = 1; 813 int32 wrap = fLogSize - logStart; 814 815 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 816 817 // add block runs 818 819 for (int32 i = 0; i < array->CountRuns(); i++) { 820 const block_run& run = array->RunAt(i); 821 off_t blockNumber = fVolume->ToBlock(run); 822 823 for (int32 j = 0; j < run.Length(); j++) { 824 if (count >= wrap) { 825 // We need to write back the first half of the entry 826 // directly as the log wraps around 827 if (writev_pos(fVolume->Device(), logOffset 828 + (logStart << blockShift), vecs, index) < 0) 829 FATAL(("could not write log area!\n")); 830 831 logPosition = logStart + count; 832 logStart = 0; 833 wrap = fLogSize; 834 count = 0; 835 index = 0; 836 } 837 838 // make blocks available in the cache 839 const void* data = block_cache_get(fVolume->BlockCache(), 840 blockNumber + j); 841 if (data == NULL) 842 return B_IO_ERROR; 843 844 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 845 count++; 846 } 847 } 848 849 // write back the rest of the log entry 850 if (count > 0) { 851 logPosition = logStart + count; 852 if (writev_pos(fVolume->Device(), logOffset 853 + (logStart << blockShift), vecs, index) < 0) 854 FATAL(("could not write log area: %s!\n", strerror(errno))); 855 } 856 857 // release blocks again 858 for (int32 i = 0; i < array->CountRuns(); i++) { 859 const block_run& run = array->RunAt(i); 860 off_t blockNumber = fVolume->ToBlock(run); 861 862 for (int32 j = 0; j < run.Length(); j++) { 863 block_cache_put(fVolume->BlockCache(), blockNumber + j); 864 } 865 } 866 867 logStart = logPosition % fLogSize; 868 } 869 870 LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(), 871 runArrays.LogEntryLength()); 872 if (logEntry == NULL) { 873 FATAL(("no memory to allocate log entries!")); 874 return B_NO_MEMORY; 875 } 876 877 #ifdef BFS_DEBUGGER_COMMANDS 878 logEntry->SetTransactionID(fTransactionID); 879 #endif 880 881 // Update the log end pointer in the superblock 882 883 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 884 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 885 886 status = fVolume->WriteSuperBlock(); 887 888 fVolume->LogEnd() = logPosition; 889 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 890 891 // We need to flush the drives own cache here to ensure 892 // disk consistency. 893 // If that call fails, we can't do anything about it anyway 894 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 895 896 // at this point, we can finally end the transaction - we're in 897 // a guaranteed valid state 898 899 mutex_lock(&fEntriesLock); 900 fEntries.Add(logEntry); 901 fUsed += logEntry->Length(); 902 mutex_unlock(&fEntriesLock); 903 904 if (detached) { 905 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 906 fTransactionID, _TransactionWritten, logEntry); 907 fUnwrittenTransactions = 1; 908 909 if (status == B_OK && _TransactionSize() > fLogSize) { 910 // If the transaction is too large after writing, there is no way to 911 // recover, so let this transaction fail. 912 dprintf("transaction too large (%d blocks, log size %d)!\n", 913 (int)_TransactionSize(), (int)fLogSize); 914 return B_BUFFER_OVERFLOW; 915 } 916 } else { 917 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 918 _TransactionWritten, logEntry); 919 fUnwrittenTransactions = 0; 920 } 921 922 return status; 923 } 924 925 926 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 927 also write back all dirty blocks for this volume. 928 */ 929 status_t 930 Journal::_FlushLog(bool canWait, bool flushBlocks) 931 { 932 status_t status = canWait ? recursive_lock_lock(&fLock) 933 : recursive_lock_trylock(&fLock); 934 if (status != B_OK) 935 return status; 936 937 if (recursive_lock_get_recursion(&fLock) > 1) { 938 // whoa, FlushLogAndBlocks() was called from inside a transaction 939 recursive_lock_unlock(&fLock); 940 return B_OK; 941 } 942 943 // write the current log entry to disk 944 945 if (fUnwrittenTransactions != 0) { 946 status = _WriteTransactionToLog(); 947 if (status < B_OK) 948 FATAL(("writing current log entry failed: %s\n", strerror(status))); 949 } 950 951 if (flushBlocks) 952 status = fVolume->FlushDevice(); 953 954 recursive_lock_unlock(&fLock); 955 return status; 956 } 957 958 959 /*! Flushes the current log entry to disk, and also writes back all dirty 960 blocks for this volume (completing all open transactions). 961 */ 962 status_t 963 Journal::FlushLogAndBlocks() 964 { 965 return _FlushLog(true, true); 966 } 967 968 969 status_t 970 Journal::Lock(Transaction* owner, bool separateSubTransactions) 971 { 972 status_t status = recursive_lock_lock(&fLock); 973 if (status != B_OK) 974 return status; 975 976 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 977 // we'll just use the current transaction again 978 return B_OK; 979 } 980 981 if (separateSubTransactions) 982 fSeparateSubTransactions = true; 983 984 if (owner != NULL) 985 owner->SetParent(fOwner); 986 987 fOwner = owner; 988 989 // TODO: we need a way to find out how big the current transaction is; 990 // we need to be able to either detach the latest sub transaction on 991 // demand, as well as having some kind of fall back plan in case the 992 // sub transaction itself grows bigger than the log. 993 // For that, it would be nice to have some call-back interface in the 994 // cache transaction API... 995 996 if (fOwner != NULL) { 997 if (fUnwrittenTransactions > 0) { 998 // start a sub transaction 999 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 1000 fHasSubtransaction = true; 1001 } else 1002 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 1003 1004 if (fTransactionID < B_OK) { 1005 recursive_lock_unlock(&fLock); 1006 return fTransactionID; 1007 } 1008 1009 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 1010 TRANSACTION_IDLE, _TransactionIdle, this); 1011 } 1012 return B_OK; 1013 } 1014 1015 1016 status_t 1017 Journal::Unlock(Transaction* owner, bool success) 1018 { 1019 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 1020 // we only end the transaction if we would really unlock it 1021 // TODO: what about failing transactions that do not unlock? 1022 // (they must make the parent fail, too) 1023 if (owner != NULL) { 1024 status_t status = _TransactionDone(success); 1025 if (status != B_OK) 1026 return status; 1027 1028 // Unlocking the inodes might trigger new transactions, but we 1029 // cannot reuse the current one anymore, as this one is already 1030 // closed. 1031 bool separateSubTransactions = fSeparateSubTransactions; 1032 fSeparateSubTransactions = true; 1033 owner->NotifyListeners(success); 1034 fSeparateSubTransactions = separateSubTransactions; 1035 1036 fOwner = owner->Parent(); 1037 } else 1038 fOwner = NULL; 1039 1040 fTimestamp = system_time(); 1041 1042 if (fSeparateSubTransactions 1043 && recursive_lock_get_recursion(&fLock) == 1) 1044 fSeparateSubTransactions = false; 1045 } else 1046 owner->MoveListenersTo(fOwner); 1047 1048 recursive_lock_unlock(&fLock); 1049 return B_OK; 1050 } 1051 1052 1053 uint32 1054 Journal::_TransactionSize() const 1055 { 1056 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 1057 fTransactionID); 1058 if (count <= 0) 1059 return 0; 1060 1061 // take the number of array blocks in this transaction into account 1062 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 1063 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 1064 return count + arrayBlocks; 1065 } 1066 1067 1068 status_t 1069 Journal::_TransactionDone(bool success) 1070 { 1071 if (!success) { 1072 if (_HasSubTransaction()) { 1073 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 1074 // We can continue to use the parent transaction afterwards 1075 } else { 1076 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1077 fUnwrittenTransactions = 0; 1078 } 1079 1080 return B_OK; 1081 } 1082 1083 // Up to a maximum size, we will just batch several 1084 // transactions together to improve speed 1085 uint32 size = _TransactionSize(); 1086 if (size < fMaxTransactionSize) { 1087 // Flush the log from time to time, so that we have enough space 1088 // for this transaction 1089 if (size > FreeLogBlocks()) 1090 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1091 1092 fUnwrittenTransactions++; 1093 return B_OK; 1094 } 1095 1096 return _WriteTransactionToLog(); 1097 } 1098 1099 1100 // #pragma mark - debugger commands 1101 1102 1103 #ifdef BFS_DEBUGGER_COMMANDS 1104 1105 1106 void 1107 Journal::Dump() 1108 { 1109 kprintf("Journal %p\n", this); 1110 kprintf(" log start: %" B_PRId32 "\n", fVolume->LogStart()); 1111 kprintf(" log end: %" B_PRId32 "\n", fVolume->LogEnd()); 1112 kprintf(" owner: %p\n", fOwner); 1113 kprintf(" log size: %" B_PRIu32 "\n", fLogSize); 1114 kprintf(" max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize); 1115 kprintf(" used: %" B_PRIu32 "\n", fUsed); 1116 kprintf(" unwritten: %" B_PRId32 "\n", fUnwrittenTransactions); 1117 kprintf(" timestamp: %" B_PRId64 "\n", fTimestamp); 1118 kprintf(" transaction ID: %" B_PRId32 "\n", fTransactionID); 1119 kprintf(" has subtransaction: %d\n", fHasSubtransaction); 1120 kprintf(" separate sub-trans.: %d\n", fSeparateSubTransactions); 1121 kprintf("entries:\n"); 1122 kprintf(" address id start length\n"); 1123 1124 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1125 1126 while (iterator.HasNext()) { 1127 LogEntry* entry = iterator.Next(); 1128 1129 kprintf(" %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry, 1130 entry->TransactionID(), entry->Start(), entry->Length()); 1131 } 1132 } 1133 1134 1135 int 1136 dump_journal(int argc, char** argv) 1137 { 1138 if (argc != 2 || !strcmp(argv[1], "--help")) { 1139 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1140 return 0; 1141 } 1142 1143 Volume* volume = (Volume*)parse_expression(argv[1]); 1144 Journal* journal = volume->GetJournal(0); 1145 1146 journal->Dump(); 1147 return 0; 1148 } 1149 1150 1151 #endif // BFS_DEBUGGER_COMMANDS 1152 1153 1154 // #pragma mark - TransactionListener 1155 1156 1157 TransactionListener::TransactionListener() 1158 { 1159 } 1160 1161 1162 TransactionListener::~TransactionListener() 1163 { 1164 } 1165 1166 1167 // #pragma mark - Transaction 1168 1169 1170 status_t 1171 Transaction::Start(Volume* volume, off_t refBlock) 1172 { 1173 // has it already been started? 1174 if (fJournal != NULL) 1175 return B_OK; 1176 1177 fJournal = volume->GetJournal(refBlock); 1178 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1179 return B_OK; 1180 1181 fJournal = NULL; 1182 return B_ERROR; 1183 } 1184 1185 1186 void 1187 Transaction::AddListener(TransactionListener* listener) 1188 { 1189 if (fJournal == NULL) 1190 panic("Transaction is not running!"); 1191 1192 fListeners.Add(listener); 1193 } 1194 1195 1196 void 1197 Transaction::RemoveListener(TransactionListener* listener) 1198 { 1199 if (fJournal == NULL) 1200 panic("Transaction is not running!"); 1201 1202 fListeners.Remove(listener); 1203 listener->RemovedFromTransaction(); 1204 } 1205 1206 1207 void 1208 Transaction::NotifyListeners(bool success) 1209 { 1210 while (TransactionListener* listener = fListeners.RemoveHead()) { 1211 listener->TransactionDone(success); 1212 listener->RemovedFromTransaction(); 1213 } 1214 } 1215 1216 1217 /*! Move the inodes into the parent transaction. This is needed only to make 1218 sure they will still be reverted in case the transaction is aborted. 1219 */ 1220 void 1221 Transaction::MoveListenersTo(Transaction* transaction) 1222 { 1223 while (TransactionListener* listener = fListeners.RemoveHead()) { 1224 transaction->fListeners.Add(listener); 1225 } 1226 } 1227