1 /* 2 * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 7 //! Transaction and logging 8 9 10 #include "Journal.h" 11 12 #include "Debug.h" 13 #include "Inode.h" 14 15 16 struct run_array { 17 int32 count; 18 int32 max_runs; 19 block_run runs[0]; 20 21 void Init(int32 blockSize); 22 void Insert(block_run& run); 23 24 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 25 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 26 // that -1 accounts for an off-by-one error in Be's BFS implementation 27 const block_run& RunAt(int32 i) const { return runs[i]; } 28 29 static int32 MaxRuns(int32 blockSize); 30 31 private: 32 static int _Compare(block_run& a, block_run& b); 33 int32 _FindInsertionIndex(block_run& run); 34 }; 35 36 class RunArrays { 37 public: 38 RunArrays(Journal* journal); 39 ~RunArrays(); 40 41 status_t Insert(off_t blockNumber); 42 43 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 44 int32 CountArrays() const { return fArrays.CountItems(); } 45 46 uint32 CountBlocks() const { return fBlockCount; } 47 uint32 LogEntryLength() const 48 { return CountBlocks() + CountArrays(); } 49 50 int32 MaxArrayLength(); 51 52 private: 53 status_t _AddArray(); 54 bool _ContainsRun(block_run& run); 55 bool _AddRun(block_run& run); 56 57 Journal* fJournal; 58 uint32 fBlockCount; 59 Stack<run_array*> fArrays; 60 run_array* fLastArray; 61 }; 62 63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 64 public: 65 LogEntry(Journal* journal, uint32 logStart, 66 uint32 length); 67 ~LogEntry(); 68 69 uint32 Start() const { return fStart; } 70 uint32 Length() const { return fLength; } 71 72 #ifdef BFS_DEBUGGER_COMMANDS 73 void SetTransactionID(int32 id) { fTransactionID = id; } 74 int32 TransactionID() const { return fTransactionID; } 75 #endif 76 77 Journal* GetJournal() { return fJournal; } 78 79 private: 80 Journal* fJournal; 81 uint32 fStart; 82 uint32 fLength; 83 #ifdef BFS_DEBUGGER_COMMANDS 84 int32 fTransactionID; 85 #endif 86 }; 87 88 89 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE) 90 namespace BFSJournalTracing { 91 92 class LogEntry : public AbstractTraceEntry { 93 public: 94 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 95 : 96 fEntry(entry), 97 #ifdef BFS_DEBUGGER_COMMANDS 98 fTransactionID(entry->TransactionID()), 99 #endif 100 fStart(entry->Start()), 101 fLength(entry->Length()), 102 fLogPosition(logPosition), 103 fStarted(started) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 #ifdef BFS_DEBUGGER_COMMANDS 111 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 112 "%lu\n", fStarted ? "Started" : "Written", fEntry, 113 fTransactionID, fStart, fLength, 114 fStarted ? "end" : "start", fLogPosition); 115 #else 116 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 117 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 118 fStarted ? "end" : "start", fLogPosition); 119 #endif 120 } 121 122 private: 123 ::LogEntry* fEntry; 124 #ifdef BFS_DEBUGGER_COMMANDS 125 int32 fTransactionID; 126 #endif 127 uint32 fStart; 128 uint32 fLength; 129 uint32 fLogPosition; 130 bool fStarted; 131 }; 132 133 } // namespace BFSJournalTracing 134 135 # define T(x) new(std::nothrow) BFSJournalTracing::x; 136 #else 137 # define T(x) ; 138 #endif 139 140 141 // #pragma mark - 142 143 144 static void 145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 146 size_t size) 147 { 148 if (index > 0 && (addr_t)vecs[index - 1].iov_base 149 + vecs[index - 1].iov_len == (addr_t)address) { 150 // the iovec can be combined with the previous one 151 vecs[index - 1].iov_len += size; 152 return; 153 } 154 155 if (index == max) 156 panic("no more space for iovecs!"); 157 158 // we need to start a new iovec 159 vecs[index].iov_base = const_cast<void*>(address); 160 vecs[index].iov_len = size; 161 index++; 162 } 163 164 165 // #pragma mark - LogEntry 166 167 168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 169 : 170 fJournal(journal), 171 fStart(start), 172 fLength(length) 173 { 174 } 175 176 177 LogEntry::~LogEntry() 178 { 179 } 180 181 182 // #pragma mark - run_array 183 184 185 /*! The run_array's size equals the block size of the BFS volume, so we 186 cannot use a (non-overridden) new. 187 This makes a freshly allocated run_array ready to run. 188 */ 189 void 190 run_array::Init(int32 blockSize) 191 { 192 memset(this, 0, blockSize); 193 count = 0; 194 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 195 } 196 197 198 /*! Inserts the block_run into the array. You will have to make sure the 199 array is large enough to contain the entry before calling this function. 200 */ 201 void 202 run_array::Insert(block_run& run) 203 { 204 int32 index = _FindInsertionIndex(run); 205 if (index == -1) { 206 // add to the end 207 runs[CountRuns()] = run; 208 } else { 209 // insert at index 210 memmove(&runs[index + 1], &runs[index], 211 (CountRuns() - index) * sizeof(off_t)); 212 runs[index] = run; 213 } 214 215 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 216 } 217 218 219 /*static*/ int32 220 run_array::MaxRuns(int32 blockSize) 221 { 222 // For whatever reason, BFS restricts the maximum array size 223 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 224 if (maxCount < 128) 225 return maxCount; 226 227 return 127; 228 } 229 230 231 /*static*/ int 232 run_array::_Compare(block_run& a, block_run& b) 233 { 234 int cmp = a.AllocationGroup() - b.AllocationGroup(); 235 if (cmp == 0) 236 return a.Start() - b.Start(); 237 238 return cmp; 239 } 240 241 242 int32 243 run_array::_FindInsertionIndex(block_run& run) 244 { 245 int32 min = 0, max = CountRuns() - 1; 246 int32 i = 0; 247 if (max >= 8) { 248 while (min <= max) { 249 i = (min + max) / 2; 250 251 int cmp = _Compare(runs[i], run); 252 if (cmp < 0) 253 min = i + 1; 254 else if (cmp > 0) 255 max = i - 1; 256 else 257 return -1; 258 } 259 260 if (_Compare(runs[i], run) < 0) 261 i++; 262 } else { 263 for (; i <= max; i++) { 264 if (_Compare(runs[i], run) > 0) 265 break; 266 } 267 if (i == count) 268 return -1; 269 } 270 271 return i; 272 } 273 274 275 // #pragma mark - RunArrays 276 277 278 RunArrays::RunArrays(Journal* journal) 279 : 280 fJournal(journal), 281 fBlockCount(0), 282 fArrays(), 283 fLastArray(NULL) 284 { 285 } 286 287 288 RunArrays::~RunArrays() 289 { 290 run_array* array; 291 while (fArrays.Pop(&array)) 292 free(array); 293 } 294 295 296 bool 297 RunArrays::_ContainsRun(block_run& run) 298 { 299 for (int32 i = 0; i < CountArrays(); i++) { 300 run_array* array = ArrayAt(i); 301 302 for (int32 j = 0; j < array->CountRuns(); j++) { 303 block_run& arrayRun = array->runs[j]; 304 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 305 continue; 306 307 if (run.Start() >= arrayRun.Start() 308 && run.Start() + run.Length() 309 <= arrayRun.Start() + arrayRun.Length()) 310 return true; 311 } 312 } 313 314 return false; 315 } 316 317 318 /*! Adds the specified block_run into the array. 319 Note: it doesn't support overlapping - it must only be used 320 with block_runs of length 1! 321 */ 322 bool 323 RunArrays::_AddRun(block_run& run) 324 { 325 ASSERT(run.length == 1); 326 327 // Be's BFS log replay routine can only deal with block_runs of size 1 328 // A pity, isn't it? Too sad we have to be compatible. 329 330 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 331 return false; 332 333 fLastArray->Insert(run); 334 fBlockCount++; 335 return true; 336 } 337 338 339 status_t 340 RunArrays::_AddArray() 341 { 342 int32 blockSize = fJournal->GetVolume()->BlockSize(); 343 344 run_array* array = (run_array*)malloc(blockSize); 345 if (array == NULL) 346 return B_NO_MEMORY; 347 348 if (fArrays.Push(array) != B_OK) { 349 free(array); 350 return B_NO_MEMORY; 351 } 352 353 array->Init(blockSize); 354 fLastArray = array; 355 return B_OK; 356 } 357 358 359 status_t 360 RunArrays::Insert(off_t blockNumber) 361 { 362 Volume* volume = fJournal->GetVolume(); 363 block_run run = volume->ToBlockRun(blockNumber); 364 365 if (fLastArray != NULL) { 366 // check if the block is already in the array 367 if (_ContainsRun(run)) 368 return B_OK; 369 } 370 371 // insert block into array 372 373 if (!_AddRun(run)) { 374 // array is full 375 if (_AddArray() != B_OK || !_AddRun(run)) 376 return B_NO_MEMORY; 377 } 378 379 return B_OK; 380 } 381 382 383 int32 384 RunArrays::MaxArrayLength() 385 { 386 int32 max = 0; 387 for (int32 i = 0; i < CountArrays(); i++) { 388 if (ArrayAt(i)->CountRuns() > max) 389 max = ArrayAt(i)->CountRuns(); 390 } 391 392 return max; 393 } 394 395 396 // #pragma mark - Journal 397 398 399 Journal::Journal(Volume* volume) 400 : 401 fVolume(volume), 402 fOwner(NULL), 403 fLogSize(volume->Log().Length()), 404 fMaxTransactionSize(fLogSize / 2 - 5), 405 fUsed(0), 406 fUnwrittenTransactions(0), 407 fHasSubtransaction(false), 408 fSeparateSubTransactions(false) 409 { 410 recursive_lock_init(&fLock, "bfs journal"); 411 mutex_init(&fEntriesLock, "bfs journal entries"); 412 } 413 414 415 Journal::~Journal() 416 { 417 FlushLogAndBlocks(); 418 419 recursive_lock_destroy(&fLock); 420 mutex_destroy(&fEntriesLock); 421 } 422 423 424 status_t 425 Journal::InitCheck() 426 { 427 return B_OK; 428 } 429 430 431 /*! \brief Does a very basic consistency check of the run array. 432 It will check the maximum run count as well as if all of the runs fall 433 within a the volume. 434 */ 435 status_t 436 Journal::_CheckRunArray(const run_array* array) 437 { 438 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 439 // the -1 works around an off-by-one bug in Be's BFS implementation, 440 // same as in run_array::MaxRuns() 441 if (array->MaxRuns() != maxRuns 442 || array->CountRuns() > maxRuns 443 || array->CountRuns() <= 0) { 444 dprintf("run count: %d, array max: %d, max runs: %d\n", 445 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 446 FATAL(("Log entry has broken header!\n")); 447 return B_ERROR; 448 } 449 450 for (int32 i = 0; i < array->CountRuns(); i++) { 451 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 452 return B_ERROR; 453 } 454 455 PRINT(("Log entry has %ld entries\n", array->CountRuns())); 456 return B_OK; 457 } 458 459 460 /*! Replays an entry in the log. 461 \a _start points to the entry in the log, and will be bumped to the next 462 one if replaying succeeded. 463 */ 464 status_t 465 Journal::_ReplayRunArray(int32* _start) 466 { 467 PRINT(("ReplayRunArray(start = %ld)\n", *_start)); 468 469 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 470 off_t firstBlockNumber = *_start % fLogSize; 471 472 CachedBlock cachedArray(fVolume); 473 474 const run_array* array = (const run_array*)cachedArray.SetTo(logOffset 475 + firstBlockNumber); 476 if (array == NULL) 477 return B_IO_ERROR; 478 479 if (_CheckRunArray(array) < B_OK) 480 return B_BAD_DATA; 481 482 // First pass: check integrity of the blocks in the run array 483 484 CachedBlock cached(fVolume); 485 486 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 487 off_t blockNumber = firstBlockNumber; 488 int32 blockSize = fVolume->BlockSize(); 489 490 for (int32 index = 0; index < array->CountRuns(); index++) { 491 const block_run& run = array->RunAt(index); 492 493 off_t offset = fVolume->ToOffset(run); 494 for (int32 i = 0; i < run.Length(); i++) { 495 const uint8* data = cached.SetTo(logOffset + blockNumber); 496 if (data == NULL) 497 RETURN_ERROR(B_IO_ERROR); 498 499 // TODO: eventually check other well known offsets, like the 500 // root and index dirs 501 if (offset == 0) { 502 // This log entry writes over the superblock - check if 503 // it's valid! 504 if (Volume::CheckSuperBlock(data) != B_OK) { 505 FATAL(("Log contains invalid superblock!\n")); 506 RETURN_ERROR(B_BAD_DATA); 507 } 508 } 509 510 blockNumber = (blockNumber + 1) % fLogSize; 511 offset += blockSize; 512 } 513 } 514 515 // Second pass: write back its blocks 516 517 blockNumber = firstBlockNumber; 518 int32 count = 1; 519 520 for (int32 index = 0; index < array->CountRuns(); index++) { 521 const block_run& run = array->RunAt(index); 522 INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n", 523 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 524 525 off_t offset = fVolume->ToOffset(run); 526 for (int32 i = 0; i < run.Length(); i++) { 527 const uint8* data = cached.SetTo(logOffset + blockNumber); 528 if (data == NULL) 529 RETURN_ERROR(B_IO_ERROR); 530 531 ssize_t written = write_pos(fVolume->Device(), offset, data, 532 blockSize); 533 if (written != blockSize) 534 RETURN_ERROR(B_IO_ERROR); 535 536 blockNumber = (blockNumber + 1) % fLogSize; 537 offset += blockSize; 538 count++; 539 } 540 } 541 542 *_start += count; 543 return B_OK; 544 } 545 546 547 /*! Replays all log entries - this will put the disk into a 548 consistent and clean state, if it was not correctly unmounted 549 before. 550 This method is called by Journal::InitCheck() if the log start 551 and end pointer don't match. 552 */ 553 status_t 554 Journal::ReplayLog() 555 { 556 // TODO: this logic won't work whenever the size of the pending transaction 557 // equals the size of the log (happens with the original BFS only) 558 if (fVolume->LogStart() == fVolume->LogEnd()) 559 return B_OK; 560 561 INFORM(("Replay log, disk was not correctly unmounted...\n")); 562 563 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) { 564 INFORM(("log_start and log_end differ, but disk is marked clean - " 565 "trying to replay log...\n")); 566 } 567 568 if (fVolume->IsReadOnly()) 569 return B_READ_ONLY_DEVICE; 570 571 int32 start = fVolume->LogStart(); 572 int32 lastStart = -1; 573 while (true) { 574 // stop if the log is completely flushed 575 if (start == fVolume->LogEnd()) 576 break; 577 578 if (start == lastStart) { 579 // strange, flushing the log hasn't changed the log_start pointer 580 return B_ERROR; 581 } 582 lastStart = start; 583 584 status_t status = _ReplayRunArray(&start); 585 if (status != B_OK) { 586 FATAL(("replaying log entry from %d failed: %s\n", (int)start, 587 strerror(status))); 588 return B_ERROR; 589 } 590 start = start % fLogSize; 591 } 592 593 PRINT(("replaying worked fine!\n")); 594 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 595 fVolume->LogEnd()); 596 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 597 fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32( 598 SUPER_BLOCK_DISK_CLEAN); 599 600 return fVolume->WriteSuperBlock(); 601 } 602 603 604 size_t 605 Journal::CurrentTransactionSize() const 606 { 607 if (_HasSubTransaction()) { 608 return cache_blocks_in_sub_transaction(fVolume->BlockCache(), 609 fTransactionID); 610 } 611 612 return cache_blocks_in_main_transaction(fVolume->BlockCache(), 613 fTransactionID); 614 } 615 616 617 bool 618 Journal::CurrentTransactionTooLarge() const 619 { 620 return CurrentTransactionSize() > fLogSize; 621 } 622 623 624 /*! This is a callback function that is called by the cache, whenever 625 all blocks of a transaction have been flushed to disk. 626 This lets us keep track of completed transactions, and update 627 the log start pointer as needed. Note, the transactions may not be 628 completed in the order they were written. 629 */ 630 /*static*/ void 631 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 632 { 633 LogEntry* logEntry = (LogEntry*)_logEntry; 634 635 PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, 636 transactionID)); 637 638 Journal* journal = logEntry->GetJournal(); 639 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 640 bool update = false; 641 642 // Set log_start pointer if possible... 643 644 mutex_lock(&journal->fEntriesLock); 645 646 if (logEntry == journal->fEntries.First()) { 647 LogEntry* next = journal->fEntries.GetNext(logEntry); 648 if (next != NULL) { 649 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 650 % journal->fLogSize); 651 } else { 652 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 653 journal->fVolume->LogEnd()); 654 } 655 656 update = true; 657 } 658 659 T(LogEntry(logEntry, superBlock.LogStart(), false)); 660 661 journal->fUsed -= logEntry->Length(); 662 journal->fEntries.Remove(logEntry); 663 mutex_unlock(&journal->fEntriesLock); 664 665 delete logEntry; 666 667 // update the superblock, and change the disk's state, if necessary 668 669 if (update) { 670 if (superBlock.log_start == superBlock.log_end) 671 superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN); 672 673 status_t status = journal->fVolume->WriteSuperBlock(); 674 if (status != B_OK) { 675 FATAL(("_TransactionWritten: could not write back superblock: %s\n", 676 strerror(status))); 677 } 678 679 journal->fVolume->LogStart() = superBlock.LogStart(); 680 } 681 } 682 683 684 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 685 /*static*/ void 686 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 687 { 688 // The current transaction seems to be idle - flush it. We can't do this 689 // in this thread, as flushing the log can produce new transaction events. 690 thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher", 691 B_NORMAL_PRIORITY, _journal); 692 if (id > 0) 693 resume_thread(id); 694 } 695 696 697 /*static*/ status_t 698 Journal::_FlushLog(void* _journal) 699 { 700 Journal* journal = (Journal*)_journal; 701 return journal->_FlushLog(false, false); 702 } 703 704 705 /*! Writes the blocks that are part of current transaction into the log, 706 and ends the current transaction. 707 If the current transaction is too large to fit into the log, it will 708 try to detach an existing sub-transaction. 709 */ 710 status_t 711 Journal::_WriteTransactionToLog() 712 { 713 // TODO: in case of a failure, we need a backup plan like writing all 714 // changed blocks back to disk immediately (hello disk corruption!) 715 716 bool detached = false; 717 718 if (_TransactionSize() > fLogSize) { 719 // The current transaction won't fit into the log anymore, try to 720 // detach the current sub-transaction 721 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 722 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 723 detached = true; 724 } else { 725 // We created a transaction larger than one we can write back to 726 // disk - the only option we have (besides risking disk corruption 727 // by writing it back anyway), is to let it fail. 728 dprintf("transaction too large (%d blocks, log size %d)!\n", 729 (int)_TransactionSize(), (int)fLogSize); 730 return B_BUFFER_OVERFLOW; 731 } 732 } 733 734 fHasSubtransaction = false; 735 736 int32 blockShift = fVolume->BlockShift(); 737 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 738 off_t logStart = fVolume->LogEnd() % fLogSize; 739 off_t logPosition = logStart; 740 status_t status; 741 742 // create run_array structures for all changed blocks 743 744 RunArrays runArrays(this); 745 746 off_t blockNumber; 747 long cookie = 0; 748 while (cache_next_block_in_transaction(fVolume->BlockCache(), 749 fTransactionID, detached, &cookie, &blockNumber, NULL, 750 NULL) == B_OK) { 751 status = runArrays.Insert(blockNumber); 752 if (status < B_OK) { 753 FATAL(("filling log entry failed!")); 754 return status; 755 } 756 } 757 758 if (runArrays.CountBlocks() == 0) { 759 // nothing has changed during this transaction 760 if (detached) { 761 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 762 fTransactionID, NULL, NULL); 763 fUnwrittenTransactions = 1; 764 } else { 765 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 766 NULL); 767 fUnwrittenTransactions = 0; 768 } 769 return B_OK; 770 } 771 772 // If necessary, flush the log, so that we have enough space for this 773 // transaction 774 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 775 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 776 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 777 panic("no space in log after sync (%ld for %ld blocks)!", 778 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 779 } 780 } 781 782 // Write log entries to disk 783 784 int32 maxVecs = runArrays.MaxArrayLength() + 1; 785 // one extra for the index block 786 787 iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs); 788 if (vecs == NULL) { 789 // TODO: write back log entries directly? 790 return B_NO_MEMORY; 791 } 792 793 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 794 run_array* array = runArrays.ArrayAt(k); 795 int32 index = 0, count = 1; 796 int32 wrap = fLogSize - logStart; 797 798 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 799 800 // add block runs 801 802 for (int32 i = 0; i < array->CountRuns(); i++) { 803 const block_run& run = array->RunAt(i); 804 off_t blockNumber = fVolume->ToBlock(run); 805 806 for (int32 j = 0; j < run.Length(); j++) { 807 if (count >= wrap) { 808 // We need to write back the first half of the entry 809 // directly as the log wraps around 810 if (writev_pos(fVolume->Device(), logOffset 811 + (logStart << blockShift), vecs, index) < 0) 812 FATAL(("could not write log area!\n")); 813 814 logPosition = logStart + count; 815 logStart = 0; 816 wrap = fLogSize; 817 count = 0; 818 index = 0; 819 } 820 821 // make blocks available in the cache 822 const void* data = block_cache_get(fVolume->BlockCache(), 823 blockNumber + j); 824 if (data == NULL) { 825 free(vecs); 826 return B_IO_ERROR; 827 } 828 829 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 830 count++; 831 } 832 } 833 834 // write back the rest of the log entry 835 if (count > 0) { 836 logPosition = logStart + count; 837 if (writev_pos(fVolume->Device(), logOffset 838 + (logStart << blockShift), vecs, index) < 0) 839 FATAL(("could not write log area: %s!\n", strerror(errno))); 840 } 841 842 // release blocks again 843 for (int32 i = 0; i < array->CountRuns(); i++) { 844 const block_run& run = array->RunAt(i); 845 off_t blockNumber = fVolume->ToBlock(run); 846 847 for (int32 j = 0; j < run.Length(); j++) { 848 block_cache_put(fVolume->BlockCache(), blockNumber + j); 849 } 850 } 851 852 logStart = logPosition % fLogSize; 853 } 854 855 free(vecs); 856 857 LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(), 858 runArrays.LogEntryLength()); 859 if (logEntry == NULL) { 860 FATAL(("no memory to allocate log entries!")); 861 return B_NO_MEMORY; 862 } 863 864 #ifdef BFS_DEBUGGER_COMMANDS 865 logEntry->SetTransactionID(fTransactionID); 866 #endif 867 868 // Update the log end pointer in the superblock 869 870 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 871 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 872 873 status = fVolume->WriteSuperBlock(); 874 875 fVolume->LogEnd() = logPosition; 876 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 877 878 // We need to flush the drives own cache here to ensure 879 // disk consistency. 880 // If that call fails, we can't do anything about it anyway 881 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 882 883 // at this point, we can finally end the transaction - we're in 884 // a guaranteed valid state 885 886 mutex_lock(&fEntriesLock); 887 fEntries.Add(logEntry); 888 fUsed += logEntry->Length(); 889 mutex_unlock(&fEntriesLock); 890 891 if (detached) { 892 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 893 fTransactionID, _TransactionWritten, logEntry); 894 fUnwrittenTransactions = 1; 895 896 if (status == B_OK && _TransactionSize() > fLogSize) { 897 // If the transaction is too large after writing, there is no way to 898 // recover, so let this transaction fail. 899 dprintf("transaction too large (%d blocks, log size %d)!\n", 900 (int)_TransactionSize(), (int)fLogSize); 901 return B_BUFFER_OVERFLOW; 902 } 903 } else { 904 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 905 _TransactionWritten, logEntry); 906 fUnwrittenTransactions = 0; 907 } 908 909 return status; 910 } 911 912 913 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 914 also write back all dirty blocks for this volume. 915 */ 916 status_t 917 Journal::_FlushLog(bool canWait, bool flushBlocks) 918 { 919 status_t status = canWait ? recursive_lock_lock(&fLock) 920 : recursive_lock_trylock(&fLock); 921 if (status != B_OK) 922 return status; 923 924 if (recursive_lock_get_recursion(&fLock) > 1) { 925 // whoa, FlushLogAndBlocks() was called from inside a transaction 926 recursive_lock_unlock(&fLock); 927 return B_OK; 928 } 929 930 // write the current log entry to disk 931 932 if (fUnwrittenTransactions != 0) { 933 status = _WriteTransactionToLog(); 934 if (status < B_OK) 935 FATAL(("writing current log entry failed: %s\n", strerror(status))); 936 } 937 938 if (flushBlocks) 939 status = fVolume->FlushDevice(); 940 941 recursive_lock_unlock(&fLock); 942 return status; 943 } 944 945 946 /*! Flushes the current log entry to disk, and also writes back all dirty 947 blocks for this volume (completing all open transactions). 948 */ 949 status_t 950 Journal::FlushLogAndBlocks() 951 { 952 return _FlushLog(true, true); 953 } 954 955 956 status_t 957 Journal::Lock(Transaction* owner, bool separateSubTransactions) 958 { 959 status_t status = recursive_lock_lock(&fLock); 960 if (status != B_OK) 961 return status; 962 963 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 964 // we'll just use the current transaction again 965 return B_OK; 966 } 967 968 if (separateSubTransactions) 969 fSeparateSubTransactions = true; 970 971 if (owner != NULL) 972 owner->SetParent(fOwner); 973 974 fOwner = owner; 975 976 // TODO: we need a way to find out how big the current transaction is; 977 // we need to be able to either detach the latest sub transaction on 978 // demand, as well as having some kind of fall back plan in case the 979 // sub transaction itself grows bigger than the log. 980 // For that, it would be nice to have some call-back interface in the 981 // cache transaction API... 982 983 if (fOwner != NULL) { 984 if (fUnwrittenTransactions > 0) { 985 // start a sub transaction 986 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 987 fHasSubtransaction = true; 988 } else 989 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 990 991 if (fTransactionID < B_OK) { 992 recursive_lock_unlock(&fLock); 993 return fTransactionID; 994 } 995 996 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 997 TRANSACTION_IDLE, _TransactionIdle, this); 998 } 999 return B_OK; 1000 } 1001 1002 1003 status_t 1004 Journal::Unlock(Transaction* owner, bool success) 1005 { 1006 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 1007 // we only end the transaction if we would really unlock it 1008 // TODO: what about failing transactions that do not unlock? 1009 // (they must make the parent fail, too) 1010 if (owner != NULL) { 1011 status_t status = _TransactionDone(success); 1012 if (status != B_OK) 1013 return status; 1014 1015 // Unlocking the inodes might trigger new transactions, but we 1016 // cannot reuse the current one anymore, as this one is already 1017 // closed. 1018 bool separateSubTransactions = fSeparateSubTransactions; 1019 fSeparateSubTransactions = true; 1020 owner->NotifyListeners(success); 1021 fSeparateSubTransactions = separateSubTransactions; 1022 1023 fOwner = owner->Parent(); 1024 } else 1025 fOwner = NULL; 1026 1027 fTimestamp = system_time(); 1028 1029 if (fSeparateSubTransactions 1030 && recursive_lock_get_recursion(&fLock) == 1) 1031 fSeparateSubTransactions = false; 1032 } else 1033 owner->MoveListenersTo(fOwner); 1034 1035 recursive_lock_unlock(&fLock); 1036 return B_OK; 1037 } 1038 1039 1040 uint32 1041 Journal::_TransactionSize() const 1042 { 1043 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 1044 fTransactionID); 1045 if (count <= 0) 1046 return 0; 1047 1048 // take the number of array blocks in this transaction into account 1049 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 1050 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 1051 return count + arrayBlocks; 1052 } 1053 1054 1055 status_t 1056 Journal::_TransactionDone(bool success) 1057 { 1058 if (!success) { 1059 if (_HasSubTransaction()) { 1060 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 1061 // We can continue to use the parent transaction afterwards 1062 } else { 1063 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1064 fUnwrittenTransactions = 0; 1065 } 1066 1067 return B_OK; 1068 } 1069 1070 // Up to a maximum size, we will just batch several 1071 // transactions together to improve speed 1072 uint32 size = _TransactionSize(); 1073 if (size < fMaxTransactionSize) { 1074 // Flush the log from time to time, so that we have enough space 1075 // for this transaction 1076 if (size > FreeLogBlocks()) 1077 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1078 1079 fUnwrittenTransactions++; 1080 return B_OK; 1081 } 1082 1083 return _WriteTransactionToLog(); 1084 } 1085 1086 1087 // #pragma mark - debugger commands 1088 1089 1090 #ifdef BFS_DEBUGGER_COMMANDS 1091 1092 1093 void 1094 Journal::Dump() 1095 { 1096 kprintf("Journal %p\n", this); 1097 kprintf(" log start: %" B_PRId32 "\n", fVolume->LogStart()); 1098 kprintf(" log end: %" B_PRId32 "\n", fVolume->LogEnd()); 1099 kprintf(" owner: %p\n", fOwner); 1100 kprintf(" log size: %" B_PRIu32 "\n", fLogSize); 1101 kprintf(" max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize); 1102 kprintf(" used: %" B_PRIu32 "\n", fUsed); 1103 kprintf(" unwritten: %" B_PRId32 "\n", fUnwrittenTransactions); 1104 kprintf(" timestamp: %" B_PRId64 "\n", fTimestamp); 1105 kprintf(" transaction ID: %" B_PRId32 "\n", fTransactionID); 1106 kprintf(" has subtransaction: %d\n", fHasSubtransaction); 1107 kprintf(" separate sub-trans.: %d\n", fSeparateSubTransactions); 1108 kprintf("entries:\n"); 1109 kprintf(" address id start length\n"); 1110 1111 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1112 1113 while (iterator.HasNext()) { 1114 LogEntry* entry = iterator.Next(); 1115 1116 kprintf(" %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry, 1117 entry->TransactionID(), entry->Start(), entry->Length()); 1118 } 1119 } 1120 1121 1122 int 1123 dump_journal(int argc, char** argv) 1124 { 1125 if (argc != 2 || !strcmp(argv[1], "--help")) { 1126 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1127 return 0; 1128 } 1129 1130 Volume* volume = (Volume*)parse_expression(argv[1]); 1131 Journal* journal = volume->GetJournal(0); 1132 1133 journal->Dump(); 1134 return 0; 1135 } 1136 1137 1138 #endif // BFS_DEBUGGER_COMMANDS 1139 1140 1141 // #pragma mark - TransactionListener 1142 1143 1144 TransactionListener::TransactionListener() 1145 { 1146 } 1147 1148 1149 TransactionListener::~TransactionListener() 1150 { 1151 } 1152 1153 1154 // #pragma mark - Transaction 1155 1156 1157 status_t 1158 Transaction::Start(Volume* volume, off_t refBlock) 1159 { 1160 // has it already been started? 1161 if (fJournal != NULL) 1162 return B_OK; 1163 1164 fJournal = volume->GetJournal(refBlock); 1165 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1166 return B_OK; 1167 1168 fJournal = NULL; 1169 return B_ERROR; 1170 } 1171 1172 1173 void 1174 Transaction::AddListener(TransactionListener* listener) 1175 { 1176 if (fJournal == NULL) 1177 panic("Transaction is not running!"); 1178 1179 fListeners.Add(listener); 1180 } 1181 1182 1183 void 1184 Transaction::RemoveListener(TransactionListener* listener) 1185 { 1186 if (fJournal == NULL) 1187 panic("Transaction is not running!"); 1188 1189 fListeners.Remove(listener); 1190 listener->RemovedFromTransaction(); 1191 } 1192 1193 1194 void 1195 Transaction::NotifyListeners(bool success) 1196 { 1197 while (TransactionListener* listener = fListeners.RemoveHead()) { 1198 listener->TransactionDone(success); 1199 listener->RemovedFromTransaction(); 1200 } 1201 } 1202 1203 1204 /*! Move the inodes into the parent transaction. This is needed only to make 1205 sure they will still be reverted in case the transaction is aborted. 1206 */ 1207 void 1208 Transaction::MoveListenersTo(Transaction* transaction) 1209 { 1210 while (TransactionListener* listener = fListeners.RemoveHead()) { 1211 transaction->fListeners.Add(listener); 1212 } 1213 } 1214