1 /* 2 * Copyright 2001-2013, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 7 //! Transaction and logging 8 9 10 #include "Journal.h" 11 12 #include "Debug.h" 13 #include "Inode.h" 14 15 16 struct run_array { 17 int32 count; 18 int32 max_runs; 19 block_run runs[0]; 20 21 void Init(int32 blockSize); 22 void Insert(block_run& run); 23 24 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 25 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 26 // that -1 accounts for an off-by-one error in Be's BFS implementation 27 const block_run& RunAt(int32 i) const { return runs[i]; } 28 29 static int32 MaxRuns(int32 blockSize); 30 31 private: 32 static int _Compare(block_run& a, block_run& b); 33 int32 _FindInsertionIndex(block_run& run); 34 }; 35 36 class RunArrays { 37 public: 38 RunArrays(Journal* journal); 39 ~RunArrays(); 40 41 status_t Insert(off_t blockNumber); 42 43 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 44 int32 CountArrays() const { return fArrays.CountItems(); } 45 46 uint32 CountBlocks() const { return fBlockCount; } 47 uint32 LogEntryLength() const 48 { return CountBlocks() + CountArrays(); } 49 50 int32 MaxArrayLength(); 51 52 private: 53 status_t _AddArray(); 54 bool _ContainsRun(block_run& run); 55 bool _AddRun(block_run& run); 56 57 Journal* fJournal; 58 uint32 fBlockCount; 59 Stack<run_array*> fArrays; 60 run_array* fLastArray; 61 }; 62 63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 64 public: 65 LogEntry(Journal* journal, uint32 logStart, 66 uint32 length); 67 ~LogEntry(); 68 69 uint32 Start() const { return fStart; } 70 uint32 Length() const { return fLength; } 71 72 #ifdef BFS_DEBUGGER_COMMANDS 73 void SetTransactionID(int32 id) { fTransactionID = id; } 74 int32 TransactionID() const { return fTransactionID; } 75 #endif 76 77 Journal* GetJournal() { return fJournal; } 78 79 private: 80 Journal* fJournal; 81 uint32 fStart; 82 uint32 fLength; 83 #ifdef BFS_DEBUGGER_COMMANDS 84 int32 fTransactionID; 85 #endif 86 }; 87 88 89 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE) 90 namespace BFSJournalTracing { 91 92 class LogEntry : public AbstractTraceEntry { 93 public: 94 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 95 : 96 fEntry(entry), 97 #ifdef BFS_DEBUGGER_COMMANDS 98 fTransactionID(entry->TransactionID()), 99 #endif 100 fStart(entry->Start()), 101 fLength(entry->Length()), 102 fLogPosition(logPosition), 103 fStarted(started) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 #ifdef BFS_DEBUGGER_COMMANDS 111 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 112 "%lu\n", fStarted ? "Started" : "Written", fEntry, 113 fTransactionID, fStart, fLength, 114 fStarted ? "end" : "start", fLogPosition); 115 #else 116 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 117 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 118 fStarted ? "end" : "start", fLogPosition); 119 #endif 120 } 121 122 private: 123 ::LogEntry* fEntry; 124 #ifdef BFS_DEBUGGER_COMMANDS 125 int32 fTransactionID; 126 #endif 127 uint32 fStart; 128 uint32 fLength; 129 uint32 fLogPosition; 130 bool fStarted; 131 }; 132 133 } // namespace BFSJournalTracing 134 135 # define T(x) new(std::nothrow) BFSJournalTracing::x; 136 #else 137 # define T(x) ; 138 #endif 139 140 141 // #pragma mark - 142 143 144 static void 145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 146 size_t size) 147 { 148 if (index > 0 && (addr_t)vecs[index - 1].iov_base 149 + vecs[index - 1].iov_len == (addr_t)address) { 150 // the iovec can be combined with the previous one 151 vecs[index - 1].iov_len += size; 152 return; 153 } 154 155 if (index == max) 156 panic("no more space for iovecs!"); 157 158 // we need to start a new iovec 159 vecs[index].iov_base = const_cast<void*>(address); 160 vecs[index].iov_len = size; 161 index++; 162 } 163 164 165 // #pragma mark - LogEntry 166 167 168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 169 : 170 fJournal(journal), 171 fStart(start), 172 fLength(length) 173 { 174 } 175 176 177 LogEntry::~LogEntry() 178 { 179 } 180 181 182 // #pragma mark - run_array 183 184 185 /*! The run_array's size equals the block size of the BFS volume, so we 186 cannot use a (non-overridden) new. 187 This makes a freshly allocated run_array ready to run. 188 */ 189 void 190 run_array::Init(int32 blockSize) 191 { 192 memset(this, 0, blockSize); 193 count = 0; 194 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 195 } 196 197 198 /*! Inserts the block_run into the array. You will have to make sure the 199 array is large enough to contain the entry before calling this function. 200 */ 201 void 202 run_array::Insert(block_run& run) 203 { 204 int32 index = _FindInsertionIndex(run); 205 if (index == -1) { 206 // add to the end 207 runs[CountRuns()] = run; 208 } else { 209 // insert at index 210 memmove(&runs[index + 1], &runs[index], 211 (CountRuns() - index) * sizeof(off_t)); 212 runs[index] = run; 213 } 214 215 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 216 } 217 218 219 /*static*/ int32 220 run_array::MaxRuns(int32 blockSize) 221 { 222 // For whatever reason, BFS restricts the maximum array size 223 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 224 if (maxCount < 128) 225 return maxCount; 226 227 return 127; 228 } 229 230 231 /*static*/ int 232 run_array::_Compare(block_run& a, block_run& b) 233 { 234 int cmp = a.AllocationGroup() - b.AllocationGroup(); 235 if (cmp == 0) 236 return a.Start() - b.Start(); 237 238 return cmp; 239 } 240 241 242 int32 243 run_array::_FindInsertionIndex(block_run& run) 244 { 245 int32 min = 0, max = CountRuns() - 1; 246 int32 i = 0; 247 if (max >= 8) { 248 while (min <= max) { 249 i = (min + max) / 2; 250 251 int cmp = _Compare(runs[i], run); 252 if (cmp < 0) 253 min = i + 1; 254 else if (cmp > 0) 255 max = i - 1; 256 else 257 return -1; 258 } 259 260 if (_Compare(runs[i], run) < 0) 261 i++; 262 } else { 263 for (; i <= max; i++) { 264 if (_Compare(runs[i], run) > 0) 265 break; 266 } 267 if (i == count) 268 return -1; 269 } 270 271 return i; 272 } 273 274 275 // #pragma mark - RunArrays 276 277 278 RunArrays::RunArrays(Journal* journal) 279 : 280 fJournal(journal), 281 fBlockCount(0), 282 fArrays(), 283 fLastArray(NULL) 284 { 285 } 286 287 288 RunArrays::~RunArrays() 289 { 290 run_array* array; 291 while (fArrays.Pop(&array)) 292 free(array); 293 } 294 295 296 bool 297 RunArrays::_ContainsRun(block_run& run) 298 { 299 for (int32 i = 0; i < CountArrays(); i++) { 300 run_array* array = ArrayAt(i); 301 302 for (int32 j = 0; j < array->CountRuns(); j++) { 303 block_run& arrayRun = array->runs[j]; 304 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 305 continue; 306 307 if (run.Start() >= arrayRun.Start() 308 && run.Start() + run.Length() 309 <= arrayRun.Start() + arrayRun.Length()) 310 return true; 311 } 312 } 313 314 return false; 315 } 316 317 318 /*! Adds the specified block_run into the array. 319 Note: it doesn't support overlapping - it must only be used 320 with block_runs of length 1! 321 */ 322 bool 323 RunArrays::_AddRun(block_run& run) 324 { 325 ASSERT(run.length == 1); 326 327 // Be's BFS log replay routine can only deal with block_runs of size 1 328 // A pity, isn't it? Too sad we have to be compatible. 329 330 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 331 return false; 332 333 fLastArray->Insert(run); 334 fBlockCount++; 335 return true; 336 } 337 338 339 status_t 340 RunArrays::_AddArray() 341 { 342 int32 blockSize = fJournal->GetVolume()->BlockSize(); 343 344 run_array* array = (run_array*)malloc(blockSize); 345 if (array == NULL) 346 return B_NO_MEMORY; 347 348 if (fArrays.Push(array) != B_OK) { 349 free(array); 350 return B_NO_MEMORY; 351 } 352 353 array->Init(blockSize); 354 fLastArray = array; 355 return B_OK; 356 } 357 358 359 status_t 360 RunArrays::Insert(off_t blockNumber) 361 { 362 Volume* volume = fJournal->GetVolume(); 363 block_run run = volume->ToBlockRun(blockNumber); 364 365 if (fLastArray != NULL) { 366 // check if the block is already in the array 367 if (_ContainsRun(run)) 368 return B_OK; 369 } 370 371 // insert block into array 372 373 if (!_AddRun(run)) { 374 // array is full 375 if (_AddArray() != B_OK || !_AddRun(run)) 376 return B_NO_MEMORY; 377 } 378 379 return B_OK; 380 } 381 382 383 int32 384 RunArrays::MaxArrayLength() 385 { 386 int32 max = 0; 387 for (int32 i = 0; i < CountArrays(); i++) { 388 if (ArrayAt(i)->CountRuns() > max) 389 max = ArrayAt(i)->CountRuns(); 390 } 391 392 return max; 393 } 394 395 396 // #pragma mark - Journal 397 398 399 Journal::Journal(Volume* volume) 400 : 401 fVolume(volume), 402 fOwner(NULL), 403 fLogSize(volume->Log().Length()), 404 fMaxTransactionSize(fLogSize / 2 - 5), 405 fUsed(0), 406 fUnwrittenTransactions(0), 407 fHasSubtransaction(false), 408 fSeparateSubTransactions(false) 409 { 410 recursive_lock_init(&fLock, "bfs journal"); 411 mutex_init(&fEntriesLock, "bfs journal entries"); 412 } 413 414 415 Journal::~Journal() 416 { 417 FlushLogAndBlocks(); 418 419 recursive_lock_destroy(&fLock); 420 mutex_destroy(&fEntriesLock); 421 } 422 423 424 status_t 425 Journal::InitCheck() 426 { 427 return B_OK; 428 } 429 430 431 /*! \brief Does a very basic consistency check of the run array. 432 It will check the maximum run count as well as if all of the runs fall 433 within a the volume. 434 */ 435 status_t 436 Journal::_CheckRunArray(const run_array* array) 437 { 438 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 439 // the -1 works around an off-by-one bug in Be's BFS implementation, 440 // same as in run_array::MaxRuns() 441 if (array->MaxRuns() != maxRuns 442 || array->CountRuns() > maxRuns 443 || array->CountRuns() <= 0) { 444 dprintf("run count: %d, array max: %d, max runs: %d\n", 445 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 446 FATAL(("Log entry has broken header!\n")); 447 return B_ERROR; 448 } 449 450 for (int32 i = 0; i < array->CountRuns(); i++) { 451 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 452 return B_ERROR; 453 } 454 455 PRINT(("Log entry has %ld entries\n", array->CountRuns())); 456 return B_OK; 457 } 458 459 460 /*! Replays an entry in the log. 461 \a _start points to the entry in the log, and will be bumped to the next 462 one if replaying succeeded. 463 */ 464 status_t 465 Journal::_ReplayRunArray(int32* _start) 466 { 467 PRINT(("ReplayRunArray(start = %ld)\n", *_start)); 468 469 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 470 off_t firstBlockNumber = *_start % fLogSize; 471 472 CachedBlock cachedArray(fVolume); 473 474 const run_array* array = (const run_array*)cachedArray.SetTo(logOffset 475 + firstBlockNumber); 476 if (array == NULL) 477 return B_IO_ERROR; 478 479 if (_CheckRunArray(array) < B_OK) 480 return B_BAD_DATA; 481 482 // First pass: check integrity of the blocks in the run array 483 484 CachedBlock cached(fVolume); 485 486 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 487 off_t blockNumber = firstBlockNumber; 488 int32 blockSize = fVolume->BlockSize(); 489 490 for (int32 index = 0; index < array->CountRuns(); index++) { 491 const block_run& run = array->RunAt(index); 492 493 off_t offset = fVolume->ToOffset(run); 494 for (int32 i = 0; i < run.Length(); i++) { 495 const uint8* data = cached.SetTo(logOffset + blockNumber); 496 if (data == NULL) 497 RETURN_ERROR(B_IO_ERROR); 498 499 // TODO: eventually check other well known offsets, like the 500 // root and index dirs 501 if (offset == 0) { 502 // This log entry writes over the superblock - check if 503 // it's valid! 504 if (Volume::CheckSuperBlock(data) != B_OK) { 505 FATAL(("Log contains invalid superblock!\n")); 506 RETURN_ERROR(B_BAD_DATA); 507 } 508 } 509 510 blockNumber = (blockNumber + 1) % fLogSize; 511 offset += blockSize; 512 } 513 } 514 515 // Second pass: write back its blocks 516 517 blockNumber = firstBlockNumber; 518 int32 count = 1; 519 520 for (int32 index = 0; index < array->CountRuns(); index++) { 521 const block_run& run = array->RunAt(index); 522 INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n", 523 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 524 525 off_t offset = fVolume->ToOffset(run); 526 for (int32 i = 0; i < run.Length(); i++) { 527 const uint8* data = cached.SetTo(logOffset + blockNumber); 528 if (data == NULL) 529 RETURN_ERROR(B_IO_ERROR); 530 531 ssize_t written = write_pos(fVolume->Device(), offset, data, 532 blockSize); 533 if (written != blockSize) 534 RETURN_ERROR(B_IO_ERROR); 535 536 blockNumber = (blockNumber + 1) % fLogSize; 537 offset += blockSize; 538 count++; 539 } 540 } 541 542 *_start += count; 543 return B_OK; 544 } 545 546 547 /*! Replays all log entries - this will put the disk into a 548 consistent and clean state, if it was not correctly unmounted 549 before. 550 This method is called by Journal::InitCheck() if the log start 551 and end pointer don't match. 552 */ 553 status_t 554 Journal::ReplayLog() 555 { 556 // TODO: this logic won't work whenever the size of the pending transaction 557 // equals the size of the log (happens with the original BFS only) 558 if (fVolume->LogStart() == fVolume->LogEnd()) 559 return B_OK; 560 561 INFORM(("Replay log, disk was not correctly unmounted...\n")); 562 563 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) { 564 INFORM(("log_start and log_end differ, but disk is marked clean - " 565 "trying to replay log...\n")); 566 } 567 568 if (fVolume->IsReadOnly()) 569 return B_READ_ONLY_DEVICE; 570 571 int32 start = fVolume->LogStart(); 572 int32 lastStart = -1; 573 while (true) { 574 // stop if the log is completely flushed 575 if (start == fVolume->LogEnd()) 576 break; 577 578 if (start == lastStart) { 579 // strange, flushing the log hasn't changed the log_start pointer 580 return B_ERROR; 581 } 582 lastStart = start; 583 584 status_t status = _ReplayRunArray(&start); 585 if (status != B_OK) { 586 FATAL(("replaying log entry from %d failed: %s\n", (int)start, 587 strerror(status))); 588 return B_ERROR; 589 } 590 start = start % fLogSize; 591 } 592 593 PRINT(("replaying worked fine!\n")); 594 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 595 fVolume->LogEnd()); 596 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 597 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 598 599 return fVolume->WriteSuperBlock(); 600 } 601 602 603 size_t 604 Journal::CurrentTransactionSize() const 605 { 606 if (_HasSubTransaction()) { 607 return cache_blocks_in_sub_transaction(fVolume->BlockCache(), 608 fTransactionID); 609 } 610 611 return cache_blocks_in_main_transaction(fVolume->BlockCache(), 612 fTransactionID); 613 } 614 615 616 bool 617 Journal::CurrentTransactionTooLarge() const 618 { 619 return CurrentTransactionSize() > fLogSize; 620 } 621 622 623 /*! This is a callback function that is called by the cache, whenever 624 all blocks of a transaction have been flushed to disk. 625 This lets us keep track of completed transactions, and update 626 the log start pointer as needed. Note, the transactions may not be 627 completed in the order they were written. 628 */ 629 /*static*/ void 630 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 631 { 632 LogEntry* logEntry = (LogEntry*)_logEntry; 633 634 PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, 635 transactionID)); 636 637 Journal* journal = logEntry->GetJournal(); 638 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 639 bool update = false; 640 641 // Set log_start pointer if possible... 642 643 mutex_lock(&journal->fEntriesLock); 644 645 if (logEntry == journal->fEntries.First()) { 646 LogEntry* next = journal->fEntries.GetNext(logEntry); 647 if (next != NULL) { 648 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 649 % journal->fLogSize); 650 } else { 651 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 652 journal->fVolume->LogEnd()); 653 } 654 655 update = true; 656 } 657 658 T(LogEntry(logEntry, superBlock.LogStart(), false)); 659 660 journal->fUsed -= logEntry->Length(); 661 journal->fEntries.Remove(logEntry); 662 mutex_unlock(&journal->fEntriesLock); 663 664 delete logEntry; 665 666 // update the superblock, and change the disk's state, if necessary 667 668 if (update) { 669 if (superBlock.log_start == superBlock.log_end) 670 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 671 672 status_t status = journal->fVolume->WriteSuperBlock(); 673 if (status != B_OK) { 674 FATAL(("_TransactionWritten: could not write back superblock: %s\n", 675 strerror(status))); 676 } 677 678 journal->fVolume->LogStart() = superBlock.LogStart(); 679 } 680 } 681 682 683 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 684 /*static*/ void 685 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 686 { 687 // The current transaction seems to be idle - flush it. We can't do this 688 // in this thread, as flushing the log can produce new transaction events. 689 thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher", 690 B_NORMAL_PRIORITY, _journal); 691 if (id > 0) 692 resume_thread(id); 693 } 694 695 696 /*static*/ status_t 697 Journal::_FlushLog(void* _journal) 698 { 699 Journal* journal = (Journal*)_journal; 700 return journal->_FlushLog(false, false); 701 } 702 703 704 /*! Writes the blocks that are part of current transaction into the log, 705 and ends the current transaction. 706 If the current transaction is too large to fit into the log, it will 707 try to detach an existing sub-transaction. 708 */ 709 status_t 710 Journal::_WriteTransactionToLog() 711 { 712 // TODO: in case of a failure, we need a backup plan like writing all 713 // changed blocks back to disk immediately (hello disk corruption!) 714 715 bool detached = false; 716 717 if (_TransactionSize() > fLogSize) { 718 // The current transaction won't fit into the log anymore, try to 719 // detach the current sub-transaction 720 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 721 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 722 detached = true; 723 } else { 724 // We created a transaction larger than one we can write back to 725 // disk - the only option we have (besides risking disk corruption 726 // by writing it back anyway), is to let it fail. 727 dprintf("transaction too large (%d blocks, log size %d)!\n", 728 (int)_TransactionSize(), (int)fLogSize); 729 return B_BUFFER_OVERFLOW; 730 } 731 } 732 733 fHasSubtransaction = false; 734 735 int32 blockShift = fVolume->BlockShift(); 736 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 737 off_t logStart = fVolume->LogEnd() % fLogSize; 738 off_t logPosition = logStart; 739 status_t status; 740 741 // create run_array structures for all changed blocks 742 743 RunArrays runArrays(this); 744 745 off_t blockNumber; 746 long cookie = 0; 747 while (cache_next_block_in_transaction(fVolume->BlockCache(), 748 fTransactionID, detached, &cookie, &blockNumber, NULL, 749 NULL) == B_OK) { 750 status = runArrays.Insert(blockNumber); 751 if (status < B_OK) { 752 FATAL(("filling log entry failed!")); 753 return status; 754 } 755 } 756 757 if (runArrays.CountBlocks() == 0) { 758 // nothing has changed during this transaction 759 if (detached) { 760 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 761 fTransactionID, NULL, NULL); 762 fUnwrittenTransactions = 1; 763 } else { 764 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 765 NULL); 766 fUnwrittenTransactions = 0; 767 } 768 return B_OK; 769 } 770 771 // If necessary, flush the log, so that we have enough space for this 772 // transaction 773 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 774 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 775 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 776 panic("no space in log after sync (%ld for %ld blocks)!", 777 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 778 } 779 } 780 781 // Write log entries to disk 782 783 int32 maxVecs = runArrays.MaxArrayLength() + 1; 784 // one extra for the index block 785 786 iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs); 787 if (vecs == NULL) { 788 // TODO: write back log entries directly? 789 return B_NO_MEMORY; 790 } 791 792 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 793 run_array* array = runArrays.ArrayAt(k); 794 int32 index = 0, count = 1; 795 int32 wrap = fLogSize - logStart; 796 797 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 798 799 // add block runs 800 801 for (int32 i = 0; i < array->CountRuns(); i++) { 802 const block_run& run = array->RunAt(i); 803 off_t blockNumber = fVolume->ToBlock(run); 804 805 for (int32 j = 0; j < run.Length(); j++) { 806 if (count >= wrap) { 807 // We need to write back the first half of the entry 808 // directly as the log wraps around 809 if (writev_pos(fVolume->Device(), logOffset 810 + (logStart << blockShift), vecs, index) < 0) 811 FATAL(("could not write log area!\n")); 812 813 logPosition = logStart + count; 814 logStart = 0; 815 wrap = fLogSize; 816 count = 0; 817 index = 0; 818 } 819 820 // make blocks available in the cache 821 const void* data = block_cache_get(fVolume->BlockCache(), 822 blockNumber + j); 823 if (data == NULL) { 824 free(vecs); 825 return B_IO_ERROR; 826 } 827 828 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 829 count++; 830 } 831 } 832 833 // write back the rest of the log entry 834 if (count > 0) { 835 logPosition = logStart + count; 836 if (writev_pos(fVolume->Device(), logOffset 837 + (logStart << blockShift), vecs, index) < 0) 838 FATAL(("could not write log area: %s!\n", strerror(errno))); 839 } 840 841 // release blocks again 842 for (int32 i = 0; i < array->CountRuns(); i++) { 843 const block_run& run = array->RunAt(i); 844 off_t blockNumber = fVolume->ToBlock(run); 845 846 for (int32 j = 0; j < run.Length(); j++) { 847 block_cache_put(fVolume->BlockCache(), blockNumber + j); 848 } 849 } 850 851 logStart = logPosition % fLogSize; 852 } 853 854 free(vecs); 855 856 LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(), 857 runArrays.LogEntryLength()); 858 if (logEntry == NULL) { 859 FATAL(("no memory to allocate log entries!")); 860 return B_NO_MEMORY; 861 } 862 863 #ifdef BFS_DEBUGGER_COMMANDS 864 logEntry->SetTransactionID(fTransactionID); 865 #endif 866 867 // Update the log end pointer in the superblock 868 869 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 870 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 871 872 status = fVolume->WriteSuperBlock(); 873 874 fVolume->LogEnd() = logPosition; 875 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 876 877 // We need to flush the drives own cache here to ensure 878 // disk consistency. 879 // If that call fails, we can't do anything about it anyway 880 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 881 882 // at this point, we can finally end the transaction - we're in 883 // a guaranteed valid state 884 885 mutex_lock(&fEntriesLock); 886 fEntries.Add(logEntry); 887 fUsed += logEntry->Length(); 888 mutex_unlock(&fEntriesLock); 889 890 if (detached) { 891 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 892 fTransactionID, _TransactionWritten, logEntry); 893 fUnwrittenTransactions = 1; 894 895 if (status == B_OK && _TransactionSize() > fLogSize) { 896 // If the transaction is too large after writing, there is no way to 897 // recover, so let this transaction fail. 898 dprintf("transaction too large (%d blocks, log size %d)!\n", 899 (int)_TransactionSize(), (int)fLogSize); 900 return B_BUFFER_OVERFLOW; 901 } 902 } else { 903 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 904 _TransactionWritten, logEntry); 905 fUnwrittenTransactions = 0; 906 } 907 908 return status; 909 } 910 911 912 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 913 also write back all dirty blocks for this volume. 914 */ 915 status_t 916 Journal::_FlushLog(bool canWait, bool flushBlocks) 917 { 918 status_t status = canWait ? recursive_lock_lock(&fLock) 919 : recursive_lock_trylock(&fLock); 920 if (status != B_OK) 921 return status; 922 923 if (recursive_lock_get_recursion(&fLock) > 1) { 924 // whoa, FlushLogAndBlocks() was called from inside a transaction 925 recursive_lock_unlock(&fLock); 926 return B_OK; 927 } 928 929 // write the current log entry to disk 930 931 if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) { 932 status = _WriteTransactionToLog(); 933 if (status < B_OK) 934 FATAL(("writing current log entry failed: %s\n", strerror(status))); 935 } 936 937 if (flushBlocks) 938 status = fVolume->FlushDevice(); 939 940 recursive_lock_unlock(&fLock); 941 return status; 942 } 943 944 945 /*! Flushes the current log entry to disk, and also writes back all dirty 946 blocks for this volume (completing all open transactions). 947 */ 948 status_t 949 Journal::FlushLogAndBlocks() 950 { 951 return _FlushLog(true, true); 952 } 953 954 955 status_t 956 Journal::Lock(Transaction* owner, bool separateSubTransactions) 957 { 958 status_t status = recursive_lock_lock(&fLock); 959 if (status != B_OK) 960 return status; 961 962 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 963 // we'll just use the current transaction again 964 return B_OK; 965 } 966 967 if (separateSubTransactions) 968 fSeparateSubTransactions = true; 969 970 if (owner != NULL) 971 owner->SetParent(fOwner); 972 973 fOwner = owner; 974 975 // TODO: we need a way to find out how big the current transaction is; 976 // we need to be able to either detach the latest sub transaction on 977 // demand, as well as having some kind of fall back plan in case the 978 // sub transaction itself grows bigger than the log. 979 // For that, it would be nice to have some call-back interface in the 980 // cache transaction API... 981 982 if (fOwner != NULL) { 983 if (fUnwrittenTransactions > 0) { 984 // start a sub transaction 985 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 986 fHasSubtransaction = true; 987 } else 988 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 989 990 if (fTransactionID < B_OK) { 991 recursive_lock_unlock(&fLock); 992 return fTransactionID; 993 } 994 995 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 996 TRANSACTION_IDLE, _TransactionIdle, this); 997 } 998 return B_OK; 999 } 1000 1001 1002 status_t 1003 Journal::Unlock(Transaction* owner, bool success) 1004 { 1005 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 1006 // we only end the transaction if we would really unlock it 1007 // TODO: what about failing transactions that do not unlock? 1008 // (they must make the parent fail, too) 1009 if (owner != NULL) { 1010 status_t status = _TransactionDone(success); 1011 if (status != B_OK) 1012 return status; 1013 1014 // Unlocking the inodes might trigger new transactions, but we 1015 // cannot reuse the current one anymore, as this one is already 1016 // closed. 1017 bool separateSubTransactions = fSeparateSubTransactions; 1018 fSeparateSubTransactions = true; 1019 owner->NotifyListeners(success); 1020 fSeparateSubTransactions = separateSubTransactions; 1021 1022 fOwner = owner->Parent(); 1023 } else 1024 fOwner = NULL; 1025 1026 fTimestamp = system_time(); 1027 1028 if (fSeparateSubTransactions 1029 && recursive_lock_get_recursion(&fLock) == 1) 1030 fSeparateSubTransactions = false; 1031 } else 1032 owner->MoveListenersTo(fOwner); 1033 1034 recursive_lock_unlock(&fLock); 1035 return B_OK; 1036 } 1037 1038 1039 uint32 1040 Journal::_TransactionSize() const 1041 { 1042 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 1043 fTransactionID); 1044 if (count <= 0) 1045 return 0; 1046 1047 // take the number of array blocks in this transaction into account 1048 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 1049 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 1050 return count + arrayBlocks; 1051 } 1052 1053 1054 status_t 1055 Journal::_TransactionDone(bool success) 1056 { 1057 if (!success) { 1058 if (_HasSubTransaction()) { 1059 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 1060 // We can continue to use the parent transaction afterwards 1061 } else { 1062 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1063 fUnwrittenTransactions = 0; 1064 } 1065 1066 return B_OK; 1067 } 1068 1069 // Up to a maximum size, we will just batch several 1070 // transactions together to improve speed 1071 uint32 size = _TransactionSize(); 1072 if (size < fMaxTransactionSize) { 1073 // Flush the log from time to time, so that we have enough space 1074 // for this transaction 1075 if (size > FreeLogBlocks()) 1076 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1077 1078 fUnwrittenTransactions++; 1079 return B_OK; 1080 } 1081 1082 return _WriteTransactionToLog(); 1083 } 1084 1085 1086 // #pragma mark - debugger commands 1087 1088 1089 #ifdef BFS_DEBUGGER_COMMANDS 1090 1091 1092 void 1093 Journal::Dump() 1094 { 1095 kprintf("Journal %p\n", this); 1096 kprintf(" log start: %" B_PRId32 "\n", fVolume->LogStart()); 1097 kprintf(" log end: %" B_PRId32 "\n", fVolume->LogEnd()); 1098 kprintf(" owner: %p\n", fOwner); 1099 kprintf(" log size: %" B_PRIu32 "\n", fLogSize); 1100 kprintf(" max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize); 1101 kprintf(" used: %" B_PRIu32 "\n", fUsed); 1102 kprintf(" unwritten: %" B_PRId32 "\n", fUnwrittenTransactions); 1103 kprintf(" timestamp: %" B_PRId64 "\n", fTimestamp); 1104 kprintf(" transaction ID: %" B_PRId32 "\n", fTransactionID); 1105 kprintf(" has subtransaction: %d\n", fHasSubtransaction); 1106 kprintf(" separate sub-trans.: %d\n", fSeparateSubTransactions); 1107 kprintf("entries:\n"); 1108 kprintf(" address id start length\n"); 1109 1110 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1111 1112 while (iterator.HasNext()) { 1113 LogEntry* entry = iterator.Next(); 1114 1115 kprintf(" %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry, 1116 entry->TransactionID(), entry->Start(), entry->Length()); 1117 } 1118 } 1119 1120 1121 int 1122 dump_journal(int argc, char** argv) 1123 { 1124 if (argc != 2 || !strcmp(argv[1], "--help")) { 1125 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1126 return 0; 1127 } 1128 1129 Volume* volume = (Volume*)parse_expression(argv[1]); 1130 Journal* journal = volume->GetJournal(0); 1131 1132 journal->Dump(); 1133 return 0; 1134 } 1135 1136 1137 #endif // BFS_DEBUGGER_COMMANDS 1138 1139 1140 // #pragma mark - TransactionListener 1141 1142 1143 TransactionListener::TransactionListener() 1144 { 1145 } 1146 1147 1148 TransactionListener::~TransactionListener() 1149 { 1150 } 1151 1152 1153 // #pragma mark - Transaction 1154 1155 1156 status_t 1157 Transaction::Start(Volume* volume, off_t refBlock) 1158 { 1159 // has it already been started? 1160 if (fJournal != NULL) 1161 return B_OK; 1162 1163 fJournal = volume->GetJournal(refBlock); 1164 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1165 return B_OK; 1166 1167 fJournal = NULL; 1168 return B_ERROR; 1169 } 1170 1171 1172 void 1173 Transaction::AddListener(TransactionListener* listener) 1174 { 1175 if (fJournal == NULL) 1176 panic("Transaction is not running!"); 1177 1178 fListeners.Add(listener); 1179 } 1180 1181 1182 void 1183 Transaction::RemoveListener(TransactionListener* listener) 1184 { 1185 if (fJournal == NULL) 1186 panic("Transaction is not running!"); 1187 1188 fListeners.Remove(listener); 1189 listener->RemovedFromTransaction(); 1190 } 1191 1192 1193 void 1194 Transaction::NotifyListeners(bool success) 1195 { 1196 while (TransactionListener* listener = fListeners.RemoveHead()) { 1197 listener->TransactionDone(success); 1198 listener->RemovedFromTransaction(); 1199 } 1200 } 1201 1202 1203 /*! Move the inodes into the parent transaction. This is needed only to make 1204 sure they will still be reverted in case the transaction is aborted. 1205 */ 1206 void 1207 Transaction::MoveListenersTo(Transaction* transaction) 1208 { 1209 while (TransactionListener* listener = fListeners.RemoveHead()) { 1210 transaction->fListeners.Add(listener); 1211 } 1212 } 1213