1 /* 2 * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 //! Transaction and logging 7 8 9 #include "Journal.h" 10 11 #include "Debug.h" 12 #include "Inode.h" 13 14 15 struct run_array { 16 int32 count; 17 int32 max_runs; 18 block_run runs[0]; 19 20 void Init(int32 blockSize); 21 void Insert(block_run& run); 22 23 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 24 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 25 // that -1 accounts for an off-by-one error in Be's BFS implementation 26 const block_run& RunAt(int32 i) const { return runs[i]; } 27 28 static int32 MaxRuns(int32 blockSize); 29 30 private: 31 static int _Compare(block_run& a, block_run& b); 32 int32 _FindInsertionIndex(block_run& run); 33 }; 34 35 class RunArrays { 36 public: 37 RunArrays(Journal* journal); 38 ~RunArrays(); 39 40 status_t Insert(off_t blockNumber); 41 42 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 43 int32 CountArrays() const { return fArrays.CountItems(); } 44 45 uint32 CountBlocks() const { return fBlockCount; } 46 uint32 LogEntryLength() const 47 { return CountBlocks() + CountArrays(); } 48 49 int32 MaxArrayLength(); 50 51 private: 52 status_t _AddArray(); 53 bool _ContainsRun(block_run& run); 54 bool _AddRun(block_run& run); 55 56 Journal* fJournal; 57 uint32 fBlockCount; 58 Stack<run_array*> fArrays; 59 run_array* fLastArray; 60 }; 61 62 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 63 public: 64 LogEntry(Journal* journal, uint32 logStart, 65 uint32 length); 66 ~LogEntry(); 67 68 uint32 Start() const { return fStart; } 69 uint32 Length() const { return fLength; } 70 71 #ifdef BFS_DEBUGGER_COMMANDS 72 void SetTransactionID(int32 id) { fTransactionID = id; } 73 int32 TransactionID() const { return fTransactionID; } 74 #endif 75 76 Journal* GetJournal() { return fJournal; } 77 78 private: 79 Journal* fJournal; 80 uint32 fStart; 81 uint32 fLength; 82 #ifdef BFS_DEBUGGER_COMMANDS 83 int32 fTransactionID; 84 #endif 85 }; 86 87 88 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE) 89 namespace BFSJournalTracing { 90 91 class LogEntry : public AbstractTraceEntry { 92 public: 93 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 94 : 95 fEntry(entry), 96 #ifdef BFS_DEBUGGER_COMMANDS 97 fTransactionID(entry->TransactionID()), 98 #endif 99 fStart(entry->Start()), 100 fLength(entry->Length()), 101 fLogPosition(logPosition), 102 fStarted(started) 103 { 104 Initialized(); 105 } 106 107 virtual void AddDump(TraceOutput& out) 108 { 109 #ifdef BFS_DEBUGGER_COMMANDS 110 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 111 "%lu\n", fStarted ? "Started" : "Written", fEntry, 112 fTransactionID, fStart, fLength, 113 fStarted ? "end" : "start", fLogPosition); 114 #else 115 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 116 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 117 fStarted ? "end" : "start", fLogPosition); 118 #endif 119 } 120 121 private: 122 ::LogEntry* fEntry; 123 #ifdef BFS_DEBUGGER_COMMANDS 124 int32 fTransactionID; 125 #endif 126 uint32 fStart; 127 uint32 fLength; 128 uint32 fLogPosition; 129 bool fStarted; 130 }; 131 132 } // namespace BFSJournalTracing 133 134 # define T(x) new(std::nothrow) BFSJournalTracing::x; 135 #else 136 # define T(x) ; 137 #endif 138 139 140 // #pragma mark - 141 142 143 static void 144 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 145 size_t size) 146 { 147 if (index > 0 && (addr_t)vecs[index - 1].iov_base 148 + vecs[index - 1].iov_len == (addr_t)address) { 149 // the iovec can be combined with the previous one 150 vecs[index - 1].iov_len += size; 151 return; 152 } 153 154 if (index == max) 155 panic("no more space for iovecs!"); 156 157 // we need to start a new iovec 158 vecs[index].iov_base = const_cast<void*>(address); 159 vecs[index].iov_len = size; 160 index++; 161 } 162 163 164 // #pragma mark - LogEntry 165 166 167 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 168 : 169 fJournal(journal), 170 fStart(start), 171 fLength(length) 172 { 173 } 174 175 176 LogEntry::~LogEntry() 177 { 178 } 179 180 181 // #pragma mark - run_array 182 183 184 /*! The run_array's size equals the block size of the BFS volume, so we 185 cannot use a (non-overridden) new. 186 This makes a freshly allocated run_array ready to run. 187 */ 188 void 189 run_array::Init(int32 blockSize) 190 { 191 memset(this, 0, blockSize); 192 count = 0; 193 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 194 } 195 196 197 /*! Inserts the block_run into the array. You will have to make sure the 198 array is large enough to contain the entry before calling this function. 199 */ 200 void 201 run_array::Insert(block_run& run) 202 { 203 int32 index = _FindInsertionIndex(run); 204 if (index == -1) { 205 // add to the end 206 runs[CountRuns()] = run; 207 } else { 208 // insert at index 209 memmove(&runs[index + 1], &runs[index], 210 (CountRuns() - index) * sizeof(off_t)); 211 runs[index] = run; 212 } 213 214 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 215 } 216 217 218 /*static*/ int32 219 run_array::MaxRuns(int32 blockSize) 220 { 221 // For whatever reason, BFS restricts the maximum array size 222 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 223 if (maxCount < 128) 224 return maxCount; 225 226 return 127; 227 } 228 229 230 /*static*/ int 231 run_array::_Compare(block_run& a, block_run& b) 232 { 233 int cmp = a.AllocationGroup() - b.AllocationGroup(); 234 if (cmp == 0) 235 return a.Start() - b.Start(); 236 237 return cmp; 238 } 239 240 241 int32 242 run_array::_FindInsertionIndex(block_run& run) 243 { 244 int32 min = 0, max = CountRuns() - 1; 245 int32 i = 0; 246 if (max >= 8) { 247 while (min <= max) { 248 i = (min + max) / 2; 249 250 int cmp = _Compare(runs[i], run); 251 if (cmp < 0) 252 min = i + 1; 253 else if (cmp > 0) 254 max = i - 1; 255 else 256 return -1; 257 } 258 259 if (_Compare(runs[i], run) < 0) 260 i++; 261 } else { 262 for (; i <= max; i++) { 263 if (_Compare(runs[i], run) > 0) 264 break; 265 } 266 if (i == count) 267 return -1; 268 } 269 270 return i; 271 } 272 273 274 // #pragma mark - RunArrays 275 276 277 RunArrays::RunArrays(Journal* journal) 278 : 279 fJournal(journal), 280 fBlockCount(0), 281 fArrays(), 282 fLastArray(NULL) 283 { 284 } 285 286 287 RunArrays::~RunArrays() 288 { 289 run_array* array; 290 while (fArrays.Pop(&array)) 291 free(array); 292 } 293 294 295 bool 296 RunArrays::_ContainsRun(block_run& run) 297 { 298 for (int32 i = 0; i < CountArrays(); i++) { 299 run_array* array = ArrayAt(i); 300 301 for (int32 j = 0; j < array->CountRuns(); j++) { 302 block_run& arrayRun = array->runs[j]; 303 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 304 continue; 305 306 if (run.Start() >= arrayRun.Start() 307 && run.Start() + run.Length() 308 <= arrayRun.Start() + arrayRun.Length()) 309 return true; 310 } 311 } 312 313 return false; 314 } 315 316 317 /*! Adds the specified block_run into the array. 318 Note: it doesn't support overlapping - it must only be used 319 with block_runs of length 1! 320 */ 321 bool 322 RunArrays::_AddRun(block_run& run) 323 { 324 ASSERT(run.length == 1); 325 326 // Be's BFS log replay routine can only deal with block_runs of size 1 327 // A pity, isn't it? Too sad we have to be compatible. 328 329 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 330 return false; 331 332 fLastArray->Insert(run); 333 fBlockCount++; 334 return true; 335 } 336 337 338 status_t 339 RunArrays::_AddArray() 340 { 341 int32 blockSize = fJournal->GetVolume()->BlockSize(); 342 343 run_array* array = (run_array*)malloc(blockSize); 344 if (array == NULL) 345 return B_NO_MEMORY; 346 347 if (fArrays.Push(array) != B_OK) { 348 free(array); 349 return B_NO_MEMORY; 350 } 351 352 array->Init(blockSize); 353 fLastArray = array; 354 return B_OK; 355 } 356 357 358 status_t 359 RunArrays::Insert(off_t blockNumber) 360 { 361 Volume* volume = fJournal->GetVolume(); 362 block_run run = volume->ToBlockRun(blockNumber); 363 364 if (fLastArray != NULL) { 365 // check if the block is already in the array 366 if (_ContainsRun(run)) 367 return B_OK; 368 } 369 370 // insert block into array 371 372 if (!_AddRun(run)) { 373 // array is full 374 if (_AddArray() != B_OK || !_AddRun(run)) 375 return B_NO_MEMORY; 376 } 377 378 return B_OK; 379 } 380 381 382 int32 383 RunArrays::MaxArrayLength() 384 { 385 int32 max = 0; 386 for (int32 i = 0; i < CountArrays(); i++) { 387 if (ArrayAt(i)->CountRuns() > max) 388 max = ArrayAt(i)->CountRuns(); 389 } 390 391 return max; 392 } 393 394 395 // #pragma mark - Journal 396 397 398 Journal::Journal(Volume* volume) 399 : 400 fVolume(volume), 401 fOwner(NULL), 402 fLogSize(volume->Log().Length()), 403 fMaxTransactionSize(fLogSize / 2 - 5), 404 fUsed(0), 405 fUnwrittenTransactions(0), 406 fHasSubtransaction(false), 407 fSeparateSubTransactions(false) 408 { 409 recursive_lock_init(&fLock, "bfs journal"); 410 mutex_init(&fEntriesLock, "bfs journal entries"); 411 } 412 413 414 Journal::~Journal() 415 { 416 FlushLogAndBlocks(); 417 418 recursive_lock_destroy(&fLock); 419 mutex_destroy(&fEntriesLock); 420 } 421 422 423 status_t 424 Journal::InitCheck() 425 { 426 return B_OK; 427 } 428 429 430 /*! \brief Does a very basic consistency check of the run array. 431 It will check the maximum run count as well as if all of the runs fall 432 within a the volume. 433 */ 434 status_t 435 Journal::_CheckRunArray(const run_array* array) 436 { 437 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 438 // the -1 works around an off-by-one bug in Be's BFS implementation, 439 // same as in run_array::MaxRuns() 440 if (array->MaxRuns() != maxRuns 441 || array->CountRuns() > maxRuns 442 || array->CountRuns() <= 0) { 443 dprintf("run count: %d, array max: %d, max runs: %d\n", 444 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 445 FATAL(("Log entry has broken header!\n")); 446 return B_ERROR; 447 } 448 449 for (int32 i = 0; i < array->CountRuns(); i++) { 450 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 451 return B_ERROR; 452 } 453 454 PRINT(("Log entry has %ld entries\n", array->CountRuns())); 455 return B_OK; 456 } 457 458 459 /*! Replays an entry in the log. 460 \a _start points to the entry in the log, and will be bumped to the next 461 one if replaying succeeded. 462 */ 463 status_t 464 Journal::_ReplayRunArray(int32* _start) 465 { 466 PRINT(("ReplayRunArray(start = %ld)\n", *_start)); 467 468 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 469 off_t firstBlockNumber = *_start % fLogSize; 470 471 CachedBlock cachedArray(fVolume); 472 473 const run_array* array = (const run_array*)cachedArray.SetTo(logOffset 474 + firstBlockNumber); 475 if (array == NULL) 476 return B_IO_ERROR; 477 478 if (_CheckRunArray(array) < B_OK) 479 return B_BAD_DATA; 480 481 // First pass: check integrity of the blocks in the run array 482 483 CachedBlock cached(fVolume); 484 485 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 486 off_t blockNumber = firstBlockNumber; 487 int32 blockSize = fVolume->BlockSize(); 488 489 for (int32 index = 0; index < array->CountRuns(); index++) { 490 const block_run& run = array->RunAt(index); 491 492 off_t offset = fVolume->ToOffset(run); 493 for (int32 i = 0; i < run.Length(); i++) { 494 const uint8* data = cached.SetTo(logOffset + blockNumber); 495 if (data == NULL) 496 RETURN_ERROR(B_IO_ERROR); 497 498 // TODO: eventually check other well known offsets, like the 499 // root and index dirs 500 if (offset == 0) { 501 // This log entry writes over the super block - check if 502 // it's valid! 503 if (Volume::CheckSuperBlock(data) != B_OK) { 504 FATAL(("Log contains invalid super block!\n")); 505 RETURN_ERROR(B_BAD_DATA); 506 } 507 } 508 509 blockNumber = (blockNumber + 1) % fLogSize; 510 offset += blockSize; 511 } 512 } 513 514 // Second pass: write back its blocks 515 516 blockNumber = firstBlockNumber; 517 int32 count = 1; 518 519 for (int32 index = 0; index < array->CountRuns(); index++) { 520 const block_run& run = array->RunAt(index); 521 INFORM(("replay block run %u:%u:%u in log at %Ld!\n", 522 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 523 524 off_t offset = fVolume->ToOffset(run); 525 for (int32 i = 0; i < run.Length(); i++) { 526 const uint8* data = cached.SetTo(logOffset + blockNumber); 527 if (data == NULL) 528 RETURN_ERROR(B_IO_ERROR); 529 530 ssize_t written = write_pos(fVolume->Device(), offset, data, 531 blockSize); 532 if (written != blockSize) 533 RETURN_ERROR(B_IO_ERROR); 534 535 blockNumber = (blockNumber + 1) % fLogSize; 536 offset += blockSize; 537 count++; 538 } 539 } 540 541 *_start += count; 542 return B_OK; 543 } 544 545 546 /*! Replays all log entries - this will put the disk into a 547 consistent and clean state, if it was not correctly unmounted 548 before. 549 This method is called by Journal::InitCheck() if the log start 550 and end pointer don't match. 551 */ 552 status_t 553 Journal::ReplayLog() 554 { 555 // TODO: this logic won't work whenever the size of the pending transaction 556 // equals the size of the log (happens with the original BFS only) 557 if (fVolume->LogStart() == fVolume->LogEnd()) 558 return B_OK; 559 560 INFORM(("Replay log, disk was not correctly unmounted...\n")); 561 562 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) 563 INFORM(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n")); 564 565 int32 start = fVolume->LogStart(); 566 int32 lastStart = -1; 567 while (true) { 568 // stop if the log is completely flushed 569 if (start == fVolume->LogEnd()) 570 break; 571 572 if (start == lastStart) { 573 // strange, flushing the log hasn't changed the log_start pointer 574 return B_ERROR; 575 } 576 lastStart = start; 577 578 status_t status = _ReplayRunArray(&start); 579 if (status < B_OK) { 580 FATAL(("replaying log entry from %d failed: %s\n", (int)start, strerror(status))); 581 return B_ERROR; 582 } 583 start = start % fLogSize; 584 } 585 586 PRINT(("replaying worked fine!\n")); 587 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 588 fVolume->LogEnd()); 589 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 590 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 591 592 return fVolume->WriteSuperBlock(); 593 } 594 595 596 /*! This is a callback function that is called by the cache, whenever 597 all blocks of a transaction have been flushed to disk. 598 This lets us keep track of completed transactions, and update 599 the log start pointer as needed. Note, the transactions may not be 600 completed in the order they were written. 601 */ 602 /*static*/ void 603 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 604 { 605 LogEntry* logEntry = (LogEntry*)_logEntry; 606 607 PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, 608 transactionID)); 609 610 Journal* journal = logEntry->GetJournal(); 611 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 612 bool update = false; 613 614 // Set log_start pointer if possible... 615 616 mutex_lock(&journal->fEntriesLock); 617 618 if (logEntry == journal->fEntries.First()) { 619 LogEntry* next = journal->fEntries.GetNext(logEntry); 620 if (next != NULL) { 621 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 622 % journal->fLogSize); 623 } else { 624 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 625 journal->fVolume->LogEnd()); 626 } 627 628 update = true; 629 } 630 631 T(LogEntry(logEntry, superBlock.LogStart(), false)); 632 633 journal->fUsed -= logEntry->Length(); 634 journal->fEntries.Remove(logEntry); 635 mutex_unlock(&journal->fEntriesLock); 636 637 delete logEntry; 638 639 // update the super block, and change the disk's state, if necessary 640 641 if (update) { 642 if (superBlock.log_start == superBlock.log_end) 643 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 644 645 status_t status = journal->fVolume->WriteSuperBlock(); 646 if (status != B_OK) { 647 FATAL(("_TransactionWritten: could not write back super block: %s\n", 648 strerror(status))); 649 } 650 651 journal->fVolume->LogStart() = superBlock.LogStart(); 652 } 653 } 654 655 656 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 657 /*static*/ void 658 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 659 { 660 // The current transaction seems to be idle - flush it 661 662 Journal* journal = (Journal*)_journal; 663 journal->_FlushLog(false, false); 664 } 665 666 667 /*! Writes the blocks that are part of current transaction into the log, 668 and ends the current transaction. 669 If the current transaction is too large to fit into the log, it will 670 try to detach an existing sub-transaction. 671 */ 672 status_t 673 Journal::_WriteTransactionToLog() 674 { 675 // TODO: in case of a failure, we need a backup plan like writing all 676 // changed blocks back to disk immediately (hello disk corruption!) 677 678 bool detached = false; 679 680 if (_TransactionSize() > fLogSize) { 681 // The current transaction won't fit into the log anymore, try to 682 // detach the current sub-transaction 683 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 684 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 685 detached = true; 686 } else { 687 // TODO: what are our options here? 688 // a) abort the transaction - bad, because all changes are lost 689 // b) carry out the changes, but don't use the log - even worse, 690 // as it potentially creates a corrupted disk. 691 dprintf("transaction too large (%d blocks, %d main, log size %d)!\n", 692 (int)_TransactionSize(), (int)cache_blocks_in_main_transaction( 693 fVolume->BlockCache(), fTransactionID), (int)fLogSize); 694 return B_BUFFER_OVERFLOW; 695 } 696 } 697 698 fHasSubtransaction = false; 699 700 int32 blockShift = fVolume->BlockShift(); 701 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 702 off_t logStart = fVolume->LogEnd() % fLogSize; 703 off_t logPosition = logStart; 704 status_t status; 705 706 // create run_array structures for all changed blocks 707 708 RunArrays runArrays(this); 709 710 off_t blockNumber; 711 long cookie = 0; 712 while (cache_next_block_in_transaction(fVolume->BlockCache(), 713 fTransactionID, detached, &cookie, &blockNumber, NULL, 714 NULL) == B_OK) { 715 status = runArrays.Insert(blockNumber); 716 if (status < B_OK) { 717 FATAL(("filling log entry failed!")); 718 return status; 719 } 720 } 721 722 if (runArrays.CountBlocks() == 0) { 723 // nothing has changed during this transaction 724 if (detached) { 725 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 726 fTransactionID, NULL, NULL); 727 fUnwrittenTransactions = 1; 728 } else { 729 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 730 NULL); 731 fUnwrittenTransactions = 0; 732 } 733 return B_OK; 734 } 735 736 // If necessary, flush the log, so that we have enough space for this 737 // transaction 738 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 739 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 740 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 741 panic("no space in log after sync (%ld for %ld blocks)!", 742 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 743 } 744 } 745 746 // Write log entries to disk 747 748 int32 maxVecs = runArrays.MaxArrayLength() + 1; 749 // one extra for the index block 750 751 iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs); 752 if (vecs == NULL) { 753 // TODO: write back log entries directly? 754 return B_NO_MEMORY; 755 } 756 757 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 758 run_array* array = runArrays.ArrayAt(k); 759 int32 index = 0, count = 1; 760 int32 wrap = fLogSize - logStart; 761 762 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 763 764 // add block runs 765 766 for (int32 i = 0; i < array->CountRuns(); i++) { 767 const block_run& run = array->RunAt(i); 768 off_t blockNumber = fVolume->ToBlock(run); 769 770 for (int32 j = 0; j < run.Length(); j++) { 771 if (count >= wrap) { 772 // We need to write back the first half of the entry 773 // directly as the log wraps around 774 if (writev_pos(fVolume->Device(), logOffset 775 + (logStart << blockShift), vecs, index) < 0) 776 FATAL(("could not write log area!\n")); 777 778 logPosition = logStart + count; 779 logStart = 0; 780 wrap = fLogSize; 781 count = 0; 782 index = 0; 783 } 784 785 // make blocks available in the cache 786 const void* data = block_cache_get(fVolume->BlockCache(), 787 blockNumber + j); 788 if (data == NULL) { 789 free(vecs); 790 return B_IO_ERROR; 791 } 792 793 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 794 count++; 795 } 796 } 797 798 // write back the rest of the log entry 799 if (count > 0) { 800 logPosition = logStart + count; 801 if (writev_pos(fVolume->Device(), logOffset 802 + (logStart << blockShift), vecs, index) < 0) 803 FATAL(("could not write log area: %s!\n", strerror(errno))); 804 } 805 806 // release blocks again 807 for (int32 i = 0; i < array->CountRuns(); i++) { 808 const block_run& run = array->RunAt(i); 809 off_t blockNumber = fVolume->ToBlock(run); 810 811 for (int32 j = 0; j < run.Length(); j++) { 812 block_cache_put(fVolume->BlockCache(), blockNumber + j); 813 } 814 } 815 816 logStart = logPosition % fLogSize; 817 } 818 819 free(vecs); 820 821 LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(), 822 runArrays.LogEntryLength()); 823 if (logEntry == NULL) { 824 FATAL(("no memory to allocate log entries!")); 825 return B_NO_MEMORY; 826 } 827 828 #ifdef BFS_DEBUGGER_COMMANDS 829 logEntry->SetTransactionID(fTransactionID); 830 #endif 831 832 // Update the log end pointer in the super block 833 834 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 835 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 836 837 status = fVolume->WriteSuperBlock(); 838 839 fVolume->LogEnd() = logPosition; 840 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 841 842 // We need to flush the drives own cache here to ensure 843 // disk consistency. 844 // If that call fails, we can't do anything about it anyway 845 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 846 847 // at this point, we can finally end the transaction - we're in 848 // a guaranteed valid state 849 850 mutex_lock(&fEntriesLock); 851 fEntries.Add(logEntry); 852 fUsed += logEntry->Length(); 853 mutex_unlock(&fEntriesLock); 854 855 if (detached) { 856 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 857 fTransactionID, _TransactionWritten, logEntry); 858 fUnwrittenTransactions = 1; 859 } else { 860 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 861 _TransactionWritten, logEntry); 862 fUnwrittenTransactions = 0; 863 } 864 865 return status; 866 } 867 868 869 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 870 also write back all dirty blocks for this volume. 871 */ 872 status_t 873 Journal::_FlushLog(bool canWait, bool flushBlocks) 874 { 875 status_t status = canWait ? recursive_lock_lock(&fLock) 876 : recursive_lock_trylock(&fLock); 877 if (status != B_OK) 878 return status; 879 880 if (recursive_lock_get_recursion(&fLock) > 1) { 881 // whoa, FlushLogAndBlocks() was called from inside a transaction 882 recursive_lock_unlock(&fLock); 883 return B_OK; 884 } 885 886 // write the current log entry to disk 887 888 if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) { 889 status = _WriteTransactionToLog(); 890 if (status < B_OK) 891 FATAL(("writing current log entry failed: %s\n", strerror(status))); 892 } 893 894 if (flushBlocks) 895 status = fVolume->FlushDevice(); 896 897 recursive_lock_unlock(&fLock); 898 return status; 899 } 900 901 902 /*! Flushes the current log entry to disk, and also writes back all dirty 903 blocks for this volume (completing all open transactions). 904 */ 905 status_t 906 Journal::FlushLogAndBlocks() 907 { 908 return _FlushLog(true, true); 909 } 910 911 912 status_t 913 Journal::Lock(Transaction* owner, bool separateSubTransactions) 914 { 915 status_t status = recursive_lock_lock(&fLock); 916 if (status != B_OK) 917 return status; 918 919 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 920 // we'll just use the current transaction again 921 return B_OK; 922 } 923 924 if (separateSubTransactions) 925 fSeparateSubTransactions = true; 926 927 fOwner = owner; 928 929 // TODO: we need a way to find out how big the current transaction is; 930 // we need to be able to either detach the latest sub transaction on 931 // demand, as well as having some kind of fall back plan in case the 932 // sub transaction itself grows bigger than the log. 933 // For that, it would be nice to have some call-back interface in the 934 // cache transaction API... 935 936 if (fOwner != NULL) { 937 if (fUnwrittenTransactions > 0) { 938 // start a sub transaction 939 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 940 fHasSubtransaction = true; 941 } else 942 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 943 944 if (fTransactionID < B_OK) { 945 recursive_lock_unlock(&fLock); 946 return fTransactionID; 947 } 948 949 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 950 TRANSACTION_IDLE, _TransactionIdle, this); 951 } 952 return B_OK; 953 } 954 955 956 void 957 Journal::Unlock(Transaction* owner, bool success) 958 { 959 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 960 // we only end the transaction if we would really unlock it 961 // TODO: what about failing transactions that do not unlock? 962 // (they must make the parent fail, too) 963 if (fOwner != NULL) 964 _TransactionDone(success); 965 966 fTimestamp = system_time(); 967 fOwner = NULL; 968 969 if (fSeparateSubTransactions 970 && recursive_lock_get_recursion(&fLock) == 1) 971 fSeparateSubTransactions = false; 972 } 973 974 recursive_lock_unlock(&fLock); 975 } 976 977 978 uint32 979 Journal::_TransactionSize() const 980 { 981 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 982 fTransactionID); 983 if (count <= 0) 984 return 0; 985 986 // take the number of array blocks in this transaction into account 987 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 988 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 989 return count + arrayBlocks; 990 } 991 992 993 status_t 994 Journal::_TransactionDone(bool success) 995 { 996 if (!success) { 997 if (_HasSubTransaction()) 998 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 999 else 1000 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1001 1002 return B_OK; 1003 } 1004 1005 // Up to a maximum size, we will just batch several 1006 // transactions together to improve speed 1007 uint32 size = _TransactionSize(); 1008 if (size < fMaxTransactionSize) { 1009 // Flush the log from time to time, so that we have enough space 1010 // for this transaction 1011 if (size > FreeLogBlocks()) 1012 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1013 1014 fUnwrittenTransactions++; 1015 return B_OK; 1016 } 1017 1018 return _WriteTransactionToLog(); 1019 } 1020 1021 1022 // #pragma mark - debugger commands 1023 1024 1025 #ifdef BFS_DEBUGGER_COMMANDS 1026 1027 void 1028 Journal::Dump() 1029 { 1030 kprintf("log start: %ld\n", fVolume->LogStart()); 1031 kprintf("log end: %ld\n", fVolume->LogEnd()); 1032 kprintf("entries:\n"); 1033 kprintf(" address id start length\n"); 1034 1035 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1036 1037 while (iterator.HasNext()) { 1038 LogEntry* entry = iterator.Next(); 1039 1040 kprintf(" %p %6ld %6lu %6lu\n", entry, entry->TransactionID(), 1041 entry->Start(), entry->Length()); 1042 } 1043 } 1044 1045 1046 int 1047 dump_journal(int argc, char** argv) 1048 { 1049 if (argc != 2 || !strcmp(argv[1], "--help")) { 1050 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1051 return 0; 1052 } 1053 1054 Volume* volume = (Volume*)parse_expression(argv[1]); 1055 Journal* journal = volume->GetJournal(0); 1056 1057 journal->Dump(); 1058 return 0; 1059 } 1060 1061 #endif // BFS_DEBUGGER_COMMANDS 1062 1063 1064 // #pragma mark - Transaction 1065 1066 1067 status_t 1068 Transaction::Start(Volume* volume, off_t refBlock) 1069 { 1070 // has it already been started? 1071 if (fJournal != NULL) 1072 return B_OK; 1073 1074 fJournal = volume->GetJournal(refBlock); 1075 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1076 return B_OK; 1077 1078 fJournal = NULL; 1079 return B_ERROR; 1080 } 1081 1082 1083 /*! Adds an inode to this transaction. This means that the inode will be write 1084 locked until the transaction ended. 1085 To ensure that the inode will stay valid until that point, an extra reference 1086 is acquired to it as long as this transaction stays active. 1087 */ 1088 void 1089 Transaction::AddInode(Inode* inode) 1090 { 1091 if (fJournal == NULL) 1092 panic("Transaction is not running!"); 1093 1094 // These flags can only change while holding the transaction lock 1095 if ((inode->Flags() & INODE_IN_TRANSACTION) != 0) 1096 return; 1097 1098 // We share the same list link with the removed list, so we have to remove 1099 // the inode from that list here (and add it back when we no longer need it) 1100 if ((inode->Flags() & INODE_DELETED) != 0) 1101 GetVolume()->RemovedInodes().Remove(inode); 1102 1103 if (!GetVolume()->IsInitializing()) 1104 acquire_vnode(GetVolume()->FSVolume(), inode->ID()); 1105 1106 rw_lock_write_lock(&inode->Lock()); 1107 fLockedInodes.Add(inode); 1108 inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1109 } 1110 1111 1112 void 1113 Transaction::RemoveInode(Inode* inode) 1114 { 1115 if (fJournal == NULL) 1116 panic("Transaction is not running!"); 1117 1118 inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1119 fLockedInodes.Remove(inode); 1120 rw_lock_write_unlock(&inode->Lock()); 1121 1122 // See AddInode() why we do this here 1123 if ((inode->Flags() & INODE_DELETED) != 0) 1124 GetVolume()->RemovedInodes().Add(inode); 1125 1126 if (!GetVolume()->IsInitializing()) 1127 put_vnode(GetVolume()->FSVolume(), inode->ID()); 1128 } 1129 1130 1131 void 1132 Transaction::_UnlockInodes() 1133 { 1134 while (Inode* inode = fLockedInodes.RemoveHead()) { 1135 inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION); 1136 rw_lock_write_unlock(&inode->Lock()); 1137 1138 // See AddInode() why we do this here 1139 if ((inode->Flags() & INODE_DELETED) != 0) 1140 GetVolume()->RemovedInodes().Add(inode); 1141 1142 if (!GetVolume()->IsInitializing()) 1143 put_vnode(GetVolume()->FSVolume(), inode->ID()); 1144 } 1145 } 1146 1147