1 /* 2 * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de. 3 * This file may be used under the terms of the MIT License. 4 */ 5 6 7 //! Transaction and logging 8 9 10 #include "Journal.h" 11 12 #include "Debug.h" 13 #include "Inode.h" 14 15 16 struct run_array { 17 int32 count; 18 int32 max_runs; 19 block_run runs[0]; 20 21 void Init(int32 blockSize); 22 void Insert(block_run& run); 23 24 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); } 25 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; } 26 // that -1 accounts for an off-by-one error in Be's BFS implementation 27 const block_run& RunAt(int32 i) const { return runs[i]; } 28 29 static int32 MaxRuns(int32 blockSize); 30 31 private: 32 static int _Compare(block_run& a, block_run& b); 33 int32 _FindInsertionIndex(block_run& run); 34 }; 35 36 class RunArrays { 37 public: 38 RunArrays(Journal* journal); 39 ~RunArrays(); 40 41 status_t Insert(off_t blockNumber); 42 43 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; } 44 int32 CountArrays() const { return fArrays.CountItems(); } 45 46 uint32 CountBlocks() const { return fBlockCount; } 47 uint32 LogEntryLength() const 48 { return CountBlocks() + CountArrays(); } 49 50 int32 MaxArrayLength(); 51 52 private: 53 status_t _AddArray(); 54 bool _ContainsRun(block_run& run); 55 bool _AddRun(block_run& run); 56 57 Journal* fJournal; 58 uint32 fBlockCount; 59 Stack<run_array*> fArrays; 60 run_array* fLastArray; 61 }; 62 63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> { 64 public: 65 LogEntry(Journal* journal, uint32 logStart, 66 uint32 length); 67 ~LogEntry(); 68 69 uint32 Start() const { return fStart; } 70 uint32 Length() const { return fLength; } 71 72 #ifdef BFS_DEBUGGER_COMMANDS 73 void SetTransactionID(int32 id) { fTransactionID = id; } 74 int32 TransactionID() const { return fTransactionID; } 75 #endif 76 77 Journal* GetJournal() { return fJournal; } 78 79 private: 80 Journal* fJournal; 81 uint32 fStart; 82 uint32 fLength; 83 #ifdef BFS_DEBUGGER_COMMANDS 84 int32 fTransactionID; 85 #endif 86 }; 87 88 89 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE) 90 namespace BFSJournalTracing { 91 92 class LogEntry : public AbstractTraceEntry { 93 public: 94 LogEntry(::LogEntry* entry, off_t logPosition, bool started) 95 : 96 fEntry(entry), 97 #ifdef BFS_DEBUGGER_COMMANDS 98 fTransactionID(entry->TransactionID()), 99 #endif 100 fStart(entry->Start()), 101 fLength(entry->Length()), 102 fLogPosition(logPosition), 103 fStarted(started) 104 { 105 Initialized(); 106 } 107 108 virtual void AddDump(TraceOutput& out) 109 { 110 #ifdef BFS_DEBUGGER_COMMANDS 111 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s " 112 "%lu\n", fStarted ? "Started" : "Written", fEntry, 113 fTransactionID, fStart, fLength, 114 fStarted ? "end" : "start", fLogPosition); 115 #else 116 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n", 117 fStarted ? "Started" : "Written", fEntry, fStart, fLength, 118 fStarted ? "end" : "start", fLogPosition); 119 #endif 120 } 121 122 private: 123 ::LogEntry* fEntry; 124 #ifdef BFS_DEBUGGER_COMMANDS 125 int32 fTransactionID; 126 #endif 127 uint32 fStart; 128 uint32 fLength; 129 uint32 fLogPosition; 130 bool fStarted; 131 }; 132 133 } // namespace BFSJournalTracing 134 135 # define T(x) new(std::nothrow) BFSJournalTracing::x; 136 #else 137 # define T(x) ; 138 #endif 139 140 141 // #pragma mark - 142 143 144 static void 145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address, 146 size_t size) 147 { 148 if (index > 0 && (addr_t)vecs[index - 1].iov_base 149 + vecs[index - 1].iov_len == (addr_t)address) { 150 // the iovec can be combined with the previous one 151 vecs[index - 1].iov_len += size; 152 return; 153 } 154 155 if (index == max) 156 panic("no more space for iovecs!"); 157 158 // we need to start a new iovec 159 vecs[index].iov_base = const_cast<void*>(address); 160 vecs[index].iov_len = size; 161 index++; 162 } 163 164 165 // #pragma mark - LogEntry 166 167 168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length) 169 : 170 fJournal(journal), 171 fStart(start), 172 fLength(length) 173 { 174 } 175 176 177 LogEntry::~LogEntry() 178 { 179 } 180 181 182 // #pragma mark - run_array 183 184 185 /*! The run_array's size equals the block size of the BFS volume, so we 186 cannot use a (non-overridden) new. 187 This makes a freshly allocated run_array ready to run. 188 */ 189 void 190 run_array::Init(int32 blockSize) 191 { 192 memset(this, 0, blockSize); 193 count = 0; 194 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize)); 195 } 196 197 198 /*! Inserts the block_run into the array. You will have to make sure the 199 array is large enough to contain the entry before calling this function. 200 */ 201 void 202 run_array::Insert(block_run& run) 203 { 204 int32 index = _FindInsertionIndex(run); 205 if (index == -1) { 206 // add to the end 207 runs[CountRuns()] = run; 208 } else { 209 // insert at index 210 memmove(&runs[index + 1], &runs[index], 211 (CountRuns() - index) * sizeof(off_t)); 212 runs[index] = run; 213 } 214 215 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1); 216 } 217 218 219 /*static*/ int32 220 run_array::MaxRuns(int32 blockSize) 221 { 222 // For whatever reason, BFS restricts the maximum array size 223 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run); 224 if (maxCount < 128) 225 return maxCount; 226 227 return 127; 228 } 229 230 231 /*static*/ int 232 run_array::_Compare(block_run& a, block_run& b) 233 { 234 int cmp = a.AllocationGroup() - b.AllocationGroup(); 235 if (cmp == 0) 236 return a.Start() - b.Start(); 237 238 return cmp; 239 } 240 241 242 int32 243 run_array::_FindInsertionIndex(block_run& run) 244 { 245 int32 min = 0, max = CountRuns() - 1; 246 int32 i = 0; 247 if (max >= 8) { 248 while (min <= max) { 249 i = (min + max) / 2; 250 251 int cmp = _Compare(runs[i], run); 252 if (cmp < 0) 253 min = i + 1; 254 else if (cmp > 0) 255 max = i - 1; 256 else 257 return -1; 258 } 259 260 if (_Compare(runs[i], run) < 0) 261 i++; 262 } else { 263 for (; i <= max; i++) { 264 if (_Compare(runs[i], run) > 0) 265 break; 266 } 267 if (i == count) 268 return -1; 269 } 270 271 return i; 272 } 273 274 275 // #pragma mark - RunArrays 276 277 278 RunArrays::RunArrays(Journal* journal) 279 : 280 fJournal(journal), 281 fBlockCount(0), 282 fArrays(), 283 fLastArray(NULL) 284 { 285 } 286 287 288 RunArrays::~RunArrays() 289 { 290 run_array* array; 291 while (fArrays.Pop(&array)) 292 free(array); 293 } 294 295 296 bool 297 RunArrays::_ContainsRun(block_run& run) 298 { 299 for (int32 i = 0; i < CountArrays(); i++) { 300 run_array* array = ArrayAt(i); 301 302 for (int32 j = 0; j < array->CountRuns(); j++) { 303 block_run& arrayRun = array->runs[j]; 304 if (run.AllocationGroup() != arrayRun.AllocationGroup()) 305 continue; 306 307 if (run.Start() >= arrayRun.Start() 308 && run.Start() + run.Length() 309 <= arrayRun.Start() + arrayRun.Length()) 310 return true; 311 } 312 } 313 314 return false; 315 } 316 317 318 /*! Adds the specified block_run into the array. 319 Note: it doesn't support overlapping - it must only be used 320 with block_runs of length 1! 321 */ 322 bool 323 RunArrays::_AddRun(block_run& run) 324 { 325 ASSERT(run.length == 1); 326 327 // Be's BFS log replay routine can only deal with block_runs of size 1 328 // A pity, isn't it? Too sad we have to be compatible. 329 330 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns()) 331 return false; 332 333 fLastArray->Insert(run); 334 fBlockCount++; 335 return true; 336 } 337 338 339 status_t 340 RunArrays::_AddArray() 341 { 342 int32 blockSize = fJournal->GetVolume()->BlockSize(); 343 344 run_array* array = (run_array*)malloc(blockSize); 345 if (array == NULL) 346 return B_NO_MEMORY; 347 348 if (fArrays.Push(array) != B_OK) { 349 free(array); 350 return B_NO_MEMORY; 351 } 352 353 array->Init(blockSize); 354 fLastArray = array; 355 return B_OK; 356 } 357 358 359 status_t 360 RunArrays::Insert(off_t blockNumber) 361 { 362 Volume* volume = fJournal->GetVolume(); 363 block_run run = volume->ToBlockRun(blockNumber); 364 365 if (fLastArray != NULL) { 366 // check if the block is already in the array 367 if (_ContainsRun(run)) 368 return B_OK; 369 } 370 371 // insert block into array 372 373 if (!_AddRun(run)) { 374 // array is full 375 if (_AddArray() != B_OK || !_AddRun(run)) 376 return B_NO_MEMORY; 377 } 378 379 return B_OK; 380 } 381 382 383 int32 384 RunArrays::MaxArrayLength() 385 { 386 int32 max = 0; 387 for (int32 i = 0; i < CountArrays(); i++) { 388 if (ArrayAt(i)->CountRuns() > max) 389 max = ArrayAt(i)->CountRuns(); 390 } 391 392 return max; 393 } 394 395 396 // #pragma mark - Journal 397 398 399 Journal::Journal(Volume* volume) 400 : 401 fVolume(volume), 402 fOwner(NULL), 403 fLogSize(volume->Log().Length()), 404 fMaxTransactionSize(fLogSize / 2 - 5), 405 fUsed(0), 406 fUnwrittenTransactions(0), 407 fHasSubtransaction(false), 408 fSeparateSubTransactions(false) 409 { 410 recursive_lock_init(&fLock, "bfs journal"); 411 mutex_init(&fEntriesLock, "bfs journal entries"); 412 } 413 414 415 Journal::~Journal() 416 { 417 FlushLogAndBlocks(); 418 419 recursive_lock_destroy(&fLock); 420 mutex_destroy(&fEntriesLock); 421 } 422 423 424 status_t 425 Journal::InitCheck() 426 { 427 return B_OK; 428 } 429 430 431 /*! \brief Does a very basic consistency check of the run array. 432 It will check the maximum run count as well as if all of the runs fall 433 within a the volume. 434 */ 435 status_t 436 Journal::_CheckRunArray(const run_array* array) 437 { 438 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1; 439 // the -1 works around an off-by-one bug in Be's BFS implementation, 440 // same as in run_array::MaxRuns() 441 if (array->MaxRuns() != maxRuns 442 || array->CountRuns() > maxRuns 443 || array->CountRuns() <= 0) { 444 dprintf("run count: %d, array max: %d, max runs: %d\n", 445 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns); 446 FATAL(("Log entry has broken header!\n")); 447 return B_ERROR; 448 } 449 450 for (int32 i = 0; i < array->CountRuns(); i++) { 451 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK) 452 return B_ERROR; 453 } 454 455 PRINT(("Log entry has %ld entries\n", array->CountRuns())); 456 return B_OK; 457 } 458 459 460 /*! Replays an entry in the log. 461 \a _start points to the entry in the log, and will be bumped to the next 462 one if replaying succeeded. 463 */ 464 status_t 465 Journal::_ReplayRunArray(int32* _start) 466 { 467 PRINT(("ReplayRunArray(start = %ld)\n", *_start)); 468 469 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 470 off_t firstBlockNumber = *_start % fLogSize; 471 472 CachedBlock cachedArray(fVolume); 473 474 const run_array* array = (const run_array*)cachedArray.SetTo(logOffset 475 + firstBlockNumber); 476 if (array == NULL) 477 return B_IO_ERROR; 478 479 if (_CheckRunArray(array) < B_OK) 480 return B_BAD_DATA; 481 482 // First pass: check integrity of the blocks in the run array 483 484 CachedBlock cached(fVolume); 485 486 firstBlockNumber = (firstBlockNumber + 1) % fLogSize; 487 off_t blockNumber = firstBlockNumber; 488 int32 blockSize = fVolume->BlockSize(); 489 490 for (int32 index = 0; index < array->CountRuns(); index++) { 491 const block_run& run = array->RunAt(index); 492 493 off_t offset = fVolume->ToOffset(run); 494 for (int32 i = 0; i < run.Length(); i++) { 495 const uint8* data = cached.SetTo(logOffset + blockNumber); 496 if (data == NULL) 497 RETURN_ERROR(B_IO_ERROR); 498 499 // TODO: eventually check other well known offsets, like the 500 // root and index dirs 501 if (offset == 0) { 502 // This log entry writes over the super block - check if 503 // it's valid! 504 if (Volume::CheckSuperBlock(data) != B_OK) { 505 FATAL(("Log contains invalid super block!\n")); 506 RETURN_ERROR(B_BAD_DATA); 507 } 508 } 509 510 blockNumber = (blockNumber + 1) % fLogSize; 511 offset += blockSize; 512 } 513 } 514 515 // Second pass: write back its blocks 516 517 blockNumber = firstBlockNumber; 518 int32 count = 1; 519 520 for (int32 index = 0; index < array->CountRuns(); index++) { 521 const block_run& run = array->RunAt(index); 522 INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n", 523 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber)); 524 525 off_t offset = fVolume->ToOffset(run); 526 for (int32 i = 0; i < run.Length(); i++) { 527 const uint8* data = cached.SetTo(logOffset + blockNumber); 528 if (data == NULL) 529 RETURN_ERROR(B_IO_ERROR); 530 531 ssize_t written = write_pos(fVolume->Device(), offset, data, 532 blockSize); 533 if (written != blockSize) 534 RETURN_ERROR(B_IO_ERROR); 535 536 blockNumber = (blockNumber + 1) % fLogSize; 537 offset += blockSize; 538 count++; 539 } 540 } 541 542 *_start += count; 543 return B_OK; 544 } 545 546 547 /*! Replays all log entries - this will put the disk into a 548 consistent and clean state, if it was not correctly unmounted 549 before. 550 This method is called by Journal::InitCheck() if the log start 551 and end pointer don't match. 552 */ 553 status_t 554 Journal::ReplayLog() 555 { 556 // TODO: this logic won't work whenever the size of the pending transaction 557 // equals the size of the log (happens with the original BFS only) 558 if (fVolume->LogStart() == fVolume->LogEnd()) 559 return B_OK; 560 561 INFORM(("Replay log, disk was not correctly unmounted...\n")); 562 563 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) { 564 INFORM(("log_start and log_end differ, but disk is marked clean - " 565 "trying to replay log...\n")); 566 } 567 568 if (fVolume->IsReadOnly()) 569 return B_READ_ONLY_DEVICE; 570 571 int32 start = fVolume->LogStart(); 572 int32 lastStart = -1; 573 while (true) { 574 // stop if the log is completely flushed 575 if (start == fVolume->LogEnd()) 576 break; 577 578 if (start == lastStart) { 579 // strange, flushing the log hasn't changed the log_start pointer 580 return B_ERROR; 581 } 582 lastStart = start; 583 584 status_t status = _ReplayRunArray(&start); 585 if (status != B_OK) { 586 FATAL(("replaying log entry from %d failed: %s\n", (int)start, 587 strerror(status))); 588 return B_ERROR; 589 } 590 start = start % fLogSize; 591 } 592 593 PRINT(("replaying worked fine!\n")); 594 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64( 595 fVolume->LogEnd()); 596 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd()); 597 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 598 599 return fVolume->WriteSuperBlock(); 600 } 601 602 603 size_t 604 Journal::CurrentTransactionSize() const 605 { 606 if (_HasSubTransaction()) { 607 return cache_blocks_in_sub_transaction(fVolume->BlockCache(), 608 fTransactionID); 609 } 610 611 return cache_blocks_in_main_transaction(fVolume->BlockCache(), 612 fTransactionID); 613 } 614 615 616 bool 617 Journal::CurrentTransactionTooLarge() const 618 { 619 return CurrentTransactionSize() > fLogSize; 620 } 621 622 623 /*! This is a callback function that is called by the cache, whenever 624 all blocks of a transaction have been flushed to disk. 625 This lets us keep track of completed transactions, and update 626 the log start pointer as needed. Note, the transactions may not be 627 completed in the order they were written. 628 */ 629 /*static*/ void 630 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry) 631 { 632 LogEntry* logEntry = (LogEntry*)_logEntry; 633 634 PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry, 635 transactionID)); 636 637 Journal* journal = logEntry->GetJournal(); 638 disk_super_block& superBlock = journal->fVolume->SuperBlock(); 639 bool update = false; 640 641 // Set log_start pointer if possible... 642 643 mutex_lock(&journal->fEntriesLock); 644 645 if (logEntry == journal->fEntries.First()) { 646 LogEntry* next = journal->fEntries.GetNext(logEntry); 647 if (next != NULL) { 648 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start() 649 % journal->fLogSize); 650 } else { 651 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64( 652 journal->fVolume->LogEnd()); 653 } 654 655 update = true; 656 } 657 658 T(LogEntry(logEntry, superBlock.LogStart(), false)); 659 660 journal->fUsed -= logEntry->Length(); 661 journal->fEntries.Remove(logEntry); 662 mutex_unlock(&journal->fEntriesLock); 663 664 delete logEntry; 665 666 // update the super block, and change the disk's state, if necessary 667 668 if (update) { 669 if (superBlock.log_start == superBlock.log_end) 670 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 671 672 status_t status = journal->fVolume->WriteSuperBlock(); 673 if (status != B_OK) { 674 FATAL(("_TransactionWritten: could not write back super block: %s\n", 675 strerror(status))); 676 } 677 678 journal->fVolume->LogStart() = superBlock.LogStart(); 679 } 680 } 681 682 683 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */ 684 /*static*/ void 685 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal) 686 { 687 // The current transaction seems to be idle - flush it 688 689 Journal* journal = (Journal*)_journal; 690 journal->_FlushLog(false, false); 691 } 692 693 694 /*! Writes the blocks that are part of current transaction into the log, 695 and ends the current transaction. 696 If the current transaction is too large to fit into the log, it will 697 try to detach an existing sub-transaction. 698 */ 699 status_t 700 Journal::_WriteTransactionToLog() 701 { 702 // TODO: in case of a failure, we need a backup plan like writing all 703 // changed blocks back to disk immediately (hello disk corruption!) 704 705 bool detached = false; 706 707 if (_TransactionSize() > fLogSize) { 708 // The current transaction won't fit into the log anymore, try to 709 // detach the current sub-transaction 710 if (_HasSubTransaction() && cache_blocks_in_main_transaction( 711 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) { 712 detached = true; 713 } else { 714 // We created a transaction larger than one we can write back to 715 // disk - the only option we have (besides risking disk corruption 716 // by writing it back anyway), is to let it fail. 717 dprintf("transaction too large (%d blocks, log size %d)!\n", 718 (int)_TransactionSize(), (int)fLogSize); 719 return B_BUFFER_OVERFLOW; 720 } 721 } 722 723 fHasSubtransaction = false; 724 725 int32 blockShift = fVolume->BlockShift(); 726 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 727 off_t logStart = fVolume->LogEnd() % fLogSize; 728 off_t logPosition = logStart; 729 status_t status; 730 731 // create run_array structures for all changed blocks 732 733 RunArrays runArrays(this); 734 735 off_t blockNumber; 736 long cookie = 0; 737 while (cache_next_block_in_transaction(fVolume->BlockCache(), 738 fTransactionID, detached, &cookie, &blockNumber, NULL, 739 NULL) == B_OK) { 740 status = runArrays.Insert(blockNumber); 741 if (status < B_OK) { 742 FATAL(("filling log entry failed!")); 743 return status; 744 } 745 } 746 747 if (runArrays.CountBlocks() == 0) { 748 // nothing has changed during this transaction 749 if (detached) { 750 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 751 fTransactionID, NULL, NULL); 752 fUnwrittenTransactions = 1; 753 } else { 754 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL, 755 NULL); 756 fUnwrittenTransactions = 0; 757 } 758 return B_OK; 759 } 760 761 // If necessary, flush the log, so that we have enough space for this 762 // transaction 763 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 764 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 765 if (runArrays.LogEntryLength() > FreeLogBlocks()) { 766 panic("no space in log after sync (%ld for %ld blocks)!", 767 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength()); 768 } 769 } 770 771 // Write log entries to disk 772 773 int32 maxVecs = runArrays.MaxArrayLength() + 1; 774 // one extra for the index block 775 776 iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs); 777 if (vecs == NULL) { 778 // TODO: write back log entries directly? 779 return B_NO_MEMORY; 780 } 781 782 for (int32 k = 0; k < runArrays.CountArrays(); k++) { 783 run_array* array = runArrays.ArrayAt(k); 784 int32 index = 0, count = 1; 785 int32 wrap = fLogSize - logStart; 786 787 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize()); 788 789 // add block runs 790 791 for (int32 i = 0; i < array->CountRuns(); i++) { 792 const block_run& run = array->RunAt(i); 793 off_t blockNumber = fVolume->ToBlock(run); 794 795 for (int32 j = 0; j < run.Length(); j++) { 796 if (count >= wrap) { 797 // We need to write back the first half of the entry 798 // directly as the log wraps around 799 if (writev_pos(fVolume->Device(), logOffset 800 + (logStart << blockShift), vecs, index) < 0) 801 FATAL(("could not write log area!\n")); 802 803 logPosition = logStart + count; 804 logStart = 0; 805 wrap = fLogSize; 806 count = 0; 807 index = 0; 808 } 809 810 // make blocks available in the cache 811 const void* data = block_cache_get(fVolume->BlockCache(), 812 blockNumber + j); 813 if (data == NULL) { 814 free(vecs); 815 return B_IO_ERROR; 816 } 817 818 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize()); 819 count++; 820 } 821 } 822 823 // write back the rest of the log entry 824 if (count > 0) { 825 logPosition = logStart + count; 826 if (writev_pos(fVolume->Device(), logOffset 827 + (logStart << blockShift), vecs, index) < 0) 828 FATAL(("could not write log area: %s!\n", strerror(errno))); 829 } 830 831 // release blocks again 832 for (int32 i = 0; i < array->CountRuns(); i++) { 833 const block_run& run = array->RunAt(i); 834 off_t blockNumber = fVolume->ToBlock(run); 835 836 for (int32 j = 0; j < run.Length(); j++) { 837 block_cache_put(fVolume->BlockCache(), blockNumber + j); 838 } 839 } 840 841 logStart = logPosition % fLogSize; 842 } 843 844 free(vecs); 845 846 LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(), 847 runArrays.LogEntryLength()); 848 if (logEntry == NULL) { 849 FATAL(("no memory to allocate log entries!")); 850 return B_NO_MEMORY; 851 } 852 853 #ifdef BFS_DEBUGGER_COMMANDS 854 logEntry->SetTransactionID(fTransactionID); 855 #endif 856 857 // Update the log end pointer in the super block 858 859 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 860 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition); 861 862 status = fVolume->WriteSuperBlock(); 863 864 fVolume->LogEnd() = logPosition; 865 T(LogEntry(logEntry, fVolume->LogEnd(), true)); 866 867 // We need to flush the drives own cache here to ensure 868 // disk consistency. 869 // If that call fails, we can't do anything about it anyway 870 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 871 872 // at this point, we can finally end the transaction - we're in 873 // a guaranteed valid state 874 875 mutex_lock(&fEntriesLock); 876 fEntries.Add(logEntry); 877 fUsed += logEntry->Length(); 878 mutex_unlock(&fEntriesLock); 879 880 if (detached) { 881 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(), 882 fTransactionID, _TransactionWritten, logEntry); 883 fUnwrittenTransactions = 1; 884 885 if (status == B_OK && _TransactionSize() > fLogSize) { 886 // If the transaction is too large after writing, there is no way to 887 // recover, so let this transaction fail. 888 dprintf("transaction too large (%d blocks, log size %d)!\n", 889 (int)_TransactionSize(), (int)fLogSize); 890 return B_BUFFER_OVERFLOW; 891 } 892 } else { 893 cache_end_transaction(fVolume->BlockCache(), fTransactionID, 894 _TransactionWritten, logEntry); 895 fUnwrittenTransactions = 0; 896 } 897 898 return status; 899 } 900 901 902 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will 903 also write back all dirty blocks for this volume. 904 */ 905 status_t 906 Journal::_FlushLog(bool canWait, bool flushBlocks) 907 { 908 status_t status = canWait ? recursive_lock_lock(&fLock) 909 : recursive_lock_trylock(&fLock); 910 if (status != B_OK) 911 return status; 912 913 if (recursive_lock_get_recursion(&fLock) > 1) { 914 // whoa, FlushLogAndBlocks() was called from inside a transaction 915 recursive_lock_unlock(&fLock); 916 return B_OK; 917 } 918 919 // write the current log entry to disk 920 921 if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) { 922 status = _WriteTransactionToLog(); 923 if (status < B_OK) 924 FATAL(("writing current log entry failed: %s\n", strerror(status))); 925 } 926 927 if (flushBlocks) 928 status = fVolume->FlushDevice(); 929 930 recursive_lock_unlock(&fLock); 931 return status; 932 } 933 934 935 /*! Flushes the current log entry to disk, and also writes back all dirty 936 blocks for this volume (completing all open transactions). 937 */ 938 status_t 939 Journal::FlushLogAndBlocks() 940 { 941 return _FlushLog(true, true); 942 } 943 944 945 status_t 946 Journal::Lock(Transaction* owner, bool separateSubTransactions) 947 { 948 status_t status = recursive_lock_lock(&fLock); 949 if (status != B_OK) 950 return status; 951 952 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) { 953 // we'll just use the current transaction again 954 return B_OK; 955 } 956 957 if (separateSubTransactions) 958 fSeparateSubTransactions = true; 959 960 if (owner != NULL) 961 owner->SetParent(fOwner); 962 963 fOwner = owner; 964 965 // TODO: we need a way to find out how big the current transaction is; 966 // we need to be able to either detach the latest sub transaction on 967 // demand, as well as having some kind of fall back plan in case the 968 // sub transaction itself grows bigger than the log. 969 // For that, it would be nice to have some call-back interface in the 970 // cache transaction API... 971 972 if (fOwner != NULL) { 973 if (fUnwrittenTransactions > 0) { 974 // start a sub transaction 975 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID); 976 fHasSubtransaction = true; 977 } else 978 fTransactionID = cache_start_transaction(fVolume->BlockCache()); 979 980 if (fTransactionID < B_OK) { 981 recursive_lock_unlock(&fLock); 982 return fTransactionID; 983 } 984 985 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID, 986 TRANSACTION_IDLE, _TransactionIdle, this); 987 } 988 return B_OK; 989 } 990 991 992 status_t 993 Journal::Unlock(Transaction* owner, bool success) 994 { 995 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) { 996 // we only end the transaction if we would really unlock it 997 // TODO: what about failing transactions that do not unlock? 998 // (they must make the parent fail, too) 999 if (owner != NULL) { 1000 status_t status = _TransactionDone(success); 1001 if (status != B_OK) 1002 return status; 1003 1004 // Unlocking the inodes might trigger new transactions, but we 1005 // cannot reuse the current one anymore, as this one is already 1006 // closed. 1007 bool separateSubTransactions = fSeparateSubTransactions; 1008 fSeparateSubTransactions = true; 1009 owner->NotifyListeners(success); 1010 fSeparateSubTransactions = separateSubTransactions; 1011 1012 fOwner = owner->Parent(); 1013 } else 1014 fOwner = NULL; 1015 1016 fTimestamp = system_time(); 1017 1018 if (fSeparateSubTransactions 1019 && recursive_lock_get_recursion(&fLock) == 1) 1020 fSeparateSubTransactions = false; 1021 } else 1022 owner->MoveListenersTo(fOwner); 1023 1024 recursive_lock_unlock(&fLock); 1025 return B_OK; 1026 } 1027 1028 1029 uint32 1030 Journal::_TransactionSize() const 1031 { 1032 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(), 1033 fTransactionID); 1034 if (count <= 0) 1035 return 0; 1036 1037 // take the number of array blocks in this transaction into account 1038 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()); 1039 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns; 1040 return count + arrayBlocks; 1041 } 1042 1043 1044 status_t 1045 Journal::_TransactionDone(bool success) 1046 { 1047 if (!success) { 1048 if (_HasSubTransaction()) { 1049 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID); 1050 // We can continue to use the parent transaction afterwards 1051 } else { 1052 cache_abort_transaction(fVolume->BlockCache(), fTransactionID); 1053 fUnwrittenTransactions = 0; 1054 } 1055 1056 return B_OK; 1057 } 1058 1059 // Up to a maximum size, we will just batch several 1060 // transactions together to improve speed 1061 uint32 size = _TransactionSize(); 1062 if (size < fMaxTransactionSize) { 1063 // Flush the log from time to time, so that we have enough space 1064 // for this transaction 1065 if (size > FreeLogBlocks()) 1066 cache_sync_transaction(fVolume->BlockCache(), fTransactionID); 1067 1068 fUnwrittenTransactions++; 1069 return B_OK; 1070 } 1071 1072 return _WriteTransactionToLog(); 1073 } 1074 1075 1076 // #pragma mark - debugger commands 1077 1078 1079 #ifdef BFS_DEBUGGER_COMMANDS 1080 1081 1082 void 1083 Journal::Dump() 1084 { 1085 kprintf("Journal %p\n", this); 1086 kprintf(" log start: %ld\n", fVolume->LogStart()); 1087 kprintf(" log end: %ld\n", fVolume->LogEnd()); 1088 kprintf(" owner: %p\n", fOwner); 1089 kprintf(" log size: %lu\n", fLogSize); 1090 kprintf(" max transaction size: %lu\n", fMaxTransactionSize); 1091 kprintf(" used: %lu\n", fUsed); 1092 kprintf(" unwritten: %ld\n", fUnwrittenTransactions); 1093 kprintf(" timestamp: %lld\n", fTimestamp); 1094 kprintf(" transaction ID: %ld\n", fTransactionID); 1095 kprintf(" has subtransaction: %d\n", fHasSubtransaction); 1096 kprintf(" separate sub-trans.: %d\n", fSeparateSubTransactions); 1097 kprintf("entries:\n"); 1098 kprintf(" address id start length\n"); 1099 1100 LogEntryList::Iterator iterator = fEntries.GetIterator(); 1101 1102 while (iterator.HasNext()) { 1103 LogEntry* entry = iterator.Next(); 1104 1105 kprintf(" %p %6ld %6lu %6lu\n", entry, entry->TransactionID(), 1106 entry->Start(), entry->Length()); 1107 } 1108 } 1109 1110 1111 int 1112 dump_journal(int argc, char** argv) 1113 { 1114 if (argc != 2 || !strcmp(argv[1], "--help")) { 1115 kprintf("usage: %s <ptr-to-volume>\n", argv[0]); 1116 return 0; 1117 } 1118 1119 Volume* volume = (Volume*)parse_expression(argv[1]); 1120 Journal* journal = volume->GetJournal(0); 1121 1122 journal->Dump(); 1123 return 0; 1124 } 1125 1126 1127 #endif // BFS_DEBUGGER_COMMANDS 1128 1129 1130 // #pragma mark - TransactionListener 1131 1132 1133 TransactionListener::TransactionListener() 1134 { 1135 } 1136 1137 1138 TransactionListener::~TransactionListener() 1139 { 1140 } 1141 1142 1143 // #pragma mark - Transaction 1144 1145 1146 status_t 1147 Transaction::Start(Volume* volume, off_t refBlock) 1148 { 1149 // has it already been started? 1150 if (fJournal != NULL) 1151 return B_OK; 1152 1153 fJournal = volume->GetJournal(refBlock); 1154 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK) 1155 return B_OK; 1156 1157 fJournal = NULL; 1158 return B_ERROR; 1159 } 1160 1161 1162 void 1163 Transaction::AddListener(TransactionListener* listener) 1164 { 1165 if (fJournal == NULL) 1166 panic("Transaction is not running!"); 1167 1168 fListeners.Add(listener); 1169 } 1170 1171 1172 void 1173 Transaction::RemoveListener(TransactionListener* listener) 1174 { 1175 if (fJournal == NULL) 1176 panic("Transaction is not running!"); 1177 1178 fListeners.Remove(listener); 1179 listener->RemovedFromTransaction(); 1180 } 1181 1182 1183 void 1184 Transaction::NotifyListeners(bool success) 1185 { 1186 while (TransactionListener* listener = fListeners.RemoveHead()) { 1187 listener->TransactionDone(success); 1188 listener->RemovedFromTransaction(); 1189 } 1190 } 1191 1192 1193 /*! Move the inodes into the parent transaction. This is needed only to make 1194 sure they will still be reverted in case the transaction is aborted. 1195 */ 1196 void 1197 Transaction::MoveListenersTo(Transaction* transaction) 1198 { 1199 while (TransactionListener* listener = fListeners.RemoveHead()) { 1200 transaction->fListeners.Add(listener); 1201 } 1202 } 1203