1 /*
2 * Copyright 2001-2020, Axel Dörfler, axeld@pinc-software.de.
3 * This file may be used under the terms of the MIT License.
4 */
5
6
7 //! Transaction and logging
8
9
10 #include <StackOrHeapArray.h>
11
12 #include "Journal.h"
13
14 #include "Debug.h"
15 #include "Inode.h"
16
17
18 struct run_array {
19 int32 count;
20 int32 max_runs;
21 block_run runs[0];
22
23 void Init(int32 blockSize);
24 void Insert(block_run& run);
25
CountRunsrun_array26 int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
MaxRunsrun_array27 int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
28 // that -1 accounts for an off-by-one error in Be's BFS implementation
RunAtrun_array29 const block_run& RunAt(int32 i) const { return runs[i]; }
30
31 static int32 MaxRuns(int32 blockSize);
32
33 private:
34 static int _Compare(block_run& a, block_run& b);
35 int32 _FindInsertionIndex(block_run& run);
36 };
37
38 class RunArrays {
39 public:
40 RunArrays(Journal* journal);
41 ~RunArrays();
42
43 status_t Insert(off_t blockNumber);
44
ArrayAt(int32 i)45 run_array* ArrayAt(int32 i) { return fArrays.Array()[i]; }
CountArrays() const46 int32 CountArrays() const { return fArrays.CountItems(); }
47
CountBlocks() const48 uint32 CountBlocks() const { return fBlockCount; }
LogEntryLength() const49 uint32 LogEntryLength() const
50 { return CountBlocks() + CountArrays(); }
51
52 int32 MaxArrayLength();
53
54 private:
55 status_t _AddArray();
56 bool _ContainsRun(block_run& run);
57 bool _AddRun(block_run& run);
58
59 Journal* fJournal;
60 uint32 fBlockCount;
61 Stack<run_array*> fArrays;
62 run_array* fLastArray;
63 };
64
65 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
66 public:
67 LogEntry(Journal* journal, uint32 logStart,
68 uint32 length);
69 ~LogEntry();
70
Start() const71 uint32 Start() const { return fStart; }
Length() const72 uint32 Length() const { return fLength; }
73
74 #ifdef BFS_DEBUGGER_COMMANDS
SetTransactionID(int32 id)75 void SetTransactionID(int32 id) { fTransactionID = id; }
TransactionID() const76 int32 TransactionID() const { return fTransactionID; }
77 #endif
78
GetJournal()79 Journal* GetJournal() { return fJournal; }
80
81 private:
82 Journal* fJournal;
83 uint32 fStart;
84 uint32 fLength;
85 #ifdef BFS_DEBUGGER_COMMANDS
86 int32 fTransactionID;
87 #endif
88 };
89
90
91 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
92 namespace BFSJournalTracing {
93
94 class LogEntry : public AbstractTraceEntry {
95 public:
LogEntry(::LogEntry * entry,off_t logPosition,bool started)96 LogEntry(::LogEntry* entry, off_t logPosition, bool started)
97 :
98 fEntry(entry),
99 #ifdef BFS_DEBUGGER_COMMANDS
100 fTransactionID(entry->TransactionID()),
101 #endif
102 fStart(entry->Start()),
103 fLength(entry->Length()),
104 fLogPosition(logPosition),
105 fStarted(started)
106 {
107 Initialized();
108 }
109
AddDump(TraceOutput & out)110 virtual void AddDump(TraceOutput& out)
111 {
112 #ifdef BFS_DEBUGGER_COMMANDS
113 out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
114 "%lu\n", fStarted ? "Started" : "Written", fEntry,
115 fTransactionID, fStart, fLength,
116 fStarted ? "end" : "start", fLogPosition);
117 #else
118 out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
119 fStarted ? "Started" : "Written", fEntry, fStart, fLength,
120 fStarted ? "end" : "start", fLogPosition);
121 #endif
122 }
123
124 private:
125 ::LogEntry* fEntry;
126 #ifdef BFS_DEBUGGER_COMMANDS
127 int32 fTransactionID;
128 #endif
129 uint32 fStart;
130 uint32 fLength;
131 uint32 fLogPosition;
132 bool fStarted;
133 };
134
135 } // namespace BFSJournalTracing
136
137 # define T(x) new(std::nothrow) BFSJournalTracing::x;
138 #else
139 # define T(x) ;
140 #endif
141
142
143 // #pragma mark -
144
145
146 static void
add_to_iovec(iovec * vecs,int32 & index,int32 max,const void * address,size_t size)147 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
148 size_t size)
149 {
150 if (index > 0 && (addr_t)vecs[index - 1].iov_base
151 + vecs[index - 1].iov_len == (addr_t)address) {
152 // the iovec can be combined with the previous one
153 vecs[index - 1].iov_len += size;
154 return;
155 }
156
157 if (index == max)
158 panic("no more space for iovecs!");
159
160 // we need to start a new iovec
161 vecs[index].iov_base = const_cast<void*>(address);
162 vecs[index].iov_len = size;
163 index++;
164 }
165
166
167 // #pragma mark - LogEntry
168
169
LogEntry(Journal * journal,uint32 start,uint32 length)170 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
171 :
172 fJournal(journal),
173 fStart(start),
174 fLength(length)
175 {
176 }
177
178
~LogEntry()179 LogEntry::~LogEntry()
180 {
181 }
182
183
184 // #pragma mark - run_array
185
186
187 /*! The run_array's size equals the block size of the BFS volume, so we
188 cannot use a (non-overridden) new.
189 This makes a freshly allocated run_array ready to run.
190 */
191 void
Init(int32 blockSize)192 run_array::Init(int32 blockSize)
193 {
194 memset(this, 0, blockSize);
195 count = 0;
196 max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
197 }
198
199
200 /*! Inserts the block_run into the array. You will have to make sure the
201 array is large enough to contain the entry before calling this function.
202 */
203 void
Insert(block_run & run)204 run_array::Insert(block_run& run)
205 {
206 int32 index = _FindInsertionIndex(run);
207 if (index == -1) {
208 // add to the end
209 runs[CountRuns()] = run;
210 } else {
211 // insert at index
212 memmove(&runs[index + 1], &runs[index],
213 (CountRuns() - index) * sizeof(off_t));
214 runs[index] = run;
215 }
216
217 count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
218 }
219
220
221 /*static*/ int32
MaxRuns(int32 blockSize)222 run_array::MaxRuns(int32 blockSize)
223 {
224 // For whatever reason, BFS restricts the maximum array size
225 uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
226 if (maxCount < 128)
227 return maxCount;
228
229 return 127;
230 }
231
232
233 /*static*/ int
_Compare(block_run & a,block_run & b)234 run_array::_Compare(block_run& a, block_run& b)
235 {
236 int cmp = a.AllocationGroup() - b.AllocationGroup();
237 if (cmp == 0)
238 return a.Start() - b.Start();
239
240 return cmp;
241 }
242
243
244 int32
_FindInsertionIndex(block_run & run)245 run_array::_FindInsertionIndex(block_run& run)
246 {
247 int32 min = 0, max = CountRuns() - 1;
248 int32 i = 0;
249 if (max >= 8) {
250 while (min <= max) {
251 i = (min + max) / 2;
252
253 int cmp = _Compare(runs[i], run);
254 if (cmp < 0)
255 min = i + 1;
256 else if (cmp > 0)
257 max = i - 1;
258 else
259 return -1;
260 }
261
262 if (_Compare(runs[i], run) < 0)
263 i++;
264 } else {
265 for (; i <= max; i++) {
266 if (_Compare(runs[i], run) > 0)
267 break;
268 }
269 if (i == count)
270 return -1;
271 }
272
273 return i;
274 }
275
276
277 // #pragma mark - RunArrays
278
279
RunArrays(Journal * journal)280 RunArrays::RunArrays(Journal* journal)
281 :
282 fJournal(journal),
283 fBlockCount(0),
284 fArrays(),
285 fLastArray(NULL)
286 {
287 }
288
289
~RunArrays()290 RunArrays::~RunArrays()
291 {
292 run_array* array;
293 while (fArrays.Pop(&array))
294 free(array);
295 }
296
297
298 bool
_ContainsRun(block_run & run)299 RunArrays::_ContainsRun(block_run& run)
300 {
301 for (int32 i = 0; i < CountArrays(); i++) {
302 run_array* array = ArrayAt(i);
303
304 for (int32 j = 0; j < array->CountRuns(); j++) {
305 block_run& arrayRun = array->runs[j];
306 if (run.AllocationGroup() != arrayRun.AllocationGroup())
307 continue;
308
309 if (run.Start() >= arrayRun.Start()
310 && run.Start() + run.Length()
311 <= arrayRun.Start() + arrayRun.Length())
312 return true;
313 }
314 }
315
316 return false;
317 }
318
319
320 /*! Adds the specified block_run into the array.
321 Note: it doesn't support overlapping - it must only be used
322 with block_runs of length 1!
323 */
324 bool
_AddRun(block_run & run)325 RunArrays::_AddRun(block_run& run)
326 {
327 ASSERT(run.length == 1);
328
329 // Be's BFS log replay routine can only deal with block_runs of size 1
330 // A pity, isn't it? Too sad we have to be compatible.
331
332 if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
333 return false;
334
335 fLastArray->Insert(run);
336 fBlockCount++;
337 return true;
338 }
339
340
341 status_t
_AddArray()342 RunArrays::_AddArray()
343 {
344 int32 blockSize = fJournal->GetVolume()->BlockSize();
345
346 run_array* array = (run_array*)malloc(blockSize);
347 if (array == NULL)
348 return B_NO_MEMORY;
349
350 if (fArrays.Push(array) != B_OK) {
351 free(array);
352 return B_NO_MEMORY;
353 }
354
355 array->Init(blockSize);
356 fLastArray = array;
357 return B_OK;
358 }
359
360
361 status_t
Insert(off_t blockNumber)362 RunArrays::Insert(off_t blockNumber)
363 {
364 Volume* volume = fJournal->GetVolume();
365 block_run run = volume->ToBlockRun(blockNumber);
366
367 if (fLastArray != NULL) {
368 // check if the block is already in the array
369 if (_ContainsRun(run))
370 return B_OK;
371 }
372
373 // insert block into array
374
375 if (!_AddRun(run)) {
376 // array is full
377 if (_AddArray() != B_OK || !_AddRun(run))
378 return B_NO_MEMORY;
379 }
380
381 return B_OK;
382 }
383
384
385 int32
MaxArrayLength()386 RunArrays::MaxArrayLength()
387 {
388 int32 max = 0;
389 for (int32 i = 0; i < CountArrays(); i++) {
390 if (ArrayAt(i)->CountRuns() > max)
391 max = ArrayAt(i)->CountRuns();
392 }
393
394 return max;
395 }
396
397
398 // #pragma mark - Journal
399
400
Journal(Volume * volume)401 Journal::Journal(Volume* volume)
402 :
403 fVolume(volume),
404 fOwner(NULL),
405 fLogSize(volume->Log().Length()),
406 fMaxTransactionSize(fLogSize / 2 - 5),
407 fUsed(0),
408 fUnwrittenTransactions(0),
409 fHasSubtransaction(false),
410 fSeparateSubTransactions(false)
411 {
412 recursive_lock_init(&fLock, "bfs journal");
413 mutex_init(&fEntriesLock, "bfs journal entries");
414
415 fLogFlusherSem = create_sem(0, "bfs log flusher");
416 fLogFlusher = spawn_kernel_thread(&Journal::_LogFlusher, "bfs log flusher",
417 B_NORMAL_PRIORITY, this);
418 if (fLogFlusher > 0)
419 resume_thread(fLogFlusher);
420 }
421
422
~Journal()423 Journal::~Journal()
424 {
425 FlushLogAndBlocks();
426
427 recursive_lock_destroy(&fLock);
428 mutex_destroy(&fEntriesLock);
429
430 sem_id logFlusher = fLogFlusherSem;
431 fLogFlusherSem = -1;
432 delete_sem(logFlusher);
433 wait_for_thread(fLogFlusher, NULL);
434 }
435
436
437 status_t
InitCheck()438 Journal::InitCheck()
439 {
440 return B_OK;
441 }
442
443
444 /*! \brief Does a very basic consistency check of the run array.
445 It will check the maximum run count as well as if all of the runs fall
446 within a the volume.
447 */
448 status_t
_CheckRunArray(const run_array * array)449 Journal::_CheckRunArray(const run_array* array)
450 {
451 int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
452 // the -1 works around an off-by-one bug in Be's BFS implementation,
453 // same as in run_array::MaxRuns()
454 if (array->MaxRuns() != maxRuns
455 || array->CountRuns() > maxRuns
456 || array->CountRuns() <= 0) {
457 dprintf("run count: %d, array max: %d, max runs: %d\n",
458 (int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
459 FATAL(("Log entry has broken header!\n"));
460 return B_ERROR;
461 }
462
463 for (int32 i = 0; i < array->CountRuns(); i++) {
464 if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
465 return B_ERROR;
466 }
467
468 PRINT(("Log entry has %" B_PRId32 " entries\n", array->CountRuns()));
469 return B_OK;
470 }
471
472
473 /*! Replays an entry in the log.
474 \a _start points to the entry in the log, and will be bumped to the next
475 one if replaying succeeded.
476 */
477 status_t
_ReplayRunArray(int32 * _start)478 Journal::_ReplayRunArray(int32* _start)
479 {
480 PRINT(("ReplayRunArray(start = %" B_PRId32 ")\n", *_start));
481
482 off_t logOffset = fVolume->ToBlock(fVolume->Log());
483 off_t firstBlockNumber = *_start % fLogSize;
484
485 CachedBlock cachedArray(fVolume);
486
487 status_t status = cachedArray.SetTo(logOffset + firstBlockNumber);
488 if (status != B_OK)
489 return status;
490
491 const run_array* array = (const run_array*)cachedArray.Block();
492 if (_CheckRunArray(array) < B_OK)
493 return B_BAD_DATA;
494
495 // First pass: check integrity of the blocks in the run array
496
497 CachedBlock cached(fVolume);
498
499 firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
500 off_t blockNumber = firstBlockNumber;
501 int32 blockSize = fVolume->BlockSize();
502
503 for (int32 index = 0; index < array->CountRuns(); index++) {
504 const block_run& run = array->RunAt(index);
505
506 off_t offset = fVolume->ToOffset(run);
507 for (int32 i = 0; i < run.Length(); i++) {
508 status = cached.SetTo(logOffset + blockNumber);
509 if (status != B_OK)
510 RETURN_ERROR(status);
511
512 // TODO: eventually check other well known offsets, like the
513 // root and index dirs
514 if (offset == 0) {
515 // This log entry writes over the superblock - check if
516 // it's valid!
517 if (Volume::CheckSuperBlock(cached.Block()) != B_OK) {
518 FATAL(("Log contains invalid superblock!\n"));
519 RETURN_ERROR(B_BAD_DATA);
520 }
521 }
522
523 blockNumber = (blockNumber + 1) % fLogSize;
524 offset += blockSize;
525 }
526 }
527
528 // Second pass: write back its blocks
529
530 blockNumber = firstBlockNumber;
531 int32 count = 1;
532
533 for (int32 index = 0; index < array->CountRuns(); index++) {
534 const block_run& run = array->RunAt(index);
535 INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
536 (int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
537
538 off_t offset = fVolume->ToOffset(run);
539 for (int32 i = 0; i < run.Length(); i++) {
540 status = cached.SetTo(logOffset + blockNumber);
541 if (status != B_OK)
542 RETURN_ERROR(status);
543
544 ssize_t written = write_pos(fVolume->Device(), offset,
545 cached.Block(), blockSize);
546 if (written != blockSize)
547 RETURN_ERROR(B_IO_ERROR);
548
549 blockNumber = (blockNumber + 1) % fLogSize;
550 offset += blockSize;
551 count++;
552 }
553 }
554
555 *_start += count;
556 return B_OK;
557 }
558
559
560 /*! Replays all log entries - this will put the disk into a
561 consistent and clean state, if it was not correctly unmounted
562 before.
563 This method is called by Journal::InitCheck() if the log start
564 and end pointer don't match.
565 */
566 status_t
ReplayLog()567 Journal::ReplayLog()
568 {
569 // TODO: this logic won't work whenever the size of the pending transaction
570 // equals the size of the log (happens with the original BFS only)
571 if (fVolume->LogStart() == fVolume->LogEnd())
572 return B_OK;
573
574 INFORM(("Replay log, disk was not correctly unmounted...\n"));
575
576 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
577 INFORM(("log_start and log_end differ, but disk is marked clean - "
578 "trying to replay log...\n"));
579 }
580
581 if (fVolume->IsReadOnly())
582 return B_READ_ONLY_DEVICE;
583
584 int32 start = fVolume->LogStart();
585 int32 lastStart = -1;
586 while (true) {
587 // stop if the log is completely flushed
588 if (start == fVolume->LogEnd())
589 break;
590
591 if (start == lastStart) {
592 // strange, flushing the log hasn't changed the log_start pointer
593 return B_ERROR;
594 }
595 lastStart = start;
596
597 status_t status = _ReplayRunArray(&start);
598 if (status != B_OK) {
599 FATAL(("replaying log entry from %d failed: %s\n", (int)start,
600 strerror(status)));
601 return B_ERROR;
602 }
603 start = start % fLogSize;
604 }
605
606 PRINT(("replaying worked fine!\n"));
607 fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
608 fVolume->LogEnd());
609 fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
610 fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
611 SUPER_BLOCK_DISK_CLEAN);
612
613 return fVolume->WriteSuperBlock();
614 }
615
616
617 size_t
CurrentTransactionSize() const618 Journal::CurrentTransactionSize() const
619 {
620 if (_HasSubTransaction()) {
621 return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
622 fTransactionID);
623 }
624
625 return cache_blocks_in_main_transaction(fVolume->BlockCache(),
626 fTransactionID);
627 }
628
629
630 bool
CurrentTransactionTooLarge() const631 Journal::CurrentTransactionTooLarge() const
632 {
633 return CurrentTransactionSize() > fLogSize;
634 }
635
636
637 /*! This is a callback function that is called by the cache, whenever
638 all blocks of a transaction have been flushed to disk.
639 This lets us keep track of completed transactions, and update
640 the log start pointer as needed. Note, the transactions may not be
641 completed in the order they were written.
642 */
643 /*static*/ void
_TransactionWritten(int32 transactionID,int32 event,void * _logEntry)644 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
645 {
646 LogEntry* logEntry = (LogEntry*)_logEntry;
647
648 PRINT(("Log entry %p has been finished, transaction ID = %" B_PRId32 "\n",
649 logEntry, transactionID));
650
651 Journal* journal = logEntry->GetJournal();
652 disk_super_block& superBlock = journal->fVolume->SuperBlock();
653 bool update = false;
654
655 // Set log_start pointer if possible...
656
657 mutex_lock(&journal->fEntriesLock);
658
659 if (logEntry == journal->fEntries.First()) {
660 LogEntry* next = journal->fEntries.GetNext(logEntry);
661 if (next != NULL) {
662 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
663 % journal->fLogSize);
664 } else {
665 superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
666 journal->fVolume->LogEnd());
667 }
668
669 update = true;
670 }
671
672 T(LogEntry(logEntry, superBlock.LogStart(), false));
673
674 journal->fUsed -= logEntry->Length();
675 journal->fEntries.Remove(logEntry);
676 mutex_unlock(&journal->fEntriesLock);
677
678 delete logEntry;
679
680 // update the superblock, and change the disk's state, if necessary
681
682 if (update) {
683 if (superBlock.log_start == superBlock.log_end)
684 superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
685
686 status_t status = journal->fVolume->WriteSuperBlock();
687 if (status != B_OK) {
688 FATAL(("_TransactionWritten: could not write back superblock: %s\n",
689 strerror(status)));
690 }
691
692 journal->fVolume->LogStart() = superBlock.LogStart();
693 }
694 }
695
696
697 /*! Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
698 /*static*/ void
_TransactionIdle(int32 transactionID,int32 event,void * _journal)699 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
700 {
701 // The current transaction seems to be idle - flush it. (We can't do this
702 // in this thread, as flushing the log can produce new transaction events.)
703 Journal* journal = (Journal*)_journal;
704 release_sem(journal->fLogFlusherSem);
705 }
706
707
708 /*static*/ status_t
_LogFlusher(void * _journal)709 Journal::_LogFlusher(void* _journal)
710 {
711 Journal* journal = (Journal*)_journal;
712 while (journal->fLogFlusherSem >= 0) {
713 if (acquire_sem(journal->fLogFlusherSem) != B_OK)
714 continue;
715
716 journal->_FlushLog(false, false);
717 }
718 return B_OK;
719 }
720
721
722 /*! Writes the blocks that are part of current transaction into the log,
723 and ends the current transaction.
724 If the current transaction is too large to fit into the log, it will
725 try to detach an existing sub-transaction.
726 */
727 status_t
_WriteTransactionToLog()728 Journal::_WriteTransactionToLog()
729 {
730 // TODO: in case of a failure, we need a backup plan like writing all
731 // changed blocks back to disk immediately (hello disk corruption!)
732
733 bool detached = false;
734
735 if (_TransactionSize() > fLogSize) {
736 // The current transaction won't fit into the log anymore, try to
737 // detach the current sub-transaction
738 if (_HasSubTransaction() && cache_blocks_in_main_transaction(
739 fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
740 detached = true;
741 } else {
742 // We created a transaction larger than one we can write back to
743 // disk - the only option we have (besides risking disk corruption
744 // by writing it back anyway), is to let it fail.
745 dprintf("transaction too large (%d blocks, log size %d)!\n",
746 (int)_TransactionSize(), (int)fLogSize);
747 return B_BUFFER_OVERFLOW;
748 }
749 }
750
751 fHasSubtransaction = false;
752
753 int32 blockShift = fVolume->BlockShift();
754 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
755 off_t logStart = fVolume->LogEnd() % fLogSize;
756 off_t logPosition = logStart;
757 status_t status;
758
759 // create run_array structures for all changed blocks
760
761 RunArrays runArrays(this);
762
763 off_t blockNumber;
764 long cookie = 0;
765 while (cache_next_block_in_transaction(fVolume->BlockCache(),
766 fTransactionID, detached, &cookie, &blockNumber, NULL,
767 NULL) == B_OK) {
768 status = runArrays.Insert(blockNumber);
769 if (status < B_OK) {
770 FATAL(("filling log entry failed!"));
771 return status;
772 }
773 }
774
775 if (runArrays.CountBlocks() == 0) {
776 // nothing has changed during this transaction
777 if (detached) {
778 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
779 fTransactionID, NULL, NULL);
780 fUnwrittenTransactions = 1;
781 } else {
782 cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
783 NULL);
784 fUnwrittenTransactions = 0;
785 }
786 return B_OK;
787 }
788
789 // If necessary, flush the log, so that we have enough space for this
790 // transaction
791 if (runArrays.LogEntryLength() > FreeLogBlocks()) {
792 cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
793 if (runArrays.LogEntryLength() > FreeLogBlocks()) {
794 panic("no space in log after sync (%ld for %ld blocks)!",
795 (long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
796 }
797 }
798
799 // Write log entries to disk
800
801 int32 maxVecs = runArrays.MaxArrayLength() + 1;
802 // one extra for the index block
803
804 BStackOrHeapArray<iovec, 8> vecs(maxVecs);
805 if (!vecs.IsValid()) {
806 // TODO: write back log entries directly?
807 return B_NO_MEMORY;
808 }
809
810 for (int32 k = 0; k < runArrays.CountArrays(); k++) {
811 run_array* array = runArrays.ArrayAt(k);
812 int32 index = 0, count = 1;
813 int32 wrap = fLogSize - logStart;
814
815 add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
816
817 // add block runs
818
819 for (int32 i = 0; i < array->CountRuns(); i++) {
820 const block_run& run = array->RunAt(i);
821 off_t blockNumber = fVolume->ToBlock(run);
822
823 for (int32 j = 0; j < run.Length(); j++) {
824 if (count >= wrap) {
825 // We need to write back the first half of the entry
826 // directly as the log wraps around
827 if (writev_pos(fVolume->Device(), logOffset
828 + (logStart << blockShift), vecs, index) < 0)
829 FATAL(("could not write log area!\n"));
830
831 logPosition = logStart + count;
832 logStart = 0;
833 wrap = fLogSize;
834 count = 0;
835 index = 0;
836 }
837
838 // make blocks available in the cache
839 const void* data = block_cache_get(fVolume->BlockCache(),
840 blockNumber + j);
841 if (data == NULL)
842 return B_IO_ERROR;
843
844 add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
845 count++;
846 }
847 }
848
849 // write back the rest of the log entry
850 if (count > 0) {
851 logPosition = logStart + count;
852 if (writev_pos(fVolume->Device(), logOffset
853 + (logStart << blockShift), vecs, index) < 0)
854 FATAL(("could not write log area: %s!\n", strerror(errno)));
855 }
856
857 // release blocks again
858 for (int32 i = 0; i < array->CountRuns(); i++) {
859 const block_run& run = array->RunAt(i);
860 off_t blockNumber = fVolume->ToBlock(run);
861
862 for (int32 j = 0; j < run.Length(); j++) {
863 block_cache_put(fVolume->BlockCache(), blockNumber + j);
864 }
865 }
866
867 logStart = logPosition % fLogSize;
868 }
869
870 LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
871 runArrays.LogEntryLength());
872 if (logEntry == NULL) {
873 FATAL(("no memory to allocate log entries!"));
874 return B_NO_MEMORY;
875 }
876
877 #ifdef BFS_DEBUGGER_COMMANDS
878 logEntry->SetTransactionID(fTransactionID);
879 #endif
880
881 // Update the log end pointer in the superblock
882
883 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
884 fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
885
886 status = fVolume->WriteSuperBlock();
887
888 fVolume->LogEnd() = logPosition;
889 T(LogEntry(logEntry, fVolume->LogEnd(), true));
890
891 // We need to flush the drives own cache here to ensure
892 // disk consistency.
893 // If that call fails, we can't do anything about it anyway
894 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
895
896 // at this point, we can finally end the transaction - we're in
897 // a guaranteed valid state
898
899 mutex_lock(&fEntriesLock);
900 fEntries.Add(logEntry);
901 fUsed += logEntry->Length();
902 mutex_unlock(&fEntriesLock);
903
904 if (detached) {
905 fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
906 fTransactionID, _TransactionWritten, logEntry);
907 fUnwrittenTransactions = 1;
908
909 if (status == B_OK && _TransactionSize() > fLogSize) {
910 // If the transaction is too large after writing, there is no way to
911 // recover, so let this transaction fail.
912 dprintf("transaction too large (%d blocks, log size %d)!\n",
913 (int)_TransactionSize(), (int)fLogSize);
914 return B_BUFFER_OVERFLOW;
915 }
916 } else {
917 cache_end_transaction(fVolume->BlockCache(), fTransactionID,
918 _TransactionWritten, logEntry);
919 fUnwrittenTransactions = 0;
920 }
921
922 return status;
923 }
924
925
926 /*! Flushes the current log entry to disk. If \a flushBlocks is \c true it will
927 also write back all dirty blocks for this volume.
928 */
929 status_t
_FlushLog(bool canWait,bool flushBlocks)930 Journal::_FlushLog(bool canWait, bool flushBlocks)
931 {
932 status_t status = canWait ? recursive_lock_lock(&fLock)
933 : recursive_lock_trylock(&fLock);
934 if (status != B_OK)
935 return status;
936
937 if (recursive_lock_get_recursion(&fLock) > 1) {
938 // whoa, FlushLogAndBlocks() was called from inside a transaction
939 recursive_lock_unlock(&fLock);
940 return B_OK;
941 }
942
943 // write the current log entry to disk
944
945 if (fUnwrittenTransactions != 0) {
946 status = _WriteTransactionToLog();
947 if (status < B_OK)
948 FATAL(("writing current log entry failed: %s\n", strerror(status)));
949 }
950
951 if (flushBlocks)
952 status = fVolume->FlushDevice();
953
954 recursive_lock_unlock(&fLock);
955 return status;
956 }
957
958
959 /*! Flushes the current log entry to disk, and also writes back all dirty
960 blocks for this volume (completing all open transactions).
961 */
962 status_t
FlushLogAndBlocks()963 Journal::FlushLogAndBlocks()
964 {
965 return _FlushLog(true, true);
966 }
967
968
969 status_t
Lock(Transaction * owner,bool separateSubTransactions)970 Journal::Lock(Transaction* owner, bool separateSubTransactions)
971 {
972 status_t status = recursive_lock_lock(&fLock);
973 if (status != B_OK)
974 return status;
975
976 if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
977 // we'll just use the current transaction again
978 return B_OK;
979 }
980
981 if (separateSubTransactions)
982 fSeparateSubTransactions = true;
983
984 if (owner != NULL)
985 owner->SetParent(fOwner);
986
987 fOwner = owner;
988
989 // TODO: we need a way to find out how big the current transaction is;
990 // we need to be able to either detach the latest sub transaction on
991 // demand, as well as having some kind of fall back plan in case the
992 // sub transaction itself grows bigger than the log.
993 // For that, it would be nice to have some call-back interface in the
994 // cache transaction API...
995
996 if (fOwner != NULL) {
997 if (fUnwrittenTransactions > 0) {
998 // start a sub transaction
999 cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
1000 fHasSubtransaction = true;
1001 } else
1002 fTransactionID = cache_start_transaction(fVolume->BlockCache());
1003
1004 if (fTransactionID < B_OK) {
1005 recursive_lock_unlock(&fLock);
1006 return fTransactionID;
1007 }
1008
1009 cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
1010 TRANSACTION_IDLE, _TransactionIdle, this);
1011 }
1012 return B_OK;
1013 }
1014
1015
1016 status_t
Unlock(Transaction * owner,bool success)1017 Journal::Unlock(Transaction* owner, bool success)
1018 {
1019 if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1020 // we only end the transaction if we would really unlock it
1021 // TODO: what about failing transactions that do not unlock?
1022 // (they must make the parent fail, too)
1023 if (owner != NULL) {
1024 status_t status = _TransactionDone(success);
1025 if (status != B_OK)
1026 return status;
1027
1028 // Unlocking the inodes might trigger new transactions, but we
1029 // cannot reuse the current one anymore, as this one is already
1030 // closed.
1031 bool separateSubTransactions = fSeparateSubTransactions;
1032 fSeparateSubTransactions = true;
1033 owner->NotifyListeners(success);
1034 fSeparateSubTransactions = separateSubTransactions;
1035
1036 fOwner = owner->Parent();
1037 } else
1038 fOwner = NULL;
1039
1040 fTimestamp = system_time();
1041
1042 if (fSeparateSubTransactions
1043 && recursive_lock_get_recursion(&fLock) == 1)
1044 fSeparateSubTransactions = false;
1045 } else
1046 owner->MoveListenersTo(fOwner);
1047
1048 recursive_lock_unlock(&fLock);
1049 return B_OK;
1050 }
1051
1052
1053 uint32
_TransactionSize() const1054 Journal::_TransactionSize() const
1055 {
1056 int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1057 fTransactionID);
1058 if (count <= 0)
1059 return 0;
1060
1061 // take the number of array blocks in this transaction into account
1062 uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1063 uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1064 return count + arrayBlocks;
1065 }
1066
1067
1068 status_t
_TransactionDone(bool success)1069 Journal::_TransactionDone(bool success)
1070 {
1071 if (!success) {
1072 if (_HasSubTransaction()) {
1073 cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1074 // We can continue to use the parent transaction afterwards
1075 } else {
1076 cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1077 fUnwrittenTransactions = 0;
1078 }
1079
1080 return B_OK;
1081 }
1082
1083 // Up to a maximum size, we will just batch several
1084 // transactions together to improve speed
1085 uint32 size = _TransactionSize();
1086 if (size < fMaxTransactionSize) {
1087 // Flush the log from time to time, so that we have enough space
1088 // for this transaction
1089 if (size > FreeLogBlocks())
1090 cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1091
1092 fUnwrittenTransactions++;
1093 return B_OK;
1094 }
1095
1096 return _WriteTransactionToLog();
1097 }
1098
1099
1100 // #pragma mark - debugger commands
1101
1102
1103 #ifdef BFS_DEBUGGER_COMMANDS
1104
1105
1106 void
Dump()1107 Journal::Dump()
1108 {
1109 kprintf("Journal %p\n", this);
1110 kprintf(" log start: %" B_PRId32 "\n", fVolume->LogStart());
1111 kprintf(" log end: %" B_PRId32 "\n", fVolume->LogEnd());
1112 kprintf(" owner: %p\n", fOwner);
1113 kprintf(" log size: %" B_PRIu32 "\n", fLogSize);
1114 kprintf(" max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1115 kprintf(" used: %" B_PRIu32 "\n", fUsed);
1116 kprintf(" unwritten: %" B_PRId32 "\n", fUnwrittenTransactions);
1117 kprintf(" timestamp: %" B_PRId64 "\n", fTimestamp);
1118 kprintf(" transaction ID: %" B_PRId32 "\n", fTransactionID);
1119 kprintf(" has subtransaction: %d\n", fHasSubtransaction);
1120 kprintf(" separate sub-trans.: %d\n", fSeparateSubTransactions);
1121 kprintf("entries:\n");
1122 kprintf(" address id start length\n");
1123
1124 LogEntryList::Iterator iterator = fEntries.GetIterator();
1125
1126 while (iterator.HasNext()) {
1127 LogEntry* entry = iterator.Next();
1128
1129 kprintf(" %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1130 entry->TransactionID(), entry->Start(), entry->Length());
1131 }
1132 }
1133
1134
1135 int
dump_journal(int argc,char ** argv)1136 dump_journal(int argc, char** argv)
1137 {
1138 if (argc != 2 || !strcmp(argv[1], "--help")) {
1139 kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1140 return 0;
1141 }
1142
1143 Volume* volume = (Volume*)parse_expression(argv[1]);
1144 Journal* journal = volume->GetJournal(0);
1145
1146 journal->Dump();
1147 return 0;
1148 }
1149
1150
1151 #endif // BFS_DEBUGGER_COMMANDS
1152
1153
1154 // #pragma mark - TransactionListener
1155
1156
TransactionListener()1157 TransactionListener::TransactionListener()
1158 {
1159 }
1160
1161
~TransactionListener()1162 TransactionListener::~TransactionListener()
1163 {
1164 }
1165
1166
1167 // #pragma mark - Transaction
1168
1169
1170 status_t
Start(Volume * volume,off_t refBlock)1171 Transaction::Start(Volume* volume, off_t refBlock)
1172 {
1173 // has it already been started?
1174 if (fJournal != NULL)
1175 return B_OK;
1176
1177 fJournal = volume->GetJournal(refBlock);
1178 if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1179 return B_OK;
1180
1181 fJournal = NULL;
1182 return B_ERROR;
1183 }
1184
1185
1186 void
AddListener(TransactionListener * listener)1187 Transaction::AddListener(TransactionListener* listener)
1188 {
1189 if (fJournal == NULL)
1190 panic("Transaction is not running!");
1191
1192 fListeners.Add(listener);
1193 }
1194
1195
1196 void
RemoveListener(TransactionListener * listener)1197 Transaction::RemoveListener(TransactionListener* listener)
1198 {
1199 if (fJournal == NULL)
1200 panic("Transaction is not running!");
1201
1202 fListeners.Remove(listener);
1203 listener->RemovedFromTransaction();
1204 }
1205
1206
1207 void
NotifyListeners(bool success)1208 Transaction::NotifyListeners(bool success)
1209 {
1210 while (TransactionListener* listener = fListeners.RemoveHead()) {
1211 listener->TransactionDone(success);
1212 listener->RemovedFromTransaction();
1213 }
1214 }
1215
1216
1217 /*! Move the inodes into the parent transaction. This is needed only to make
1218 sure they will still be reverted in case the transaction is aborted.
1219 */
1220 void
MoveListenersTo(Transaction * transaction)1221 Transaction::MoveListenersTo(Transaction* transaction)
1222 {
1223 while (TransactionListener* listener = fListeners.RemoveHead()) {
1224 transaction->fListeners.Add(listener);
1225 }
1226 }
1227