xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 68d37cfb3a755a7270d772b505ee15c8b18aa5e0)
1 /*
2  * Copyright 2001-2020, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Transaction and logging
8 
9 
10 #include <StackOrHeapArray.h>
11 
12 #include "Journal.h"
13 
14 #include "Debug.h"
15 #include "Inode.h"
16 
17 
18 struct run_array {
19 	int32		count;
20 	int32		max_runs;
21 	block_run	runs[0];
22 
23 	void Init(int32 blockSize);
24 	void Insert(block_run& run);
25 
26 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
27 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
28 		// that -1 accounts for an off-by-one error in Be's BFS implementation
29 	const block_run& RunAt(int32 i) const { return runs[i]; }
30 
31 	static int32 MaxRuns(int32 blockSize);
32 
33 private:
34 	static int _Compare(block_run& a, block_run& b);
35 	int32 _FindInsertionIndex(block_run& run);
36 };
37 
38 class RunArrays {
39 public:
40 							RunArrays(Journal* journal);
41 							~RunArrays();
42 
43 			status_t		Insert(off_t blockNumber);
44 
45 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
46 			int32			CountArrays() const { return fArrays.CountItems(); }
47 
48 			uint32			CountBlocks() const { return fBlockCount; }
49 			uint32			LogEntryLength() const
50 								{ return CountBlocks() + CountArrays(); }
51 
52 			int32			MaxArrayLength();
53 
54 private:
55 			status_t		_AddArray();
56 			bool			_ContainsRun(block_run& run);
57 			bool			_AddRun(block_run& run);
58 
59 			Journal*		fJournal;
60 			uint32			fBlockCount;
61 			Stack<run_array*> fArrays;
62 			run_array*		fLastArray;
63 };
64 
65 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
66 public:
67 							LogEntry(Journal* journal, uint32 logStart,
68 								uint32 length);
69 							~LogEntry();
70 
71 			uint32			Start() const { return fStart; }
72 			uint32			Length() const { return fLength; }
73 
74 #ifdef BFS_DEBUGGER_COMMANDS
75 			void			SetTransactionID(int32 id) { fTransactionID = id; }
76 			int32			TransactionID() const { return fTransactionID; }
77 #endif
78 
79 			Journal*		GetJournal() { return fJournal; }
80 
81 private:
82 			Journal*		fJournal;
83 			uint32			fStart;
84 			uint32			fLength;
85 #ifdef BFS_DEBUGGER_COMMANDS
86 			int32			fTransactionID;
87 #endif
88 };
89 
90 
91 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
92 namespace BFSJournalTracing {
93 
94 class LogEntry : public AbstractTraceEntry {
95 public:
96 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
97 		:
98 		fEntry(entry),
99 #ifdef BFS_DEBUGGER_COMMANDS
100 		fTransactionID(entry->TransactionID()),
101 #endif
102 		fStart(entry->Start()),
103 		fLength(entry->Length()),
104 		fLogPosition(logPosition),
105 		fStarted(started)
106 	{
107 		Initialized();
108 	}
109 
110 	virtual void AddDump(TraceOutput& out)
111 	{
112 #ifdef BFS_DEBUGGER_COMMANDS
113 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
114 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
115 			fTransactionID, fStart, fLength,
116 			fStarted ? "end" : "start", fLogPosition);
117 #else
118 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
119 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
120 			fStarted ? "end" : "start", fLogPosition);
121 #endif
122 	}
123 
124 private:
125 	::LogEntry*	fEntry;
126 #ifdef BFS_DEBUGGER_COMMANDS
127 	int32		fTransactionID;
128 #endif
129 	uint32		fStart;
130 	uint32		fLength;
131 	uint32		fLogPosition;
132 	bool		fStarted;
133 };
134 
135 }	// namespace BFSJournalTracing
136 
137 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
138 #else
139 #	define T(x) ;
140 #endif
141 
142 
143 //	#pragma mark -
144 
145 
146 static void
147 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
148 	size_t size)
149 {
150 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
151 			+ vecs[index - 1].iov_len == (addr_t)address) {
152 		// the iovec can be combined with the previous one
153 		vecs[index - 1].iov_len += size;
154 		return;
155 	}
156 
157 	if (index == max)
158 		panic("no more space for iovecs!");
159 
160 	// we need to start a new iovec
161 	vecs[index].iov_base = const_cast<void*>(address);
162 	vecs[index].iov_len = size;
163 	index++;
164 }
165 
166 
167 //	#pragma mark - LogEntry
168 
169 
170 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
171 	:
172 	fJournal(journal),
173 	fStart(start),
174 	fLength(length)
175 {
176 }
177 
178 
179 LogEntry::~LogEntry()
180 {
181 }
182 
183 
184 //	#pragma mark - run_array
185 
186 
187 /*!	The run_array's size equals the block size of the BFS volume, so we
188 	cannot use a (non-overridden) new.
189 	This makes a freshly allocated run_array ready to run.
190 */
191 void
192 run_array::Init(int32 blockSize)
193 {
194 	memset(this, 0, blockSize);
195 	count = 0;
196 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
197 }
198 
199 
200 /*!	Inserts the block_run into the array. You will have to make sure the
201 	array is large enough to contain the entry before calling this function.
202 */
203 void
204 run_array::Insert(block_run& run)
205 {
206 	int32 index = _FindInsertionIndex(run);
207 	if (index == -1) {
208 		// add to the end
209 		runs[CountRuns()] = run;
210 	} else {
211 		// insert at index
212 		memmove(&runs[index + 1], &runs[index],
213 			(CountRuns() - index) * sizeof(off_t));
214 		runs[index] = run;
215 	}
216 
217 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
218 }
219 
220 
221 /*static*/ int32
222 run_array::MaxRuns(int32 blockSize)
223 {
224 	// For whatever reason, BFS restricts the maximum array size
225 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
226 	if (maxCount < 128)
227 		return maxCount;
228 
229 	return 127;
230 }
231 
232 
233 /*static*/ int
234 run_array::_Compare(block_run& a, block_run& b)
235 {
236 	int cmp = a.AllocationGroup() - b.AllocationGroup();
237 	if (cmp == 0)
238 		return a.Start() - b.Start();
239 
240 	return cmp;
241 }
242 
243 
244 int32
245 run_array::_FindInsertionIndex(block_run& run)
246 {
247 	int32 min = 0, max = CountRuns() - 1;
248 	int32 i = 0;
249 	if (max >= 8) {
250 		while (min <= max) {
251 			i = (min + max) / 2;
252 
253 			int cmp = _Compare(runs[i], run);
254 			if (cmp < 0)
255 				min = i + 1;
256 			else if (cmp > 0)
257 				max = i - 1;
258 			else
259 				return -1;
260 		}
261 
262 		if (_Compare(runs[i], run) < 0)
263 			i++;
264 	} else {
265 		for (; i <= max; i++) {
266 			if (_Compare(runs[i], run) > 0)
267 				break;
268 		}
269 		if (i == count)
270 			return -1;
271 	}
272 
273 	return i;
274 }
275 
276 
277 //	#pragma mark - RunArrays
278 
279 
280 RunArrays::RunArrays(Journal* journal)
281 	:
282 	fJournal(journal),
283 	fBlockCount(0),
284 	fArrays(),
285 	fLastArray(NULL)
286 {
287 }
288 
289 
290 RunArrays::~RunArrays()
291 {
292 	run_array* array;
293 	while (fArrays.Pop(&array))
294 		free(array);
295 }
296 
297 
298 bool
299 RunArrays::_ContainsRun(block_run& run)
300 {
301 	for (int32 i = 0; i < CountArrays(); i++) {
302 		run_array* array = ArrayAt(i);
303 
304 		for (int32 j = 0; j < array->CountRuns(); j++) {
305 			block_run& arrayRun = array->runs[j];
306 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
307 				continue;
308 
309 			if (run.Start() >= arrayRun.Start()
310 				&& run.Start() + run.Length()
311 					<= arrayRun.Start() + arrayRun.Length())
312 				return true;
313 		}
314 	}
315 
316 	return false;
317 }
318 
319 
320 /*!	Adds the specified block_run into the array.
321 	Note: it doesn't support overlapping - it must only be used
322 	with block_runs of length 1!
323 */
324 bool
325 RunArrays::_AddRun(block_run& run)
326 {
327 	ASSERT(run.length == 1);
328 
329 	// Be's BFS log replay routine can only deal with block_runs of size 1
330 	// A pity, isn't it? Too sad we have to be compatible.
331 
332 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
333 		return false;
334 
335 	fLastArray->Insert(run);
336 	fBlockCount++;
337 	return true;
338 }
339 
340 
341 status_t
342 RunArrays::_AddArray()
343 {
344 	int32 blockSize = fJournal->GetVolume()->BlockSize();
345 
346 	run_array* array = (run_array*)malloc(blockSize);
347 	if (array == NULL)
348 		return B_NO_MEMORY;
349 
350 	if (fArrays.Push(array) != B_OK) {
351 		free(array);
352 		return B_NO_MEMORY;
353 	}
354 
355 	array->Init(blockSize);
356 	fLastArray = array;
357 	return B_OK;
358 }
359 
360 
361 status_t
362 RunArrays::Insert(off_t blockNumber)
363 {
364 	Volume* volume = fJournal->GetVolume();
365 	block_run run = volume->ToBlockRun(blockNumber);
366 
367 	if (fLastArray != NULL) {
368 		// check if the block is already in the array
369 		if (_ContainsRun(run))
370 			return B_OK;
371 	}
372 
373 	// insert block into array
374 
375 	if (!_AddRun(run)) {
376 		// array is full
377 		if (_AddArray() != B_OK || !_AddRun(run))
378 			return B_NO_MEMORY;
379 	}
380 
381 	return B_OK;
382 }
383 
384 
385 int32
386 RunArrays::MaxArrayLength()
387 {
388 	int32 max = 0;
389 	for (int32 i = 0; i < CountArrays(); i++) {
390 		if (ArrayAt(i)->CountRuns() > max)
391 			max = ArrayAt(i)->CountRuns();
392 	}
393 
394 	return max;
395 }
396 
397 
398 //	#pragma mark - Journal
399 
400 
401 Journal::Journal(Volume* volume)
402 	:
403 	fVolume(volume),
404 	fOwner(NULL),
405 	fLogSize(volume->Log().Length()),
406 	fMaxTransactionSize(fLogSize / 2 - 5),
407 	fUsed(0),
408 	fUnwrittenTransactions(0),
409 	fHasSubtransaction(false),
410 	fSeparateSubTransactions(false)
411 {
412 	recursive_lock_init(&fLock, "bfs journal");
413 	mutex_init(&fEntriesLock, "bfs journal entries");
414 
415 	fLogFlusherSem = create_sem(0, "bfs log flusher");
416 	fLogFlusher = spawn_kernel_thread(&Journal::_LogFlusher, "bfs log flusher",
417 		B_NORMAL_PRIORITY, this);
418 	if (fLogFlusher > 0)
419 		resume_thread(fLogFlusher);
420 }
421 
422 
423 Journal::~Journal()
424 {
425 	FlushLogAndBlocks();
426 
427 	recursive_lock_destroy(&fLock);
428 	mutex_destroy(&fEntriesLock);
429 
430 	sem_id logFlusher = fLogFlusherSem;
431 	fLogFlusherSem = -1;
432 	delete_sem(logFlusher);
433 	wait_for_thread(fLogFlusher, NULL);
434 }
435 
436 
437 status_t
438 Journal::InitCheck()
439 {
440 	return B_OK;
441 }
442 
443 
444 /*!	\brief Does a very basic consistency check of the run array.
445 	It will check the maximum run count as well as if all of the runs fall
446 	within a the volume.
447 */
448 status_t
449 Journal::_CheckRunArray(const run_array* array)
450 {
451 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
452 		// the -1 works around an off-by-one bug in Be's BFS implementation,
453 		// same as in run_array::MaxRuns()
454 	if (array->MaxRuns() != maxRuns
455 		|| array->CountRuns() > maxRuns
456 		|| array->CountRuns() <= 0) {
457 		dprintf("run count: %d, array max: %d, max runs: %d\n",
458 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
459 		FATAL(("Log entry has broken header!\n"));
460 		return B_ERROR;
461 	}
462 
463 	for (int32 i = 0; i < array->CountRuns(); i++) {
464 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
465 			return B_ERROR;
466 	}
467 
468 	PRINT(("Log entry has %" B_PRId32 " entries\n", array->CountRuns()));
469 	return B_OK;
470 }
471 
472 
473 /*!	Replays an entry in the log.
474 	\a _start points to the entry in the log, and will be bumped to the next
475 	one if replaying succeeded.
476 */
477 status_t
478 Journal::_ReplayRunArray(int32* _start)
479 {
480 	PRINT(("ReplayRunArray(start = %" B_PRId32 ")\n", *_start));
481 
482 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
483 	off_t firstBlockNumber = *_start % fLogSize;
484 
485 	CachedBlock cachedArray(fVolume);
486 
487 	status_t status = cachedArray.SetTo(logOffset + firstBlockNumber);
488 	if (status != B_OK)
489 		return status;
490 
491 	const run_array* array = (const run_array*)cachedArray.Block();
492 	if (_CheckRunArray(array) < B_OK)
493 		return B_BAD_DATA;
494 
495 	// First pass: check integrity of the blocks in the run array
496 
497 	CachedBlock cached(fVolume);
498 
499 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
500 	off_t blockNumber = firstBlockNumber;
501 	int32 blockSize = fVolume->BlockSize();
502 
503 	for (int32 index = 0; index < array->CountRuns(); index++) {
504 		const block_run& run = array->RunAt(index);
505 
506 		off_t offset = fVolume->ToOffset(run);
507 		for (int32 i = 0; i < run.Length(); i++) {
508 			status = cached.SetTo(logOffset + blockNumber);
509 			if (status != B_OK)
510 				RETURN_ERROR(status);
511 
512 			// TODO: eventually check other well known offsets, like the
513 			// root and index dirs
514 			if (offset == 0) {
515 				// This log entry writes over the superblock - check if
516 				// it's valid!
517 				if (Volume::CheckSuperBlock(cached.Block()) != B_OK) {
518 					FATAL(("Log contains invalid superblock!\n"));
519 					RETURN_ERROR(B_BAD_DATA);
520 				}
521 			}
522 
523 			blockNumber = (blockNumber + 1) % fLogSize;
524 			offset += blockSize;
525 		}
526 	}
527 
528 	// Second pass: write back its blocks
529 
530 	blockNumber = firstBlockNumber;
531 	int32 count = 1;
532 
533 	for (int32 index = 0; index < array->CountRuns(); index++) {
534 		const block_run& run = array->RunAt(index);
535 		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
536 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
537 
538 		off_t offset = fVolume->ToOffset(run);
539 		for (int32 i = 0; i < run.Length(); i++) {
540 			status = cached.SetTo(logOffset + blockNumber);
541 			if (status != B_OK)
542 				RETURN_ERROR(status);
543 
544 			ssize_t written = write_pos(fVolume->Device(), offset,
545 				cached.Block(), blockSize);
546 			if (written != blockSize)
547 				RETURN_ERROR(B_IO_ERROR);
548 
549 			blockNumber = (blockNumber + 1) % fLogSize;
550 			offset += blockSize;
551 			count++;
552 		}
553 	}
554 
555 	*_start += count;
556 	return B_OK;
557 }
558 
559 
560 /*!	Replays all log entries - this will put the disk into a
561 	consistent and clean state, if it was not correctly unmounted
562 	before.
563 	This method is called by Journal::InitCheck() if the log start
564 	and end pointer don't match.
565 */
566 status_t
567 Journal::ReplayLog()
568 {
569 	// TODO: this logic won't work whenever the size of the pending transaction
570 	//	equals the size of the log (happens with the original BFS only)
571 	if (fVolume->LogStart() == fVolume->LogEnd())
572 		return B_OK;
573 
574 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
575 
576 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
577 		INFORM(("log_start and log_end differ, but disk is marked clean - "
578 			"trying to replay log...\n"));
579 	}
580 
581 	if (fVolume->IsReadOnly())
582 		return B_READ_ONLY_DEVICE;
583 
584 	int32 start = fVolume->LogStart();
585 	int32 lastStart = -1;
586 	while (true) {
587 		// stop if the log is completely flushed
588 		if (start == fVolume->LogEnd())
589 			break;
590 
591 		if (start == lastStart) {
592 			// strange, flushing the log hasn't changed the log_start pointer
593 			return B_ERROR;
594 		}
595 		lastStart = start;
596 
597 		status_t status = _ReplayRunArray(&start);
598 		if (status != B_OK) {
599 			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
600 				strerror(status)));
601 			return B_ERROR;
602 		}
603 		start = start % fLogSize;
604 	}
605 
606 	PRINT(("replaying worked fine!\n"));
607 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
608 		fVolume->LogEnd());
609 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
610 	fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
611 		SUPER_BLOCK_DISK_CLEAN);
612 
613 	return fVolume->WriteSuperBlock();
614 }
615 
616 
617 size_t
618 Journal::CurrentTransactionSize() const
619 {
620 	if (_HasSubTransaction()) {
621 		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
622 			fTransactionID);
623 	}
624 
625 	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
626 		fTransactionID);
627 }
628 
629 
630 bool
631 Journal::CurrentTransactionTooLarge() const
632 {
633 	return CurrentTransactionSize() > fLogSize;
634 }
635 
636 
637 /*!	This is a callback function that is called by the cache, whenever
638 	all blocks of a transaction have been flushed to disk.
639 	This lets us keep track of completed transactions, and update
640 	the log start pointer as needed. Note, the transactions may not be
641 	completed in the order they were written.
642 */
643 /*static*/ void
644 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
645 {
646 	LogEntry* logEntry = (LogEntry*)_logEntry;
647 
648 	PRINT(("Log entry %p has been finished, transaction ID = %" B_PRId32 "\n",
649 		logEntry, transactionID));
650 
651 	Journal* journal = logEntry->GetJournal();
652 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
653 	bool update = false;
654 
655 	// Set log_start pointer if possible...
656 
657 	mutex_lock(&journal->fEntriesLock);
658 
659 	if (logEntry == journal->fEntries.First()) {
660 		LogEntry* next = journal->fEntries.GetNext(logEntry);
661 		if (next != NULL) {
662 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
663 				% journal->fLogSize);
664 		} else {
665 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
666 				journal->fVolume->LogEnd());
667 		}
668 
669 		update = true;
670 	}
671 
672 	T(LogEntry(logEntry, superBlock.LogStart(), false));
673 
674 	journal->fUsed -= logEntry->Length();
675 	journal->fEntries.Remove(logEntry);
676 	mutex_unlock(&journal->fEntriesLock);
677 
678 	delete logEntry;
679 
680 	// update the superblock, and change the disk's state, if necessary
681 
682 	if (update) {
683 		if (superBlock.log_start == superBlock.log_end)
684 			superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
685 
686 		status_t status = journal->fVolume->WriteSuperBlock();
687 		if (status != B_OK) {
688 			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
689 				strerror(status)));
690 		}
691 
692 		journal->fVolume->LogStart() = superBlock.LogStart();
693 	}
694 }
695 
696 
697 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
698 /*static*/ void
699 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
700 {
701 	// The current transaction seems to be idle - flush it. (We can't do this
702 	// in this thread, as flushing the log can produce new transaction events.)
703 	Journal* journal = (Journal*)_journal;
704 	release_sem(journal->fLogFlusherSem);
705 }
706 
707 
708 /*static*/ status_t
709 Journal::_LogFlusher(void* _journal)
710 {
711 	Journal* journal = (Journal*)_journal;
712 	while (journal->fLogFlusherSem >= 0) {
713 		if (acquire_sem(journal->fLogFlusherSem) != B_OK)
714 			continue;
715 
716 		journal->_FlushLog(false, false);
717 	}
718 	return B_OK;
719 }
720 
721 
722 /*!	Writes the blocks that are part of current transaction into the log,
723 	and ends the current transaction.
724 	If the current transaction is too large to fit into the log, it will
725 	try to detach an existing sub-transaction.
726 */
727 status_t
728 Journal::_WriteTransactionToLog()
729 {
730 	// TODO: in case of a failure, we need a backup plan like writing all
731 	//	changed blocks back to disk immediately (hello disk corruption!)
732 
733 	bool detached = false;
734 
735 	if (_TransactionSize() > fLogSize) {
736 		// The current transaction won't fit into the log anymore, try to
737 		// detach the current sub-transaction
738 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
739 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
740 			detached = true;
741 		} else {
742 			// We created a transaction larger than one we can write back to
743 			// disk - the only option we have (besides risking disk corruption
744 			// by writing it back anyway), is to let it fail.
745 			dprintf("transaction too large (%d blocks, log size %d)!\n",
746 				(int)_TransactionSize(), (int)fLogSize);
747 			return B_BUFFER_OVERFLOW;
748 		}
749 	}
750 
751 	fHasSubtransaction = false;
752 
753 	int32 blockShift = fVolume->BlockShift();
754 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
755 	off_t logStart = fVolume->LogEnd() % fLogSize;
756 	off_t logPosition = logStart;
757 	status_t status;
758 
759 	// create run_array structures for all changed blocks
760 
761 	RunArrays runArrays(this);
762 
763 	off_t blockNumber;
764 	long cookie = 0;
765 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
766 			fTransactionID, detached, &cookie, &blockNumber, NULL,
767 			NULL) == B_OK) {
768 		status = runArrays.Insert(blockNumber);
769 		if (status < B_OK) {
770 			FATAL(("filling log entry failed!"));
771 			return status;
772 		}
773 	}
774 
775 	if (runArrays.CountBlocks() == 0) {
776 		// nothing has changed during this transaction
777 		if (detached) {
778 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
779 				fTransactionID, NULL, NULL);
780 			fUnwrittenTransactions = 1;
781 		} else {
782 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
783 				NULL);
784 			fUnwrittenTransactions = 0;
785 		}
786 		return B_OK;
787 	}
788 
789 	// If necessary, flush the log, so that we have enough space for this
790 	// transaction
791 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
792 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
793 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
794 			panic("no space in log after sync (%ld for %ld blocks)!",
795 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
796 		}
797 	}
798 
799 	// Write log entries to disk
800 
801 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
802 		// one extra for the index block
803 
804 	BStackOrHeapArray<iovec, 8> vecs(maxVecs);
805 	if (!vecs.IsValid()) {
806 		// TODO: write back log entries directly?
807 		return B_NO_MEMORY;
808 	}
809 
810 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
811 		run_array* array = runArrays.ArrayAt(k);
812 		int32 index = 0, count = 1;
813 		int32 wrap = fLogSize - logStart;
814 
815 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
816 
817 		// add block runs
818 
819 		for (int32 i = 0; i < array->CountRuns(); i++) {
820 			const block_run& run = array->RunAt(i);
821 			off_t blockNumber = fVolume->ToBlock(run);
822 
823 			for (int32 j = 0; j < run.Length(); j++) {
824 				if (count >= wrap) {
825 					// We need to write back the first half of the entry
826 					// directly as the log wraps around
827 					if (writev_pos(fVolume->Device(), logOffset
828 						+ (logStart << blockShift), vecs, index) < 0)
829 						FATAL(("could not write log area!\n"));
830 
831 					logPosition = logStart + count;
832 					logStart = 0;
833 					wrap = fLogSize;
834 					count = 0;
835 					index = 0;
836 				}
837 
838 				// make blocks available in the cache
839 				const void* data = block_cache_get(fVolume->BlockCache(),
840 					blockNumber + j);
841 				if (data == NULL)
842 					return B_IO_ERROR;
843 
844 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
845 				count++;
846 			}
847 		}
848 
849 		// write back the rest of the log entry
850 		if (count > 0) {
851 			logPosition = logStart + count;
852 			if (writev_pos(fVolume->Device(), logOffset
853 					+ (logStart << blockShift), vecs, index) < 0)
854 				FATAL(("could not write log area: %s!\n", strerror(errno)));
855 		}
856 
857 		// release blocks again
858 		for (int32 i = 0; i < array->CountRuns(); i++) {
859 			const block_run& run = array->RunAt(i);
860 			off_t blockNumber = fVolume->ToBlock(run);
861 
862 			for (int32 j = 0; j < run.Length(); j++) {
863 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
864 			}
865 		}
866 
867 		logStart = logPosition % fLogSize;
868 	}
869 
870 	LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
871 		runArrays.LogEntryLength());
872 	if (logEntry == NULL) {
873 		FATAL(("no memory to allocate log entries!"));
874 		return B_NO_MEMORY;
875 	}
876 
877 #ifdef BFS_DEBUGGER_COMMANDS
878 	logEntry->SetTransactionID(fTransactionID);
879 #endif
880 
881 	// Update the log end pointer in the superblock
882 
883 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
884 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
885 
886 	status = fVolume->WriteSuperBlock();
887 
888 	fVolume->LogEnd() = logPosition;
889 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
890 
891 	// We need to flush the drives own cache here to ensure
892 	// disk consistency.
893 	// If that call fails, we can't do anything about it anyway
894 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
895 
896 	// at this point, we can finally end the transaction - we're in
897 	// a guaranteed valid state
898 
899 	mutex_lock(&fEntriesLock);
900 	fEntries.Add(logEntry);
901 	fUsed += logEntry->Length();
902 	mutex_unlock(&fEntriesLock);
903 
904 	if (detached) {
905 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
906 			fTransactionID, _TransactionWritten, logEntry);
907 		fUnwrittenTransactions = 1;
908 
909 		if (status == B_OK && _TransactionSize() > fLogSize) {
910 			// If the transaction is too large after writing, there is no way to
911 			// recover, so let this transaction fail.
912 			dprintf("transaction too large (%d blocks, log size %d)!\n",
913 				(int)_TransactionSize(), (int)fLogSize);
914 			return B_BUFFER_OVERFLOW;
915 		}
916 	} else {
917 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
918 			_TransactionWritten, logEntry);
919 		fUnwrittenTransactions = 0;
920 	}
921 
922 	return status;
923 }
924 
925 
926 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
927 	also write back all dirty blocks for this volume.
928 */
929 status_t
930 Journal::_FlushLog(bool canWait, bool flushBlocks)
931 {
932 	status_t status = canWait ? recursive_lock_lock(&fLock)
933 		: recursive_lock_trylock(&fLock);
934 	if (status != B_OK)
935 		return status;
936 
937 	if (recursive_lock_get_recursion(&fLock) > 1) {
938 		// whoa, FlushLogAndBlocks() was called from inside a transaction
939 		recursive_lock_unlock(&fLock);
940 		return B_OK;
941 	}
942 
943 	// write the current log entry to disk
944 
945 	if (fUnwrittenTransactions != 0) {
946 		status = _WriteTransactionToLog();
947 		if (status < B_OK)
948 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
949 	}
950 
951 	if (flushBlocks)
952 		status = fVolume->FlushDevice();
953 
954 	recursive_lock_unlock(&fLock);
955 	return status;
956 }
957 
958 
959 /*!	Flushes the current log entry to disk, and also writes back all dirty
960 	blocks for this volume (completing all open transactions).
961 */
962 status_t
963 Journal::FlushLogAndBlocks()
964 {
965 	return _FlushLog(true, true);
966 }
967 
968 
969 status_t
970 Journal::Lock(Transaction* owner, bool separateSubTransactions)
971 {
972 	status_t status = recursive_lock_lock(&fLock);
973 	if (status != B_OK)
974 		return status;
975 
976 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
977 		// we'll just use the current transaction again
978 		return B_OK;
979 	}
980 
981 	if (separateSubTransactions)
982 		fSeparateSubTransactions = true;
983 
984 	if (owner != NULL)
985 		owner->SetParent(fOwner);
986 
987 	fOwner = owner;
988 
989 	// TODO: we need a way to find out how big the current transaction is;
990 	//	we need to be able to either detach the latest sub transaction on
991 	//	demand, as well as having some kind of fall back plan in case the
992 	//	sub transaction itself grows bigger than the log.
993 	//	For that, it would be nice to have some call-back interface in the
994 	//	cache transaction API...
995 
996 	if (fOwner != NULL) {
997 		if (fUnwrittenTransactions > 0) {
998 			// start a sub transaction
999 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
1000 			fHasSubtransaction = true;
1001 		} else
1002 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
1003 
1004 		if (fTransactionID < B_OK) {
1005 			recursive_lock_unlock(&fLock);
1006 			return fTransactionID;
1007 		}
1008 
1009 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
1010 			TRANSACTION_IDLE, _TransactionIdle, this);
1011 	}
1012 	return B_OK;
1013 }
1014 
1015 
1016 status_t
1017 Journal::Unlock(Transaction* owner, bool success)
1018 {
1019 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1020 		// we only end the transaction if we would really unlock it
1021 		// TODO: what about failing transactions that do not unlock?
1022 		// (they must make the parent fail, too)
1023 		if (owner != NULL) {
1024 			status_t status = _TransactionDone(success);
1025 			if (status != B_OK)
1026 				return status;
1027 
1028 			// Unlocking the inodes might trigger new transactions, but we
1029 			// cannot reuse the current one anymore, as this one is already
1030 			// closed.
1031 			bool separateSubTransactions = fSeparateSubTransactions;
1032 			fSeparateSubTransactions = true;
1033 			owner->NotifyListeners(success);
1034 			fSeparateSubTransactions = separateSubTransactions;
1035 
1036 			fOwner = owner->Parent();
1037 		} else
1038 			fOwner = NULL;
1039 
1040 		fTimestamp = system_time();
1041 
1042 		if (fSeparateSubTransactions
1043 			&& recursive_lock_get_recursion(&fLock) == 1)
1044 			fSeparateSubTransactions = false;
1045 	} else
1046 		owner->MoveListenersTo(fOwner);
1047 
1048 	recursive_lock_unlock(&fLock);
1049 	return B_OK;
1050 }
1051 
1052 
1053 uint32
1054 Journal::_TransactionSize() const
1055 {
1056 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1057 		fTransactionID);
1058 	if (count <= 0)
1059 		return 0;
1060 
1061 	// take the number of array blocks in this transaction into account
1062 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1063 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1064 	return count + arrayBlocks;
1065 }
1066 
1067 
1068 status_t
1069 Journal::_TransactionDone(bool success)
1070 {
1071 	if (!success) {
1072 		if (_HasSubTransaction()) {
1073 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1074 			// We can continue to use the parent transaction afterwards
1075 		} else {
1076 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1077 			fUnwrittenTransactions = 0;
1078 		}
1079 
1080 		return B_OK;
1081 	}
1082 
1083 	// Up to a maximum size, we will just batch several
1084 	// transactions together to improve speed
1085 	uint32 size = _TransactionSize();
1086 	if (size < fMaxTransactionSize) {
1087 		// Flush the log from time to time, so that we have enough space
1088 		// for this transaction
1089 		if (size > FreeLogBlocks())
1090 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1091 
1092 		fUnwrittenTransactions++;
1093 		return B_OK;
1094 	}
1095 
1096 	return _WriteTransactionToLog();
1097 }
1098 
1099 
1100 //	#pragma mark - debugger commands
1101 
1102 
1103 #ifdef BFS_DEBUGGER_COMMANDS
1104 
1105 
1106 void
1107 Journal::Dump()
1108 {
1109 	kprintf("Journal %p\n", this);
1110 	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1111 	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1112 	kprintf("  owner:                %p\n", fOwner);
1113 	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1114 	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1115 	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1116 	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1117 	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1118 	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1119 	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1120 	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1121 	kprintf("entries:\n");
1122 	kprintf("  address        id  start length\n");
1123 
1124 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1125 
1126 	while (iterator.HasNext()) {
1127 		LogEntry* entry = iterator.Next();
1128 
1129 		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1130 			entry->TransactionID(), entry->Start(), entry->Length());
1131 	}
1132 }
1133 
1134 
1135 int
1136 dump_journal(int argc, char** argv)
1137 {
1138 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1139 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1140 		return 0;
1141 	}
1142 
1143 	Volume* volume = (Volume*)parse_expression(argv[1]);
1144 	Journal* journal = volume->GetJournal(0);
1145 
1146 	journal->Dump();
1147 	return 0;
1148 }
1149 
1150 
1151 #endif	// BFS_DEBUGGER_COMMANDS
1152 
1153 
1154 //	#pragma mark - TransactionListener
1155 
1156 
1157 TransactionListener::TransactionListener()
1158 {
1159 }
1160 
1161 
1162 TransactionListener::~TransactionListener()
1163 {
1164 }
1165 
1166 
1167 //	#pragma mark - Transaction
1168 
1169 
1170 status_t
1171 Transaction::Start(Volume* volume, off_t refBlock)
1172 {
1173 	// has it already been started?
1174 	if (fJournal != NULL)
1175 		return B_OK;
1176 
1177 	fJournal = volume->GetJournal(refBlock);
1178 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1179 		return B_OK;
1180 
1181 	fJournal = NULL;
1182 	return B_ERROR;
1183 }
1184 
1185 
1186 void
1187 Transaction::AddListener(TransactionListener* listener)
1188 {
1189 	if (fJournal == NULL)
1190 		panic("Transaction is not running!");
1191 
1192 	fListeners.Add(listener);
1193 }
1194 
1195 
1196 void
1197 Transaction::RemoveListener(TransactionListener* listener)
1198 {
1199 	if (fJournal == NULL)
1200 		panic("Transaction is not running!");
1201 
1202 	fListeners.Remove(listener);
1203 	listener->RemovedFromTransaction();
1204 }
1205 
1206 
1207 void
1208 Transaction::NotifyListeners(bool success)
1209 {
1210 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1211 		listener->TransactionDone(success);
1212 		listener->RemovedFromTransaction();
1213 	}
1214 }
1215 
1216 
1217 /*!	Move the inodes into the parent transaction. This is needed only to make
1218 	sure they will still be reverted in case the transaction is aborted.
1219 */
1220 void
1221 Transaction::MoveListenersTo(Transaction* transaction)
1222 {
1223 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1224 		transaction->fListeners.Add(listener);
1225 	}
1226 }
1227