xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 5ac9b506412b11afb993bb52d161efe7666958a5)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Transaction and logging
8 
9 
10 #include "Journal.h"
11 
12 #include "Debug.h"
13 #include "Inode.h"
14 
15 
16 struct run_array {
17 	int32		count;
18 	int32		max_runs;
19 	block_run	runs[0];
20 
21 	void Init(int32 blockSize);
22 	void Insert(block_run& run);
23 
24 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26 		// that -1 accounts for an off-by-one error in Be's BFS implementation
27 	const block_run& RunAt(int32 i) const { return runs[i]; }
28 
29 	static int32 MaxRuns(int32 blockSize);
30 
31 private:
32 	static int _Compare(block_run& a, block_run& b);
33 	int32 _FindInsertionIndex(block_run& run);
34 };
35 
36 class RunArrays {
37 public:
38 							RunArrays(Journal* journal);
39 							~RunArrays();
40 
41 			status_t		Insert(off_t blockNumber);
42 
43 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44 			int32			CountArrays() const { return fArrays.CountItems(); }
45 
46 			uint32			CountBlocks() const { return fBlockCount; }
47 			uint32			LogEntryLength() const
48 								{ return CountBlocks() + CountArrays(); }
49 
50 			int32			MaxArrayLength();
51 
52 private:
53 			status_t		_AddArray();
54 			bool			_ContainsRun(block_run& run);
55 			bool			_AddRun(block_run& run);
56 
57 			Journal*		fJournal;
58 			uint32			fBlockCount;
59 			Stack<run_array*> fArrays;
60 			run_array*		fLastArray;
61 };
62 
63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64 public:
65 							LogEntry(Journal* journal, uint32 logStart,
66 								uint32 length);
67 							~LogEntry();
68 
69 			uint32			Start() const { return fStart; }
70 			uint32			Length() const { return fLength; }
71 
72 #ifdef BFS_DEBUGGER_COMMANDS
73 			void			SetTransactionID(int32 id) { fTransactionID = id; }
74 			int32			TransactionID() const { return fTransactionID; }
75 #endif
76 
77 			Journal*		GetJournal() { return fJournal; }
78 
79 private:
80 			Journal*		fJournal;
81 			uint32			fStart;
82 			uint32			fLength;
83 #ifdef BFS_DEBUGGER_COMMANDS
84 			int32			fTransactionID;
85 #endif
86 };
87 
88 
89 #if BFS_TRACING && !defined(FS_SHELL) && !defined(_BOOT_MODE)
90 namespace BFSJournalTracing {
91 
92 class LogEntry : public AbstractTraceEntry {
93 public:
94 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95 		:
96 		fEntry(entry),
97 #ifdef BFS_DEBUGGER_COMMANDS
98 		fTransactionID(entry->TransactionID()),
99 #endif
100 		fStart(entry->Start()),
101 		fLength(entry->Length()),
102 		fLogPosition(logPosition),
103 		fStarted(started)
104 	{
105 		Initialized();
106 	}
107 
108 	virtual void AddDump(TraceOutput& out)
109 	{
110 #ifdef BFS_DEBUGGER_COMMANDS
111 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113 			fTransactionID, fStart, fLength,
114 			fStarted ? "end" : "start", fLogPosition);
115 #else
116 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118 			fStarted ? "end" : "start", fLogPosition);
119 #endif
120 	}
121 
122 private:
123 	::LogEntry*	fEntry;
124 #ifdef BFS_DEBUGGER_COMMANDS
125 	int32		fTransactionID;
126 #endif
127 	uint32		fStart;
128 	uint32		fLength;
129 	uint32		fLogPosition;
130 	bool		fStarted;
131 };
132 
133 }	// namespace BFSJournalTracing
134 
135 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
136 #else
137 #	define T(x) ;
138 #endif
139 
140 
141 //	#pragma mark -
142 
143 
144 static void
145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146 	size_t size)
147 {
148 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149 			+ vecs[index - 1].iov_len == (addr_t)address) {
150 		// the iovec can be combined with the previous one
151 		vecs[index - 1].iov_len += size;
152 		return;
153 	}
154 
155 	if (index == max)
156 		panic("no more space for iovecs!");
157 
158 	// we need to start a new iovec
159 	vecs[index].iov_base = const_cast<void*>(address);
160 	vecs[index].iov_len = size;
161 	index++;
162 }
163 
164 
165 //	#pragma mark - LogEntry
166 
167 
168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169 	:
170 	fJournal(journal),
171 	fStart(start),
172 	fLength(length)
173 {
174 }
175 
176 
177 LogEntry::~LogEntry()
178 {
179 }
180 
181 
182 //	#pragma mark - run_array
183 
184 
185 /*!	The run_array's size equals the block size of the BFS volume, so we
186 	cannot use a (non-overridden) new.
187 	This makes a freshly allocated run_array ready to run.
188 */
189 void
190 run_array::Init(int32 blockSize)
191 {
192 	memset(this, 0, blockSize);
193 	count = 0;
194 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195 }
196 
197 
198 /*!	Inserts the block_run into the array. You will have to make sure the
199 	array is large enough to contain the entry before calling this function.
200 */
201 void
202 run_array::Insert(block_run& run)
203 {
204 	int32 index = _FindInsertionIndex(run);
205 	if (index == -1) {
206 		// add to the end
207 		runs[CountRuns()] = run;
208 	} else {
209 		// insert at index
210 		memmove(&runs[index + 1], &runs[index],
211 			(CountRuns() - index) * sizeof(off_t));
212 		runs[index] = run;
213 	}
214 
215 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216 }
217 
218 
219 /*static*/ int32
220 run_array::MaxRuns(int32 blockSize)
221 {
222 	// For whatever reason, BFS restricts the maximum array size
223 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224 	if (maxCount < 128)
225 		return maxCount;
226 
227 	return 127;
228 }
229 
230 
231 /*static*/ int
232 run_array::_Compare(block_run& a, block_run& b)
233 {
234 	int cmp = a.AllocationGroup() - b.AllocationGroup();
235 	if (cmp == 0)
236 		return a.Start() - b.Start();
237 
238 	return cmp;
239 }
240 
241 
242 int32
243 run_array::_FindInsertionIndex(block_run& run)
244 {
245 	int32 min = 0, max = CountRuns() - 1;
246 	int32 i = 0;
247 	if (max >= 8) {
248 		while (min <= max) {
249 			i = (min + max) / 2;
250 
251 			int cmp = _Compare(runs[i], run);
252 			if (cmp < 0)
253 				min = i + 1;
254 			else if (cmp > 0)
255 				max = i - 1;
256 			else
257 				return -1;
258 		}
259 
260 		if (_Compare(runs[i], run) < 0)
261 			i++;
262 	} else {
263 		for (; i <= max; i++) {
264 			if (_Compare(runs[i], run) > 0)
265 				break;
266 		}
267 		if (i == count)
268 			return -1;
269 	}
270 
271 	return i;
272 }
273 
274 
275 //	#pragma mark - RunArrays
276 
277 
278 RunArrays::RunArrays(Journal* journal)
279 	:
280 	fJournal(journal),
281 	fBlockCount(0),
282 	fArrays(),
283 	fLastArray(NULL)
284 {
285 }
286 
287 
288 RunArrays::~RunArrays()
289 {
290 	run_array* array;
291 	while (fArrays.Pop(&array))
292 		free(array);
293 }
294 
295 
296 bool
297 RunArrays::_ContainsRun(block_run& run)
298 {
299 	for (int32 i = 0; i < CountArrays(); i++) {
300 		run_array* array = ArrayAt(i);
301 
302 		for (int32 j = 0; j < array->CountRuns(); j++) {
303 			block_run& arrayRun = array->runs[j];
304 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305 				continue;
306 
307 			if (run.Start() >= arrayRun.Start()
308 				&& run.Start() + run.Length()
309 					<= arrayRun.Start() + arrayRun.Length())
310 				return true;
311 		}
312 	}
313 
314 	return false;
315 }
316 
317 
318 /*!	Adds the specified block_run into the array.
319 	Note: it doesn't support overlapping - it must only be used
320 	with block_runs of length 1!
321 */
322 bool
323 RunArrays::_AddRun(block_run& run)
324 {
325 	ASSERT(run.length == 1);
326 
327 	// Be's BFS log replay routine can only deal with block_runs of size 1
328 	// A pity, isn't it? Too sad we have to be compatible.
329 
330 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331 		return false;
332 
333 	fLastArray->Insert(run);
334 	fBlockCount++;
335 	return true;
336 }
337 
338 
339 status_t
340 RunArrays::_AddArray()
341 {
342 	int32 blockSize = fJournal->GetVolume()->BlockSize();
343 
344 	run_array* array = (run_array*)malloc(blockSize);
345 	if (array == NULL)
346 		return B_NO_MEMORY;
347 
348 	if (fArrays.Push(array) != B_OK) {
349 		free(array);
350 		return B_NO_MEMORY;
351 	}
352 
353 	array->Init(blockSize);
354 	fLastArray = array;
355 	return B_OK;
356 }
357 
358 
359 status_t
360 RunArrays::Insert(off_t blockNumber)
361 {
362 	Volume* volume = fJournal->GetVolume();
363 	block_run run = volume->ToBlockRun(blockNumber);
364 
365 	if (fLastArray != NULL) {
366 		// check if the block is already in the array
367 		if (_ContainsRun(run))
368 			return B_OK;
369 	}
370 
371 	// insert block into array
372 
373 	if (!_AddRun(run)) {
374 		// array is full
375 		if (_AddArray() != B_OK || !_AddRun(run))
376 			return B_NO_MEMORY;
377 	}
378 
379 	return B_OK;
380 }
381 
382 
383 int32
384 RunArrays::MaxArrayLength()
385 {
386 	int32 max = 0;
387 	for (int32 i = 0; i < CountArrays(); i++) {
388 		if (ArrayAt(i)->CountRuns() > max)
389 			max = ArrayAt(i)->CountRuns();
390 	}
391 
392 	return max;
393 }
394 
395 
396 //	#pragma mark - Journal
397 
398 
399 Journal::Journal(Volume* volume)
400 	:
401 	fVolume(volume),
402 	fOwner(NULL),
403 	fLogSize(volume->Log().Length()),
404 	fMaxTransactionSize(fLogSize / 2 - 5),
405 	fUsed(0),
406 	fUnwrittenTransactions(0),
407 	fHasSubtransaction(false),
408 	fSeparateSubTransactions(false)
409 {
410 	recursive_lock_init(&fLock, "bfs journal");
411 	mutex_init(&fEntriesLock, "bfs journal entries");
412 }
413 
414 
415 Journal::~Journal()
416 {
417 	FlushLogAndBlocks();
418 
419 	recursive_lock_destroy(&fLock);
420 	mutex_destroy(&fEntriesLock);
421 }
422 
423 
424 status_t
425 Journal::InitCheck()
426 {
427 	return B_OK;
428 }
429 
430 
431 /*!	\brief Does a very basic consistency check of the run array.
432 	It will check the maximum run count as well as if all of the runs fall
433 	within a the volume.
434 */
435 status_t
436 Journal::_CheckRunArray(const run_array* array)
437 {
438 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
439 		// the -1 works around an off-by-one bug in Be's BFS implementation,
440 		// same as in run_array::MaxRuns()
441 	if (array->MaxRuns() != maxRuns
442 		|| array->CountRuns() > maxRuns
443 		|| array->CountRuns() <= 0) {
444 		dprintf("run count: %d, array max: %d, max runs: %d\n",
445 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
446 		FATAL(("Log entry has broken header!\n"));
447 		return B_ERROR;
448 	}
449 
450 	for (int32 i = 0; i < array->CountRuns(); i++) {
451 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
452 			return B_ERROR;
453 	}
454 
455 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
456 	return B_OK;
457 }
458 
459 
460 /*!	Replays an entry in the log.
461 	\a _start points to the entry in the log, and will be bumped to the next
462 	one if replaying succeeded.
463 */
464 status_t
465 Journal::_ReplayRunArray(int32* _start)
466 {
467 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
468 
469 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
470 	off_t firstBlockNumber = *_start % fLogSize;
471 
472 	CachedBlock cachedArray(fVolume);
473 
474 	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
475 		+ firstBlockNumber);
476 	if (array == NULL)
477 		return B_IO_ERROR;
478 
479 	if (_CheckRunArray(array) < B_OK)
480 		return B_BAD_DATA;
481 
482 	// First pass: check integrity of the blocks in the run array
483 
484 	CachedBlock cached(fVolume);
485 
486 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
487 	off_t blockNumber = firstBlockNumber;
488 	int32 blockSize = fVolume->BlockSize();
489 
490 	for (int32 index = 0; index < array->CountRuns(); index++) {
491 		const block_run& run = array->RunAt(index);
492 
493 		off_t offset = fVolume->ToOffset(run);
494 		for (int32 i = 0; i < run.Length(); i++) {
495 			const uint8* data = cached.SetTo(logOffset + blockNumber);
496 			if (data == NULL)
497 				RETURN_ERROR(B_IO_ERROR);
498 
499 			// TODO: eventually check other well known offsets, like the
500 			// root and index dirs
501 			if (offset == 0) {
502 				// This log entry writes over the superblock - check if
503 				// it's valid!
504 				if (Volume::CheckSuperBlock(data) != B_OK) {
505 					FATAL(("Log contains invalid superblock!\n"));
506 					RETURN_ERROR(B_BAD_DATA);
507 				}
508 			}
509 
510 			blockNumber = (blockNumber + 1) % fLogSize;
511 			offset += blockSize;
512 		}
513 	}
514 
515 	// Second pass: write back its blocks
516 
517 	blockNumber = firstBlockNumber;
518 	int32 count = 1;
519 
520 	for (int32 index = 0; index < array->CountRuns(); index++) {
521 		const block_run& run = array->RunAt(index);
522 		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
523 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
524 
525 		off_t offset = fVolume->ToOffset(run);
526 		for (int32 i = 0; i < run.Length(); i++) {
527 			const uint8* data = cached.SetTo(logOffset + blockNumber);
528 			if (data == NULL)
529 				RETURN_ERROR(B_IO_ERROR);
530 
531 			ssize_t written = write_pos(fVolume->Device(), offset, data,
532 				blockSize);
533 			if (written != blockSize)
534 				RETURN_ERROR(B_IO_ERROR);
535 
536 			blockNumber = (blockNumber + 1) % fLogSize;
537 			offset += blockSize;
538 			count++;
539 		}
540 	}
541 
542 	*_start += count;
543 	return B_OK;
544 }
545 
546 
547 /*!	Replays all log entries - this will put the disk into a
548 	consistent and clean state, if it was not correctly unmounted
549 	before.
550 	This method is called by Journal::InitCheck() if the log start
551 	and end pointer don't match.
552 */
553 status_t
554 Journal::ReplayLog()
555 {
556 	// TODO: this logic won't work whenever the size of the pending transaction
557 	//	equals the size of the log (happens with the original BFS only)
558 	if (fVolume->LogStart() == fVolume->LogEnd())
559 		return B_OK;
560 
561 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
562 
563 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
564 		INFORM(("log_start and log_end differ, but disk is marked clean - "
565 			"trying to replay log...\n"));
566 	}
567 
568 	if (fVolume->IsReadOnly())
569 		return B_READ_ONLY_DEVICE;
570 
571 	int32 start = fVolume->LogStart();
572 	int32 lastStart = -1;
573 	while (true) {
574 		// stop if the log is completely flushed
575 		if (start == fVolume->LogEnd())
576 			break;
577 
578 		if (start == lastStart) {
579 			// strange, flushing the log hasn't changed the log_start pointer
580 			return B_ERROR;
581 		}
582 		lastStart = start;
583 
584 		status_t status = _ReplayRunArray(&start);
585 		if (status != B_OK) {
586 			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
587 				strerror(status)));
588 			return B_ERROR;
589 		}
590 		start = start % fLogSize;
591 	}
592 
593 	PRINT(("replaying worked fine!\n"));
594 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
595 		fVolume->LogEnd());
596 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
597 	fVolume->SuperBlock().flags = HOST_ENDIAN_TO_BFS_INT32(
598 		SUPER_BLOCK_DISK_CLEAN);
599 
600 	return fVolume->WriteSuperBlock();
601 }
602 
603 
604 size_t
605 Journal::CurrentTransactionSize() const
606 {
607 	if (_HasSubTransaction()) {
608 		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
609 			fTransactionID);
610 	}
611 
612 	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
613 		fTransactionID);
614 }
615 
616 
617 bool
618 Journal::CurrentTransactionTooLarge() const
619 {
620 	return CurrentTransactionSize() > fLogSize;
621 }
622 
623 
624 /*!	This is a callback function that is called by the cache, whenever
625 	all blocks of a transaction have been flushed to disk.
626 	This lets us keep track of completed transactions, and update
627 	the log start pointer as needed. Note, the transactions may not be
628 	completed in the order they were written.
629 */
630 /*static*/ void
631 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
632 {
633 	LogEntry* logEntry = (LogEntry*)_logEntry;
634 
635 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
636 		transactionID));
637 
638 	Journal* journal = logEntry->GetJournal();
639 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
640 	bool update = false;
641 
642 	// Set log_start pointer if possible...
643 
644 	mutex_lock(&journal->fEntriesLock);
645 
646 	if (logEntry == journal->fEntries.First()) {
647 		LogEntry* next = journal->fEntries.GetNext(logEntry);
648 		if (next != NULL) {
649 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
650 				% journal->fLogSize);
651 		} else {
652 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
653 				journal->fVolume->LogEnd());
654 		}
655 
656 		update = true;
657 	}
658 
659 	T(LogEntry(logEntry, superBlock.LogStart(), false));
660 
661 	journal->fUsed -= logEntry->Length();
662 	journal->fEntries.Remove(logEntry);
663 	mutex_unlock(&journal->fEntriesLock);
664 
665 	delete logEntry;
666 
667 	// update the superblock, and change the disk's state, if necessary
668 
669 	if (update) {
670 		if (superBlock.log_start == superBlock.log_end)
671 			superBlock.flags = HOST_ENDIAN_TO_BFS_INT32(SUPER_BLOCK_DISK_CLEAN);
672 
673 		status_t status = journal->fVolume->WriteSuperBlock();
674 		if (status != B_OK) {
675 			FATAL(("_TransactionWritten: could not write back superblock: %s\n",
676 				strerror(status)));
677 		}
678 
679 		journal->fVolume->LogStart() = superBlock.LogStart();
680 	}
681 }
682 
683 
684 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
685 /*static*/ void
686 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
687 {
688 	// The current transaction seems to be idle - flush it. We can't do this
689 	// in this thread, as flushing the log can produce new transaction events.
690 	thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher",
691 		B_NORMAL_PRIORITY, _journal);
692 	if (id > 0)
693 		resume_thread(id);
694 }
695 
696 
697 /*static*/ status_t
698 Journal::_FlushLog(void* _journal)
699 {
700 	Journal* journal = (Journal*)_journal;
701 	return journal->_FlushLog(false, false);
702 }
703 
704 
705 /*!	Writes the blocks that are part of current transaction into the log,
706 	and ends the current transaction.
707 	If the current transaction is too large to fit into the log, it will
708 	try to detach an existing sub-transaction.
709 */
710 status_t
711 Journal::_WriteTransactionToLog()
712 {
713 	// TODO: in case of a failure, we need a backup plan like writing all
714 	//	changed blocks back to disk immediately (hello disk corruption!)
715 
716 	bool detached = false;
717 
718 	if (_TransactionSize() > fLogSize) {
719 		// The current transaction won't fit into the log anymore, try to
720 		// detach the current sub-transaction
721 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
722 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
723 			detached = true;
724 		} else {
725 			// We created a transaction larger than one we can write back to
726 			// disk - the only option we have (besides risking disk corruption
727 			// by writing it back anyway), is to let it fail.
728 			dprintf("transaction too large (%d blocks, log size %d)!\n",
729 				(int)_TransactionSize(), (int)fLogSize);
730 			return B_BUFFER_OVERFLOW;
731 		}
732 	}
733 
734 	fHasSubtransaction = false;
735 
736 	int32 blockShift = fVolume->BlockShift();
737 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
738 	off_t logStart = fVolume->LogEnd() % fLogSize;
739 	off_t logPosition = logStart;
740 	status_t status;
741 
742 	// create run_array structures for all changed blocks
743 
744 	RunArrays runArrays(this);
745 
746 	off_t blockNumber;
747 	long cookie = 0;
748 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
749 			fTransactionID, detached, &cookie, &blockNumber, NULL,
750 			NULL) == B_OK) {
751 		status = runArrays.Insert(blockNumber);
752 		if (status < B_OK) {
753 			FATAL(("filling log entry failed!"));
754 			return status;
755 		}
756 	}
757 
758 	if (runArrays.CountBlocks() == 0) {
759 		// nothing has changed during this transaction
760 		if (detached) {
761 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
762 				fTransactionID, NULL, NULL);
763 			fUnwrittenTransactions = 1;
764 		} else {
765 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
766 				NULL);
767 			fUnwrittenTransactions = 0;
768 		}
769 		return B_OK;
770 	}
771 
772 	// If necessary, flush the log, so that we have enough space for this
773 	// transaction
774 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
775 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
776 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
777 			panic("no space in log after sync (%ld for %ld blocks)!",
778 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
779 		}
780 	}
781 
782 	// Write log entries to disk
783 
784 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
785 		// one extra for the index block
786 
787 	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
788 	if (vecs == NULL) {
789 		// TODO: write back log entries directly?
790 		return B_NO_MEMORY;
791 	}
792 
793 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
794 		run_array* array = runArrays.ArrayAt(k);
795 		int32 index = 0, count = 1;
796 		int32 wrap = fLogSize - logStart;
797 
798 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
799 
800 		// add block runs
801 
802 		for (int32 i = 0; i < array->CountRuns(); i++) {
803 			const block_run& run = array->RunAt(i);
804 			off_t blockNumber = fVolume->ToBlock(run);
805 
806 			for (int32 j = 0; j < run.Length(); j++) {
807 				if (count >= wrap) {
808 					// We need to write back the first half of the entry
809 					// directly as the log wraps around
810 					if (writev_pos(fVolume->Device(), logOffset
811 						+ (logStart << blockShift), vecs, index) < 0)
812 						FATAL(("could not write log area!\n"));
813 
814 					logPosition = logStart + count;
815 					logStart = 0;
816 					wrap = fLogSize;
817 					count = 0;
818 					index = 0;
819 				}
820 
821 				// make blocks available in the cache
822 				const void* data = block_cache_get(fVolume->BlockCache(),
823 					blockNumber + j);
824 				if (data == NULL) {
825 					free(vecs);
826 					return B_IO_ERROR;
827 				}
828 
829 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
830 				count++;
831 			}
832 		}
833 
834 		// write back the rest of the log entry
835 		if (count > 0) {
836 			logPosition = logStart + count;
837 			if (writev_pos(fVolume->Device(), logOffset
838 					+ (logStart << blockShift), vecs, index) < 0)
839 				FATAL(("could not write log area: %s!\n", strerror(errno)));
840 		}
841 
842 		// release blocks again
843 		for (int32 i = 0; i < array->CountRuns(); i++) {
844 			const block_run& run = array->RunAt(i);
845 			off_t blockNumber = fVolume->ToBlock(run);
846 
847 			for (int32 j = 0; j < run.Length(); j++) {
848 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
849 			}
850 		}
851 
852 		logStart = logPosition % fLogSize;
853 	}
854 
855 	free(vecs);
856 
857 	LogEntry* logEntry = new(std::nothrow) LogEntry(this, fVolume->LogEnd(),
858 		runArrays.LogEntryLength());
859 	if (logEntry == NULL) {
860 		FATAL(("no memory to allocate log entries!"));
861 		return B_NO_MEMORY;
862 	}
863 
864 #ifdef BFS_DEBUGGER_COMMANDS
865 	logEntry->SetTransactionID(fTransactionID);
866 #endif
867 
868 	// Update the log end pointer in the superblock
869 
870 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
871 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
872 
873 	status = fVolume->WriteSuperBlock();
874 
875 	fVolume->LogEnd() = logPosition;
876 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
877 
878 	// We need to flush the drives own cache here to ensure
879 	// disk consistency.
880 	// If that call fails, we can't do anything about it anyway
881 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
882 
883 	// at this point, we can finally end the transaction - we're in
884 	// a guaranteed valid state
885 
886 	mutex_lock(&fEntriesLock);
887 	fEntries.Add(logEntry);
888 	fUsed += logEntry->Length();
889 	mutex_unlock(&fEntriesLock);
890 
891 	if (detached) {
892 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
893 			fTransactionID, _TransactionWritten, logEntry);
894 		fUnwrittenTransactions = 1;
895 
896 		if (status == B_OK && _TransactionSize() > fLogSize) {
897 			// If the transaction is too large after writing, there is no way to
898 			// recover, so let this transaction fail.
899 			dprintf("transaction too large (%d blocks, log size %d)!\n",
900 				(int)_TransactionSize(), (int)fLogSize);
901 			return B_BUFFER_OVERFLOW;
902 		}
903 	} else {
904 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
905 			_TransactionWritten, logEntry);
906 		fUnwrittenTransactions = 0;
907 	}
908 
909 	return status;
910 }
911 
912 
913 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
914 	also write back all dirty blocks for this volume.
915 */
916 status_t
917 Journal::_FlushLog(bool canWait, bool flushBlocks)
918 {
919 	status_t status = canWait ? recursive_lock_lock(&fLock)
920 		: recursive_lock_trylock(&fLock);
921 	if (status != B_OK)
922 		return status;
923 
924 	if (recursive_lock_get_recursion(&fLock) > 1) {
925 		// whoa, FlushLogAndBlocks() was called from inside a transaction
926 		recursive_lock_unlock(&fLock);
927 		return B_OK;
928 	}
929 
930 	// write the current log entry to disk
931 
932 	if (fUnwrittenTransactions != 0) {
933 		status = _WriteTransactionToLog();
934 		if (status < B_OK)
935 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
936 	}
937 
938 	if (flushBlocks)
939 		status = fVolume->FlushDevice();
940 
941 	recursive_lock_unlock(&fLock);
942 	return status;
943 }
944 
945 
946 /*!	Flushes the current log entry to disk, and also writes back all dirty
947 	blocks for this volume (completing all open transactions).
948 */
949 status_t
950 Journal::FlushLogAndBlocks()
951 {
952 	return _FlushLog(true, true);
953 }
954 
955 
956 status_t
957 Journal::Lock(Transaction* owner, bool separateSubTransactions)
958 {
959 	status_t status = recursive_lock_lock(&fLock);
960 	if (status != B_OK)
961 		return status;
962 
963 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
964 		// we'll just use the current transaction again
965 		return B_OK;
966 	}
967 
968 	if (separateSubTransactions)
969 		fSeparateSubTransactions = true;
970 
971 	if (owner != NULL)
972 		owner->SetParent(fOwner);
973 
974 	fOwner = owner;
975 
976 	// TODO: we need a way to find out how big the current transaction is;
977 	//	we need to be able to either detach the latest sub transaction on
978 	//	demand, as well as having some kind of fall back plan in case the
979 	//	sub transaction itself grows bigger than the log.
980 	//	For that, it would be nice to have some call-back interface in the
981 	//	cache transaction API...
982 
983 	if (fOwner != NULL) {
984 		if (fUnwrittenTransactions > 0) {
985 			// start a sub transaction
986 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
987 			fHasSubtransaction = true;
988 		} else
989 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
990 
991 		if (fTransactionID < B_OK) {
992 			recursive_lock_unlock(&fLock);
993 			return fTransactionID;
994 		}
995 
996 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
997 			TRANSACTION_IDLE, _TransactionIdle, this);
998 	}
999 	return B_OK;
1000 }
1001 
1002 
1003 status_t
1004 Journal::Unlock(Transaction* owner, bool success)
1005 {
1006 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1007 		// we only end the transaction if we would really unlock it
1008 		// TODO: what about failing transactions that do not unlock?
1009 		// (they must make the parent fail, too)
1010 		if (owner != NULL) {
1011 			status_t status = _TransactionDone(success);
1012 			if (status != B_OK)
1013 				return status;
1014 
1015 			// Unlocking the inodes might trigger new transactions, but we
1016 			// cannot reuse the current one anymore, as this one is already
1017 			// closed.
1018 			bool separateSubTransactions = fSeparateSubTransactions;
1019 			fSeparateSubTransactions = true;
1020 			owner->NotifyListeners(success);
1021 			fSeparateSubTransactions = separateSubTransactions;
1022 
1023 			fOwner = owner->Parent();
1024 		} else
1025 			fOwner = NULL;
1026 
1027 		fTimestamp = system_time();
1028 
1029 		if (fSeparateSubTransactions
1030 			&& recursive_lock_get_recursion(&fLock) == 1)
1031 			fSeparateSubTransactions = false;
1032 	} else
1033 		owner->MoveListenersTo(fOwner);
1034 
1035 	recursive_lock_unlock(&fLock);
1036 	return B_OK;
1037 }
1038 
1039 
1040 uint32
1041 Journal::_TransactionSize() const
1042 {
1043 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1044 		fTransactionID);
1045 	if (count <= 0)
1046 		return 0;
1047 
1048 	// take the number of array blocks in this transaction into account
1049 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1050 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1051 	return count + arrayBlocks;
1052 }
1053 
1054 
1055 status_t
1056 Journal::_TransactionDone(bool success)
1057 {
1058 	if (!success) {
1059 		if (_HasSubTransaction()) {
1060 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1061 			// We can continue to use the parent transaction afterwards
1062 		} else {
1063 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1064 			fUnwrittenTransactions = 0;
1065 		}
1066 
1067 		return B_OK;
1068 	}
1069 
1070 	// Up to a maximum size, we will just batch several
1071 	// transactions together to improve speed
1072 	uint32 size = _TransactionSize();
1073 	if (size < fMaxTransactionSize) {
1074 		// Flush the log from time to time, so that we have enough space
1075 		// for this transaction
1076 		if (size > FreeLogBlocks())
1077 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1078 
1079 		fUnwrittenTransactions++;
1080 		return B_OK;
1081 	}
1082 
1083 	return _WriteTransactionToLog();
1084 }
1085 
1086 
1087 //	#pragma mark - debugger commands
1088 
1089 
1090 #ifdef BFS_DEBUGGER_COMMANDS
1091 
1092 
1093 void
1094 Journal::Dump()
1095 {
1096 	kprintf("Journal %p\n", this);
1097 	kprintf("  log start:            %" B_PRId32 "\n", fVolume->LogStart());
1098 	kprintf("  log end:              %" B_PRId32 "\n", fVolume->LogEnd());
1099 	kprintf("  owner:                %p\n", fOwner);
1100 	kprintf("  log size:             %" B_PRIu32 "\n", fLogSize);
1101 	kprintf("  max transaction size: %" B_PRIu32 "\n", fMaxTransactionSize);
1102 	kprintf("  used:                 %" B_PRIu32 "\n", fUsed);
1103 	kprintf("  unwritten:            %" B_PRId32 "\n", fUnwrittenTransactions);
1104 	kprintf("  timestamp:            %" B_PRId64 "\n", fTimestamp);
1105 	kprintf("  transaction ID:       %" B_PRId32 "\n", fTransactionID);
1106 	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1107 	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1108 	kprintf("entries:\n");
1109 	kprintf("  address        id  start length\n");
1110 
1111 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1112 
1113 	while (iterator.HasNext()) {
1114 		LogEntry* entry = iterator.Next();
1115 
1116 		kprintf("  %p %6" B_PRId32 " %6" B_PRIu32 " %6" B_PRIu32 "\n", entry,
1117 			entry->TransactionID(), entry->Start(), entry->Length());
1118 	}
1119 }
1120 
1121 
1122 int
1123 dump_journal(int argc, char** argv)
1124 {
1125 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1126 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1127 		return 0;
1128 	}
1129 
1130 	Volume* volume = (Volume*)parse_expression(argv[1]);
1131 	Journal* journal = volume->GetJournal(0);
1132 
1133 	journal->Dump();
1134 	return 0;
1135 }
1136 
1137 
1138 #endif	// BFS_DEBUGGER_COMMANDS
1139 
1140 
1141 //	#pragma mark - TransactionListener
1142 
1143 
1144 TransactionListener::TransactionListener()
1145 {
1146 }
1147 
1148 
1149 TransactionListener::~TransactionListener()
1150 {
1151 }
1152 
1153 
1154 //	#pragma mark - Transaction
1155 
1156 
1157 status_t
1158 Transaction::Start(Volume* volume, off_t refBlock)
1159 {
1160 	// has it already been started?
1161 	if (fJournal != NULL)
1162 		return B_OK;
1163 
1164 	fJournal = volume->GetJournal(refBlock);
1165 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1166 		return B_OK;
1167 
1168 	fJournal = NULL;
1169 	return B_ERROR;
1170 }
1171 
1172 
1173 void
1174 Transaction::AddListener(TransactionListener* listener)
1175 {
1176 	if (fJournal == NULL)
1177 		panic("Transaction is not running!");
1178 
1179 	fListeners.Add(listener);
1180 }
1181 
1182 
1183 void
1184 Transaction::RemoveListener(TransactionListener* listener)
1185 {
1186 	if (fJournal == NULL)
1187 		panic("Transaction is not running!");
1188 
1189 	fListeners.Remove(listener);
1190 	listener->RemovedFromTransaction();
1191 }
1192 
1193 
1194 void
1195 Transaction::NotifyListeners(bool success)
1196 {
1197 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1198 		listener->TransactionDone(success);
1199 		listener->RemovedFromTransaction();
1200 	}
1201 }
1202 
1203 
1204 /*!	Move the inodes into the parent transaction. This is needed only to make
1205 	sure they will still be reverted in case the transaction is aborted.
1206 */
1207 void
1208 Transaction::MoveListenersTo(Transaction* transaction)
1209 {
1210 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1211 		transaction->fListeners.Add(listener);
1212 	}
1213 }
1214