xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 04b1c44b89086a9521bb478c8e99d024cef8f6a6)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Transaction and logging
8 
9 
10 #include "Journal.h"
11 
12 #include "Debug.h"
13 #include "Inode.h"
14 
15 
16 struct run_array {
17 	int32		count;
18 	int32		max_runs;
19 	block_run	runs[0];
20 
21 	void Init(int32 blockSize);
22 	void Insert(block_run& run);
23 
24 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26 		// that -1 accounts for an off-by-one error in Be's BFS implementation
27 	const block_run& RunAt(int32 i) const { return runs[i]; }
28 
29 	static int32 MaxRuns(int32 blockSize);
30 
31 private:
32 	static int _Compare(block_run& a, block_run& b);
33 	int32 _FindInsertionIndex(block_run& run);
34 };
35 
36 class RunArrays {
37 public:
38 							RunArrays(Journal* journal);
39 							~RunArrays();
40 
41 			status_t		Insert(off_t blockNumber);
42 
43 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44 			int32			CountArrays() const { return fArrays.CountItems(); }
45 
46 			uint32			CountBlocks() const { return fBlockCount; }
47 			uint32			LogEntryLength() const
48 								{ return CountBlocks() + CountArrays(); }
49 
50 			int32			MaxArrayLength();
51 
52 private:
53 			status_t		_AddArray();
54 			bool			_ContainsRun(block_run& run);
55 			bool			_AddRun(block_run& run);
56 
57 			Journal*		fJournal;
58 			uint32			fBlockCount;
59 			Stack<run_array*> fArrays;
60 			run_array*		fLastArray;
61 };
62 
63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64 public:
65 							LogEntry(Journal* journal, uint32 logStart,
66 								uint32 length);
67 							~LogEntry();
68 
69 			uint32			Start() const { return fStart; }
70 			uint32			Length() const { return fLength; }
71 
72 #ifdef BFS_DEBUGGER_COMMANDS
73 			void			SetTransactionID(int32 id) { fTransactionID = id; }
74 			int32			TransactionID() const { return fTransactionID; }
75 #endif
76 
77 			Journal*		GetJournal() { return fJournal; }
78 
79 private:
80 			Journal*		fJournal;
81 			uint32			fStart;
82 			uint32			fLength;
83 #ifdef BFS_DEBUGGER_COMMANDS
84 			int32			fTransactionID;
85 #endif
86 };
87 
88 
89 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
90 namespace BFSJournalTracing {
91 
92 class LogEntry : public AbstractTraceEntry {
93 public:
94 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95 		:
96 		fEntry(entry),
97 #ifdef BFS_DEBUGGER_COMMANDS
98 		fTransactionID(entry->TransactionID()),
99 #endif
100 		fStart(entry->Start()),
101 		fLength(entry->Length()),
102 		fLogPosition(logPosition),
103 		fStarted(started)
104 	{
105 		Initialized();
106 	}
107 
108 	virtual void AddDump(TraceOutput& out)
109 	{
110 #ifdef BFS_DEBUGGER_COMMANDS
111 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113 			fTransactionID, fStart, fLength,
114 			fStarted ? "end" : "start", fLogPosition);
115 #else
116 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118 			fStarted ? "end" : "start", fLogPosition);
119 #endif
120 	}
121 
122 private:
123 	::LogEntry*	fEntry;
124 #ifdef BFS_DEBUGGER_COMMANDS
125 	int32		fTransactionID;
126 #endif
127 	uint32		fStart;
128 	uint32		fLength;
129 	uint32		fLogPosition;
130 	bool		fStarted;
131 };
132 
133 }	// namespace BFSJournalTracing
134 
135 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
136 #else
137 #	define T(x) ;
138 #endif
139 
140 
141 //	#pragma mark -
142 
143 
144 static void
145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146 	size_t size)
147 {
148 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149 			+ vecs[index - 1].iov_len == (addr_t)address) {
150 		// the iovec can be combined with the previous one
151 		vecs[index - 1].iov_len += size;
152 		return;
153 	}
154 
155 	if (index == max)
156 		panic("no more space for iovecs!");
157 
158 	// we need to start a new iovec
159 	vecs[index].iov_base = const_cast<void*>(address);
160 	vecs[index].iov_len = size;
161 	index++;
162 }
163 
164 
165 //	#pragma mark - LogEntry
166 
167 
168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169 	:
170 	fJournal(journal),
171 	fStart(start),
172 	fLength(length)
173 {
174 }
175 
176 
177 LogEntry::~LogEntry()
178 {
179 }
180 
181 
182 //	#pragma mark - run_array
183 
184 
185 /*!	The run_array's size equals the block size of the BFS volume, so we
186 	cannot use a (non-overridden) new.
187 	This makes a freshly allocated run_array ready to run.
188 */
189 void
190 run_array::Init(int32 blockSize)
191 {
192 	memset(this, 0, blockSize);
193 	count = 0;
194 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195 }
196 
197 
198 /*!	Inserts the block_run into the array. You will have to make sure the
199 	array is large enough to contain the entry before calling this function.
200 */
201 void
202 run_array::Insert(block_run& run)
203 {
204 	int32 index = _FindInsertionIndex(run);
205 	if (index == -1) {
206 		// add to the end
207 		runs[CountRuns()] = run;
208 	} else {
209 		// insert at index
210 		memmove(&runs[index + 1], &runs[index],
211 			(CountRuns() - index) * sizeof(off_t));
212 		runs[index] = run;
213 	}
214 
215 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216 }
217 
218 
219 /*static*/ int32
220 run_array::MaxRuns(int32 blockSize)
221 {
222 	// For whatever reason, BFS restricts the maximum array size
223 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224 	if (maxCount < 128)
225 		return maxCount;
226 
227 	return 127;
228 }
229 
230 
231 /*static*/ int
232 run_array::_Compare(block_run& a, block_run& b)
233 {
234 	int cmp = a.AllocationGroup() - b.AllocationGroup();
235 	if (cmp == 0)
236 		return a.Start() - b.Start();
237 
238 	return cmp;
239 }
240 
241 
242 int32
243 run_array::_FindInsertionIndex(block_run& run)
244 {
245 	int32 min = 0, max = CountRuns() - 1;
246 	int32 i = 0;
247 	if (max >= 8) {
248 		while (min <= max) {
249 			i = (min + max) / 2;
250 
251 			int cmp = _Compare(runs[i], run);
252 			if (cmp < 0)
253 				min = i + 1;
254 			else if (cmp > 0)
255 				max = i - 1;
256 			else
257 				return -1;
258 		}
259 
260 		if (_Compare(runs[i], run) < 0)
261 			i++;
262 	} else {
263 		for (; i <= max; i++) {
264 			if (_Compare(runs[i], run) > 0)
265 				break;
266 		}
267 		if (i == count)
268 			return -1;
269 	}
270 
271 	return i;
272 }
273 
274 
275 //	#pragma mark - RunArrays
276 
277 
278 RunArrays::RunArrays(Journal* journal)
279 	:
280 	fJournal(journal),
281 	fBlockCount(0),
282 	fArrays(),
283 	fLastArray(NULL)
284 {
285 }
286 
287 
288 RunArrays::~RunArrays()
289 {
290 	run_array* array;
291 	while (fArrays.Pop(&array))
292 		free(array);
293 }
294 
295 
296 bool
297 RunArrays::_ContainsRun(block_run& run)
298 {
299 	for (int32 i = 0; i < CountArrays(); i++) {
300 		run_array* array = ArrayAt(i);
301 
302 		for (int32 j = 0; j < array->CountRuns(); j++) {
303 			block_run& arrayRun = array->runs[j];
304 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305 				continue;
306 
307 			if (run.Start() >= arrayRun.Start()
308 				&& run.Start() + run.Length()
309 					<= arrayRun.Start() + arrayRun.Length())
310 				return true;
311 		}
312 	}
313 
314 	return false;
315 }
316 
317 
318 /*!	Adds the specified block_run into the array.
319 	Note: it doesn't support overlapping - it must only be used
320 	with block_runs of length 1!
321 */
322 bool
323 RunArrays::_AddRun(block_run& run)
324 {
325 	ASSERT(run.length == 1);
326 
327 	// Be's BFS log replay routine can only deal with block_runs of size 1
328 	// A pity, isn't it? Too sad we have to be compatible.
329 
330 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331 		return false;
332 
333 	fLastArray->Insert(run);
334 	fBlockCount++;
335 	return true;
336 }
337 
338 
339 status_t
340 RunArrays::_AddArray()
341 {
342 	int32 blockSize = fJournal->GetVolume()->BlockSize();
343 
344 	run_array* array = (run_array*)malloc(blockSize);
345 	if (array == NULL)
346 		return B_NO_MEMORY;
347 
348 	if (fArrays.Push(array) != B_OK) {
349 		free(array);
350 		return B_NO_MEMORY;
351 	}
352 
353 	array->Init(blockSize);
354 	fLastArray = array;
355 	return B_OK;
356 }
357 
358 
359 status_t
360 RunArrays::Insert(off_t blockNumber)
361 {
362 	Volume* volume = fJournal->GetVolume();
363 	block_run run = volume->ToBlockRun(blockNumber);
364 
365 	if (fLastArray != NULL) {
366 		// check if the block is already in the array
367 		if (_ContainsRun(run))
368 			return B_OK;
369 	}
370 
371 	// insert block into array
372 
373 	if (!_AddRun(run)) {
374 		// array is full
375 		if (_AddArray() != B_OK || !_AddRun(run))
376 			return B_NO_MEMORY;
377 	}
378 
379 	return B_OK;
380 }
381 
382 
383 int32
384 RunArrays::MaxArrayLength()
385 {
386 	int32 max = 0;
387 	for (int32 i = 0; i < CountArrays(); i++) {
388 		if (ArrayAt(i)->CountRuns() > max)
389 			max = ArrayAt(i)->CountRuns();
390 	}
391 
392 	return max;
393 }
394 
395 
396 //	#pragma mark - Journal
397 
398 
399 Journal::Journal(Volume* volume)
400 	:
401 	fVolume(volume),
402 	fOwner(NULL),
403 	fLogSize(volume->Log().Length()),
404 	fMaxTransactionSize(fLogSize / 2 - 5),
405 	fUsed(0),
406 	fUnwrittenTransactions(0),
407 	fHasSubtransaction(false),
408 	fSeparateSubTransactions(false)
409 {
410 	recursive_lock_init(&fLock, "bfs journal");
411 	mutex_init(&fEntriesLock, "bfs journal entries");
412 }
413 
414 
415 Journal::~Journal()
416 {
417 	FlushLogAndBlocks();
418 
419 	recursive_lock_destroy(&fLock);
420 	mutex_destroy(&fEntriesLock);
421 }
422 
423 
424 status_t
425 Journal::InitCheck()
426 {
427 	return B_OK;
428 }
429 
430 
431 /*!	\brief Does a very basic consistency check of the run array.
432 	It will check the maximum run count as well as if all of the runs fall
433 	within a the volume.
434 */
435 status_t
436 Journal::_CheckRunArray(const run_array* array)
437 {
438 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
439 		// the -1 works around an off-by-one bug in Be's BFS implementation,
440 		// same as in run_array::MaxRuns()
441 	if (array->MaxRuns() != maxRuns
442 		|| array->CountRuns() > maxRuns
443 		|| array->CountRuns() <= 0) {
444 		dprintf("run count: %d, array max: %d, max runs: %d\n",
445 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
446 		FATAL(("Log entry has broken header!\n"));
447 		return B_ERROR;
448 	}
449 
450 	for (int32 i = 0; i < array->CountRuns(); i++) {
451 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
452 			return B_ERROR;
453 	}
454 
455 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
456 	return B_OK;
457 }
458 
459 
460 /*!	Replays an entry in the log.
461 	\a _start points to the entry in the log, and will be bumped to the next
462 	one if replaying succeeded.
463 */
464 status_t
465 Journal::_ReplayRunArray(int32* _start)
466 {
467 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
468 
469 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
470 	off_t firstBlockNumber = *_start % fLogSize;
471 
472 	CachedBlock cachedArray(fVolume);
473 
474 	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
475 		+ firstBlockNumber);
476 	if (array == NULL)
477 		return B_IO_ERROR;
478 
479 	if (_CheckRunArray(array) < B_OK)
480 		return B_BAD_DATA;
481 
482 	// First pass: check integrity of the blocks in the run array
483 
484 	CachedBlock cached(fVolume);
485 
486 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
487 	off_t blockNumber = firstBlockNumber;
488 	int32 blockSize = fVolume->BlockSize();
489 
490 	for (int32 index = 0; index < array->CountRuns(); index++) {
491 		const block_run& run = array->RunAt(index);
492 
493 		off_t offset = fVolume->ToOffset(run);
494 		for (int32 i = 0; i < run.Length(); i++) {
495 			const uint8* data = cached.SetTo(logOffset + blockNumber);
496 			if (data == NULL)
497 				RETURN_ERROR(B_IO_ERROR);
498 
499 			// TODO: eventually check other well known offsets, like the
500 			// root and index dirs
501 			if (offset == 0) {
502 				// This log entry writes over the super block - check if
503 				// it's valid!
504 				if (Volume::CheckSuperBlock(data) != B_OK) {
505 					FATAL(("Log contains invalid super block!\n"));
506 					RETURN_ERROR(B_BAD_DATA);
507 				}
508 			}
509 
510 			blockNumber = (blockNumber + 1) % fLogSize;
511 			offset += blockSize;
512 		}
513 	}
514 
515 	// Second pass: write back its blocks
516 
517 	blockNumber = firstBlockNumber;
518 	int32 count = 1;
519 
520 	for (int32 index = 0; index < array->CountRuns(); index++) {
521 		const block_run& run = array->RunAt(index);
522 		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
523 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
524 
525 		off_t offset = fVolume->ToOffset(run);
526 		for (int32 i = 0; i < run.Length(); i++) {
527 			const uint8* data = cached.SetTo(logOffset + blockNumber);
528 			if (data == NULL)
529 				RETURN_ERROR(B_IO_ERROR);
530 
531 			ssize_t written = write_pos(fVolume->Device(), offset, data,
532 				blockSize);
533 			if (written != blockSize)
534 				RETURN_ERROR(B_IO_ERROR);
535 
536 			blockNumber = (blockNumber + 1) % fLogSize;
537 			offset += blockSize;
538 			count++;
539 		}
540 	}
541 
542 	*_start += count;
543 	return B_OK;
544 }
545 
546 
547 /*!	Replays all log entries - this will put the disk into a
548 	consistent and clean state, if it was not correctly unmounted
549 	before.
550 	This method is called by Journal::InitCheck() if the log start
551 	and end pointer don't match.
552 */
553 status_t
554 Journal::ReplayLog()
555 {
556 	// TODO: this logic won't work whenever the size of the pending transaction
557 	//	equals the size of the log (happens with the original BFS only)
558 	if (fVolume->LogStart() == fVolume->LogEnd())
559 		return B_OK;
560 
561 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
562 
563 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
564 		INFORM(("log_start and log_end differ, but disk is marked clean - "
565 			"trying to replay log...\n"));
566 	}
567 
568 	if (fVolume->IsReadOnly())
569 		return B_READ_ONLY_DEVICE;
570 
571 	int32 start = fVolume->LogStart();
572 	int32 lastStart = -1;
573 	while (true) {
574 		// stop if the log is completely flushed
575 		if (start == fVolume->LogEnd())
576 			break;
577 
578 		if (start == lastStart) {
579 			// strange, flushing the log hasn't changed the log_start pointer
580 			return B_ERROR;
581 		}
582 		lastStart = start;
583 
584 		status_t status = _ReplayRunArray(&start);
585 		if (status != B_OK) {
586 			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
587 				strerror(status)));
588 			return B_ERROR;
589 		}
590 		start = start % fLogSize;
591 	}
592 
593 	PRINT(("replaying worked fine!\n"));
594 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
595 		fVolume->LogEnd());
596 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
597 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
598 
599 	return fVolume->WriteSuperBlock();
600 }
601 
602 
603 size_t
604 Journal::CurrentTransactionSize() const
605 {
606 	if (_HasSubTransaction()) {
607 		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
608 			fTransactionID);
609 	}
610 
611 	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
612 		fTransactionID);
613 }
614 
615 
616 bool
617 Journal::CurrentTransactionTooLarge() const
618 {
619 	return CurrentTransactionSize() > fLogSize;
620 }
621 
622 
623 /*!	This is a callback function that is called by the cache, whenever
624 	all blocks of a transaction have been flushed to disk.
625 	This lets us keep track of completed transactions, and update
626 	the log start pointer as needed. Note, the transactions may not be
627 	completed in the order they were written.
628 */
629 /*static*/ void
630 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
631 {
632 	LogEntry* logEntry = (LogEntry*)_logEntry;
633 
634 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
635 		transactionID));
636 
637 	Journal* journal = logEntry->GetJournal();
638 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
639 	bool update = false;
640 
641 	// Set log_start pointer if possible...
642 
643 	mutex_lock(&journal->fEntriesLock);
644 
645 	if (logEntry == journal->fEntries.First()) {
646 		LogEntry* next = journal->fEntries.GetNext(logEntry);
647 		if (next != NULL) {
648 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
649 				% journal->fLogSize);
650 		} else {
651 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
652 				journal->fVolume->LogEnd());
653 		}
654 
655 		update = true;
656 	}
657 
658 	T(LogEntry(logEntry, superBlock.LogStart(), false));
659 
660 	journal->fUsed -= logEntry->Length();
661 	journal->fEntries.Remove(logEntry);
662 	mutex_unlock(&journal->fEntriesLock);
663 
664 	delete logEntry;
665 
666 	// update the super block, and change the disk's state, if necessary
667 
668 	if (update) {
669 		if (superBlock.log_start == superBlock.log_end)
670 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
671 
672 		status_t status = journal->fVolume->WriteSuperBlock();
673 		if (status != B_OK) {
674 			FATAL(("_TransactionWritten: could not write back super block: %s\n",
675 				strerror(status)));
676 		}
677 
678 		journal->fVolume->LogStart() = superBlock.LogStart();
679 	}
680 }
681 
682 
683 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
684 /*static*/ void
685 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
686 {
687 	// The current transaction seems to be idle - flush it. We can't do this
688 	// in this thread, as flushing the log can produce new transaction events.
689 	thread_id id = spawn_kernel_thread(&Journal::_FlushLog, "bfs log flusher",
690 		B_NORMAL_PRIORITY, _journal);
691 	if (id > 0)
692 		resume_thread(id);
693 }
694 
695 
696 /*static*/ status_t
697 Journal::_FlushLog(void* _journal)
698 {
699 	Journal* journal = (Journal*)_journal;
700 	return journal->_FlushLog(false, false);
701 }
702 
703 
704 /*!	Writes the blocks that are part of current transaction into the log,
705 	and ends the current transaction.
706 	If the current transaction is too large to fit into the log, it will
707 	try to detach an existing sub-transaction.
708 */
709 status_t
710 Journal::_WriteTransactionToLog()
711 {
712 	// TODO: in case of a failure, we need a backup plan like writing all
713 	//	changed blocks back to disk immediately (hello disk corruption!)
714 
715 	bool detached = false;
716 
717 	if (_TransactionSize() > fLogSize) {
718 		// The current transaction won't fit into the log anymore, try to
719 		// detach the current sub-transaction
720 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
721 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
722 			detached = true;
723 		} else {
724 			// We created a transaction larger than one we can write back to
725 			// disk - the only option we have (besides risking disk corruption
726 			// by writing it back anyway), is to let it fail.
727 			dprintf("transaction too large (%d blocks, log size %d)!\n",
728 				(int)_TransactionSize(), (int)fLogSize);
729 			return B_BUFFER_OVERFLOW;
730 		}
731 	}
732 
733 	fHasSubtransaction = false;
734 
735 	int32 blockShift = fVolume->BlockShift();
736 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
737 	off_t logStart = fVolume->LogEnd() % fLogSize;
738 	off_t logPosition = logStart;
739 	status_t status;
740 
741 	// create run_array structures for all changed blocks
742 
743 	RunArrays runArrays(this);
744 
745 	off_t blockNumber;
746 	long cookie = 0;
747 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
748 			fTransactionID, detached, &cookie, &blockNumber, NULL,
749 			NULL) == B_OK) {
750 		status = runArrays.Insert(blockNumber);
751 		if (status < B_OK) {
752 			FATAL(("filling log entry failed!"));
753 			return status;
754 		}
755 	}
756 
757 	if (runArrays.CountBlocks() == 0) {
758 		// nothing has changed during this transaction
759 		if (detached) {
760 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
761 				fTransactionID, NULL, NULL);
762 			fUnwrittenTransactions = 1;
763 		} else {
764 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
765 				NULL);
766 			fUnwrittenTransactions = 0;
767 		}
768 		return B_OK;
769 	}
770 
771 	// If necessary, flush the log, so that we have enough space for this
772 	// transaction
773 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
774 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
775 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
776 			panic("no space in log after sync (%ld for %ld blocks)!",
777 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
778 		}
779 	}
780 
781 	// Write log entries to disk
782 
783 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
784 		// one extra for the index block
785 
786 	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
787 	if (vecs == NULL) {
788 		// TODO: write back log entries directly?
789 		return B_NO_MEMORY;
790 	}
791 
792 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
793 		run_array* array = runArrays.ArrayAt(k);
794 		int32 index = 0, count = 1;
795 		int32 wrap = fLogSize - logStart;
796 
797 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
798 
799 		// add block runs
800 
801 		for (int32 i = 0; i < array->CountRuns(); i++) {
802 			const block_run& run = array->RunAt(i);
803 			off_t blockNumber = fVolume->ToBlock(run);
804 
805 			for (int32 j = 0; j < run.Length(); j++) {
806 				if (count >= wrap) {
807 					// We need to write back the first half of the entry
808 					// directly as the log wraps around
809 					if (writev_pos(fVolume->Device(), logOffset
810 						+ (logStart << blockShift), vecs, index) < 0)
811 						FATAL(("could not write log area!\n"));
812 
813 					logPosition = logStart + count;
814 					logStart = 0;
815 					wrap = fLogSize;
816 					count = 0;
817 					index = 0;
818 				}
819 
820 				// make blocks available in the cache
821 				const void* data = block_cache_get(fVolume->BlockCache(),
822 					blockNumber + j);
823 				if (data == NULL) {
824 					free(vecs);
825 					return B_IO_ERROR;
826 				}
827 
828 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
829 				count++;
830 			}
831 		}
832 
833 		// write back the rest of the log entry
834 		if (count > 0) {
835 			logPosition = logStart + count;
836 			if (writev_pos(fVolume->Device(), logOffset
837 					+ (logStart << blockShift), vecs, index) < 0)
838 				FATAL(("could not write log area: %s!\n", strerror(errno)));
839 		}
840 
841 		// release blocks again
842 		for (int32 i = 0; i < array->CountRuns(); i++) {
843 			const block_run& run = array->RunAt(i);
844 			off_t blockNumber = fVolume->ToBlock(run);
845 
846 			for (int32 j = 0; j < run.Length(); j++) {
847 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
848 			}
849 		}
850 
851 		logStart = logPosition % fLogSize;
852 	}
853 
854 	free(vecs);
855 
856 	LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(),
857 		runArrays.LogEntryLength());
858 	if (logEntry == NULL) {
859 		FATAL(("no memory to allocate log entries!"));
860 		return B_NO_MEMORY;
861 	}
862 
863 #ifdef BFS_DEBUGGER_COMMANDS
864 	logEntry->SetTransactionID(fTransactionID);
865 #endif
866 
867 	// Update the log end pointer in the super block
868 
869 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
870 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
871 
872 	status = fVolume->WriteSuperBlock();
873 
874 	fVolume->LogEnd() = logPosition;
875 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
876 
877 	// We need to flush the drives own cache here to ensure
878 	// disk consistency.
879 	// If that call fails, we can't do anything about it anyway
880 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
881 
882 	// at this point, we can finally end the transaction - we're in
883 	// a guaranteed valid state
884 
885 	mutex_lock(&fEntriesLock);
886 	fEntries.Add(logEntry);
887 	fUsed += logEntry->Length();
888 	mutex_unlock(&fEntriesLock);
889 
890 	if (detached) {
891 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
892 			fTransactionID, _TransactionWritten, logEntry);
893 		fUnwrittenTransactions = 1;
894 
895 		if (status == B_OK && _TransactionSize() > fLogSize) {
896 			// If the transaction is too large after writing, there is no way to
897 			// recover, so let this transaction fail.
898 			dprintf("transaction too large (%d blocks, log size %d)!\n",
899 				(int)_TransactionSize(), (int)fLogSize);
900 			return B_BUFFER_OVERFLOW;
901 		}
902 	} else {
903 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
904 			_TransactionWritten, logEntry);
905 		fUnwrittenTransactions = 0;
906 	}
907 
908 	return status;
909 }
910 
911 
912 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
913 	also write back all dirty blocks for this volume.
914 */
915 status_t
916 Journal::_FlushLog(bool canWait, bool flushBlocks)
917 {
918 	status_t status = canWait ? recursive_lock_lock(&fLock)
919 		: recursive_lock_trylock(&fLock);
920 	if (status != B_OK)
921 		return status;
922 
923 	if (recursive_lock_get_recursion(&fLock) > 1) {
924 		// whoa, FlushLogAndBlocks() was called from inside a transaction
925 		recursive_lock_unlock(&fLock);
926 		return B_OK;
927 	}
928 
929 	// write the current log entry to disk
930 
931 	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
932 		status = _WriteTransactionToLog();
933 		if (status < B_OK)
934 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
935 	}
936 
937 	if (flushBlocks)
938 		status = fVolume->FlushDevice();
939 
940 	recursive_lock_unlock(&fLock);
941 	return status;
942 }
943 
944 
945 /*!	Flushes the current log entry to disk, and also writes back all dirty
946 	blocks for this volume (completing all open transactions).
947 */
948 status_t
949 Journal::FlushLogAndBlocks()
950 {
951 	return _FlushLog(true, true);
952 }
953 
954 
955 status_t
956 Journal::Lock(Transaction* owner, bool separateSubTransactions)
957 {
958 	status_t status = recursive_lock_lock(&fLock);
959 	if (status != B_OK)
960 		return status;
961 
962 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
963 		// we'll just use the current transaction again
964 		return B_OK;
965 	}
966 
967 	if (separateSubTransactions)
968 		fSeparateSubTransactions = true;
969 
970 	if (owner != NULL)
971 		owner->SetParent(fOwner);
972 
973 	fOwner = owner;
974 
975 	// TODO: we need a way to find out how big the current transaction is;
976 	//	we need to be able to either detach the latest sub transaction on
977 	//	demand, as well as having some kind of fall back plan in case the
978 	//	sub transaction itself grows bigger than the log.
979 	//	For that, it would be nice to have some call-back interface in the
980 	//	cache transaction API...
981 
982 	if (fOwner != NULL) {
983 		if (fUnwrittenTransactions > 0) {
984 			// start a sub transaction
985 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
986 			fHasSubtransaction = true;
987 		} else
988 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
989 
990 		if (fTransactionID < B_OK) {
991 			recursive_lock_unlock(&fLock);
992 			return fTransactionID;
993 		}
994 
995 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
996 			TRANSACTION_IDLE, _TransactionIdle, this);
997 	}
998 	return B_OK;
999 }
1000 
1001 
1002 status_t
1003 Journal::Unlock(Transaction* owner, bool success)
1004 {
1005 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
1006 		// we only end the transaction if we would really unlock it
1007 		// TODO: what about failing transactions that do not unlock?
1008 		// (they must make the parent fail, too)
1009 		if (owner != NULL) {
1010 			status_t status = _TransactionDone(success);
1011 			if (status != B_OK)
1012 				return status;
1013 
1014 			// Unlocking the inodes might trigger new transactions, but we
1015 			// cannot reuse the current one anymore, as this one is already
1016 			// closed.
1017 			bool separateSubTransactions = fSeparateSubTransactions;
1018 			fSeparateSubTransactions = true;
1019 			owner->NotifyListeners(success);
1020 			fSeparateSubTransactions = separateSubTransactions;
1021 
1022 			fOwner = owner->Parent();
1023 		} else
1024 			fOwner = NULL;
1025 
1026 		fTimestamp = system_time();
1027 
1028 		if (fSeparateSubTransactions
1029 			&& recursive_lock_get_recursion(&fLock) == 1)
1030 			fSeparateSubTransactions = false;
1031 	} else
1032 		owner->MoveListenersTo(fOwner);
1033 
1034 	recursive_lock_unlock(&fLock);
1035 	return B_OK;
1036 }
1037 
1038 
1039 uint32
1040 Journal::_TransactionSize() const
1041 {
1042 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1043 		fTransactionID);
1044 	if (count <= 0)
1045 		return 0;
1046 
1047 	// take the number of array blocks in this transaction into account
1048 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1049 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1050 	return count + arrayBlocks;
1051 }
1052 
1053 
1054 status_t
1055 Journal::_TransactionDone(bool success)
1056 {
1057 	if (!success) {
1058 		if (_HasSubTransaction()) {
1059 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1060 			// We can continue to use the parent transaction afterwards
1061 		} else {
1062 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1063 			fUnwrittenTransactions = 0;
1064 		}
1065 
1066 		return B_OK;
1067 	}
1068 
1069 	// Up to a maximum size, we will just batch several
1070 	// transactions together to improve speed
1071 	uint32 size = _TransactionSize();
1072 	if (size < fMaxTransactionSize) {
1073 		// Flush the log from time to time, so that we have enough space
1074 		// for this transaction
1075 		if (size > FreeLogBlocks())
1076 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1077 
1078 		fUnwrittenTransactions++;
1079 		return B_OK;
1080 	}
1081 
1082 	return _WriteTransactionToLog();
1083 }
1084 
1085 
1086 //	#pragma mark - debugger commands
1087 
1088 
1089 #ifdef BFS_DEBUGGER_COMMANDS
1090 
1091 
1092 void
1093 Journal::Dump()
1094 {
1095 	kprintf("Journal %p\n", this);
1096 	kprintf("  log start:            %ld\n", fVolume->LogStart());
1097 	kprintf("  log end:              %ld\n", fVolume->LogEnd());
1098 	kprintf("  owner:                %p\n", fOwner);
1099 	kprintf("  log size:             %lu\n", fLogSize);
1100 	kprintf("  max transaction size: %lu\n", fMaxTransactionSize);
1101 	kprintf("  used:                 %lu\n", fUsed);
1102 	kprintf("  unwritten:            %ld\n", fUnwrittenTransactions);
1103 	kprintf("  timestamp:            %lld\n", fTimestamp);
1104 	kprintf("  transaction ID:       %ld\n", fTransactionID);
1105 	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1106 	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1107 	kprintf("entries:\n");
1108 	kprintf("  address        id  start length\n");
1109 
1110 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1111 
1112 	while (iterator.HasNext()) {
1113 		LogEntry* entry = iterator.Next();
1114 
1115 		kprintf("  %p %6ld %6lu %6lu\n", entry, entry->TransactionID(),
1116 			entry->Start(), entry->Length());
1117 	}
1118 }
1119 
1120 
1121 int
1122 dump_journal(int argc, char** argv)
1123 {
1124 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1125 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1126 		return 0;
1127 	}
1128 
1129 	Volume* volume = (Volume*)parse_expression(argv[1]);
1130 	Journal* journal = volume->GetJournal(0);
1131 
1132 	journal->Dump();
1133 	return 0;
1134 }
1135 
1136 
1137 #endif	// BFS_DEBUGGER_COMMANDS
1138 
1139 
1140 //	#pragma mark - TransactionListener
1141 
1142 
1143 TransactionListener::TransactionListener()
1144 {
1145 }
1146 
1147 
1148 TransactionListener::~TransactionListener()
1149 {
1150 }
1151 
1152 
1153 //	#pragma mark - Transaction
1154 
1155 
1156 status_t
1157 Transaction::Start(Volume* volume, off_t refBlock)
1158 {
1159 	// has it already been started?
1160 	if (fJournal != NULL)
1161 		return B_OK;
1162 
1163 	fJournal = volume->GetJournal(refBlock);
1164 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1165 		return B_OK;
1166 
1167 	fJournal = NULL;
1168 	return B_ERROR;
1169 }
1170 
1171 
1172 void
1173 Transaction::AddListener(TransactionListener* listener)
1174 {
1175 	if (fJournal == NULL)
1176 		panic("Transaction is not running!");
1177 
1178 	fListeners.Add(listener);
1179 }
1180 
1181 
1182 void
1183 Transaction::RemoveListener(TransactionListener* listener)
1184 {
1185 	if (fJournal == NULL)
1186 		panic("Transaction is not running!");
1187 
1188 	fListeners.Remove(listener);
1189 	listener->RemovedFromTransaction();
1190 }
1191 
1192 
1193 void
1194 Transaction::NotifyListeners(bool success)
1195 {
1196 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1197 		listener->TransactionDone(success);
1198 		listener->RemovedFromTransaction();
1199 	}
1200 }
1201 
1202 
1203 /*!	Move the inodes into the parent transaction. This is needed only to make
1204 	sure they will still be reverted in case the transaction is aborted.
1205 */
1206 void
1207 Transaction::MoveListenersTo(Transaction* transaction)
1208 {
1209 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1210 		transaction->fListeners.Add(listener);
1211 	}
1212 }
1213