xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 1b80286772b529a3d6de3bbeb0720c62e6a32fed)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! Transaction and logging
7 
8 
9 #include "Journal.h"
10 
11 #include "Debug.h"
12 #include "Inode.h"
13 
14 
15 struct run_array {
16 	int32		count;
17 	int32		max_runs;
18 	block_run	runs[0];
19 
20 	void Init(int32 blockSize);
21 	void Insert(block_run& run);
22 
23 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
24 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
25 		// that -1 accounts for an off-by-one error in Be's BFS implementation
26 	const block_run& RunAt(int32 i) const { return runs[i]; }
27 
28 	static int32 MaxRuns(int32 blockSize);
29 
30 private:
31 	static int _Compare(block_run& a, block_run& b);
32 	int32 _FindInsertionIndex(block_run& run);
33 };
34 
35 class RunArrays {
36 public:
37 							RunArrays(Journal* journal);
38 							~RunArrays();
39 
40 			status_t		Insert(off_t blockNumber);
41 
42 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
43 			int32			CountArrays() const { return fArrays.CountItems(); }
44 
45 			uint32			CountBlocks() const { return fBlockCount; }
46 			uint32			LogEntryLength() const
47 								{ return CountBlocks() + CountArrays(); }
48 
49 			int32			MaxArrayLength();
50 
51 private:
52 			status_t		_AddArray();
53 			bool			_ContainsRun(block_run& run);
54 			bool			_AddRun(block_run& run);
55 
56 			Journal*		fJournal;
57 			uint32			fBlockCount;
58 			Stack<run_array*> fArrays;
59 			run_array*		fLastArray;
60 };
61 
62 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
63 public:
64 							LogEntry(Journal* journal, uint32 logStart,
65 								uint32 length);
66 							~LogEntry();
67 
68 			uint32			Start() const { return fStart; }
69 			uint32			Length() const { return fLength; }
70 
71 #ifdef BFS_DEBUGGER_COMMANDS
72 			void			SetTransactionID(int32 id) { fTransactionID = id; }
73 			int32			TransactionID() const { return fTransactionID; }
74 #endif
75 
76 			Journal*		GetJournal() { return fJournal; }
77 
78 private:
79 			Journal*		fJournal;
80 			uint32			fStart;
81 			uint32			fLength;
82 #ifdef BFS_DEBUGGER_COMMANDS
83 			int32			fTransactionID;
84 #endif
85 };
86 
87 
88 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
89 namespace BFSJournalTracing {
90 
91 class LogEntry : public AbstractTraceEntry {
92 public:
93 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
94 		:
95 		fEntry(entry),
96 #ifdef BFS_DEBUGGER_COMMANDS
97 		fTransactionID(entry->TransactionID()),
98 #endif
99 		fStart(entry->Start()),
100 		fLength(entry->Length()),
101 		fLogPosition(logPosition),
102 		fStarted(started)
103 	{
104 		Initialized();
105 	}
106 
107 	virtual void AddDump(TraceOutput& out)
108 	{
109 #ifdef BFS_DEBUGGER_COMMANDS
110 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
111 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
112 			fTransactionID, fStart, fLength,
113 			fStarted ? "end" : "start", fLogPosition);
114 #else
115 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
116 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
117 			fStarted ? "end" : "start", fLogPosition);
118 #endif
119 	}
120 
121 private:
122 	::LogEntry*	fEntry;
123 #ifdef BFS_DEBUGGER_COMMANDS
124 	int32		fTransactionID;
125 #endif
126 	uint32		fStart;
127 	uint32		fLength;
128 	uint32		fLogPosition;
129 	bool		fStarted;
130 };
131 
132 }	// namespace BFSJournalTracing
133 
134 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
135 #else
136 #	define T(x) ;
137 #endif
138 
139 
140 //	#pragma mark -
141 
142 
143 static void
144 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
145 	size_t size)
146 {
147 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
148 			+ vecs[index - 1].iov_len == (addr_t)address) {
149 		// the iovec can be combined with the previous one
150 		vecs[index - 1].iov_len += size;
151 		return;
152 	}
153 
154 	if (index == max)
155 		panic("no more space for iovecs!");
156 
157 	// we need to start a new iovec
158 	vecs[index].iov_base = const_cast<void*>(address);
159 	vecs[index].iov_len = size;
160 	index++;
161 }
162 
163 
164 //	#pragma mark - LogEntry
165 
166 
167 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
168 	:
169 	fJournal(journal),
170 	fStart(start),
171 	fLength(length)
172 {
173 }
174 
175 
176 LogEntry::~LogEntry()
177 {
178 }
179 
180 
181 //	#pragma mark - run_array
182 
183 
184 /*!	The run_array's size equals the block size of the BFS volume, so we
185 	cannot use a (non-overridden) new.
186 	This makes a freshly allocated run_array ready to run.
187 */
188 void
189 run_array::Init(int32 blockSize)
190 {
191 	memset(this, 0, blockSize);
192 	count = 0;
193 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
194 }
195 
196 
197 /*!	Inserts the block_run into the array. You will have to make sure the
198 	array is large enough to contain the entry before calling this function.
199 */
200 void
201 run_array::Insert(block_run& run)
202 {
203 	int32 index = _FindInsertionIndex(run);
204 	if (index == -1) {
205 		// add to the end
206 		runs[CountRuns()] = run;
207 	} else {
208 		// insert at index
209 		memmove(&runs[index + 1], &runs[index],
210 			(CountRuns() - index) * sizeof(off_t));
211 		runs[index] = run;
212 	}
213 
214 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
215 }
216 
217 
218 /*static*/ int32
219 run_array::MaxRuns(int32 blockSize)
220 {
221 	// For whatever reason, BFS restricts the maximum array size
222 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
223 	if (maxCount < 128)
224 		return maxCount;
225 
226 	return 127;
227 }
228 
229 
230 /*static*/ int
231 run_array::_Compare(block_run& a, block_run& b)
232 {
233 	int cmp = a.AllocationGroup() - b.AllocationGroup();
234 	if (cmp == 0)
235 		return a.Start() - b.Start();
236 
237 	return cmp;
238 }
239 
240 
241 int32
242 run_array::_FindInsertionIndex(block_run& run)
243 {
244 	int32 min = 0, max = CountRuns() - 1;
245 	int32 i = 0;
246 	if (max >= 8) {
247 		while (min <= max) {
248 			i = (min + max) / 2;
249 
250 			int cmp = _Compare(runs[i], run);
251 			if (cmp < 0)
252 				min = i + 1;
253 			else if (cmp > 0)
254 				max = i - 1;
255 			else
256 				return -1;
257 		}
258 
259 		if (_Compare(runs[i], run) < 0)
260 			i++;
261 	} else {
262 		for (; i <= max; i++) {
263 			if (_Compare(runs[i], run) > 0)
264 				break;
265 		}
266 		if (i == count)
267 			return -1;
268 	}
269 
270 	return i;
271 }
272 
273 
274 //	#pragma mark - RunArrays
275 
276 
277 RunArrays::RunArrays(Journal* journal)
278 	:
279 	fJournal(journal),
280 	fBlockCount(0),
281 	fArrays(),
282 	fLastArray(NULL)
283 {
284 }
285 
286 
287 RunArrays::~RunArrays()
288 {
289 	run_array* array;
290 	while (fArrays.Pop(&array))
291 		free(array);
292 }
293 
294 
295 bool
296 RunArrays::_ContainsRun(block_run& run)
297 {
298 	for (int32 i = 0; i < CountArrays(); i++) {
299 		run_array* array = ArrayAt(i);
300 
301 		for (int32 j = 0; j < array->CountRuns(); j++) {
302 			block_run& arrayRun = array->runs[j];
303 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
304 				continue;
305 
306 			if (run.Start() >= arrayRun.Start()
307 				&& run.Start() + run.Length()
308 					<= arrayRun.Start() + arrayRun.Length())
309 				return true;
310 		}
311 	}
312 
313 	return false;
314 }
315 
316 
317 /*!	Adds the specified block_run into the array.
318 	Note: it doesn't support overlapping - it must only be used
319 	with block_runs of length 1!
320 */
321 bool
322 RunArrays::_AddRun(block_run& run)
323 {
324 	ASSERT(run.length == 1);
325 
326 	// Be's BFS log replay routine can only deal with block_runs of size 1
327 	// A pity, isn't it? Too sad we have to be compatible.
328 
329 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
330 		return false;
331 
332 	fLastArray->Insert(run);
333 	fBlockCount++;
334 	return true;
335 }
336 
337 
338 status_t
339 RunArrays::_AddArray()
340 {
341 	int32 blockSize = fJournal->GetVolume()->BlockSize();
342 
343 	run_array* array = (run_array*)malloc(blockSize);
344 	if (array == NULL)
345 		return B_NO_MEMORY;
346 
347 	if (fArrays.Push(array) != B_OK) {
348 		free(array);
349 		return B_NO_MEMORY;
350 	}
351 
352 	array->Init(blockSize);
353 	fLastArray = array;
354 	return B_OK;
355 }
356 
357 
358 status_t
359 RunArrays::Insert(off_t blockNumber)
360 {
361 	Volume* volume = fJournal->GetVolume();
362 	block_run run = volume->ToBlockRun(blockNumber);
363 
364 	if (fLastArray != NULL) {
365 		// check if the block is already in the array
366 		if (_ContainsRun(run))
367 			return B_OK;
368 	}
369 
370 	// insert block into array
371 
372 	if (!_AddRun(run)) {
373 		// array is full
374 		if (_AddArray() != B_OK || !_AddRun(run))
375 			return B_NO_MEMORY;
376 	}
377 
378 	return B_OK;
379 }
380 
381 
382 int32
383 RunArrays::MaxArrayLength()
384 {
385 	int32 max = 0;
386 	for (int32 i = 0; i < CountArrays(); i++) {
387 		if (ArrayAt(i)->CountRuns() > max)
388 			max = ArrayAt(i)->CountRuns();
389 	}
390 
391 	return max;
392 }
393 
394 
395 //	#pragma mark - Journal
396 
397 
398 Journal::Journal(Volume* volume)
399 	:
400 	fVolume(volume),
401 	fOwner(NULL),
402 	fLogSize(volume->Log().Length()),
403 	fMaxTransactionSize(fLogSize / 2 - 5),
404 	fUsed(0),
405 	fUnwrittenTransactions(0),
406 	fHasSubtransaction(false),
407 	fSeparateSubTransactions(false)
408 {
409 	recursive_lock_init(&fLock, "bfs journal");
410 	mutex_init(&fEntriesLock, "bfs journal entries");
411 }
412 
413 
414 Journal::~Journal()
415 {
416 	FlushLogAndBlocks();
417 
418 	recursive_lock_destroy(&fLock);
419 	mutex_destroy(&fEntriesLock);
420 }
421 
422 
423 status_t
424 Journal::InitCheck()
425 {
426 	return B_OK;
427 }
428 
429 
430 /*!	\brief Does a very basic consistency check of the run array.
431 	It will check the maximum run count as well as if all of the runs fall
432 	within a the volume.
433 */
434 status_t
435 Journal::_CheckRunArray(const run_array* array)
436 {
437 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
438 		// the -1 works around an off-by-one bug in Be's BFS implementation,
439 		// same as in run_array::MaxRuns()
440 	if (array->MaxRuns() != maxRuns
441 		|| array->CountRuns() > maxRuns
442 		|| array->CountRuns() <= 0) {
443 		dprintf("run count: %d, array max: %d, max runs: %d\n",
444 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
445 		FATAL(("Log entry has broken header!\n"));
446 		return B_ERROR;
447 	}
448 
449 	for (int32 i = 0; i < array->CountRuns(); i++) {
450 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
451 			return B_ERROR;
452 	}
453 
454 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
455 	return B_OK;
456 }
457 
458 
459 /*!	Replays an entry in the log.
460 	\a _start points to the entry in the log, and will be bumped to the next
461 	one if replaying succeeded.
462 */
463 status_t
464 Journal::_ReplayRunArray(int32* _start)
465 {
466 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
467 
468 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
469 	off_t firstBlockNumber = *_start % fLogSize;
470 
471 	CachedBlock cachedArray(fVolume);
472 
473 	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
474 		+ firstBlockNumber);
475 	if (array == NULL)
476 		return B_IO_ERROR;
477 
478 	if (_CheckRunArray(array) < B_OK)
479 		return B_BAD_DATA;
480 
481 	// First pass: check integrity of the blocks in the run array
482 
483 	CachedBlock cached(fVolume);
484 
485 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
486 	off_t blockNumber = firstBlockNumber;
487 	int32 blockSize = fVolume->BlockSize();
488 
489 	for (int32 index = 0; index < array->CountRuns(); index++) {
490 		const block_run& run = array->RunAt(index);
491 
492 		off_t offset = fVolume->ToOffset(run);
493 		for (int32 i = 0; i < run.Length(); i++) {
494 			const uint8* data = cached.SetTo(logOffset + blockNumber);
495 			if (data == NULL)
496 				RETURN_ERROR(B_IO_ERROR);
497 
498 			// TODO: eventually check other well known offsets, like the
499 			// root and index dirs
500 			if (offset == 0) {
501 				// This log entry writes over the super block - check if
502 				// it's valid!
503 				if (Volume::CheckSuperBlock(data) != B_OK) {
504 					FATAL(("Log contains invalid super block!\n"));
505 					RETURN_ERROR(B_BAD_DATA);
506 				}
507 			}
508 
509 			blockNumber = (blockNumber + 1) % fLogSize;
510 			offset += blockSize;
511 		}
512 	}
513 
514 	// Second pass: write back its blocks
515 
516 	blockNumber = firstBlockNumber;
517 	int32 count = 1;
518 
519 	for (int32 index = 0; index < array->CountRuns(); index++) {
520 		const block_run& run = array->RunAt(index);
521 		INFORM(("replay block run %u:%u:%u in log at %Ld!\n",
522 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
523 
524 		off_t offset = fVolume->ToOffset(run);
525 		for (int32 i = 0; i < run.Length(); i++) {
526 			const uint8* data = cached.SetTo(logOffset + blockNumber);
527 			if (data == NULL)
528 				RETURN_ERROR(B_IO_ERROR);
529 
530 			ssize_t written = write_pos(fVolume->Device(), offset, data,
531 				blockSize);
532 			if (written != blockSize)
533 				RETURN_ERROR(B_IO_ERROR);
534 
535 			blockNumber = (blockNumber + 1) % fLogSize;
536 			offset += blockSize;
537 			count++;
538 		}
539 	}
540 
541 	*_start += count;
542 	return B_OK;
543 }
544 
545 
546 /*!	Replays all log entries - this will put the disk into a
547 	consistent and clean state, if it was not correctly unmounted
548 	before.
549 	This method is called by Journal::InitCheck() if the log start
550 	and end pointer don't match.
551 */
552 status_t
553 Journal::ReplayLog()
554 {
555 	// TODO: this logic won't work whenever the size of the pending transaction
556 	//	equals the size of the log (happens with the original BFS only)
557 	if (fVolume->LogStart() == fVolume->LogEnd())
558 		return B_OK;
559 
560 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
561 
562 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
563 		INFORM(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));
564 
565 	int32 start = fVolume->LogStart();
566 	int32 lastStart = -1;
567 	while (true) {
568 		// stop if the log is completely flushed
569 		if (start == fVolume->LogEnd())
570 			break;
571 
572 		if (start == lastStart) {
573 			// strange, flushing the log hasn't changed the log_start pointer
574 			return B_ERROR;
575 		}
576 		lastStart = start;
577 
578 		status_t status = _ReplayRunArray(&start);
579 		if (status < B_OK) {
580 			FATAL(("replaying log entry from %d failed: %s\n", (int)start, strerror(status)));
581 			return B_ERROR;
582 		}
583 		start = start % fLogSize;
584 	}
585 
586 	PRINT(("replaying worked fine!\n"));
587 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
588 		fVolume->LogEnd());
589 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
590 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
591 
592 	return fVolume->WriteSuperBlock();
593 }
594 
595 
596 /*!	This is a callback function that is called by the cache, whenever
597 	all blocks of a transaction have been flushed to disk.
598 	This lets us keep track of completed transactions, and update
599 	the log start pointer as needed. Note, the transactions may not be
600 	completed in the order they were written.
601 */
602 /*static*/ void
603 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
604 {
605 	LogEntry* logEntry = (LogEntry*)_logEntry;
606 
607 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
608 		transactionID));
609 
610 	Journal* journal = logEntry->GetJournal();
611 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
612 	bool update = false;
613 
614 	// Set log_start pointer if possible...
615 
616 	mutex_lock(&journal->fEntriesLock);
617 
618 	if (logEntry == journal->fEntries.First()) {
619 		LogEntry* next = journal->fEntries.GetNext(logEntry);
620 		if (next != NULL) {
621 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
622 				% journal->fLogSize);
623 		} else {
624 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
625 				journal->fVolume->LogEnd());
626 		}
627 
628 		update = true;
629 	}
630 
631 	T(LogEntry(logEntry, superBlock.LogStart(), false));
632 
633 	journal->fUsed -= logEntry->Length();
634 	journal->fEntries.Remove(logEntry);
635 	mutex_unlock(&journal->fEntriesLock);
636 
637 	delete logEntry;
638 
639 	// update the super block, and change the disk's state, if necessary
640 
641 	if (update) {
642 		if (superBlock.log_start == superBlock.log_end)
643 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
644 
645 		status_t status = journal->fVolume->WriteSuperBlock();
646 		if (status != B_OK) {
647 			FATAL(("_TransactionWritten: could not write back super block: %s\n",
648 				strerror(status)));
649 		}
650 
651 		journal->fVolume->LogStart() = superBlock.LogStart();
652 	}
653 }
654 
655 
656 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
657 /*static*/ void
658 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
659 {
660 	// The current transaction seems to be idle - flush it
661 
662 	Journal* journal = (Journal*)_journal;
663 	journal->_FlushLog(false, false);
664 }
665 
666 
667 /*!	Writes the blocks that are part of current transaction into the log,
668 	and ends the current transaction.
669 	If the current transaction is too large to fit into the log, it will
670 	try to detach an existing sub-transaction.
671 */
672 status_t
673 Journal::_WriteTransactionToLog()
674 {
675 	// TODO: in case of a failure, we need a backup plan like writing all
676 	//	changed blocks back to disk immediately (hello disk corruption!)
677 
678 	bool detached = false;
679 
680 	if (_TransactionSize() > fLogSize) {
681 		// The current transaction won't fit into the log anymore, try to
682 		// detach the current sub-transaction
683 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
684 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
685 			detached = true;
686 		} else {
687 			// TODO: what are our options here?
688 			// a) abort the transaction - bad, because all changes are lost
689 			// b) carry out the changes, but don't use the log - even worse,
690 			//    as it potentially creates a corrupted disk.
691 			dprintf("transaction too large (%d blocks, %d main, log size %d)!\n",
692 				(int)_TransactionSize(), (int)cache_blocks_in_main_transaction(
693 				fVolume->BlockCache(), fTransactionID), (int)fLogSize);
694 			return B_BUFFER_OVERFLOW;
695 		}
696 	}
697 
698 	fHasSubtransaction = false;
699 
700 	int32 blockShift = fVolume->BlockShift();
701 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
702 	off_t logStart = fVolume->LogEnd() % fLogSize;
703 	off_t logPosition = logStart;
704 	status_t status;
705 
706 	// create run_array structures for all changed blocks
707 
708 	RunArrays runArrays(this);
709 
710 	off_t blockNumber;
711 	long cookie = 0;
712 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
713 			fTransactionID, detached, &cookie, &blockNumber, NULL,
714 			NULL) == B_OK) {
715 		status = runArrays.Insert(blockNumber);
716 		if (status < B_OK) {
717 			FATAL(("filling log entry failed!"));
718 			return status;
719 		}
720 	}
721 
722 	if (runArrays.CountBlocks() == 0) {
723 		// nothing has changed during this transaction
724 		if (detached) {
725 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
726 				fTransactionID, NULL, NULL);
727 			fUnwrittenTransactions = 1;
728 		} else {
729 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
730 				NULL);
731 			fUnwrittenTransactions = 0;
732 		}
733 		return B_OK;
734 	}
735 
736 	// If necessary, flush the log, so that we have enough space for this
737 	// transaction
738 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
739 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
740 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
741 			panic("no space in log after sync (%ld for %ld blocks)!",
742 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
743 		}
744 	}
745 
746 	// Write log entries to disk
747 
748 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
749 		// one extra for the index block
750 
751 	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
752 	if (vecs == NULL) {
753 		// TODO: write back log entries directly?
754 		return B_NO_MEMORY;
755 	}
756 
757 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
758 		run_array* array = runArrays.ArrayAt(k);
759 		int32 index = 0, count = 1;
760 		int32 wrap = fLogSize - logStart;
761 
762 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
763 
764 		// add block runs
765 
766 		for (int32 i = 0; i < array->CountRuns(); i++) {
767 			const block_run& run = array->RunAt(i);
768 			off_t blockNumber = fVolume->ToBlock(run);
769 
770 			for (int32 j = 0; j < run.Length(); j++) {
771 				if (count >= wrap) {
772 					// We need to write back the first half of the entry
773 					// directly as the log wraps around
774 					if (writev_pos(fVolume->Device(), logOffset
775 						+ (logStart << blockShift), vecs, index) < 0)
776 						FATAL(("could not write log area!\n"));
777 
778 					logPosition = logStart + count;
779 					logStart = 0;
780 					wrap = fLogSize;
781 					count = 0;
782 					index = 0;
783 				}
784 
785 				// make blocks available in the cache
786 				const void* data = block_cache_get(fVolume->BlockCache(),
787 					blockNumber + j);
788 				if (data == NULL) {
789 					free(vecs);
790 					return B_IO_ERROR;
791 				}
792 
793 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
794 				count++;
795 			}
796 		}
797 
798 		// write back the rest of the log entry
799 		if (count > 0) {
800 			logPosition = logStart + count;
801 			if (writev_pos(fVolume->Device(), logOffset
802 					+ (logStart << blockShift), vecs, index) < 0)
803 				FATAL(("could not write log area: %s!\n", strerror(errno)));
804 		}
805 
806 		// release blocks again
807 		for (int32 i = 0; i < array->CountRuns(); i++) {
808 			const block_run& run = array->RunAt(i);
809 			off_t blockNumber = fVolume->ToBlock(run);
810 
811 			for (int32 j = 0; j < run.Length(); j++) {
812 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
813 			}
814 		}
815 
816 		logStart = logPosition % fLogSize;
817 	}
818 
819 	free(vecs);
820 
821 	LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(),
822 		runArrays.LogEntryLength());
823 	if (logEntry == NULL) {
824 		FATAL(("no memory to allocate log entries!"));
825 		return B_NO_MEMORY;
826 	}
827 
828 #ifdef BFS_DEBUGGER_COMMANDS
829 	logEntry->SetTransactionID(fTransactionID);
830 #endif
831 
832 	// Update the log end pointer in the super block
833 
834 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
835 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
836 
837 	status = fVolume->WriteSuperBlock();
838 
839 	fVolume->LogEnd() = logPosition;
840 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
841 
842 	// We need to flush the drives own cache here to ensure
843 	// disk consistency.
844 	// If that call fails, we can't do anything about it anyway
845 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
846 
847 	// at this point, we can finally end the transaction - we're in
848 	// a guaranteed valid state
849 
850 	mutex_lock(&fEntriesLock);
851 	fEntries.Add(logEntry);
852 	fUsed += logEntry->Length();
853 	mutex_unlock(&fEntriesLock);
854 
855 	if (detached) {
856 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
857 			fTransactionID, _TransactionWritten, logEntry);
858 		fUnwrittenTransactions = 1;
859 	} else {
860 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
861 			_TransactionWritten, logEntry);
862 		fUnwrittenTransactions = 0;
863 	}
864 
865 	return status;
866 }
867 
868 
869 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
870 	also write back all dirty blocks for this volume.
871 */
872 status_t
873 Journal::_FlushLog(bool canWait, bool flushBlocks)
874 {
875 	status_t status = canWait ? recursive_lock_lock(&fLock)
876 		: recursive_lock_trylock(&fLock);
877 	if (status != B_OK)
878 		return status;
879 
880 	if (recursive_lock_get_recursion(&fLock) > 1) {
881 		// whoa, FlushLogAndBlocks() was called from inside a transaction
882 		recursive_lock_unlock(&fLock);
883 		return B_OK;
884 	}
885 
886 	// write the current log entry to disk
887 
888 	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
889 		status = _WriteTransactionToLog();
890 		if (status < B_OK)
891 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
892 	}
893 
894 	if (flushBlocks)
895 		status = fVolume->FlushDevice();
896 
897 	recursive_lock_unlock(&fLock);
898 	return status;
899 }
900 
901 
902 /*!	Flushes the current log entry to disk, and also writes back all dirty
903 	blocks for this volume (completing all open transactions).
904 */
905 status_t
906 Journal::FlushLogAndBlocks()
907 {
908 	return _FlushLog(true, true);
909 }
910 
911 
912 status_t
913 Journal::Lock(Transaction* owner, bool separateSubTransactions)
914 {
915 	status_t status = recursive_lock_lock(&fLock);
916 	if (status != B_OK)
917 		return status;
918 
919 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
920 		// we'll just use the current transaction again
921 		return B_OK;
922 	}
923 
924 	if (separateSubTransactions)
925 		fSeparateSubTransactions = true;
926 
927 	fOwner = owner;
928 
929 	// TODO: we need a way to find out how big the current transaction is;
930 	//	we need to be able to either detach the latest sub transaction on
931 	//	demand, as well as having some kind of fall back plan in case the
932 	//	sub transaction itself grows bigger than the log.
933 	//	For that, it would be nice to have some call-back interface in the
934 	//	cache transaction API...
935 
936 	if (fOwner != NULL) {
937 		if (fUnwrittenTransactions > 0) {
938 			// start a sub transaction
939 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
940 			fHasSubtransaction = true;
941 		} else
942 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
943 
944 		if (fTransactionID < B_OK) {
945 			recursive_lock_unlock(&fLock);
946 			return fTransactionID;
947 		}
948 
949 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
950 			TRANSACTION_IDLE, _TransactionIdle, this);
951 	}
952 	return B_OK;
953 }
954 
955 
956 void
957 Journal::Unlock(Transaction* owner, bool success)
958 {
959 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
960 		// we only end the transaction if we would really unlock it
961 		// TODO: what about failing transactions that do not unlock?
962 		// (they must make the parent fail, too)
963 		if (fOwner != NULL)
964 			_TransactionDone(success);
965 
966 		fTimestamp = system_time();
967 		fOwner = NULL;
968 
969 		if (fSeparateSubTransactions
970 			&& recursive_lock_get_recursion(&fLock) == 1)
971 			fSeparateSubTransactions = false;
972 	}
973 
974 	recursive_lock_unlock(&fLock);
975 }
976 
977 
978 uint32
979 Journal::_TransactionSize() const
980 {
981 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
982 		fTransactionID);
983 	if (count <= 0)
984 		return 0;
985 
986 	// take the number of array blocks in this transaction into account
987 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
988 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
989 	return count + arrayBlocks;
990 }
991 
992 
993 status_t
994 Journal::_TransactionDone(bool success)
995 {
996 	if (!success) {
997 		if (_HasSubTransaction())
998 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
999 		else
1000 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1001 
1002 		return B_OK;
1003 	}
1004 
1005 	// Up to a maximum size, we will just batch several
1006 	// transactions together to improve speed
1007 	uint32 size = _TransactionSize();
1008 	if (size < fMaxTransactionSize) {
1009 		// Flush the log from time to time, so that we have enough space
1010 		// for this transaction
1011 		if (size > FreeLogBlocks())
1012 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1013 
1014 		fUnwrittenTransactions++;
1015 		return B_OK;
1016 	}
1017 
1018 	return _WriteTransactionToLog();
1019 }
1020 
1021 
1022 //	#pragma mark - debugger commands
1023 
1024 
1025 #ifdef BFS_DEBUGGER_COMMANDS
1026 
1027 void
1028 Journal::Dump()
1029 {
1030 	kprintf("log start: %ld\n", fVolume->LogStart());
1031 	kprintf("log end:   %ld\n", fVolume->LogEnd());
1032 	kprintf("entries:\n");
1033 	kprintf("  address        id  start length\n");
1034 
1035 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1036 
1037 	while (iterator.HasNext()) {
1038 		LogEntry* entry = iterator.Next();
1039 
1040 		kprintf("  %p %6ld %6lu %6lu\n", entry, entry->TransactionID(),
1041 			entry->Start(), entry->Length());
1042 	}
1043 }
1044 
1045 
1046 int
1047 dump_journal(int argc, char** argv)
1048 {
1049 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1050 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1051 		return 0;
1052 	}
1053 
1054 	Volume* volume = (Volume*)parse_expression(argv[1]);
1055 	Journal* journal = volume->GetJournal(0);
1056 
1057 	journal->Dump();
1058 	return 0;
1059 }
1060 
1061 #endif	// BFS_DEBUGGER_COMMANDS
1062 
1063 
1064 //	#pragma mark - Transaction
1065 
1066 
1067 status_t
1068 Transaction::Start(Volume* volume, off_t refBlock)
1069 {
1070 	// has it already been started?
1071 	if (fJournal != NULL)
1072 		return B_OK;
1073 
1074 	fJournal = volume->GetJournal(refBlock);
1075 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1076 		return B_OK;
1077 
1078 	fJournal = NULL;
1079 	return B_ERROR;
1080 }
1081 
1082 
1083 /*!	Adds an inode to this transaction. This means that the inode will be write
1084 	locked until the transaction ended.
1085 	To ensure that the inode will stay valid until that point, an extra reference
1086 	is acquired to it as long as this transaction stays active.
1087 */
1088 void
1089 Transaction::AddInode(Inode* inode)
1090 {
1091 	if (fJournal == NULL)
1092 		panic("Transaction is not running!");
1093 
1094 	// These flags can only change while holding the transaction lock
1095 	if ((inode->Flags() & INODE_IN_TRANSACTION) != 0)
1096 		return;
1097 
1098 	// We share the same list link with the removed list, so we have to remove
1099 	// the inode from that list here (and add it back when we no longer need it)
1100 	if ((inode->Flags() & INODE_DELETED) != 0)
1101 		GetVolume()->RemovedInodes().Remove(inode);
1102 
1103 	if (!GetVolume()->IsInitializing())
1104 		acquire_vnode(GetVolume()->FSVolume(), inode->ID());
1105 
1106 	rw_lock_write_lock(&inode->Lock());
1107 	fLockedInodes.Add(inode);
1108 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1109 }
1110 
1111 
1112 void
1113 Transaction::RemoveInode(Inode* inode)
1114 {
1115 	if (fJournal == NULL)
1116 		panic("Transaction is not running!");
1117 
1118 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1119 	fLockedInodes.Remove(inode);
1120 	rw_lock_write_unlock(&inode->Lock());
1121 
1122 	// See AddInode() why we do this here
1123 	if ((inode->Flags() & INODE_DELETED) != 0)
1124 		GetVolume()->RemovedInodes().Add(inode);
1125 
1126 	if (!GetVolume()->IsInitializing())
1127 		put_vnode(GetVolume()->FSVolume(), inode->ID());
1128 }
1129 
1130 
1131 void
1132 Transaction::_UnlockInodes()
1133 {
1134 	while (Inode* inode = fLockedInodes.RemoveHead()) {
1135 		inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1136 		rw_lock_write_unlock(&inode->Lock());
1137 
1138 		// See AddInode() why we do this here
1139 		if ((inode->Flags() & INODE_DELETED) != 0)
1140 			GetVolume()->RemovedInodes().Add(inode);
1141 
1142 		if (!GetVolume()->IsInitializing())
1143 			put_vnode(GetVolume()->FSVolume(), inode->ID());
1144 	}
1145 }
1146 
1147