xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 16d5c24e533eb14b7b8a99ee9f3ec9ba66335b1e)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! Transaction and logging
7 
8 
9 #include "Journal.h"
10 
11 #include "Debug.h"
12 #include "Inode.h"
13 
14 
15 struct run_array {
16 	int32		count;
17 	int32		max_runs;
18 	block_run	runs[0];
19 
20 	void Init(int32 blockSize);
21 	void Insert(block_run& run);
22 
23 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
24 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
25 		// that -1 accounts for an off-by-one error in Be's BFS implementation
26 	const block_run& RunAt(int32 i) const { return runs[i]; }
27 
28 	static int32 MaxRuns(int32 blockSize);
29 
30 private:
31 	static int _Compare(block_run& a, block_run& b);
32 	int32 _FindInsertionIndex(block_run& run);
33 };
34 
35 class RunArrays {
36 public:
37 							RunArrays(Journal* journal);
38 							~RunArrays();
39 
40 			status_t		Insert(off_t blockNumber);
41 
42 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
43 			int32			CountArrays() const { return fArrays.CountItems(); }
44 
45 			uint32			CountBlocks() const { return fBlockCount; }
46 			uint32			LogEntryLength() const
47 								{ return CountBlocks() + CountArrays(); }
48 
49 			int32			MaxArrayLength();
50 
51 private:
52 			status_t		_AddArray();
53 			bool			_ContainsRun(block_run& run);
54 			bool			_AddRun(block_run& run);
55 
56 			Journal*		fJournal;
57 			uint32			fBlockCount;
58 			Stack<run_array*> fArrays;
59 			run_array*		fLastArray;
60 };
61 
62 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
63 public:
64 							LogEntry(Journal* journal, uint32 logStart,
65 								uint32 length);
66 							~LogEntry();
67 
68 			uint32			Start() const { return fStart; }
69 			uint32			Length() const { return fLength; }
70 
71 #ifdef BFS_DEBUGGER_COMMANDS
72 			void			SetTransactionID(int32 id) { fTransactionID = id; }
73 			int32			TransactionID() const { return fTransactionID; }
74 #endif
75 
76 			Journal*		GetJournal() { return fJournal; }
77 
78 private:
79 			Journal*		fJournal;
80 			uint32			fStart;
81 			uint32			fLength;
82 #ifdef BFS_DEBUGGER_COMMANDS
83 			int32			fTransactionID;
84 #endif
85 };
86 
87 
88 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
89 namespace BFSJournalTracing {
90 
91 class LogEntry : public AbstractTraceEntry {
92 public:
93 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
94 		:
95 		fEntry(entry),
96 #ifdef BFS_DEBUGGER_COMMANDS
97 		fTransactionID(entry->TransactionID()),
98 #endif
99 		fStart(entry->Start()),
100 		fLength(entry->Length()),
101 		fLogPosition(logPosition),
102 		fStarted(started)
103 	{
104 		Initialized();
105 	}
106 
107 	virtual void AddDump(TraceOutput& out)
108 	{
109 #ifdef BFS_DEBUGGER_COMMANDS
110 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
111 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
112 			fTransactionID, fStart, fLength,
113 			fStarted ? "end" : "start", fLogPosition);
114 #else
115 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
116 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
117 			fStarted ? "end" : "start", fLogPosition);
118 #endif
119 	}
120 
121 private:
122 	::LogEntry*	fEntry;
123 #ifdef BFS_DEBUGGER_COMMANDS
124 	int32		fTransactionID;
125 #endif
126 	uint32		fStart;
127 	uint32		fLength;
128 	uint32		fLogPosition;
129 	bool		fStarted;
130 };
131 
132 }	// namespace BFSJournalTracing
133 
134 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
135 #else
136 #	define T(x) ;
137 #endif
138 
139 
140 //	#pragma mark -
141 
142 
143 static void
144 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
145 	size_t size)
146 {
147 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
148 			+ vecs[index - 1].iov_len == (addr_t)address) {
149 		// the iovec can be combined with the previous one
150 		vecs[index - 1].iov_len += size;
151 		return;
152 	}
153 
154 	if (index == max)
155 		panic("no more space for iovecs!");
156 
157 	// we need to start a new iovec
158 	vecs[index].iov_base = const_cast<void*>(address);
159 	vecs[index].iov_len = size;
160 	index++;
161 }
162 
163 
164 //	#pragma mark - LogEntry
165 
166 
167 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
168 	:
169 	fJournal(journal),
170 	fStart(start),
171 	fLength(length)
172 {
173 }
174 
175 
176 LogEntry::~LogEntry()
177 {
178 }
179 
180 
181 //	#pragma mark - run_array
182 
183 
184 /*!	The run_array's size equals the block size of the BFS volume, so we
185 	cannot use a (non-overridden) new.
186 	This makes a freshly allocated run_array ready to run.
187 */
188 void
189 run_array::Init(int32 blockSize)
190 {
191 	memset(this, 0, blockSize);
192 	count = 0;
193 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
194 }
195 
196 
197 /*!	Inserts the block_run into the array. You will have to make sure the
198 	array is large enough to contain the entry before calling this function.
199 */
200 void
201 run_array::Insert(block_run& run)
202 {
203 	int32 index = _FindInsertionIndex(run);
204 	if (index == -1) {
205 		// add to the end
206 		runs[CountRuns()] = run;
207 	} else {
208 		// insert at index
209 		memmove(&runs[index + 1], &runs[index],
210 			(CountRuns() - index) * sizeof(off_t));
211 		runs[index] = run;
212 	}
213 
214 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
215 }
216 
217 
218 /*static*/ int32
219 run_array::MaxRuns(int32 blockSize)
220 {
221 	// For whatever reason, BFS restricts the maximum array size
222 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
223 	if (maxCount < 128)
224 		return maxCount;
225 
226 	return 127;
227 }
228 
229 
230 /*static*/ int
231 run_array::_Compare(block_run& a, block_run& b)
232 {
233 	int cmp = a.AllocationGroup() - b.AllocationGroup();
234 	if (cmp == 0)
235 		return a.Start() - b.Start();
236 
237 	return cmp;
238 }
239 
240 
241 int32
242 run_array::_FindInsertionIndex(block_run& run)
243 {
244 	int32 min = 0, max = CountRuns() - 1;
245 	int32 i = 0;
246 	if (max >= 8) {
247 		while (min <= max) {
248 			i = (min + max) / 2;
249 
250 			int cmp = _Compare(runs[i], run);
251 			if (cmp < 0)
252 				min = i + 1;
253 			else if (cmp > 0)
254 				max = i - 1;
255 			else
256 				return -1;
257 		}
258 
259 		if (_Compare(runs[i], run) < 0)
260 			i++;
261 	} else {
262 		for (; i <= max; i++) {
263 			if (_Compare(runs[i], run) > 0)
264 				break;
265 		}
266 		if (i == count)
267 			return -1;
268 	}
269 
270 	return i;
271 }
272 
273 
274 //	#pragma mark - RunArrays
275 
276 
277 RunArrays::RunArrays(Journal* journal)
278 	:
279 	fJournal(journal),
280 	fBlockCount(0),
281 	fArrays(),
282 	fLastArray(NULL)
283 {
284 }
285 
286 
287 RunArrays::~RunArrays()
288 {
289 	run_array* array;
290 	while (fArrays.Pop(&array))
291 		free(array);
292 }
293 
294 
295 bool
296 RunArrays::_ContainsRun(block_run& run)
297 {
298 	for (int32 i = 0; i < CountArrays(); i++) {
299 		run_array* array = ArrayAt(i);
300 
301 		for (int32 j = 0; j < array->CountRuns(); j++) {
302 			block_run& arrayRun = array->runs[j];
303 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
304 				continue;
305 
306 			if (run.Start() >= arrayRun.Start()
307 				&& run.Start() + run.Length()
308 					<= arrayRun.Start() + arrayRun.Length())
309 				return true;
310 		}
311 	}
312 
313 	return false;
314 }
315 
316 
317 /*!	Adds the specified block_run into the array.
318 	Note: it doesn't support overlapping - it must only be used
319 	with block_runs of length 1!
320 */
321 bool
322 RunArrays::_AddRun(block_run& run)
323 {
324 	ASSERT(run.length == 1);
325 
326 	// Be's BFS log replay routine can only deal with block_runs of size 1
327 	// A pity, isn't it? Too sad we have to be compatible.
328 
329 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
330 		return false;
331 
332 	fLastArray->Insert(run);
333 	fBlockCount++;
334 	return true;
335 }
336 
337 
338 status_t
339 RunArrays::_AddArray()
340 {
341 	int32 blockSize = fJournal->GetVolume()->BlockSize();
342 
343 	run_array* array = (run_array*)malloc(blockSize);
344 	if (array == NULL)
345 		return B_NO_MEMORY;
346 
347 	if (fArrays.Push(array) != B_OK) {
348 		free(array);
349 		return B_NO_MEMORY;
350 	}
351 
352 	array->Init(blockSize);
353 	fLastArray = array;
354 	return B_OK;
355 }
356 
357 
358 status_t
359 RunArrays::Insert(off_t blockNumber)
360 {
361 	Volume* volume = fJournal->GetVolume();
362 	block_run run = volume->ToBlockRun(blockNumber);
363 
364 	if (fLastArray != NULL) {
365 		// check if the block is already in the array
366 		if (_ContainsRun(run))
367 			return B_OK;
368 	}
369 
370 	// insert block into array
371 
372 	if (!_AddRun(run)) {
373 		// array is full
374 		if (_AddArray() != B_OK || !_AddRun(run))
375 			return B_NO_MEMORY;
376 	}
377 
378 	return B_OK;
379 }
380 
381 
382 int32
383 RunArrays::MaxArrayLength()
384 {
385 	int32 max = 0;
386 	for (int32 i = 0; i < CountArrays(); i++) {
387 		if (ArrayAt(i)->CountRuns() > max)
388 			max = ArrayAt(i)->CountRuns();
389 	}
390 
391 	return max;
392 }
393 
394 
395 //	#pragma mark - Journal
396 
397 
398 Journal::Journal(Volume* volume)
399 	:
400 	fVolume(volume),
401 	fOwner(NULL),
402 	fLogSize(volume->Log().Length()),
403 	fMaxTransactionSize(fLogSize / 2 - 5),
404 	fUsed(0),
405 	fUnwrittenTransactions(0),
406 	fHasSubtransaction(false),
407 	fSeparateSubTransactions(false)
408 {
409 	recursive_lock_init(&fLock, "bfs journal");
410 	mutex_init(&fEntriesLock, "bfs journal entries");
411 }
412 
413 
414 Journal::~Journal()
415 {
416 	FlushLogAndBlocks();
417 
418 	recursive_lock_destroy(&fLock);
419 	mutex_destroy(&fEntriesLock);
420 }
421 
422 
423 status_t
424 Journal::InitCheck()
425 {
426 	return B_OK;
427 }
428 
429 
430 /*!	\brief Does a very basic consistency check of the run array.
431 	It will check the maximum run count as well as if all of the runs fall
432 	within a the volume.
433 */
434 status_t
435 Journal::_CheckRunArray(const run_array* array)
436 {
437 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
438 		// the -1 works around an off-by-one bug in Be's BFS implementation,
439 		// same as in run_array::MaxRuns()
440 	if (array->MaxRuns() != maxRuns
441 		|| array->CountRuns() > maxRuns
442 		|| array->CountRuns() <= 0) {
443 		dprintf("run count: %d, array max: %d, max runs: %d\n",
444 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
445 		FATAL(("Log entry has broken header!\n"));
446 		return B_ERROR;
447 	}
448 
449 	for (int32 i = 0; i < array->CountRuns(); i++) {
450 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
451 			return B_ERROR;
452 	}
453 
454 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
455 	return B_OK;
456 }
457 
458 
459 /*!	Replays an entry in the log.
460 	\a _start points to the entry in the log, and will be bumped to the next
461 	one if replaying succeeded.
462 */
463 status_t
464 Journal::_ReplayRunArray(int32* _start)
465 {
466 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
467 
468 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
469 	off_t firstBlockNumber = *_start % fLogSize;
470 
471 	CachedBlock cachedArray(fVolume);
472 
473 	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
474 		+ firstBlockNumber);
475 	if (array == NULL)
476 		return B_IO_ERROR;
477 
478 	if (_CheckRunArray(array) < B_OK)
479 		return B_BAD_DATA;
480 
481 	// First pass: check integrity of the blocks in the run array
482 
483 	CachedBlock cached(fVolume);
484 
485 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
486 	off_t blockNumber = firstBlockNumber;
487 	int32 blockSize = fVolume->BlockSize();
488 
489 	for (int32 index = 0; index < array->CountRuns(); index++) {
490 		const block_run& run = array->RunAt(index);
491 
492 		off_t offset = fVolume->ToOffset(run);
493 		for (int32 i = 0; i < run.Length(); i++) {
494 			const uint8* data = cached.SetTo(logOffset + blockNumber);
495 			if (data == NULL)
496 				RETURN_ERROR(B_IO_ERROR);
497 
498 			// TODO: eventually check other well known offsets, like the
499 			// root and index dirs
500 			if (offset == 0) {
501 				// This log entry writes over the super block - check if
502 				// it's valid!
503 				if (Volume::CheckSuperBlock(data) != B_OK) {
504 					FATAL(("Log contains invalid super block!\n"));
505 					RETURN_ERROR(B_BAD_DATA);
506 				}
507 			}
508 
509 			blockNumber = (blockNumber + 1) % fLogSize;
510 			offset += blockSize;
511 		}
512 	}
513 
514 	// Second pass: write back its blocks
515 
516 	blockNumber = firstBlockNumber;
517 	int32 count = 1;
518 
519 	for (int32 index = 0; index < array->CountRuns(); index++) {
520 		const block_run& run = array->RunAt(index);
521 		INFORM(("replay block run %u:%u:%u in log at %Ld!\n",
522 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
523 
524 		off_t offset = fVolume->ToOffset(run);
525 		for (int32 i = 0; i < run.Length(); i++) {
526 			const uint8* data = cached.SetTo(logOffset + blockNumber);
527 			if (data == NULL)
528 				RETURN_ERROR(B_IO_ERROR);
529 
530 			ssize_t written = write_pos(fVolume->Device(), offset, data,
531 				blockSize);
532 			if (written != blockSize)
533 				RETURN_ERROR(B_IO_ERROR);
534 
535 			blockNumber = (blockNumber + 1) % fLogSize;
536 			offset += blockSize;
537 			count++;
538 		}
539 	}
540 
541 	*_start += count;
542 	return B_OK;
543 }
544 
545 
546 /*!	Replays all log entries - this will put the disk into a
547 	consistent and clean state, if it was not correctly unmounted
548 	before.
549 	This method is called by Journal::InitCheck() if the log start
550 	and end pointer don't match.
551 */
552 status_t
553 Journal::ReplayLog()
554 {
555 	// TODO: this logic won't work whenever the size of the pending transaction
556 	//	equals the size of the log (happens with the original BFS only)
557 	if (fVolume->LogStart() == fVolume->LogEnd())
558 		return B_OK;
559 
560 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
561 
562 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
563 		INFORM(("log_start and log_end differ, but disk is marked clean - "
564 			"trying to replay log...\n"));
565 	}
566 
567 	if (fVolume->IsReadOnly())
568 		return B_READ_ONLY_DEVICE;
569 
570 	int32 start = fVolume->LogStart();
571 	int32 lastStart = -1;
572 	while (true) {
573 		// stop if the log is completely flushed
574 		if (start == fVolume->LogEnd())
575 			break;
576 
577 		if (start == lastStart) {
578 			// strange, flushing the log hasn't changed the log_start pointer
579 			return B_ERROR;
580 		}
581 		lastStart = start;
582 
583 		status_t status = _ReplayRunArray(&start);
584 		if (status != B_OK) {
585 			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
586 				strerror(status)));
587 			return B_ERROR;
588 		}
589 		start = start % fLogSize;
590 	}
591 
592 	PRINT(("replaying worked fine!\n"));
593 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
594 		fVolume->LogEnd());
595 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
596 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
597 
598 	return fVolume->WriteSuperBlock();
599 }
600 
601 
602 /*!	This is a callback function that is called by the cache, whenever
603 	all blocks of a transaction have been flushed to disk.
604 	This lets us keep track of completed transactions, and update
605 	the log start pointer as needed. Note, the transactions may not be
606 	completed in the order they were written.
607 */
608 /*static*/ void
609 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
610 {
611 	LogEntry* logEntry = (LogEntry*)_logEntry;
612 
613 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
614 		transactionID));
615 
616 	Journal* journal = logEntry->GetJournal();
617 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
618 	bool update = false;
619 
620 	// Set log_start pointer if possible...
621 
622 	mutex_lock(&journal->fEntriesLock);
623 
624 	if (logEntry == journal->fEntries.First()) {
625 		LogEntry* next = journal->fEntries.GetNext(logEntry);
626 		if (next != NULL) {
627 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
628 				% journal->fLogSize);
629 		} else {
630 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
631 				journal->fVolume->LogEnd());
632 		}
633 
634 		update = true;
635 	}
636 
637 	T(LogEntry(logEntry, superBlock.LogStart(), false));
638 
639 	journal->fUsed -= logEntry->Length();
640 	journal->fEntries.Remove(logEntry);
641 	mutex_unlock(&journal->fEntriesLock);
642 
643 	delete logEntry;
644 
645 	// update the super block, and change the disk's state, if necessary
646 
647 	if (update) {
648 		if (superBlock.log_start == superBlock.log_end)
649 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
650 
651 		status_t status = journal->fVolume->WriteSuperBlock();
652 		if (status != B_OK) {
653 			FATAL(("_TransactionWritten: could not write back super block: %s\n",
654 				strerror(status)));
655 		}
656 
657 		journal->fVolume->LogStart() = superBlock.LogStart();
658 	}
659 }
660 
661 
662 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
663 /*static*/ void
664 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
665 {
666 	// The current transaction seems to be idle - flush it
667 
668 	Journal* journal = (Journal*)_journal;
669 	journal->_FlushLog(false, false);
670 }
671 
672 
673 /*!	Writes the blocks that are part of current transaction into the log,
674 	and ends the current transaction.
675 	If the current transaction is too large to fit into the log, it will
676 	try to detach an existing sub-transaction.
677 */
678 status_t
679 Journal::_WriteTransactionToLog()
680 {
681 	// TODO: in case of a failure, we need a backup plan like writing all
682 	//	changed blocks back to disk immediately (hello disk corruption!)
683 
684 	bool detached = false;
685 
686 	if (_TransactionSize() > fLogSize) {
687 		// The current transaction won't fit into the log anymore, try to
688 		// detach the current sub-transaction
689 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
690 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
691 			detached = true;
692 		} else {
693 			// TODO: what are our options here?
694 			// a) abort the transaction - bad, because all changes are lost
695 			// b) carry out the changes, but don't use the log - even worse,
696 			//    as it potentially creates a corrupted disk.
697 			dprintf("transaction too large (%d blocks, %d main, log size %d)!\n",
698 				(int)_TransactionSize(), (int)cache_blocks_in_main_transaction(
699 				fVolume->BlockCache(), fTransactionID), (int)fLogSize);
700 			return B_BUFFER_OVERFLOW;
701 		}
702 	}
703 
704 	fHasSubtransaction = false;
705 
706 	int32 blockShift = fVolume->BlockShift();
707 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
708 	off_t logStart = fVolume->LogEnd() % fLogSize;
709 	off_t logPosition = logStart;
710 	status_t status;
711 
712 	// create run_array structures for all changed blocks
713 
714 	RunArrays runArrays(this);
715 
716 	off_t blockNumber;
717 	long cookie = 0;
718 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
719 			fTransactionID, detached, &cookie, &blockNumber, NULL,
720 			NULL) == B_OK) {
721 		status = runArrays.Insert(blockNumber);
722 		if (status < B_OK) {
723 			FATAL(("filling log entry failed!"));
724 			return status;
725 		}
726 	}
727 
728 	if (runArrays.CountBlocks() == 0) {
729 		// nothing has changed during this transaction
730 		if (detached) {
731 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
732 				fTransactionID, NULL, NULL);
733 			fUnwrittenTransactions = 1;
734 		} else {
735 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
736 				NULL);
737 			fUnwrittenTransactions = 0;
738 		}
739 		return B_OK;
740 	}
741 
742 	// If necessary, flush the log, so that we have enough space for this
743 	// transaction
744 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
745 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
746 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
747 			panic("no space in log after sync (%ld for %ld blocks)!",
748 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
749 		}
750 	}
751 
752 	// Write log entries to disk
753 
754 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
755 		// one extra for the index block
756 
757 	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
758 	if (vecs == NULL) {
759 		// TODO: write back log entries directly?
760 		return B_NO_MEMORY;
761 	}
762 
763 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
764 		run_array* array = runArrays.ArrayAt(k);
765 		int32 index = 0, count = 1;
766 		int32 wrap = fLogSize - logStart;
767 
768 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
769 
770 		// add block runs
771 
772 		for (int32 i = 0; i < array->CountRuns(); i++) {
773 			const block_run& run = array->RunAt(i);
774 			off_t blockNumber = fVolume->ToBlock(run);
775 
776 			for (int32 j = 0; j < run.Length(); j++) {
777 				if (count >= wrap) {
778 					// We need to write back the first half of the entry
779 					// directly as the log wraps around
780 					if (writev_pos(fVolume->Device(), logOffset
781 						+ (logStart << blockShift), vecs, index) < 0)
782 						FATAL(("could not write log area!\n"));
783 
784 					logPosition = logStart + count;
785 					logStart = 0;
786 					wrap = fLogSize;
787 					count = 0;
788 					index = 0;
789 				}
790 
791 				// make blocks available in the cache
792 				const void* data = block_cache_get(fVolume->BlockCache(),
793 					blockNumber + j);
794 				if (data == NULL) {
795 					free(vecs);
796 					return B_IO_ERROR;
797 				}
798 
799 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
800 				count++;
801 			}
802 		}
803 
804 		// write back the rest of the log entry
805 		if (count > 0) {
806 			logPosition = logStart + count;
807 			if (writev_pos(fVolume->Device(), logOffset
808 					+ (logStart << blockShift), vecs, index) < 0)
809 				FATAL(("could not write log area: %s!\n", strerror(errno)));
810 		}
811 
812 		// release blocks again
813 		for (int32 i = 0; i < array->CountRuns(); i++) {
814 			const block_run& run = array->RunAt(i);
815 			off_t blockNumber = fVolume->ToBlock(run);
816 
817 			for (int32 j = 0; j < run.Length(); j++) {
818 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
819 			}
820 		}
821 
822 		logStart = logPosition % fLogSize;
823 	}
824 
825 	free(vecs);
826 
827 	LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(),
828 		runArrays.LogEntryLength());
829 	if (logEntry == NULL) {
830 		FATAL(("no memory to allocate log entries!"));
831 		return B_NO_MEMORY;
832 	}
833 
834 #ifdef BFS_DEBUGGER_COMMANDS
835 	logEntry->SetTransactionID(fTransactionID);
836 #endif
837 
838 	// Update the log end pointer in the super block
839 
840 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
841 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
842 
843 	status = fVolume->WriteSuperBlock();
844 
845 	fVolume->LogEnd() = logPosition;
846 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
847 
848 	// We need to flush the drives own cache here to ensure
849 	// disk consistency.
850 	// If that call fails, we can't do anything about it anyway
851 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
852 
853 	// at this point, we can finally end the transaction - we're in
854 	// a guaranteed valid state
855 
856 	mutex_lock(&fEntriesLock);
857 	fEntries.Add(logEntry);
858 	fUsed += logEntry->Length();
859 	mutex_unlock(&fEntriesLock);
860 
861 	if (detached) {
862 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
863 			fTransactionID, _TransactionWritten, logEntry);
864 		fUnwrittenTransactions = 1;
865 	} else {
866 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
867 			_TransactionWritten, logEntry);
868 		fUnwrittenTransactions = 0;
869 	}
870 
871 	return status;
872 }
873 
874 
875 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
876 	also write back all dirty blocks for this volume.
877 */
878 status_t
879 Journal::_FlushLog(bool canWait, bool flushBlocks)
880 {
881 	status_t status = canWait ? recursive_lock_lock(&fLock)
882 		: recursive_lock_trylock(&fLock);
883 	if (status != B_OK)
884 		return status;
885 
886 	if (recursive_lock_get_recursion(&fLock) > 1) {
887 		// whoa, FlushLogAndBlocks() was called from inside a transaction
888 		recursive_lock_unlock(&fLock);
889 		return B_OK;
890 	}
891 
892 	// write the current log entry to disk
893 
894 	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
895 		status = _WriteTransactionToLog();
896 		if (status < B_OK)
897 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
898 	}
899 
900 	if (flushBlocks)
901 		status = fVolume->FlushDevice();
902 
903 	recursive_lock_unlock(&fLock);
904 	return status;
905 }
906 
907 
908 /*!	Flushes the current log entry to disk, and also writes back all dirty
909 	blocks for this volume (completing all open transactions).
910 */
911 status_t
912 Journal::FlushLogAndBlocks()
913 {
914 	return _FlushLog(true, true);
915 }
916 
917 
918 status_t
919 Journal::Lock(Transaction* owner, bool separateSubTransactions)
920 {
921 	status_t status = recursive_lock_lock(&fLock);
922 	if (status != B_OK)
923 		return status;
924 
925 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
926 		// we'll just use the current transaction again
927 		return B_OK;
928 	}
929 
930 	if (separateSubTransactions)
931 		fSeparateSubTransactions = true;
932 
933 	fOwner = owner;
934 
935 	// TODO: we need a way to find out how big the current transaction is;
936 	//	we need to be able to either detach the latest sub transaction on
937 	//	demand, as well as having some kind of fall back plan in case the
938 	//	sub transaction itself grows bigger than the log.
939 	//	For that, it would be nice to have some call-back interface in the
940 	//	cache transaction API...
941 
942 	if (fOwner != NULL) {
943 		if (fUnwrittenTransactions > 0) {
944 			// start a sub transaction
945 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
946 			fHasSubtransaction = true;
947 		} else
948 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
949 
950 		if (fTransactionID < B_OK) {
951 			recursive_lock_unlock(&fLock);
952 			return fTransactionID;
953 		}
954 
955 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
956 			TRANSACTION_IDLE, _TransactionIdle, this);
957 	}
958 	return B_OK;
959 }
960 
961 
962 void
963 Journal::Unlock(Transaction* owner, bool success)
964 {
965 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
966 		// we only end the transaction if we would really unlock it
967 		// TODO: what about failing transactions that do not unlock?
968 		// (they must make the parent fail, too)
969 		if (fOwner != NULL)
970 			_TransactionDone(success);
971 
972 		fTimestamp = system_time();
973 		fOwner = NULL;
974 
975 		if (fSeparateSubTransactions
976 			&& recursive_lock_get_recursion(&fLock) == 1)
977 			fSeparateSubTransactions = false;
978 	}
979 
980 	recursive_lock_unlock(&fLock);
981 }
982 
983 
984 uint32
985 Journal::_TransactionSize() const
986 {
987 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
988 		fTransactionID);
989 	if (count <= 0)
990 		return 0;
991 
992 	// take the number of array blocks in this transaction into account
993 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
994 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
995 	return count + arrayBlocks;
996 }
997 
998 
999 status_t
1000 Journal::_TransactionDone(bool success)
1001 {
1002 	if (!success) {
1003 		if (_HasSubTransaction())
1004 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1005 		else
1006 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1007 
1008 		return B_OK;
1009 	}
1010 
1011 	// Up to a maximum size, we will just batch several
1012 	// transactions together to improve speed
1013 	uint32 size = _TransactionSize();
1014 	if (size < fMaxTransactionSize) {
1015 		// Flush the log from time to time, so that we have enough space
1016 		// for this transaction
1017 		if (size > FreeLogBlocks())
1018 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1019 
1020 		fUnwrittenTransactions++;
1021 		return B_OK;
1022 	}
1023 
1024 	return _WriteTransactionToLog();
1025 }
1026 
1027 
1028 //	#pragma mark - debugger commands
1029 
1030 
1031 #ifdef BFS_DEBUGGER_COMMANDS
1032 
1033 void
1034 Journal::Dump()
1035 {
1036 	kprintf("log start: %ld\n", fVolume->LogStart());
1037 	kprintf("log end:   %ld\n", fVolume->LogEnd());
1038 	kprintf("entries:\n");
1039 	kprintf("  address        id  start length\n");
1040 
1041 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1042 
1043 	while (iterator.HasNext()) {
1044 		LogEntry* entry = iterator.Next();
1045 
1046 		kprintf("  %p %6ld %6lu %6lu\n", entry, entry->TransactionID(),
1047 			entry->Start(), entry->Length());
1048 	}
1049 }
1050 
1051 
1052 int
1053 dump_journal(int argc, char** argv)
1054 {
1055 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1056 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1057 		return 0;
1058 	}
1059 
1060 	Volume* volume = (Volume*)parse_expression(argv[1]);
1061 	Journal* journal = volume->GetJournal(0);
1062 
1063 	journal->Dump();
1064 	return 0;
1065 }
1066 
1067 #endif	// BFS_DEBUGGER_COMMANDS
1068 
1069 
1070 //	#pragma mark - Transaction
1071 
1072 
1073 status_t
1074 Transaction::Start(Volume* volume, off_t refBlock)
1075 {
1076 	// has it already been started?
1077 	if (fJournal != NULL)
1078 		return B_OK;
1079 
1080 	fJournal = volume->GetJournal(refBlock);
1081 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1082 		return B_OK;
1083 
1084 	fJournal = NULL;
1085 	return B_ERROR;
1086 }
1087 
1088 
1089 /*!	Adds an inode to this transaction. This means that the inode will be write
1090 	locked until the transaction ended.
1091 	To ensure that the inode will stay valid until that point, an extra reference
1092 	is acquired to it as long as this transaction stays active.
1093 */
1094 void
1095 Transaction::AddInode(Inode* inode)
1096 {
1097 	if (fJournal == NULL)
1098 		panic("Transaction is not running!");
1099 
1100 	// These flags can only change while holding the transaction lock
1101 	if ((inode->Flags() & INODE_IN_TRANSACTION) != 0)
1102 		return;
1103 
1104 	// We share the same list link with the removed list, so we have to remove
1105 	// the inode from that list here (and add it back when we no longer need it)
1106 	if ((inode->Flags() & INODE_DELETED) != 0)
1107 		GetVolume()->RemovedInodes().Remove(inode);
1108 
1109 	if (!GetVolume()->IsInitializing())
1110 		acquire_vnode(GetVolume()->FSVolume(), inode->ID());
1111 
1112 	rw_lock_write_lock(&inode->Lock());
1113 	fLockedInodes.Add(inode);
1114 	inode->Node().flags |= HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1115 }
1116 
1117 
1118 void
1119 Transaction::RemoveInode(Inode* inode)
1120 {
1121 	if (fJournal == NULL)
1122 		panic("Transaction is not running!");
1123 
1124 	inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1125 	fLockedInodes.Remove(inode);
1126 	rw_lock_write_unlock(&inode->Lock());
1127 
1128 	// See AddInode() why we do this here
1129 	if ((inode->Flags() & INODE_DELETED) != 0)
1130 		GetVolume()->RemovedInodes().Add(inode);
1131 
1132 	if (!GetVolume()->IsInitializing())
1133 		put_vnode(GetVolume()->FSVolume(), inode->ID());
1134 }
1135 
1136 
1137 void
1138 Transaction::_UnlockInodes()
1139 {
1140 	while (Inode* inode = fLockedInodes.RemoveHead()) {
1141 		inode->Node().flags &= ~HOST_ENDIAN_TO_BFS_INT32(INODE_IN_TRANSACTION);
1142 		rw_lock_write_unlock(&inode->Lock());
1143 
1144 		// See AddInode() why we do this here
1145 		if ((inode->Flags() & INODE_DELETED) != 0)
1146 			GetVolume()->RemovedInodes().Add(inode);
1147 
1148 		if (!GetVolume()->IsInitializing())
1149 			put_vnode(GetVolume()->FSVolume(), inode->ID());
1150 	}
1151 }
1152 
1153