xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 97901ec593ec4dd50ac115c1c35a6d72f6e489a5)
1 /*
2  * Copyright 2001-2010, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 
7 //! Transaction and logging
8 
9 
10 #include "Journal.h"
11 
12 #include "Debug.h"
13 #include "Inode.h"
14 
15 
16 struct run_array {
17 	int32		count;
18 	int32		max_runs;
19 	block_run	runs[0];
20 
21 	void Init(int32 blockSize);
22 	void Insert(block_run& run);
23 
24 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
25 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
26 		// that -1 accounts for an off-by-one error in Be's BFS implementation
27 	const block_run& RunAt(int32 i) const { return runs[i]; }
28 
29 	static int32 MaxRuns(int32 blockSize);
30 
31 private:
32 	static int _Compare(block_run& a, block_run& b);
33 	int32 _FindInsertionIndex(block_run& run);
34 };
35 
36 class RunArrays {
37 public:
38 							RunArrays(Journal* journal);
39 							~RunArrays();
40 
41 			status_t		Insert(off_t blockNumber);
42 
43 			run_array*		ArrayAt(int32 i) { return fArrays.Array()[i]; }
44 			int32			CountArrays() const { return fArrays.CountItems(); }
45 
46 			uint32			CountBlocks() const { return fBlockCount; }
47 			uint32			LogEntryLength() const
48 								{ return CountBlocks() + CountArrays(); }
49 
50 			int32			MaxArrayLength();
51 
52 private:
53 			status_t		_AddArray();
54 			bool			_ContainsRun(block_run& run);
55 			bool			_AddRun(block_run& run);
56 
57 			Journal*		fJournal;
58 			uint32			fBlockCount;
59 			Stack<run_array*> fArrays;
60 			run_array*		fLastArray;
61 };
62 
63 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
64 public:
65 							LogEntry(Journal* journal, uint32 logStart,
66 								uint32 length);
67 							~LogEntry();
68 
69 			uint32			Start() const { return fStart; }
70 			uint32			Length() const { return fLength; }
71 
72 #ifdef BFS_DEBUGGER_COMMANDS
73 			void			SetTransactionID(int32 id) { fTransactionID = id; }
74 			int32			TransactionID() const { return fTransactionID; }
75 #endif
76 
77 			Journal*		GetJournal() { return fJournal; }
78 
79 private:
80 			Journal*		fJournal;
81 			uint32			fStart;
82 			uint32			fLength;
83 #ifdef BFS_DEBUGGER_COMMANDS
84 			int32			fTransactionID;
85 #endif
86 };
87 
88 
89 #if BFS_TRACING && !defined(BFS_SHELL) && !defined(_BOOT_MODE)
90 namespace BFSJournalTracing {
91 
92 class LogEntry : public AbstractTraceEntry {
93 public:
94 	LogEntry(::LogEntry* entry, off_t logPosition, bool started)
95 		:
96 		fEntry(entry),
97 #ifdef BFS_DEBUGGER_COMMANDS
98 		fTransactionID(entry->TransactionID()),
99 #endif
100 		fStart(entry->Start()),
101 		fLength(entry->Length()),
102 		fLogPosition(logPosition),
103 		fStarted(started)
104 	{
105 		Initialized();
106 	}
107 
108 	virtual void AddDump(TraceOutput& out)
109 	{
110 #ifdef BFS_DEBUGGER_COMMANDS
111 		out.Print("bfs:j:%s entry %p id %ld, start %lu, length %lu, log %s "
112 			"%lu\n", fStarted ? "Started" : "Written", fEntry,
113 			fTransactionID, fStart, fLength,
114 			fStarted ? "end" : "start", fLogPosition);
115 #else
116 		out.Print("bfs:j:%s entry %p start %lu, length %lu, log %s %lu\n",
117 			fStarted ? "Started" : "Written", fEntry, fStart, fLength,
118 			fStarted ? "end" : "start", fLogPosition);
119 #endif
120 	}
121 
122 private:
123 	::LogEntry*	fEntry;
124 #ifdef BFS_DEBUGGER_COMMANDS
125 	int32		fTransactionID;
126 #endif
127 	uint32		fStart;
128 	uint32		fLength;
129 	uint32		fLogPosition;
130 	bool		fStarted;
131 };
132 
133 }	// namespace BFSJournalTracing
134 
135 #	define T(x) new(std::nothrow) BFSJournalTracing::x;
136 #else
137 #	define T(x) ;
138 #endif
139 
140 
141 //	#pragma mark -
142 
143 
144 static void
145 add_to_iovec(iovec* vecs, int32& index, int32 max, const void* address,
146 	size_t size)
147 {
148 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
149 			+ vecs[index - 1].iov_len == (addr_t)address) {
150 		// the iovec can be combined with the previous one
151 		vecs[index - 1].iov_len += size;
152 		return;
153 	}
154 
155 	if (index == max)
156 		panic("no more space for iovecs!");
157 
158 	// we need to start a new iovec
159 	vecs[index].iov_base = const_cast<void*>(address);
160 	vecs[index].iov_len = size;
161 	index++;
162 }
163 
164 
165 //	#pragma mark - LogEntry
166 
167 
168 LogEntry::LogEntry(Journal* journal, uint32 start, uint32 length)
169 	:
170 	fJournal(journal),
171 	fStart(start),
172 	fLength(length)
173 {
174 }
175 
176 
177 LogEntry::~LogEntry()
178 {
179 }
180 
181 
182 //	#pragma mark - run_array
183 
184 
185 /*!	The run_array's size equals the block size of the BFS volume, so we
186 	cannot use a (non-overridden) new.
187 	This makes a freshly allocated run_array ready to run.
188 */
189 void
190 run_array::Init(int32 blockSize)
191 {
192 	memset(this, 0, blockSize);
193 	count = 0;
194 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
195 }
196 
197 
198 /*!	Inserts the block_run into the array. You will have to make sure the
199 	array is large enough to contain the entry before calling this function.
200 */
201 void
202 run_array::Insert(block_run& run)
203 {
204 	int32 index = _FindInsertionIndex(run);
205 	if (index == -1) {
206 		// add to the end
207 		runs[CountRuns()] = run;
208 	} else {
209 		// insert at index
210 		memmove(&runs[index + 1], &runs[index],
211 			(CountRuns() - index) * sizeof(off_t));
212 		runs[index] = run;
213 	}
214 
215 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
216 }
217 
218 
219 /*static*/ int32
220 run_array::MaxRuns(int32 blockSize)
221 {
222 	// For whatever reason, BFS restricts the maximum array size
223 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
224 	if (maxCount < 128)
225 		return maxCount;
226 
227 	return 127;
228 }
229 
230 
231 /*static*/ int
232 run_array::_Compare(block_run& a, block_run& b)
233 {
234 	int cmp = a.AllocationGroup() - b.AllocationGroup();
235 	if (cmp == 0)
236 		return a.Start() - b.Start();
237 
238 	return cmp;
239 }
240 
241 
242 int32
243 run_array::_FindInsertionIndex(block_run& run)
244 {
245 	int32 min = 0, max = CountRuns() - 1;
246 	int32 i = 0;
247 	if (max >= 8) {
248 		while (min <= max) {
249 			i = (min + max) / 2;
250 
251 			int cmp = _Compare(runs[i], run);
252 			if (cmp < 0)
253 				min = i + 1;
254 			else if (cmp > 0)
255 				max = i - 1;
256 			else
257 				return -1;
258 		}
259 
260 		if (_Compare(runs[i], run) < 0)
261 			i++;
262 	} else {
263 		for (; i <= max; i++) {
264 			if (_Compare(runs[i], run) > 0)
265 				break;
266 		}
267 		if (i == count)
268 			return -1;
269 	}
270 
271 	return i;
272 }
273 
274 
275 //	#pragma mark - RunArrays
276 
277 
278 RunArrays::RunArrays(Journal* journal)
279 	:
280 	fJournal(journal),
281 	fBlockCount(0),
282 	fArrays(),
283 	fLastArray(NULL)
284 {
285 }
286 
287 
288 RunArrays::~RunArrays()
289 {
290 	run_array* array;
291 	while (fArrays.Pop(&array))
292 		free(array);
293 }
294 
295 
296 bool
297 RunArrays::_ContainsRun(block_run& run)
298 {
299 	for (int32 i = 0; i < CountArrays(); i++) {
300 		run_array* array = ArrayAt(i);
301 
302 		for (int32 j = 0; j < array->CountRuns(); j++) {
303 			block_run& arrayRun = array->runs[j];
304 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
305 				continue;
306 
307 			if (run.Start() >= arrayRun.Start()
308 				&& run.Start() + run.Length()
309 					<= arrayRun.Start() + arrayRun.Length())
310 				return true;
311 		}
312 	}
313 
314 	return false;
315 }
316 
317 
318 /*!	Adds the specified block_run into the array.
319 	Note: it doesn't support overlapping - it must only be used
320 	with block_runs of length 1!
321 */
322 bool
323 RunArrays::_AddRun(block_run& run)
324 {
325 	ASSERT(run.length == 1);
326 
327 	// Be's BFS log replay routine can only deal with block_runs of size 1
328 	// A pity, isn't it? Too sad we have to be compatible.
329 
330 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
331 		return false;
332 
333 	fLastArray->Insert(run);
334 	fBlockCount++;
335 	return true;
336 }
337 
338 
339 status_t
340 RunArrays::_AddArray()
341 {
342 	int32 blockSize = fJournal->GetVolume()->BlockSize();
343 
344 	run_array* array = (run_array*)malloc(blockSize);
345 	if (array == NULL)
346 		return B_NO_MEMORY;
347 
348 	if (fArrays.Push(array) != B_OK) {
349 		free(array);
350 		return B_NO_MEMORY;
351 	}
352 
353 	array->Init(blockSize);
354 	fLastArray = array;
355 	return B_OK;
356 }
357 
358 
359 status_t
360 RunArrays::Insert(off_t blockNumber)
361 {
362 	Volume* volume = fJournal->GetVolume();
363 	block_run run = volume->ToBlockRun(blockNumber);
364 
365 	if (fLastArray != NULL) {
366 		// check if the block is already in the array
367 		if (_ContainsRun(run))
368 			return B_OK;
369 	}
370 
371 	// insert block into array
372 
373 	if (!_AddRun(run)) {
374 		// array is full
375 		if (_AddArray() != B_OK || !_AddRun(run))
376 			return B_NO_MEMORY;
377 	}
378 
379 	return B_OK;
380 }
381 
382 
383 int32
384 RunArrays::MaxArrayLength()
385 {
386 	int32 max = 0;
387 	for (int32 i = 0; i < CountArrays(); i++) {
388 		if (ArrayAt(i)->CountRuns() > max)
389 			max = ArrayAt(i)->CountRuns();
390 	}
391 
392 	return max;
393 }
394 
395 
396 //	#pragma mark - Journal
397 
398 
399 Journal::Journal(Volume* volume)
400 	:
401 	fVolume(volume),
402 	fOwner(NULL),
403 	fLogSize(volume->Log().Length()),
404 	fMaxTransactionSize(fLogSize / 2 - 5),
405 	fUsed(0),
406 	fUnwrittenTransactions(0),
407 	fHasSubtransaction(false),
408 	fSeparateSubTransactions(false)
409 {
410 	recursive_lock_init(&fLock, "bfs journal");
411 	mutex_init(&fEntriesLock, "bfs journal entries");
412 }
413 
414 
415 Journal::~Journal()
416 {
417 	FlushLogAndBlocks();
418 
419 	recursive_lock_destroy(&fLock);
420 	mutex_destroy(&fEntriesLock);
421 }
422 
423 
424 status_t
425 Journal::InitCheck()
426 {
427 	return B_OK;
428 }
429 
430 
431 /*!	\brief Does a very basic consistency check of the run array.
432 	It will check the maximum run count as well as if all of the runs fall
433 	within a the volume.
434 */
435 status_t
436 Journal::_CheckRunArray(const run_array* array)
437 {
438 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
439 		// the -1 works around an off-by-one bug in Be's BFS implementation,
440 		// same as in run_array::MaxRuns()
441 	if (array->MaxRuns() != maxRuns
442 		|| array->CountRuns() > maxRuns
443 		|| array->CountRuns() <= 0) {
444 		dprintf("run count: %d, array max: %d, max runs: %d\n",
445 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
446 		FATAL(("Log entry has broken header!\n"));
447 		return B_ERROR;
448 	}
449 
450 	for (int32 i = 0; i < array->CountRuns(); i++) {
451 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
452 			return B_ERROR;
453 	}
454 
455 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
456 	return B_OK;
457 }
458 
459 
460 /*!	Replays an entry in the log.
461 	\a _start points to the entry in the log, and will be bumped to the next
462 	one if replaying succeeded.
463 */
464 status_t
465 Journal::_ReplayRunArray(int32* _start)
466 {
467 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
468 
469 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
470 	off_t firstBlockNumber = *_start % fLogSize;
471 
472 	CachedBlock cachedArray(fVolume);
473 
474 	const run_array* array = (const run_array*)cachedArray.SetTo(logOffset
475 		+ firstBlockNumber);
476 	if (array == NULL)
477 		return B_IO_ERROR;
478 
479 	if (_CheckRunArray(array) < B_OK)
480 		return B_BAD_DATA;
481 
482 	// First pass: check integrity of the blocks in the run array
483 
484 	CachedBlock cached(fVolume);
485 
486 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
487 	off_t blockNumber = firstBlockNumber;
488 	int32 blockSize = fVolume->BlockSize();
489 
490 	for (int32 index = 0; index < array->CountRuns(); index++) {
491 		const block_run& run = array->RunAt(index);
492 
493 		off_t offset = fVolume->ToOffset(run);
494 		for (int32 i = 0; i < run.Length(); i++) {
495 			const uint8* data = cached.SetTo(logOffset + blockNumber);
496 			if (data == NULL)
497 				RETURN_ERROR(B_IO_ERROR);
498 
499 			// TODO: eventually check other well known offsets, like the
500 			// root and index dirs
501 			if (offset == 0) {
502 				// This log entry writes over the super block - check if
503 				// it's valid!
504 				if (Volume::CheckSuperBlock(data) != B_OK) {
505 					FATAL(("Log contains invalid super block!\n"));
506 					RETURN_ERROR(B_BAD_DATA);
507 				}
508 			}
509 
510 			blockNumber = (blockNumber + 1) % fLogSize;
511 			offset += blockSize;
512 		}
513 	}
514 
515 	// Second pass: write back its blocks
516 
517 	blockNumber = firstBlockNumber;
518 	int32 count = 1;
519 
520 	for (int32 index = 0; index < array->CountRuns(); index++) {
521 		const block_run& run = array->RunAt(index);
522 		INFORM(("replay block run %u:%u:%u in log at %" B_PRIdOFF "!\n",
523 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
524 
525 		off_t offset = fVolume->ToOffset(run);
526 		for (int32 i = 0; i < run.Length(); i++) {
527 			const uint8* data = cached.SetTo(logOffset + blockNumber);
528 			if (data == NULL)
529 				RETURN_ERROR(B_IO_ERROR);
530 
531 			ssize_t written = write_pos(fVolume->Device(), offset, data,
532 				blockSize);
533 			if (written != blockSize)
534 				RETURN_ERROR(B_IO_ERROR);
535 
536 			blockNumber = (blockNumber + 1) % fLogSize;
537 			offset += blockSize;
538 			count++;
539 		}
540 	}
541 
542 	*_start += count;
543 	return B_OK;
544 }
545 
546 
547 /*!	Replays all log entries - this will put the disk into a
548 	consistent and clean state, if it was not correctly unmounted
549 	before.
550 	This method is called by Journal::InitCheck() if the log start
551 	and end pointer don't match.
552 */
553 status_t
554 Journal::ReplayLog()
555 {
556 	// TODO: this logic won't work whenever the size of the pending transaction
557 	//	equals the size of the log (happens with the original BFS only)
558 	if (fVolume->LogStart() == fVolume->LogEnd())
559 		return B_OK;
560 
561 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
562 
563 	if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) {
564 		INFORM(("log_start and log_end differ, but disk is marked clean - "
565 			"trying to replay log...\n"));
566 	}
567 
568 	if (fVolume->IsReadOnly())
569 		return B_READ_ONLY_DEVICE;
570 
571 	int32 start = fVolume->LogStart();
572 	int32 lastStart = -1;
573 	while (true) {
574 		// stop if the log is completely flushed
575 		if (start == fVolume->LogEnd())
576 			break;
577 
578 		if (start == lastStart) {
579 			// strange, flushing the log hasn't changed the log_start pointer
580 			return B_ERROR;
581 		}
582 		lastStart = start;
583 
584 		status_t status = _ReplayRunArray(&start);
585 		if (status != B_OK) {
586 			FATAL(("replaying log entry from %d failed: %s\n", (int)start,
587 				strerror(status)));
588 			return B_ERROR;
589 		}
590 		start = start % fLogSize;
591 	}
592 
593 	PRINT(("replaying worked fine!\n"));
594 	fVolume->SuperBlock().log_start = HOST_ENDIAN_TO_BFS_INT64(
595 		fVolume->LogEnd());
596 	fVolume->LogStart() = HOST_ENDIAN_TO_BFS_INT64(fVolume->LogEnd());
597 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
598 
599 	return fVolume->WriteSuperBlock();
600 }
601 
602 
603 size_t
604 Journal::CurrentTransactionSize() const
605 {
606 	if (_HasSubTransaction()) {
607 		return cache_blocks_in_sub_transaction(fVolume->BlockCache(),
608 			fTransactionID);
609 	}
610 
611 	return cache_blocks_in_main_transaction(fVolume->BlockCache(),
612 		fTransactionID);
613 }
614 
615 
616 bool
617 Journal::CurrentTransactionTooLarge() const
618 {
619 	return CurrentTransactionSize() > fLogSize;
620 }
621 
622 
623 /*!	This is a callback function that is called by the cache, whenever
624 	all blocks of a transaction have been flushed to disk.
625 	This lets us keep track of completed transactions, and update
626 	the log start pointer as needed. Note, the transactions may not be
627 	completed in the order they were written.
628 */
629 /*static*/ void
630 Journal::_TransactionWritten(int32 transactionID, int32 event, void* _logEntry)
631 {
632 	LogEntry* logEntry = (LogEntry*)_logEntry;
633 
634 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
635 		transactionID));
636 
637 	Journal* journal = logEntry->GetJournal();
638 	disk_super_block& superBlock = journal->fVolume->SuperBlock();
639 	bool update = false;
640 
641 	// Set log_start pointer if possible...
642 
643 	mutex_lock(&journal->fEntriesLock);
644 
645 	if (logEntry == journal->fEntries.First()) {
646 		LogEntry* next = journal->fEntries.GetNext(logEntry);
647 		if (next != NULL) {
648 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(next->Start()
649 				% journal->fLogSize);
650 		} else {
651 			superBlock.log_start = HOST_ENDIAN_TO_BFS_INT64(
652 				journal->fVolume->LogEnd());
653 		}
654 
655 		update = true;
656 	}
657 
658 	T(LogEntry(logEntry, superBlock.LogStart(), false));
659 
660 	journal->fUsed -= logEntry->Length();
661 	journal->fEntries.Remove(logEntry);
662 	mutex_unlock(&journal->fEntriesLock);
663 
664 	delete logEntry;
665 
666 	// update the super block, and change the disk's state, if necessary
667 
668 	if (update) {
669 		if (superBlock.log_start == superBlock.log_end)
670 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
671 
672 		status_t status = journal->fVolume->WriteSuperBlock();
673 		if (status != B_OK) {
674 			FATAL(("_TransactionWritten: could not write back super block: %s\n",
675 				strerror(status)));
676 		}
677 
678 		journal->fVolume->LogStart() = superBlock.LogStart();
679 	}
680 }
681 
682 
683 /*!	Listens to TRANSACTION_IDLE events, and flushes the log when that happens */
684 /*static*/ void
685 Journal::_TransactionIdle(int32 transactionID, int32 event, void* _journal)
686 {
687 	// The current transaction seems to be idle - flush it
688 
689 	Journal* journal = (Journal*)_journal;
690 	journal->_FlushLog(false, false);
691 }
692 
693 
694 /*!	Writes the blocks that are part of current transaction into the log,
695 	and ends the current transaction.
696 	If the current transaction is too large to fit into the log, it will
697 	try to detach an existing sub-transaction.
698 */
699 status_t
700 Journal::_WriteTransactionToLog()
701 {
702 	// TODO: in case of a failure, we need a backup plan like writing all
703 	//	changed blocks back to disk immediately (hello disk corruption!)
704 
705 	bool detached = false;
706 
707 	if (_TransactionSize() > fLogSize) {
708 		// The current transaction won't fit into the log anymore, try to
709 		// detach the current sub-transaction
710 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
711 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
712 			detached = true;
713 		} else {
714 			// We created a transaction larger than one we can write back to
715 			// disk - the only option we have (besides risking disk corruption
716 			// by writing it back anyway), is to let it fail.
717 			dprintf("transaction too large (%d blocks, log size %d)!\n",
718 				(int)_TransactionSize(), (int)fLogSize);
719 			return B_BUFFER_OVERFLOW;
720 		}
721 	}
722 
723 	fHasSubtransaction = false;
724 
725 	int32 blockShift = fVolume->BlockShift();
726 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
727 	off_t logStart = fVolume->LogEnd() % fLogSize;
728 	off_t logPosition = logStart;
729 	status_t status;
730 
731 	// create run_array structures for all changed blocks
732 
733 	RunArrays runArrays(this);
734 
735 	off_t blockNumber;
736 	long cookie = 0;
737 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
738 			fTransactionID, detached, &cookie, &blockNumber, NULL,
739 			NULL) == B_OK) {
740 		status = runArrays.Insert(blockNumber);
741 		if (status < B_OK) {
742 			FATAL(("filling log entry failed!"));
743 			return status;
744 		}
745 	}
746 
747 	if (runArrays.CountBlocks() == 0) {
748 		// nothing has changed during this transaction
749 		if (detached) {
750 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
751 				fTransactionID, NULL, NULL);
752 			fUnwrittenTransactions = 1;
753 		} else {
754 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
755 				NULL);
756 			fUnwrittenTransactions = 0;
757 		}
758 		return B_OK;
759 	}
760 
761 	// If necessary, flush the log, so that we have enough space for this
762 	// transaction
763 	if (runArrays.LogEntryLength() > FreeLogBlocks()) {
764 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
765 		if (runArrays.LogEntryLength() > FreeLogBlocks()) {
766 			panic("no space in log after sync (%ld for %ld blocks)!",
767 				(long)FreeLogBlocks(), (long)runArrays.LogEntryLength());
768 		}
769 	}
770 
771 	// Write log entries to disk
772 
773 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
774 		// one extra for the index block
775 
776 	iovec* vecs = (iovec*)malloc(sizeof(iovec) * maxVecs);
777 	if (vecs == NULL) {
778 		// TODO: write back log entries directly?
779 		return B_NO_MEMORY;
780 	}
781 
782 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
783 		run_array* array = runArrays.ArrayAt(k);
784 		int32 index = 0, count = 1;
785 		int32 wrap = fLogSize - logStart;
786 
787 		add_to_iovec(vecs, index, maxVecs, (void*)array, fVolume->BlockSize());
788 
789 		// add block runs
790 
791 		for (int32 i = 0; i < array->CountRuns(); i++) {
792 			const block_run& run = array->RunAt(i);
793 			off_t blockNumber = fVolume->ToBlock(run);
794 
795 			for (int32 j = 0; j < run.Length(); j++) {
796 				if (count >= wrap) {
797 					// We need to write back the first half of the entry
798 					// directly as the log wraps around
799 					if (writev_pos(fVolume->Device(), logOffset
800 						+ (logStart << blockShift), vecs, index) < 0)
801 						FATAL(("could not write log area!\n"));
802 
803 					logPosition = logStart + count;
804 					logStart = 0;
805 					wrap = fLogSize;
806 					count = 0;
807 					index = 0;
808 				}
809 
810 				// make blocks available in the cache
811 				const void* data = block_cache_get(fVolume->BlockCache(),
812 					blockNumber + j);
813 				if (data == NULL) {
814 					free(vecs);
815 					return B_IO_ERROR;
816 				}
817 
818 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
819 				count++;
820 			}
821 		}
822 
823 		// write back the rest of the log entry
824 		if (count > 0) {
825 			logPosition = logStart + count;
826 			if (writev_pos(fVolume->Device(), logOffset
827 					+ (logStart << blockShift), vecs, index) < 0)
828 				FATAL(("could not write log area: %s!\n", strerror(errno)));
829 		}
830 
831 		// release blocks again
832 		for (int32 i = 0; i < array->CountRuns(); i++) {
833 			const block_run& run = array->RunAt(i);
834 			off_t blockNumber = fVolume->ToBlock(run);
835 
836 			for (int32 j = 0; j < run.Length(); j++) {
837 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
838 			}
839 		}
840 
841 		logStart = logPosition % fLogSize;
842 	}
843 
844 	free(vecs);
845 
846 	LogEntry* logEntry = new LogEntry(this, fVolume->LogEnd(),
847 		runArrays.LogEntryLength());
848 	if (logEntry == NULL) {
849 		FATAL(("no memory to allocate log entries!"));
850 		return B_NO_MEMORY;
851 	}
852 
853 #ifdef BFS_DEBUGGER_COMMANDS
854 	logEntry->SetTransactionID(fTransactionID);
855 #endif
856 
857 	// Update the log end pointer in the super block
858 
859 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
860 	fVolume->SuperBlock().log_end = HOST_ENDIAN_TO_BFS_INT64(logPosition);
861 
862 	status = fVolume->WriteSuperBlock();
863 
864 	fVolume->LogEnd() = logPosition;
865 	T(LogEntry(logEntry, fVolume->LogEnd(), true));
866 
867 	// We need to flush the drives own cache here to ensure
868 	// disk consistency.
869 	// If that call fails, we can't do anything about it anyway
870 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
871 
872 	// at this point, we can finally end the transaction - we're in
873 	// a guaranteed valid state
874 
875 	mutex_lock(&fEntriesLock);
876 	fEntries.Add(logEntry);
877 	fUsed += logEntry->Length();
878 	mutex_unlock(&fEntriesLock);
879 
880 	if (detached) {
881 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
882 			fTransactionID, _TransactionWritten, logEntry);
883 		fUnwrittenTransactions = 1;
884 
885 		if (status == B_OK && _TransactionSize() > fLogSize) {
886 			// If the transaction is too large after writing, there is no way to
887 			// recover, so let this transaction fail.
888 			dprintf("transaction too large (%d blocks, log size %d)!\n",
889 				(int)_TransactionSize(), (int)fLogSize);
890 			return B_BUFFER_OVERFLOW;
891 		}
892 	} else {
893 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
894 			_TransactionWritten, logEntry);
895 		fUnwrittenTransactions = 0;
896 	}
897 
898 	return status;
899 }
900 
901 
902 /*!	Flushes the current log entry to disk. If \a flushBlocks is \c true it will
903 	also write back all dirty blocks for this volume.
904 */
905 status_t
906 Journal::_FlushLog(bool canWait, bool flushBlocks)
907 {
908 	status_t status = canWait ? recursive_lock_lock(&fLock)
909 		: recursive_lock_trylock(&fLock);
910 	if (status != B_OK)
911 		return status;
912 
913 	if (recursive_lock_get_recursion(&fLock) > 1) {
914 		// whoa, FlushLogAndBlocks() was called from inside a transaction
915 		recursive_lock_unlock(&fLock);
916 		return B_OK;
917 	}
918 
919 	// write the current log entry to disk
920 
921 	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
922 		status = _WriteTransactionToLog();
923 		if (status < B_OK)
924 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
925 	}
926 
927 	if (flushBlocks)
928 		status = fVolume->FlushDevice();
929 
930 	recursive_lock_unlock(&fLock);
931 	return status;
932 }
933 
934 
935 /*!	Flushes the current log entry to disk, and also writes back all dirty
936 	blocks for this volume (completing all open transactions).
937 */
938 status_t
939 Journal::FlushLogAndBlocks()
940 {
941 	return _FlushLog(true, true);
942 }
943 
944 
945 status_t
946 Journal::Lock(Transaction* owner, bool separateSubTransactions)
947 {
948 	status_t status = recursive_lock_lock(&fLock);
949 	if (status != B_OK)
950 		return status;
951 
952 	if (!fSeparateSubTransactions && recursive_lock_get_recursion(&fLock) > 1) {
953 		// we'll just use the current transaction again
954 		return B_OK;
955 	}
956 
957 	if (separateSubTransactions)
958 		fSeparateSubTransactions = true;
959 
960 	if (owner != NULL)
961 		owner->SetParent(fOwner);
962 
963 	fOwner = owner;
964 
965 	// TODO: we need a way to find out how big the current transaction is;
966 	//	we need to be able to either detach the latest sub transaction on
967 	//	demand, as well as having some kind of fall back plan in case the
968 	//	sub transaction itself grows bigger than the log.
969 	//	For that, it would be nice to have some call-back interface in the
970 	//	cache transaction API...
971 
972 	if (fOwner != NULL) {
973 		if (fUnwrittenTransactions > 0) {
974 			// start a sub transaction
975 			cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
976 			fHasSubtransaction = true;
977 		} else
978 			fTransactionID = cache_start_transaction(fVolume->BlockCache());
979 
980 		if (fTransactionID < B_OK) {
981 			recursive_lock_unlock(&fLock);
982 			return fTransactionID;
983 		}
984 
985 		cache_add_transaction_listener(fVolume->BlockCache(), fTransactionID,
986 			TRANSACTION_IDLE, _TransactionIdle, this);
987 	}
988 	return B_OK;
989 }
990 
991 
992 status_t
993 Journal::Unlock(Transaction* owner, bool success)
994 {
995 	if (fSeparateSubTransactions || recursive_lock_get_recursion(&fLock) == 1) {
996 		// we only end the transaction if we would really unlock it
997 		// TODO: what about failing transactions that do not unlock?
998 		// (they must make the parent fail, too)
999 		if (owner != NULL) {
1000 			status_t status = _TransactionDone(success);
1001 			if (status != B_OK)
1002 				return status;
1003 
1004 			// Unlocking the inodes might trigger new transactions, but we
1005 			// cannot reuse the current one anymore, as this one is already
1006 			// closed.
1007 			bool separateSubTransactions = fSeparateSubTransactions;
1008 			fSeparateSubTransactions = true;
1009 			owner->NotifyListeners(success);
1010 			fSeparateSubTransactions = separateSubTransactions;
1011 
1012 			fOwner = owner->Parent();
1013 		} else
1014 			fOwner = NULL;
1015 
1016 		fTimestamp = system_time();
1017 
1018 		if (fSeparateSubTransactions
1019 			&& recursive_lock_get_recursion(&fLock) == 1)
1020 			fSeparateSubTransactions = false;
1021 	} else
1022 		owner->MoveListenersTo(fOwner);
1023 
1024 	recursive_lock_unlock(&fLock);
1025 	return B_OK;
1026 }
1027 
1028 
1029 uint32
1030 Journal::_TransactionSize() const
1031 {
1032 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
1033 		fTransactionID);
1034 	if (count <= 0)
1035 		return 0;
1036 
1037 	// take the number of array blocks in this transaction into account
1038 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
1039 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
1040 	return count + arrayBlocks;
1041 }
1042 
1043 
1044 status_t
1045 Journal::_TransactionDone(bool success)
1046 {
1047 	if (!success) {
1048 		if (_HasSubTransaction()) {
1049 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
1050 			// We can continue to use the parent transaction afterwards
1051 		} else {
1052 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
1053 			fUnwrittenTransactions = 0;
1054 		}
1055 
1056 		return B_OK;
1057 	}
1058 
1059 	// Up to a maximum size, we will just batch several
1060 	// transactions together to improve speed
1061 	uint32 size = _TransactionSize();
1062 	if (size < fMaxTransactionSize) {
1063 		// Flush the log from time to time, so that we have enough space
1064 		// for this transaction
1065 		if (size > FreeLogBlocks())
1066 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
1067 
1068 		fUnwrittenTransactions++;
1069 		return B_OK;
1070 	}
1071 
1072 	return _WriteTransactionToLog();
1073 }
1074 
1075 
1076 //	#pragma mark - debugger commands
1077 
1078 
1079 #ifdef BFS_DEBUGGER_COMMANDS
1080 
1081 
1082 void
1083 Journal::Dump()
1084 {
1085 	kprintf("Journal %p\n", this);
1086 	kprintf("  log start:            %ld\n", fVolume->LogStart());
1087 	kprintf("  log end:              %ld\n", fVolume->LogEnd());
1088 	kprintf("  owner:                %p\n", fOwner);
1089 	kprintf("  log size:             %lu\n", fLogSize);
1090 	kprintf("  max transaction size: %lu\n", fMaxTransactionSize);
1091 	kprintf("  used:                 %lu\n", fUsed);
1092 	kprintf("  unwritten:            %ld\n", fUnwrittenTransactions);
1093 	kprintf("  timestamp:            %lld\n", fTimestamp);
1094 	kprintf("  transaction ID:       %ld\n", fTransactionID);
1095 	kprintf("  has subtransaction:   %d\n", fHasSubtransaction);
1096 	kprintf("  separate sub-trans.:  %d\n", fSeparateSubTransactions);
1097 	kprintf("entries:\n");
1098 	kprintf("  address        id  start length\n");
1099 
1100 	LogEntryList::Iterator iterator = fEntries.GetIterator();
1101 
1102 	while (iterator.HasNext()) {
1103 		LogEntry* entry = iterator.Next();
1104 
1105 		kprintf("  %p %6ld %6lu %6lu\n", entry, entry->TransactionID(),
1106 			entry->Start(), entry->Length());
1107 	}
1108 }
1109 
1110 
1111 int
1112 dump_journal(int argc, char** argv)
1113 {
1114 	if (argc != 2 || !strcmp(argv[1], "--help")) {
1115 		kprintf("usage: %s <ptr-to-volume>\n", argv[0]);
1116 		return 0;
1117 	}
1118 
1119 	Volume* volume = (Volume*)parse_expression(argv[1]);
1120 	Journal* journal = volume->GetJournal(0);
1121 
1122 	journal->Dump();
1123 	return 0;
1124 }
1125 
1126 
1127 #endif	// BFS_DEBUGGER_COMMANDS
1128 
1129 
1130 //	#pragma mark - TransactionListener
1131 
1132 
1133 TransactionListener::TransactionListener()
1134 {
1135 }
1136 
1137 
1138 TransactionListener::~TransactionListener()
1139 {
1140 }
1141 
1142 
1143 //	#pragma mark - Transaction
1144 
1145 
1146 status_t
1147 Transaction::Start(Volume* volume, off_t refBlock)
1148 {
1149 	// has it already been started?
1150 	if (fJournal != NULL)
1151 		return B_OK;
1152 
1153 	fJournal = volume->GetJournal(refBlock);
1154 	if (fJournal != NULL && fJournal->Lock(this, false) == B_OK)
1155 		return B_OK;
1156 
1157 	fJournal = NULL;
1158 	return B_ERROR;
1159 }
1160 
1161 
1162 void
1163 Transaction::AddListener(TransactionListener* listener)
1164 {
1165 	if (fJournal == NULL)
1166 		panic("Transaction is not running!");
1167 
1168 	fListeners.Add(listener);
1169 }
1170 
1171 
1172 void
1173 Transaction::RemoveListener(TransactionListener* listener)
1174 {
1175 	if (fJournal == NULL)
1176 		panic("Transaction is not running!");
1177 
1178 	fListeners.Remove(listener);
1179 	listener->RemovedFromTransaction();
1180 }
1181 
1182 
1183 void
1184 Transaction::NotifyListeners(bool success)
1185 {
1186 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1187 		listener->TransactionDone(success);
1188 		listener->RemovedFromTransaction();
1189 	}
1190 }
1191 
1192 
1193 /*!	Move the inodes into the parent transaction. This is needed only to make
1194 	sure they will still be reverted in case the transaction is aborted.
1195 */
1196 void
1197 Transaction::MoveListenersTo(Transaction* transaction)
1198 {
1199 	while (TransactionListener* listener = fListeners.RemoveHead()) {
1200 		transaction->fListeners.Add(listener);
1201 	}
1202 }
1203