xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision 89755088d790ff4fe36f8aa77dacb2bd15507108)
1 /*
2  * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  */
5 
6 //! Transaction and logging
7 
8 
9 #include "Journal.h"
10 #include "Inode.h"
11 #include "Debug.h"
12 
13 
14 struct run_array {
15 	int32		count;
16 	int32		max_runs;
17 	block_run	runs[0];
18 
19 	void Init(int32 blockSize);
20 	void Insert(block_run &run);
21 
22 	int32 CountRuns() const { return BFS_ENDIAN_TO_HOST_INT32(count); }
23 	int32 MaxRuns() const { return BFS_ENDIAN_TO_HOST_INT32(max_runs) - 1; }
24 		// that -1 accounts for an off-by-one error in Be's BFS implementation
25 	const block_run &RunAt(int32 i) const { return runs[i]; }
26 
27 	static int32 MaxRuns(int32 blockSize);
28 
29 private:
30 	static int _Compare(block_run &a, block_run &b);
31 	int32 _FindInsertionIndex(block_run &run);
32 };
33 
34 class RunArrays {
35 	public:
36 		RunArrays(Journal *journal);
37 		~RunArrays();
38 
39 		status_t Insert(off_t blockNumber);
40 
41 		run_array *ArrayAt(int32 i) { return fArrays.Array()[i]; }
42 		int32 CountArrays() const { return fArrays.CountItems(); }
43 
44 		uint32 CountBlocks() const { return fBlockCount; }
45 		uint32 LogEntryLength() const { return CountBlocks() + CountArrays(); }
46 
47 		int32 MaxArrayLength();
48 
49 	private:
50 		status_t _AddArray();
51 		bool _ContainsRun(block_run &run);
52 		bool _AddRun(block_run &run);
53 
54 		Journal		*fJournal;
55 		uint32		fBlockCount;
56 		Stack<run_array *> fArrays;
57 		run_array	*fLastArray;
58 };
59 
60 class LogEntry : public DoublyLinkedListLinkImpl<LogEntry> {
61 	public:
62 		LogEntry(Journal *journal, uint32 logStart, uint32 length);
63 		~LogEntry();
64 
65 		uint32 Start() const { return fStart; }
66 		uint32 Length() const { return fLength; }
67 
68 		Journal *GetJournal() { return fJournal; }
69 
70 	private:
71 		Journal		*fJournal;
72 		uint32		fStart;
73 		uint32		fLength;
74 };
75 
76 
77 //	#pragma mark -
78 
79 
80 static void
81 add_to_iovec(iovec *vecs, int32 &index, int32 max, const void *address,
82 	size_t size)
83 {
84 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
85 			+ vecs[index - 1].iov_len == (addr_t)address) {
86 		// the iovec can be combined with the previous one
87 		vecs[index - 1].iov_len += size;
88 		return;
89 	}
90 
91 	if (index == max)
92 		panic("no more space for iovecs!");
93 
94 	// we need to start a new iovec
95 	vecs[index].iov_base = const_cast<void *>(address);
96 	vecs[index].iov_len = size;
97 	index++;
98 }
99 
100 
101 //	#pragma mark - LogEntry
102 
103 
104 LogEntry::LogEntry(Journal *journal, uint32 start, uint32 length)
105 	:
106 	fJournal(journal),
107 	fStart(start),
108 	fLength(length)
109 {
110 }
111 
112 
113 LogEntry::~LogEntry()
114 {
115 }
116 
117 
118 //	#pragma mark - run_array
119 
120 
121 /*!	The run_array's size equals the block size of the BFS volume, so we
122 	cannot use a (non-overridden) new.
123 	This makes a freshly allocated run_array ready to run.
124 */
125 void
126 run_array::Init(int32 blockSize)
127 {
128 	memset(this, 0, blockSize);
129 	count = 0;
130 	max_runs = HOST_ENDIAN_TO_BFS_INT32(MaxRuns(blockSize));
131 }
132 
133 
134 /*!	Inserts the block_run into the array. You will have to make sure the
135 	array is large enough to contain the entry before calling this function.
136 */
137 void
138 run_array::Insert(block_run &run)
139 {
140 	int32 index = _FindInsertionIndex(run);
141 	if (index == -1) {
142 		// add to the end
143 		runs[CountRuns()] = run;
144 	} else {
145 		// insert at index
146 		memmove(&runs[index + 1], &runs[index],
147 			(CountRuns() - index) * sizeof(off_t));
148 		runs[index] = run;
149 	}
150 
151 	count = HOST_ENDIAN_TO_BFS_INT32(CountRuns() + 1);
152 }
153 
154 
155 /*static*/ int32
156 run_array::MaxRuns(int32 blockSize)
157 {
158 	// For whatever reason, BFS restricts the maximum array size
159 	uint32 maxCount = (blockSize - sizeof(run_array)) / sizeof(block_run);
160 	if (maxCount < 128)
161 		return maxCount;
162 
163 	return 127;
164 }
165 
166 
167 /*static*/ int
168 run_array::_Compare(block_run &a, block_run &b)
169 {
170 	int cmp = a.AllocationGroup() - b.AllocationGroup();
171 	if (cmp == 0)
172 		return a.Start() - b.Start();
173 
174 	return cmp;
175 }
176 
177 
178 int32
179 run_array::_FindInsertionIndex(block_run &run)
180 {
181 	int32 min = 0, max = CountRuns() - 1;
182 	int32 i = 0;
183 	if (max >= 8) {
184 		while (min <= max) {
185 			i = (min + max) / 2;
186 
187 			int cmp = _Compare(runs[i], run);
188 			if (cmp < 0)
189 				min = i + 1;
190 			else if (cmp > 0)
191 				max = i - 1;
192 			else
193 				return -1;
194 		}
195 
196 		if (_Compare(runs[i], run) < 0)
197 			i++;
198 	} else {
199 		for (; i <= max; i++) {
200 			if (_Compare(runs[i], run) > 0)
201 				break;
202 		}
203 		if (i == count)
204 			return -1;
205 	}
206 
207 	return i;
208 }
209 
210 
211 //	#pragma mark - RunArrays
212 
213 
214 RunArrays::RunArrays(Journal *journal)
215 	:
216 	fJournal(journal),
217 	fBlockCount(0),
218 	fArrays(),
219 	fLastArray(NULL)
220 {
221 }
222 
223 
224 RunArrays::~RunArrays()
225 {
226 	run_array *array;
227 	while (fArrays.Pop(&array))
228 		free(array);
229 }
230 
231 
232 bool
233 RunArrays::_ContainsRun(block_run &run)
234 {
235 	for (int32 i = 0; i < CountArrays(); i++) {
236 		run_array *array = ArrayAt(i);
237 
238 		for (int32 j = 0; j < array->CountRuns(); j++) {
239 			block_run &arrayRun = array->runs[j];
240 			if (run.AllocationGroup() != arrayRun.AllocationGroup())
241 				continue;
242 
243 			if (run.Start() >= arrayRun.Start()
244 				&& run.Start() + run.Length()
245 					<= arrayRun.Start() + arrayRun.Length())
246 				return true;
247 		}
248 	}
249 
250 	return false;
251 }
252 
253 
254 /*!	Adds the specified block_run into the array.
255 	Note: it doesn't support overlapping - it must only be used
256 	with block_runs of length 1!
257 */
258 bool
259 RunArrays::_AddRun(block_run &run)
260 {
261 	ASSERT(run.length == 1);
262 
263 	// Be's BFS log replay routine can only deal with block_runs of size 1
264 	// A pity, isn't it? Too sad we have to be compatible.
265 
266 	if (fLastArray == NULL || fLastArray->CountRuns() == fLastArray->MaxRuns())
267 		return false;
268 
269 	fLastArray->Insert(run);
270 	fBlockCount++;
271 	return true;
272 }
273 
274 
275 status_t
276 RunArrays::_AddArray()
277 {
278 	int32 blockSize = fJournal->GetVolume()->BlockSize();
279 
280 	run_array *array = (run_array *)malloc(blockSize);
281 	if (array == NULL)
282 		return B_NO_MEMORY;
283 
284 	if (fArrays.Push(array) != B_OK) {
285 		free(array);
286 		return B_NO_MEMORY;
287 	}
288 
289 	array->Init(blockSize);
290 	fLastArray = array;
291 	return B_OK;
292 }
293 
294 
295 status_t
296 RunArrays::Insert(off_t blockNumber)
297 {
298 	Volume *volume = fJournal->GetVolume();
299 	block_run run = volume->ToBlockRun(blockNumber);
300 
301 	if (fLastArray != NULL) {
302 		// check if the block is already in the array
303 		if (_ContainsRun(run))
304 			return B_OK;
305 	}
306 
307 	// insert block into array
308 
309 	if (!_AddRun(run)) {
310 		// array is full
311 		if (_AddArray() != B_OK || !_AddRun(run))
312 			return B_NO_MEMORY;
313 	}
314 
315 	return B_OK;
316 }
317 
318 
319 int32
320 RunArrays::MaxArrayLength()
321 {
322 	int32 max = 0;
323 	for (int32 i = 0; i < CountArrays(); i++) {
324 		if (ArrayAt(i)->CountRuns() > max)
325 			max = ArrayAt(i)->CountRuns();
326 	}
327 
328 	return max;
329 }
330 
331 
332 //	#pragma mark - Journal
333 
334 
335 Journal::Journal(Volume *volume)
336 	:
337 	fVolume(volume),
338 	fLock("bfs journal"),
339 	fOwner(NULL),
340 	fLogSize(volume->Log().Length()),
341 	fMaxTransactionSize(fLogSize / 2 - 5),
342 	fUsed(0),
343 	fUnwrittenTransactions(0),
344 	fHasSubtransaction(false)
345 {
346 }
347 
348 
349 Journal::~Journal()
350 {
351 	FlushLogAndBlocks();
352 }
353 
354 
355 status_t
356 Journal::InitCheck()
357 {
358 	// TODO: this logic won't work whenever the size of the pending transaction
359 	//	equals the size of the log (happens with the original BFS only)
360 	if (fVolume->LogStart() != fVolume->LogEnd()) {
361 		if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
362 			FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));
363 
364 		return ReplayLog();
365 	}
366 
367 	return B_OK;
368 }
369 
370 
371 /*!	\brief Does a very basic consistency check of the run array.
372 	It will check the maximum run count as well as if all of the runs fall
373 	within a the volume.
374 */
375 status_t
376 Journal::_CheckRunArray(const run_array *array)
377 {
378 	int32 maxRuns = run_array::MaxRuns(fVolume->BlockSize()) - 1;
379 		// the -1 works around an off-by-one bug in Be's BFS implementation,
380 		// same as in run_array::MaxRuns()
381 	if (array->MaxRuns() != maxRuns
382 		|| array->CountRuns() > maxRuns
383 		|| array->CountRuns() <= 0) {
384 		dprintf("run count: %d, array max: %d, max runs: %d\n",
385 			(int)array->CountRuns(), (int)array->MaxRuns(), (int)maxRuns);
386 		FATAL(("Log entry has broken header!\n"));
387 		return B_ERROR;
388 	}
389 
390 	for (int32 i = 0; i < array->CountRuns(); i++) {
391 		if (fVolume->ValidateBlockRun(array->RunAt(i)) != B_OK)
392 			return B_ERROR;
393 	}
394 
395 	PRINT(("Log entry has %ld entries\n", array->CountRuns()));
396 	return B_OK;
397 }
398 
399 
400 /*!	Replays an entry in the log.
401 	\a _start points to the entry in the log, and will be bumped to the next
402 	one if replaying succeeded.
403 */
404 status_t
405 Journal::_ReplayRunArray(int32 *_start)
406 {
407 	PRINT(("ReplayRunArray(start = %ld)\n", *_start));
408 
409 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
410 	off_t firstBlockNumber = *_start % fLogSize;
411 
412 	CachedBlock cachedArray(fVolume);
413 
414 	const run_array *array = (const run_array *)cachedArray.SetTo(logOffset
415 		+ firstBlockNumber);
416 	if (array == NULL)
417 		return B_IO_ERROR;
418 
419 	if (_CheckRunArray(array) < B_OK)
420 		return B_BAD_DATA;
421 
422 	// First pass: check integrity of the blocks in the run array
423 
424 	CachedBlock cached(fVolume);
425 
426 	firstBlockNumber = (firstBlockNumber + 1) % fLogSize;
427 	off_t blockNumber = firstBlockNumber;
428 	int32 blockSize = fVolume->BlockSize();
429 
430 	for (int32 index = 0; index < array->CountRuns(); index++) {
431 		const block_run &run = array->RunAt(index);
432 
433 		off_t offset = fVolume->ToOffset(run);
434 		for (int32 i = 0; i < run.Length(); i++) {
435 			const uint8 *data = cached.SetTo(logOffset + blockNumber);
436 			if (data == NULL)
437 				RETURN_ERROR(B_IO_ERROR);
438 
439 			// TODO: eventually check other well known offsets, like the
440 			// root and index dirs
441 			if (offset == 0) {
442 				// This log entry writes over the super block - check if
443 				// it's valid!
444 				if (Volume::CheckSuperBlock(data) != B_OK) {
445 					FATAL(("Log contains invalid super block!\n"));
446 					RETURN_ERROR(B_BAD_DATA);
447 				}
448 			}
449 
450 			blockNumber = (blockNumber + 1) % fLogSize;
451 			offset += blockSize;
452 		}
453 	}
454 
455 	// Second pass: write back its blocks
456 
457 	blockNumber = firstBlockNumber;
458 	int32 count = 1;
459 
460 	for (int32 index = 0; index < array->CountRuns(); index++) {
461 		const block_run &run = array->RunAt(index);
462 		INFORM(("replay block run %u:%u:%u in log at %Ld!\n",
463 			(int)run.AllocationGroup(), run.Start(), run.Length(), blockNumber));
464 
465 		off_t offset = fVolume->ToOffset(run);
466 		for (int32 i = 0; i < run.Length(); i++) {
467 			const uint8 *data = cached.SetTo(logOffset + blockNumber);
468 			if (data == NULL)
469 				RETURN_ERROR(B_IO_ERROR);
470 
471 			ssize_t written = write_pos(fVolume->Device(), offset, data,
472 				blockSize);
473 			if (written != blockSize)
474 				RETURN_ERROR(B_IO_ERROR);
475 
476 			blockNumber = (blockNumber + 1) % fLogSize;
477 			offset += blockSize;
478 			count++;
479 		}
480 	}
481 
482 	*_start += count;
483 	return B_OK;
484 }
485 
486 
487 /*!	Replays all log entries - this will put the disk into a
488 	consistent and clean state, if it was not correctly unmounted
489 	before.
490 	This method is called by Journal::InitCheck() if the log start
491 	and end pointer don't match.
492 */
493 status_t
494 Journal::ReplayLog()
495 {
496 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
497 
498 	int32 start = fVolume->LogStart();
499 	int32 lastStart = -1;
500 	while (true) {
501 		// stop if the log is completely flushed
502 		if (start == fVolume->LogEnd())
503 			break;
504 
505 		if (start == lastStart) {
506 			// strange, flushing the log hasn't changed the log_start pointer
507 			return B_ERROR;
508 		}
509 		lastStart = start;
510 
511 		status_t status = _ReplayRunArray(&start);
512 		if (status < B_OK) {
513 			FATAL(("replaying log entry from %d failed: %s\n", (int)start, strerror(status)));
514 			return B_ERROR;
515 		}
516 		start = start % fLogSize;
517 	}
518 
519 	PRINT(("replaying worked fine!\n"));
520 	fVolume->SuperBlock().log_start = fVolume->LogEnd();
521 	fVolume->LogStart() = fVolume->LogEnd();
522 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
523 
524 	return fVolume->WriteSuperBlock();
525 }
526 
527 
528 /*!	This is a callback function that is called by the cache, whenever
529 	all blocks of a transaction have been flushed to disk.
530 	This lets us keep track of completed transactions, and update
531 	the log start pointer as needed. Note, the transactions may not be
532 	completed in the order they were written.
533 */
534 void
535 Journal::_TransactionNotify(int32 transactionID, int32 event, void *_logEntry)
536 {
537 	LogEntry *logEntry = (LogEntry *)_logEntry;
538 
539 	if (event != TRANSACTION_WRITTEN)
540 		return;
541 
542 	PRINT(("Log entry %p has been finished, transaction ID = %ld\n", logEntry,
543 		transactionID));
544 
545 	Journal *journal = logEntry->GetJournal();
546 	disk_super_block &superBlock = journal->fVolume->SuperBlock();
547 	bool update = false;
548 
549 	// Set log_start pointer if possible...
550 	// TODO: this is not endian safe!
551 
552 	journal->fEntriesLock.Lock();
553 
554 	if (logEntry == journal->fEntries.First()) {
555 		LogEntry *next = journal->fEntries.GetNext(logEntry);
556 		if (next != NULL)
557 			superBlock.log_start = next->Start() % journal->fLogSize;
558 		else
559 			superBlock.log_start = journal->fVolume->LogEnd();
560 
561 		update = true;
562 	}
563 
564 	journal->fUsed -= logEntry->Length();
565 	journal->fEntries.Remove(logEntry);
566 	journal->fEntriesLock.Unlock();
567 
568 	delete logEntry;
569 
570 	// update the super block, and change the disk's state, if necessary
571 
572 	if (update) {
573 		journal->fVolume->LogStart() = superBlock.log_start;
574 
575 		if (superBlock.log_start == superBlock.log_end)
576 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
577 
578 		status_t status = journal->fVolume->WriteSuperBlock();
579 		if (status != B_OK) {
580 			FATAL(("_BlockNotify: could not write back super block: %s\n",
581 				strerror(status)));
582 		}
583 	}
584 }
585 
586 
587 /*!	Writes the blocks that are part of current transaction into the log,
588 	and ends the current transaction.
589 	If the current transaction is too large to fit into the log, it will
590 	try to detach an existing sub-transaction.
591 */
592 status_t
593 Journal::_WriteTransactionToLog()
594 {
595 	// ToDo: in case of a failure, we need a backup plan like writing all
596 	//	changed blocks back to disk immediately (hello disk corruption!)
597 
598 	bool detached = false;
599 
600 	if (_TransactionSize() > fLogSize) {
601 		// The current transaction won't fit into the log anymore, try to
602 		// detach the current sub-transaction
603 		if (_HasSubTransaction() && cache_blocks_in_main_transaction(
604 				fVolume->BlockCache(), fTransactionID) < (int32)fLogSize) {
605 			detached = true;
606 		} else {
607 			// TODO: what are our options here?
608 			// a) abort the transaction - bad, because all changes are lost
609 			// b) carry out the changes, but don't use the log - even worse,
610 			//    as it potentially creates a corrupted disk.
611 			dprintf("transaction too large (%d blocks, %d main, log size %d)!\n",
612 				(int)_TransactionSize(), (int)cache_blocks_in_main_transaction(
613 				fVolume->BlockCache(), fTransactionID), (int)fLogSize);
614 			return B_BUFFER_OVERFLOW;
615 		}
616 	}
617 
618 	fHasSubtransaction = false;
619 
620 	int32 blockShift = fVolume->BlockShift();
621 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
622 	off_t logStart = fVolume->LogEnd() % fLogSize;
623 	off_t logPosition = logStart;
624 	status_t status;
625 
626 	// create run_array structures for all changed blocks
627 
628 	RunArrays runArrays(this);
629 
630 	off_t blockNumber;
631 	long cookie = 0;
632 	while (cache_next_block_in_transaction(fVolume->BlockCache(),
633 			fTransactionID, detached, &cookie, &blockNumber, NULL,
634 			NULL) == B_OK) {
635 		status = runArrays.Insert(blockNumber);
636 		if (status < B_OK) {
637 			FATAL(("filling log entry failed!"));
638 			return status;
639 		}
640 	}
641 
642 	if (runArrays.CountBlocks() == 0) {
643 		// nothing has changed during this transaction
644 		if (detached) {
645 			fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
646 				fTransactionID, NULL, NULL);
647 			fUnwrittenTransactions = 1;
648 		} else {
649 			cache_end_transaction(fVolume->BlockCache(), fTransactionID, NULL,
650 				NULL);
651 			fUnwrittenTransactions = 0;
652 		}
653 		return B_OK;
654 	}
655 
656 	// If necessary, flush the log, so that we have enough space for this
657 	// transaction
658 	if (runArrays.LogEntryLength() > FreeLogBlocks())
659 		cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
660 
661 	// Write log entries to disk
662 
663 	int32 maxVecs = runArrays.MaxArrayLength() + 1;
664 		// one extra for the index block
665 
666 	iovec *vecs = (iovec *)malloc(sizeof(iovec) * maxVecs);
667 	if (vecs == NULL) {
668 		// ToDo: write back log entries directly?
669 		return B_NO_MEMORY;
670 	}
671 
672 	for (int32 k = 0; k < runArrays.CountArrays(); k++) {
673 		run_array *array = runArrays.ArrayAt(k);
674 		int32 index = 0, count = 1;
675 		int32 wrap = fLogSize - logStart;
676 
677 		add_to_iovec(vecs, index, maxVecs, (void *)array, fVolume->BlockSize());
678 
679 		// add block runs
680 
681 		for (int32 i = 0; i < array->CountRuns(); i++) {
682 			const block_run &run = array->RunAt(i);
683 			off_t blockNumber = fVolume->ToBlock(run);
684 
685 			for (int32 j = 0; j < run.Length(); j++) {
686 				if (count >= wrap) {
687 					// We need to write back the first half of the entry
688 					// directly as the log wraps around
689 					if (writev_pos(fVolume->Device(), logOffset
690 						+ (logStart << blockShift), vecs, index) < 0)
691 						FATAL(("could not write log area!\n"));
692 
693 					logPosition = logStart + count;
694 					logStart = 0;
695 					wrap = fLogSize;
696 					count = 0;
697 					index = 0;
698 				}
699 
700 				// make blocks available in the cache
701 				const void *data = block_cache_get(fVolume->BlockCache(),
702 					blockNumber + j);
703 				if (data == NULL) {
704 					free(vecs);
705 					return B_IO_ERROR;
706 				}
707 
708 				add_to_iovec(vecs, index, maxVecs, data, fVolume->BlockSize());
709 				count++;
710 			}
711 		}
712 
713 		// write back the rest of the log entry
714 		if (count > 0) {
715 			logPosition = logStart + count;
716 			if (writev_pos(fVolume->Device(), logOffset
717 					+ (logStart << blockShift), vecs, index) < 0)
718 				FATAL(("could not write log area: %s!\n", strerror(errno)));
719 		}
720 
721 		// release blocks again
722 		for (int32 i = 0; i < array->CountRuns(); i++) {
723 			const block_run &run = array->RunAt(i);
724 			off_t blockNumber = fVolume->ToBlock(run);
725 
726 			for (int32 j = 0; j < run.Length(); j++) {
727 				block_cache_put(fVolume->BlockCache(), blockNumber + j);
728 			}
729 		}
730 
731 		logStart = logPosition % fLogSize;
732 	}
733 
734 	free(vecs);
735 
736 	LogEntry *logEntry = new LogEntry(this, fVolume->LogEnd(),
737 		runArrays.LogEntryLength());
738 	if (logEntry == NULL) {
739 		FATAL(("no memory to allocate log entries!"));
740 		return B_NO_MEMORY;
741 	}
742 
743 	// Update the log end pointer in the super block
744 
745 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
746 	fVolume->SuperBlock().log_end = logPosition;
747 	fVolume->LogEnd() = logPosition;
748 
749 	status = fVolume->WriteSuperBlock();
750 
751 	// We need to flush the drives own cache here to ensure
752 	// disk consistency.
753 	// If that call fails, we can't do anything about it anyway
754 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
755 
756 	// at this point, we can finally end the transaction - we're in
757 	// a guaranteed valid state
758 
759 	fEntriesLock.Lock();
760 	fEntries.Add(logEntry);
761 	fUsed += logEntry->Length();
762 	fEntriesLock.Unlock();
763 
764 	if (detached) {
765 		fTransactionID = cache_detach_sub_transaction(fVolume->BlockCache(),
766 			fTransactionID, _TransactionNotify, logEntry);
767 		fUnwrittenTransactions = 1;
768 	} else {
769 		cache_end_transaction(fVolume->BlockCache(), fTransactionID,
770 			_TransactionNotify, logEntry);
771 		fUnwrittenTransactions = 0;
772 	}
773 
774 	return status;
775 }
776 
777 
778 status_t
779 Journal::FlushLogAndBlocks()
780 {
781 	status_t status = fLock.Lock();
782 	if (status != B_OK)
783 		return status;
784 
785 	if (fLock.OwnerCount() > 1) {
786 		// whoa, FlushLogAndBlocks() was called from inside a transaction
787 		fLock.Unlock();
788 		return B_OK;
789 	}
790 
791 	// write the current log entry to disk
792 
793 	if (fUnwrittenTransactions != 0 && _TransactionSize() != 0) {
794 		status = _WriteTransactionToLog();
795 		if (status < B_OK)
796 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
797 	}
798 
799 	status = fVolume->FlushDevice();
800 
801 	fLock.Unlock();
802 	return status;
803 }
804 
805 
806 status_t
807 Journal::Lock(Transaction *owner)
808 {
809 	status_t status = fLock.Lock();
810 	if (status != B_OK)
811 		return status;
812 
813 /*	ToDo:
814 	// if the last transaction is older than 2 secs, start a new one
815 	if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L)
816 		WriteLogEntry();
817 */
818 
819 	if (fLock.OwnerCount() > 1) {
820 		// we'll just use the current transaction again
821 		return B_OK;
822 	}
823 
824 	fOwner = owner;
825 
826 	// ToDo: we need a way to find out how big the current transaction is;
827 	//	we need to be able to either detach the latest sub transaction on
828 	//	demand, as well as having some kind of fall back plan in case the
829 	//	sub transaction itself grows bigger than the log.
830 	//	For that, it would be nice to have some call-back interface in the
831 	//	cache transaction API...
832 
833 	if (fUnwrittenTransactions > 0) {
834 		// start a sub transaction
835 		cache_start_sub_transaction(fVolume->BlockCache(), fTransactionID);
836 		fHasSubtransaction = true;
837 	} else
838 		fTransactionID = cache_start_transaction(fVolume->BlockCache());
839 
840 	if (fTransactionID < B_OK) {
841 		fLock.Unlock();
842 		return fTransactionID;
843 	}
844 
845 	return B_OK;
846 }
847 
848 
849 void
850 Journal::Unlock(Transaction *owner, bool success)
851 {
852 	if (fLock.OwnerCount() == 1) {
853 		// we only end the transaction if we would really unlock it
854 		// ToDo: what about failing transactions that do not unlock?
855 		_TransactionDone(success);
856 
857 		fTimestamp = system_time();
858 		fOwner = NULL;
859 	}
860 
861 	fLock.Unlock();
862 }
863 
864 
865 uint32
866 Journal::_TransactionSize() const
867 {
868 	int32 count = cache_blocks_in_transaction(fVolume->BlockCache(),
869 		fTransactionID);
870 	if (count < 0)
871 		return 0;
872 
873 	// take the number of array blocks in this transaction into account
874 	uint32 maxRuns = run_array::MaxRuns(fVolume->BlockSize());
875 	uint32 arrayBlocks = (count + maxRuns - 1) / maxRuns;
876 	return count + arrayBlocks;
877 }
878 
879 
880 status_t
881 Journal::_TransactionDone(bool success)
882 {
883 	if (!success) {
884 		if (_HasSubTransaction())
885 			cache_abort_sub_transaction(fVolume->BlockCache(), fTransactionID);
886 		else
887 			cache_abort_transaction(fVolume->BlockCache(), fTransactionID);
888 
889 		return B_OK;
890 	}
891 
892 	// Up to a maximum size, we will just batch several
893 	// transactions together to improve speed
894 	uint32 size = _TransactionSize();
895 	if (size < fMaxTransactionSize) {
896 		// Flush the log from time to time, so that we have enough space
897 		// for this transaction
898 		if (size > FreeLogBlocks())
899 			cache_sync_transaction(fVolume->BlockCache(), fTransactionID);
900 
901 		fUnwrittenTransactions++;
902 		return B_OK;
903 	}
904 
905 	return _WriteTransactionToLog();
906 }
907 
908 
909 //	#pragma mark - Transaction
910 
911 
912 status_t
913 Transaction::Start(Volume *volume, off_t refBlock)
914 {
915 	// has it already been started?
916 	if (fJournal != NULL)
917 		return B_OK;
918 
919 	fJournal = volume->GetJournal(refBlock);
920 	if (fJournal != NULL && fJournal->Lock(this) == B_OK)
921 		return B_OK;
922 
923 	fJournal = NULL;
924 	return B_ERROR;
925 }
926 
927