xref: /haiku/src/add-ons/kernel/file_systems/bfs/Journal.cpp (revision d5cd5d63ff0ad395989db6cf4841a64d5b545d1d)
1 /* Journal - transaction and logging
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** This file may be used under the terms of the OpenBeOS License.
5 */
6 
7 
8 #include "cpp.h"
9 #include "Journal.h"
10 #include "Inode.h"
11 #include "Debug.h"
12 
13 
14 Journal::Journal(Volume *volume)
15 	:
16 	fVolume(volume),
17 	fLock("bfs journal"),
18 	fOwner(NULL),
19 	fOwningThread(-1),
20 	fArray(volume->BlockSize()),
21 	fLogSize(volume->Log().length),
22 	fMaxTransactionSize(fLogSize / 4 - 5),
23 	fUsed(0),
24 	fTransactionsInEntry(0)
25 {
26 	if (fMaxTransactionSize > fLogSize / 2)
27 		fMaxTransactionSize = fLogSize / 2 - 5;
28 }
29 
30 
31 Journal::~Journal()
32 {
33 	FlushLogAndBlocks();
34 }
35 
36 
37 status_t
38 Journal::InitCheck()
39 {
40 	if (fVolume->LogStart() != fVolume->LogEnd()) {
41 		if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY)
42 			FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n"));
43 
44 		return ReplayLog();
45 	}
46 
47 	return B_OK;
48 }
49 
50 
51 status_t
52 Journal::CheckLogEntry(int32 count, off_t *array)
53 {
54 	// ToDo: check log entry integrity (block numbers and entry size)
55 	PRINT(("Log entry has %ld entries (%Ld)\n", count, array[0]));
56 	return B_OK;
57 }
58 
59 
60 status_t
61 Journal::ReplayLogEntry(int32 *_start)
62 {
63 	PRINT(("ReplayLogEntry(start = %ld)\n", *_start));
64 
65 	off_t logOffset = fVolume->ToBlock(fVolume->Log());
66 	off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log());
67 	int32 blockSize = fVolume->BlockSize();
68 	int32 count = 1, valuesInBlock = blockSize / sizeof(off_t);
69 	int32 numArrayBlocks;
70 	off_t blockNumber = 0;
71 	bool first = true;
72 
73 	CachedBlock cached(fVolume);
74 	while (count > 0) {
75 		off_t *array = (off_t *)cached.SetTo(arrayBlock);
76 		if (array == NULL)
77 			return B_IO_ERROR;
78 
79 		int32 index = 0;
80 		if (first) {
81 			if (array[0] < 1 || array[0] >= fLogSize)
82 				return B_BAD_DATA;
83 
84 			count = array[0];
85 			first = false;
86 
87 			numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize;
88 			blockNumber = (*_start + numArrayBlocks) % fLogSize;
89 				// first real block in this log entry
90 			*_start += count;
91 			index++;
92 				// the first entry in the first block is the number
93 				// of blocks in that log entry
94 		}
95 		(*_start)++;
96 
97 		if (CheckLogEntry(count, array + 1) < B_OK)
98 			return B_BAD_DATA;
99 
100 		CachedBlock cachedCopy(fVolume);
101 		for (; index < valuesInBlock && count-- > 0; index++) {
102 			PRINT(("replay block %Ld in log at %Ld!\n", array[index], blockNumber));
103 
104 			uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber);
105 			if (copy == NULL)
106 				RETURN_ERROR(B_IO_ERROR);
107 
108 			ssize_t written = write_pos(fVolume->Device(),
109 						array[index] << fVolume->BlockShift(), copy, blockSize);
110 			if (written != blockSize)
111 				RETURN_ERROR(B_IO_ERROR);
112 
113 			blockNumber = (blockNumber + 1) % fLogSize;
114 		}
115 		arrayBlock++;
116 		if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize)
117 			arrayBlock = fVolume->ToBlock(fVolume->Log());
118 	}
119 	return B_OK;
120 }
121 
122 
123 /**	Replays all log entries - this will put the disk into a
124  *	consistent and clean state, if it was not correctly unmounted
125  *	before.
126  *	This method is called by Journal::InitCheck() if the log start
127  *	and end pointer don't match.
128  */
129 
130 status_t
131 Journal::ReplayLog()
132 {
133 	INFORM(("Replay log, disk was not correctly unmounted...\n"));
134 
135 	int32 start = fVolume->LogStart();
136 	int32 lastStart = -1;
137 	while (true) {
138 		// stop if the log is completely flushed
139 		if (start == fVolume->LogEnd())
140 			break;
141 
142 		if (start == lastStart) {
143 			// strange, flushing the log hasn't changed the log_start pointer
144 			return B_ERROR;
145 		}
146 		lastStart = start;
147 
148 		status_t status = ReplayLogEntry(&start);
149 		if (status < B_OK) {
150 			FATAL(("replaying log entry from %ld failed: %s\n", start, strerror(status)));
151 			return B_ERROR;
152 		}
153 		start = start % fLogSize;
154 	}
155 
156 	PRINT(("replaying worked fine!\n"));
157 	fVolume->SuperBlock().log_start = fVolume->LogEnd();
158 	fVolume->LogStart() = fVolume->LogEnd();
159 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN;
160 
161 	return fVolume->WriteSuperBlock();
162 }
163 
164 
165 /**	This is a callback function that is called by the cache, whenever
166  *	a block is flushed to disk that was updated as part of a transaction.
167  *	This is necessary to keep track of completed transactions, to be
168  *	able to update the log start pointer.
169  */
170 
171 void
172 Journal::blockNotify(off_t blockNumber, size_t numBlocks, void *arg)
173 {
174 	log_entry *logEntry = (log_entry *)arg;
175 
176 	logEntry->cached_blocks -= numBlocks;
177 	if (logEntry->cached_blocks > 0) {
178 		// nothing to do yet...
179 		return;
180 	}
181 
182 	Journal *journal = logEntry->journal;
183 	disk_super_block &superBlock = journal->fVolume->SuperBlock();
184 	bool update = false;
185 
186 	// Set log_start pointer if possible...
187 
188 	if (logEntry == journal->fEntries.head) {
189 		if (logEntry->Next() != NULL) {
190 			int32 length = logEntry->next->start - logEntry->start;
191 			superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize;
192 		} else
193 			superBlock.log_start = journal->fVolume->LogEnd();
194 
195 		update = true;
196 	}
197 	journal->fUsed -= logEntry->length;
198 
199 	journal->fEntriesLock.Lock();
200 	logEntry->Remove();
201 	journal->fEntriesLock.Unlock();
202 
203 	free(logEntry);
204 
205 	// update the super block, and change the disk's state, if necessary
206 
207 	if (update) {
208 		journal->fVolume->LogStart() = superBlock.log_start;
209 
210 		if (superBlock.log_start == superBlock.log_end)
211 			superBlock.flags = SUPER_BLOCK_DISK_CLEAN;
212 
213 		journal->fVolume->WriteSuperBlock();
214 	}
215 }
216 
217 
218 status_t
219 Journal::WriteLogEntry()
220 {
221 	fTransactionsInEntry = 0;
222 	fHasChangedBlocks = false;
223 
224 	sorted_array *array = fArray.Array();
225 	if (array == NULL || array->count == 0)
226 		return B_OK;
227 
228 	// Make sure there is enough space in the log.
229 	// If that fails for whatever reason, panic!
230 	force_cache_flush(fVolume->Device(), false);
231 	int32 tries = fLogSize / 2 + 1;
232 	while (TransactionSize() > FreeLogBlocks() && tries-- > 0)
233 		force_cache_flush(fVolume->Device(), true);
234 
235 	if (tries <= 0) {
236 		fVolume->Panic();
237 		return B_BAD_DATA;
238 	}
239 
240 	int32 blockShift = fVolume->BlockShift();
241 	off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift;
242 	off_t logStart = fVolume->LogEnd();
243 	off_t logPosition = logStart % fLogSize;
244 
245 	// Write disk block array
246 
247 	uint8 *arrayBlock = (uint8 *)array;
248 
249 	for (int32 size = fArray.BlocksUsed(); size-- > 0;) {
250 		write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
251 			arrayBlock, fVolume->BlockSize());
252 
253 		logPosition = (logPosition + 1) % fLogSize;
254 		arrayBlock += fVolume->BlockSize();
255 	}
256 
257 	// Write logged blocks into the log
258 
259 	CachedBlock cached(fVolume);
260 	for (int32 i = 0;i < array->count;i++) {
261 		// ToDo: combine blocks if possible (using iovecs)!
262 
263 		uint8 *block = cached.SetTo(array->values[i]);
264 		if (block == NULL)
265 			return B_IO_ERROR;
266 
267 		write_pos(fVolume->Device(), logOffset + (logPosition << blockShift),
268 			block, fVolume->BlockSize());
269 		logPosition = (logPosition + 1) % fLogSize;
270 	}
271 
272 	log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry));
273 	if (logEntry != NULL) {
274 		logEntry->start = logStart;
275 		logEntry->length = TransactionSize();
276 		logEntry->cached_blocks = array->count;
277 		logEntry->journal = this;
278 
279 		fEntriesLock.Lock();
280 		fEntries.Add(logEntry);
281 		fEntriesLock.Unlock();
282 
283 		fCurrent = logEntry;
284 		fUsed += logEntry->length;
285 
286 		set_blocks_info(fVolume->Device(), &array->values[0], array->count, blockNotify, logEntry);
287 	}
288 
289 	// If the log goes to the next round (the log is written as a
290 	// circular buffer), all blocks will be flushed out which is
291 	// possible because we don't have any locked blocks at this
292 	// point.
293 	if (logPosition < logStart)
294 		fVolume->FlushDevice();
295 
296 	// We need to flush the drives own cache here to ensure
297 	// disk consistency.
298 	// If that call fails, we can't do anything about it anyway
299 	ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE);
300 
301 	fArray.MakeEmpty();
302 
303 	// Update the log end pointer in the super block
304 	fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY;
305 	fVolume->SuperBlock().log_end = logPosition;
306 	fVolume->LogEnd() = logPosition;
307 
308 	return fVolume->WriteSuperBlock();
309 }
310 
311 
312 status_t
313 Journal::FlushLogAndBlocks()
314 {
315 	status_t status = Lock((Transaction *)this);
316 	if (status != B_OK)
317 		return status;
318 
319 	// write the current log entry to disk
320 
321 	if (TransactionSize() != 0) {
322 		status = WriteLogEntry();
323 		if (status < B_OK)
324 			FATAL(("writing current log entry failed: %s\n", strerror(status)));
325 	}
326 	status = fVolume->FlushDevice();
327 
328 	Unlock((Transaction *)this, true);
329 	return status;
330 }
331 
332 
333 status_t
334 Journal::Lock(Transaction *owner)
335 {
336 	if (owner == fOwner)
337 		return B_OK;
338 
339 	status_t status = fLock.Lock();
340 	if (status == B_OK) {
341 		fOwner = owner;
342 		fOwningThread = find_thread(NULL);
343 	}
344 
345 	// if the last transaction is older than 2 secs, start a new one
346 	if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L)
347 		WriteLogEntry();
348 
349 	return B_OK;
350 }
351 
352 
353 void
354 Journal::Unlock(Transaction *owner, bool success)
355 {
356 	if (owner != fOwner)
357 		return;
358 
359 	TransactionDone(success);
360 
361 	fTimestamp = system_time();
362 	fOwner = NULL;
363 	fOwningThread = -1;
364 	fLock.Unlock();
365 }
366 
367 
368 status_t
369 Journal::TransactionDone(bool success)
370 {
371 	if (!success && fTransactionsInEntry == 0) {
372 		// we can safely abort the transaction
373 		sorted_array *array = fArray.Array();
374 		if (array != NULL) {
375 			// release the lock for all blocks in the array (we don't need
376 			// to be notified when they are actually written to disk)
377 			for (int32 i = 0; i < array->count; i++)
378 				release_block(fVolume->Device(), array->values[i]);
379 		}
380 
381 		return B_OK;
382 	}
383 
384 	// Up to a maximum size, we will just batch several
385 	// transactions together to improve speed
386 	if (TransactionSize() < fMaxTransactionSize) {
387 		fTransactionsInEntry++;
388 		fHasChangedBlocks = false;
389 
390 		return B_OK;
391 	}
392 
393 	return WriteLogEntry();
394 }
395 
396 
397 status_t
398 Journal::LogBlocks(off_t blockNumber, const uint8 *buffer, size_t numBlocks)
399 {
400 	// ToDo: that's for now - we should change the log file size here
401 	if (TransactionSize() + numBlocks + 1 > fLogSize)
402 		return B_DEVICE_FULL;
403 
404 	fHasChangedBlocks = true;
405 	int32 blockSize = fVolume->BlockSize();
406 
407 	for (;numBlocks-- > 0; blockNumber++, buffer += blockSize) {
408 		if (fArray.Find(blockNumber) >= 0) {
409 			// The block is already in the log, so just update its data
410 			// Note, this is only necessary if this method is called with a buffer
411 			// different from the cached block buffer - which is unlikely but
412 			// we'll make sure this way (costs one cache lookup, though).
413 			status_t status = cached_write(fVolume->Device(), blockNumber, buffer, 1, blockSize);
414 			if (status < B_OK)
415 				return status;
416 
417 			continue;
418 		}
419 
420 		// Insert the block into the transaction's array, and write the changes
421 		// back into the locked cache buffer
422 		fArray.Insert(blockNumber);
423 		status_t status = cached_write_locked(fVolume->Device(), blockNumber, buffer, 1, blockSize);
424 		if (status < B_OK)
425 			return status;
426 	}
427 
428 	// If necessary, flush the log, so that we have enough space for this transaction
429 	if (TransactionSize() > FreeLogBlocks())
430 		force_cache_flush(fVolume->Device(), true);
431 
432 	return B_OK;
433 }
434 
435 
436 //	#pragma mark -
437 
438 
439 status_t
440 Transaction::Start(Volume *volume, off_t refBlock)
441 {
442 	// has it already been started?
443 	if (fJournal != NULL)
444 		return B_OK;
445 
446 	fJournal = volume->GetJournal(refBlock);
447 	if (fJournal != NULL && fJournal->Lock(this) == B_OK)
448 		return B_OK;
449 
450 	fJournal = NULL;
451 	return B_ERROR;
452 }
453 
454