1 /* Journal - transaction and logging 2 ** 3 ** Initial version by Axel Dörfler, axeld@pinc-software.de 4 ** This file may be used under the terms of the OpenBeOS License. 5 */ 6 7 8 #include "Journal.h" 9 #include "Inode.h" 10 #include "Debug.h" 11 12 #include <kernel_cpp.h> 13 14 15 Journal::Journal(Volume *volume) 16 : 17 fVolume(volume), 18 fLock("bfs journal"), 19 fOwner(NULL), 20 fArray(volume->BlockSize()), 21 fLogSize(volume->Log().length), 22 fMaxTransactionSize(fLogSize / 4 - 5), 23 fUsed(0), 24 fTransactionsInEntry(0) 25 { 26 if (fMaxTransactionSize > fLogSize / 2) 27 fMaxTransactionSize = fLogSize / 2 - 5; 28 } 29 30 31 Journal::~Journal() 32 { 33 FlushLogAndBlocks(); 34 } 35 36 37 status_t 38 Journal::InitCheck() 39 { 40 if (fVolume->LogStart() != fVolume->LogEnd()) { 41 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) 42 FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n")); 43 44 return ReplayLog(); 45 } 46 47 return B_OK; 48 } 49 50 51 status_t 52 Journal::CheckLogEntry(int32 count, off_t *array) 53 { 54 // ToDo: check log entry integrity (block numbers and entry size) 55 PRINT(("Log entry has %ld entries (%Ld)\n", count, array[0])); 56 return B_OK; 57 } 58 59 60 status_t 61 Journal::ReplayLogEntry(int32 *_start) 62 { 63 PRINT(("ReplayLogEntry(start = %ld)\n", *_start)); 64 65 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 66 off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log()); 67 int32 blockSize = fVolume->BlockSize(); 68 int32 count = 1, valuesInBlock = blockSize / sizeof(off_t); 69 int32 numArrayBlocks; 70 off_t blockNumber = 0; 71 bool first = true; 72 73 CachedBlock cached(fVolume); 74 while (count > 0) { 75 off_t *array = (off_t *)cached.SetTo(arrayBlock); 76 if (array == NULL) 77 return B_IO_ERROR; 78 79 int32 index = 0; 80 if (first) { 81 if (array[0] < 1 || array[0] >= fLogSize) 82 return B_BAD_DATA; 83 84 count = array[0]; 85 first = false; 86 87 numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize; 88 blockNumber = (*_start + numArrayBlocks) % fLogSize; 89 // first real block in this log entry 90 *_start += count; 91 index++; 92 // the first entry in the first block is the number 93 // of blocks in that log entry 94 } 95 (*_start)++; 96 97 if (CheckLogEntry(count, array + 1) < B_OK) 98 return B_BAD_DATA; 99 100 CachedBlock cachedCopy(fVolume); 101 for (; index < valuesInBlock && count-- > 0; index++) { 102 PRINT(("replay block %Ld in log at %Ld!\n", array[index], blockNumber)); 103 104 uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber); 105 if (copy == NULL) 106 RETURN_ERROR(B_IO_ERROR); 107 108 ssize_t written = write_pos(fVolume->Device(), 109 array[index] << fVolume->BlockShift(), copy, blockSize); 110 if (written != blockSize) 111 RETURN_ERROR(B_IO_ERROR); 112 113 blockNumber = (blockNumber + 1) % fLogSize; 114 } 115 arrayBlock++; 116 if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize) 117 arrayBlock = fVolume->ToBlock(fVolume->Log()); 118 } 119 return B_OK; 120 } 121 122 123 /** Replays all log entries - this will put the disk into a 124 * consistent and clean state, if it was not correctly unmounted 125 * before. 126 * This method is called by Journal::InitCheck() if the log start 127 * and end pointer don't match. 128 */ 129 130 status_t 131 Journal::ReplayLog() 132 { 133 INFORM(("Replay log, disk was not correctly unmounted...\n")); 134 135 int32 start = fVolume->LogStart(); 136 int32 lastStart = -1; 137 while (true) { 138 // stop if the log is completely flushed 139 if (start == fVolume->LogEnd()) 140 break; 141 142 if (start == lastStart) { 143 // strange, flushing the log hasn't changed the log_start pointer 144 return B_ERROR; 145 } 146 lastStart = start; 147 148 status_t status = ReplayLogEntry(&start); 149 if (status < B_OK) { 150 FATAL(("replaying log entry from %ld failed: %s\n", start, strerror(status))); 151 return B_ERROR; 152 } 153 start = start % fLogSize; 154 } 155 156 PRINT(("replaying worked fine!\n")); 157 fVolume->SuperBlock().log_start = fVolume->LogEnd(); 158 fVolume->LogStart() = fVolume->LogEnd(); 159 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 160 161 return fVolume->WriteSuperBlock(); 162 } 163 164 165 /** This is a callback function that is called by the cache, whenever 166 * a block is flushed to disk that was updated as part of a transaction. 167 * This is necessary to keep track of completed transactions, to be 168 * able to update the log start pointer. 169 */ 170 171 void 172 Journal::blockNotify(off_t blockNumber, size_t numBlocks, void *arg) 173 { 174 log_entry *logEntry = (log_entry *)arg; 175 176 logEntry->cached_blocks -= numBlocks; 177 if (logEntry->cached_blocks > 0) { 178 // nothing to do yet... 179 return; 180 } 181 182 Journal *journal = logEntry->journal; 183 disk_super_block &superBlock = journal->fVolume->SuperBlock(); 184 bool update = false; 185 186 // Set log_start pointer if possible... 187 188 if (logEntry == journal->fEntries.head) { 189 if (logEntry->Next() != NULL) { 190 int32 length = logEntry->next->start - logEntry->start; 191 superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize; 192 } else 193 superBlock.log_start = journal->fVolume->LogEnd(); 194 195 update = true; 196 } 197 journal->fUsed -= logEntry->length; 198 199 journal->fEntriesLock.Lock(); 200 logEntry->Remove(); 201 journal->fEntriesLock.Unlock(); 202 203 free(logEntry); 204 205 // update the super block, and change the disk's state, if necessary 206 207 if (update) { 208 journal->fVolume->LogStart() = superBlock.log_start; 209 210 if (superBlock.log_start == superBlock.log_end) 211 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 212 213 journal->fVolume->WriteSuperBlock(); 214 } 215 } 216 217 218 status_t 219 Journal::WriteLogEntry() 220 { 221 fTransactionsInEntry = 0; 222 fHasChangedBlocks = false; 223 224 sorted_array *array = fArray.Array(); 225 if (array == NULL || array->count == 0) 226 return B_OK; 227 228 // Make sure there is enough space in the log. 229 // If that fails for whatever reason, panic! 230 force_cache_flush(fVolume->Device(), false); 231 int32 tries = fLogSize / 2 + 1; 232 while (TransactionSize() > FreeLogBlocks() && tries-- > 0) 233 force_cache_flush(fVolume->Device(), true); 234 235 if (tries <= 0) { 236 fVolume->Panic(); 237 return B_BAD_DATA; 238 } 239 240 int32 blockShift = fVolume->BlockShift(); 241 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 242 off_t logStart = fVolume->LogEnd(); 243 off_t logPosition = logStart % fLogSize; 244 245 // Write disk block array 246 247 uint8 *arrayBlock = (uint8 *)array; 248 249 for (int32 size = fArray.BlocksUsed(); size-- > 0;) { 250 write_pos(fVolume->Device(), logOffset + (logPosition << blockShift), 251 arrayBlock, fVolume->BlockSize()); 252 253 logPosition = (logPosition + 1) % fLogSize; 254 arrayBlock += fVolume->BlockSize(); 255 } 256 257 // Write logged blocks into the log 258 259 CachedBlock cached(fVolume); 260 for (int32 i = 0;i < array->count;i++) { 261 // ToDo: combine blocks if possible (using iovecs)! 262 263 uint8 *block = cached.SetTo(array->values[i]); 264 if (block == NULL) 265 return B_IO_ERROR; 266 267 write_pos(fVolume->Device(), logOffset + (logPosition << blockShift), 268 block, fVolume->BlockSize()); 269 logPosition = (logPosition + 1) % fLogSize; 270 } 271 272 log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry)); 273 if (logEntry != NULL) { 274 logEntry->start = logStart; 275 logEntry->length = TransactionSize(); 276 logEntry->cached_blocks = array->count; 277 logEntry->journal = this; 278 279 fEntriesLock.Lock(); 280 fEntries.Add(logEntry); 281 fEntriesLock.Unlock(); 282 283 fCurrent = logEntry; 284 fUsed += logEntry->length; 285 286 set_blocks_info(fVolume->Device(), &array->values[0], array->count, blockNotify, logEntry); 287 } 288 289 // If the log goes to the next round (the log is written as a 290 // circular buffer), all blocks will be flushed out which is 291 // possible because we don't have any locked blocks at this 292 // point. 293 if (logPosition < logStart) 294 fVolume->FlushDevice(); 295 296 // We need to flush the drives own cache here to ensure 297 // disk consistency. 298 // If that call fails, we can't do anything about it anyway 299 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 300 301 fArray.MakeEmpty(); 302 303 // Update the log end pointer in the super block 304 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 305 fVolume->SuperBlock().log_end = logPosition; 306 fVolume->LogEnd() = logPosition; 307 308 return fVolume->WriteSuperBlock(); 309 } 310 311 312 status_t 313 Journal::FlushLogAndBlocks() 314 { 315 status_t status = Lock((Transaction *)this); 316 if (status != B_OK) 317 return status; 318 319 // write the current log entry to disk 320 321 if (TransactionSize() != 0) { 322 status = WriteLogEntry(); 323 if (status < B_OK) 324 FATAL(("writing current log entry failed: %s\n", strerror(status))); 325 } 326 status = fVolume->FlushDevice(); 327 328 Unlock((Transaction *)this, true); 329 return status; 330 } 331 332 333 status_t 334 Journal::Lock(Transaction *owner) 335 { 336 if (owner == fOwner) 337 return B_OK; 338 339 status_t status = fLock.Lock(); 340 if (status == B_OK) 341 fOwner = owner; 342 343 // if the last transaction is older than 2 secs, start a new one 344 if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L) 345 WriteLogEntry(); 346 347 return B_OK; 348 } 349 350 351 void 352 Journal::Unlock(Transaction *owner, bool success) 353 { 354 if (owner != fOwner) 355 return; 356 357 TransactionDone(success); 358 359 fTimestamp = system_time(); 360 fOwner = NULL; 361 fLock.Unlock(); 362 } 363 364 365 /** If there is a current transaction that the current thread has 366 * started, this function will give you access to it. 367 */ 368 369 Transaction * 370 Journal::CurrentTransaction() 371 { 372 if (fLock.LockWithTimeout(0) != B_OK) 373 return NULL; 374 375 Transaction *owner = fOwner; 376 fLock.Unlock(); 377 378 return owner; 379 } 380 381 382 status_t 383 Journal::TransactionDone(bool success) 384 { 385 if (!success && fTransactionsInEntry == 0) { 386 // we can safely abort the transaction 387 sorted_array *array = fArray.Array(); 388 if (array != NULL) { 389 // release the lock for all blocks in the array (we don't need 390 // to be notified when they are actually written to disk) 391 for (int32 i = 0; i < array->count; i++) 392 release_block(fVolume->Device(), array->values[i]); 393 } 394 395 return B_OK; 396 } 397 398 // Up to a maximum size, we will just batch several 399 // transactions together to improve speed 400 if (TransactionSize() < fMaxTransactionSize) { 401 fTransactionsInEntry++; 402 fHasChangedBlocks = false; 403 404 return B_OK; 405 } 406 407 return WriteLogEntry(); 408 } 409 410 411 status_t 412 Journal::LogBlocks(off_t blockNumber, const uint8 *buffer, size_t numBlocks) 413 { 414 // ToDo: that's for now - we should change the log file size here 415 if (TransactionSize() + numBlocks + 1 > fLogSize) 416 return B_DEVICE_FULL; 417 418 fHasChangedBlocks = true; 419 int32 blockSize = fVolume->BlockSize(); 420 421 for (;numBlocks-- > 0; blockNumber++, buffer += blockSize) { 422 if (fArray.Find(blockNumber) >= 0) { 423 // The block is already in the log, so just update its data 424 // Note, this is only necessary if this method is called with a buffer 425 // different from the cached block buffer - which is unlikely but 426 // we'll make sure this way (costs one cache lookup, though). 427 status_t status = cached_write(fVolume->Device(), blockNumber, buffer, 1, blockSize); 428 if (status < B_OK) 429 return status; 430 431 continue; 432 } 433 434 // Insert the block into the transaction's array, and write the changes 435 // back into the locked cache buffer 436 fArray.Insert(blockNumber); 437 status_t status = cached_write_locked(fVolume->Device(), blockNumber, buffer, 1, blockSize); 438 if (status < B_OK) 439 return status; 440 } 441 442 // If necessary, flush the log, so that we have enough space for this transaction 443 if (TransactionSize() > FreeLogBlocks()) 444 force_cache_flush(fVolume->Device(), true); 445 446 return B_OK; 447 } 448 449 450 // #pragma mark - 451 452 453 status_t 454 Transaction::Start(Volume *volume, off_t refBlock) 455 { 456 // has it already been started? 457 if (fJournal != NULL) 458 return B_OK; 459 460 fJournal = volume->GetJournal(refBlock); 461 if (fJournal != NULL && fJournal->Lock(this) == B_OK) 462 return B_OK; 463 464 fJournal = NULL; 465 return B_ERROR; 466 } 467 468