1 /* Journal - transaction and logging 2 ** 3 ** Initial version by Axel Dörfler, axeld@pinc-software.de 4 ** This file may be used under the terms of the OpenBeOS License. 5 */ 6 7 8 #include "cpp.h" 9 #include "Journal.h" 10 #include "Inode.h" 11 #include "Debug.h" 12 13 14 Journal::Journal(Volume *volume) 15 : 16 fVolume(volume), 17 fLock("bfs journal"), 18 fOwner(NULL), 19 fOwningThread(-1), 20 fArray(volume->BlockSize()), 21 fLogSize(volume->Log().length), 22 fMaxTransactionSize(fLogSize / 4 - 5), 23 fUsed(0), 24 fTransactionsInEntry(0) 25 { 26 if (fMaxTransactionSize > fLogSize / 2) 27 fMaxTransactionSize = fLogSize / 2 - 5; 28 } 29 30 31 Journal::~Journal() 32 { 33 FlushLogAndBlocks(); 34 } 35 36 37 status_t 38 Journal::InitCheck() 39 { 40 if (fVolume->LogStart() != fVolume->LogEnd()) { 41 if (fVolume->SuperBlock().flags != SUPER_BLOCK_DISK_DIRTY) 42 FATAL(("log_start and log_end differ, but disk is marked clean - trying to replay log...\n")); 43 44 return ReplayLog(); 45 } 46 47 return B_OK; 48 } 49 50 51 status_t 52 Journal::CheckLogEntry(int32 count, off_t *array) 53 { 54 // ToDo: check log entry integrity (block numbers and entry size) 55 PRINT(("Log entry has %ld entries (%Ld)\n", count, array[0])); 56 return B_OK; 57 } 58 59 60 status_t 61 Journal::ReplayLogEntry(int32 *_start) 62 { 63 PRINT(("ReplayLogEntry(start = %ld)\n", *_start)); 64 65 off_t logOffset = fVolume->ToBlock(fVolume->Log()); 66 off_t arrayBlock = (*_start % fLogSize) + fVolume->ToBlock(fVolume->Log()); 67 int32 blockSize = fVolume->BlockSize(); 68 int32 count = 1, valuesInBlock = blockSize / sizeof(off_t); 69 int32 numArrayBlocks; 70 off_t blockNumber = 0; 71 bool first = true; 72 73 CachedBlock cached(fVolume); 74 while (count > 0) { 75 off_t *array = (off_t *)cached.SetTo(arrayBlock); 76 if (array == NULL) 77 return B_IO_ERROR; 78 79 int32 index = 0; 80 if (first) { 81 if (array[0] < 1 || array[0] >= fLogSize) 82 return B_BAD_DATA; 83 84 count = array[0]; 85 first = false; 86 87 numArrayBlocks = ((count + 1) * sizeof(off_t) + blockSize - 1) / blockSize; 88 blockNumber = (*_start + numArrayBlocks) % fLogSize; 89 // first real block in this log entry 90 *_start += count; 91 index++; 92 // the first entry in the first block is the number 93 // of blocks in that log entry 94 } 95 (*_start)++; 96 97 if (CheckLogEntry(count, array + 1) < B_OK) 98 return B_BAD_DATA; 99 100 CachedBlock cachedCopy(fVolume); 101 for (; index < valuesInBlock && count-- > 0; index++) { 102 PRINT(("replay block %Ld in log at %Ld!\n", array[index], blockNumber)); 103 104 uint8 *copy = cachedCopy.SetTo(logOffset + blockNumber); 105 if (copy == NULL) 106 RETURN_ERROR(B_IO_ERROR); 107 108 ssize_t written = write_pos(fVolume->Device(), 109 array[index] << fVolume->BlockShift(), copy, blockSize); 110 if (written != blockSize) 111 RETURN_ERROR(B_IO_ERROR); 112 113 blockNumber = (blockNumber + 1) % fLogSize; 114 } 115 arrayBlock++; 116 if (arrayBlock > fVolume->ToBlock(fVolume->Log()) + fLogSize) 117 arrayBlock = fVolume->ToBlock(fVolume->Log()); 118 } 119 return B_OK; 120 } 121 122 123 /** Replays all log entries - this will put the disk into a 124 * consistent and clean state, if it was not correctly unmounted 125 * before. 126 * This method is called by Journal::InitCheck() if the log start 127 * and end pointer don't match. 128 */ 129 130 status_t 131 Journal::ReplayLog() 132 { 133 INFORM(("Replay log, disk was not correctly unmounted...\n")); 134 135 int32 start = fVolume->LogStart(); 136 int32 lastStart = -1; 137 while (true) { 138 // stop if the log is completely flushed 139 if (start == fVolume->LogEnd()) 140 break; 141 142 if (start == lastStart) { 143 // strange, flushing the log hasn't changed the log_start pointer 144 return B_ERROR; 145 } 146 lastStart = start; 147 148 status_t status = ReplayLogEntry(&start); 149 if (status < B_OK) { 150 FATAL(("replaying log entry from %ld failed: %s\n", start, strerror(status))); 151 return B_ERROR; 152 } 153 start = start % fLogSize; 154 } 155 156 PRINT(("replaying worked fine!\n")); 157 fVolume->SuperBlock().log_start = fVolume->LogEnd(); 158 fVolume->LogStart() = fVolume->LogEnd(); 159 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_CLEAN; 160 161 return fVolume->WriteSuperBlock(); 162 } 163 164 165 /** This is a callback function that is called by the cache, whenever 166 * a block is flushed to disk that was updated as part of a transaction. 167 * This is necessary to keep track of completed transactions, to be 168 * able to update the log start pointer. 169 */ 170 171 void 172 Journal::blockNotify(off_t blockNumber, size_t numBlocks, void *arg) 173 { 174 log_entry *logEntry = (log_entry *)arg; 175 176 logEntry->cached_blocks -= numBlocks; 177 if (logEntry->cached_blocks > 0) { 178 // nothing to do yet... 179 return; 180 } 181 182 Journal *journal = logEntry->journal; 183 disk_super_block &superBlock = journal->fVolume->SuperBlock(); 184 bool update = false; 185 186 // Set log_start pointer if possible... 187 188 if (logEntry == journal->fEntries.head) { 189 if (logEntry->Next() != NULL) { 190 int32 length = logEntry->next->start - logEntry->start; 191 superBlock.log_start = (superBlock.log_start + length) % journal->fLogSize; 192 } else 193 superBlock.log_start = journal->fVolume->LogEnd(); 194 195 update = true; 196 } 197 journal->fUsed -= logEntry->length; 198 199 journal->fEntriesLock.Lock(); 200 logEntry->Remove(); 201 journal->fEntriesLock.Unlock(); 202 203 free(logEntry); 204 205 // update the super block, and change the disk's state, if necessary 206 207 if (update) { 208 journal->fVolume->LogStart() = superBlock.log_start; 209 210 if (superBlock.log_start == superBlock.log_end) 211 superBlock.flags = SUPER_BLOCK_DISK_CLEAN; 212 213 journal->fVolume->WriteSuperBlock(); 214 } 215 } 216 217 218 status_t 219 Journal::WriteLogEntry() 220 { 221 fTransactionsInEntry = 0; 222 fHasChangedBlocks = false; 223 224 sorted_array *array = fArray.Array(); 225 if (array == NULL || array->count == 0) 226 return B_OK; 227 228 // Make sure there is enough space in the log. 229 // If that fails for whatever reason, panic! 230 force_cache_flush(fVolume->Device(), false); 231 int32 tries = fLogSize / 2 + 1; 232 while (TransactionSize() > FreeLogBlocks() && tries-- > 0) 233 force_cache_flush(fVolume->Device(), true); 234 235 if (tries <= 0) { 236 fVolume->Panic(); 237 return B_BAD_DATA; 238 } 239 240 int32 blockShift = fVolume->BlockShift(); 241 off_t logOffset = fVolume->ToBlock(fVolume->Log()) << blockShift; 242 off_t logStart = fVolume->LogEnd(); 243 off_t logPosition = logStart % fLogSize; 244 245 // Write disk block array 246 247 uint8 *arrayBlock = (uint8 *)array; 248 249 for (int32 size = fArray.BlocksUsed(); size-- > 0;) { 250 write_pos(fVolume->Device(), logOffset + (logPosition << blockShift), 251 arrayBlock, fVolume->BlockSize()); 252 253 logPosition = (logPosition + 1) % fLogSize; 254 arrayBlock += fVolume->BlockSize(); 255 } 256 257 // Write logged blocks into the log 258 259 CachedBlock cached(fVolume); 260 for (int32 i = 0;i < array->count;i++) { 261 // ToDo: combine blocks if possible (using iovecs)! 262 263 uint8 *block = cached.SetTo(array->values[i]); 264 if (block == NULL) 265 return B_IO_ERROR; 266 267 write_pos(fVolume->Device(), logOffset + (logPosition << blockShift), 268 block, fVolume->BlockSize()); 269 logPosition = (logPosition + 1) % fLogSize; 270 } 271 272 log_entry *logEntry = (log_entry *)malloc(sizeof(log_entry)); 273 if (logEntry != NULL) { 274 logEntry->start = logStart; 275 logEntry->length = TransactionSize(); 276 logEntry->cached_blocks = array->count; 277 logEntry->journal = this; 278 279 fEntriesLock.Lock(); 280 fEntries.Add(logEntry); 281 fEntriesLock.Unlock(); 282 283 fCurrent = logEntry; 284 fUsed += logEntry->length; 285 286 set_blocks_info(fVolume->Device(), &array->values[0], array->count, blockNotify, logEntry); 287 } 288 289 // If the log goes to the next round (the log is written as a 290 // circular buffer), all blocks will be flushed out which is 291 // possible because we don't have any locked blocks at this 292 // point. 293 if (logPosition < logStart) 294 fVolume->FlushDevice(); 295 296 // We need to flush the drives own cache here to ensure 297 // disk consistency. 298 // If that call fails, we can't do anything about it anyway 299 ioctl(fVolume->Device(), B_FLUSH_DRIVE_CACHE); 300 301 fArray.MakeEmpty(); 302 303 // Update the log end pointer in the super block 304 fVolume->SuperBlock().flags = SUPER_BLOCK_DISK_DIRTY; 305 fVolume->SuperBlock().log_end = logPosition; 306 fVolume->LogEnd() = logPosition; 307 308 return fVolume->WriteSuperBlock(); 309 } 310 311 312 status_t 313 Journal::FlushLogAndBlocks() 314 { 315 status_t status = Lock((Transaction *)this); 316 if (status != B_OK) 317 return status; 318 319 // write the current log entry to disk 320 321 if (TransactionSize() != 0) { 322 status = WriteLogEntry(); 323 if (status < B_OK) 324 FATAL(("writing current log entry failed: %s\n", strerror(status))); 325 } 326 status = fVolume->FlushDevice(); 327 328 Unlock((Transaction *)this, true); 329 return status; 330 } 331 332 333 status_t 334 Journal::Lock(Transaction *owner) 335 { 336 if (owner == fOwner) 337 return B_OK; 338 339 status_t status = fLock.Lock(); 340 if (status == B_OK) { 341 fOwner = owner; 342 fOwningThread = find_thread(NULL); 343 } 344 345 // if the last transaction is older than 2 secs, start a new one 346 if (fTransactionsInEntry != 0 && system_time() - fTimestamp > 2000000L) 347 WriteLogEntry(); 348 349 return B_OK; 350 } 351 352 353 void 354 Journal::Unlock(Transaction *owner, bool success) 355 { 356 if (owner != fOwner) 357 return; 358 359 TransactionDone(success); 360 361 fTimestamp = system_time(); 362 fOwner = NULL; 363 fOwningThread = -1; 364 fLock.Unlock(); 365 } 366 367 368 status_t 369 Journal::TransactionDone(bool success) 370 { 371 if (!success && fTransactionsInEntry == 0) { 372 // we can safely abort the transaction 373 sorted_array *array = fArray.Array(); 374 if (array != NULL) { 375 // release the lock for all blocks in the array (we don't need 376 // to be notified when they are actually written to disk) 377 for (int32 i = 0; i < array->count; i++) 378 release_block(fVolume->Device(), array->values[i]); 379 } 380 381 return B_OK; 382 } 383 384 // Up to a maximum size, we will just batch several 385 // transactions together to improve speed 386 if (TransactionSize() < fMaxTransactionSize) { 387 fTransactionsInEntry++; 388 fHasChangedBlocks = false; 389 390 return B_OK; 391 } 392 393 return WriteLogEntry(); 394 } 395 396 397 status_t 398 Journal::LogBlocks(off_t blockNumber, const uint8 *buffer, size_t numBlocks) 399 { 400 // ToDo: that's for now - we should change the log file size here 401 if (TransactionSize() + numBlocks + 1 > fLogSize) 402 return B_DEVICE_FULL; 403 404 fHasChangedBlocks = true; 405 int32 blockSize = fVolume->BlockSize(); 406 407 for (;numBlocks-- > 0; blockNumber++, buffer += blockSize) { 408 if (fArray.Find(blockNumber) >= 0) { 409 // The block is already in the log, so just update its data 410 // Note, this is only necessary if this method is called with a buffer 411 // different from the cached block buffer - which is unlikely but 412 // we'll make sure this way (costs one cache lookup, though). 413 status_t status = cached_write(fVolume->Device(), blockNumber, buffer, 1, blockSize); 414 if (status < B_OK) 415 return status; 416 417 continue; 418 } 419 420 // Insert the block into the transaction's array, and write the changes 421 // back into the locked cache buffer 422 fArray.Insert(blockNumber); 423 status_t status = cached_write_locked(fVolume->Device(), blockNumber, buffer, 1, blockSize); 424 if (status < B_OK) 425 return status; 426 } 427 428 // If necessary, flush the log, so that we have enough space for this transaction 429 if (TransactionSize() > FreeLogBlocks()) 430 force_cache_flush(fVolume->Device(), true); 431 432 return B_OK; 433 } 434 435 436 // #pragma mark - 437 438 439 status_t 440 Transaction::Start(Volume *volume, off_t refBlock) 441 { 442 // has it already been started? 443 if (fJournal != NULL) 444 return B_OK; 445 446 fJournal = volume->GetJournal(refBlock); 447 if (fJournal != NULL && fJournal->Lock(this) == B_OK) 448 return B_OK; 449 450 fJournal = NULL; 451 return B_ERROR; 452 } 453 454