1 /* 2 * Copyright 2004-2006, Axel Dörfler, axeld@pinc-software.de. All rights reserved. 3 * Distributed under the terms of the MIT License. 4 */ 5 6 7 #include "vnode_store.h" 8 9 #include <KernelExport.h> 10 #include <fs_cache.h> 11 12 #include <util/kernel_cpp.h> 13 #include <file_cache.h> 14 #include <vfs.h> 15 #include <vm.h> 16 #include <vm_page.h> 17 #include <vm_cache.h> 18 #include <generic_syscall.h> 19 20 #include <unistd.h> 21 #include <stdlib.h> 22 #include <string.h> 23 24 25 //#define TRACE_FILE_CACHE 26 #ifdef TRACE_FILE_CACHE 27 # define TRACE(x) dprintf x 28 #else 29 # define TRACE(x) ; 30 #endif 31 32 // maximum number of iovecs per request 33 #define MAX_IO_VECS 64 // 256 kB 34 #define MAX_FILE_IO_VECS 32 35 #define MAX_TEMP_IO_VECS 8 36 37 #define CACHED_FILE_EXTENTS 2 38 // must be smaller than MAX_FILE_IO_VECS 39 // ToDo: find out how much of these are typically used 40 41 struct file_extent { 42 off_t offset; 43 file_io_vec disk; 44 }; 45 46 struct file_map { 47 file_map(); 48 ~file_map(); 49 50 file_extent *operator[](uint32 index); 51 file_extent *ExtentAt(uint32 index); 52 status_t Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset); 53 void Free(); 54 55 union { 56 file_extent direct[CACHED_FILE_EXTENTS]; 57 file_extent *array; 58 }; 59 size_t count; 60 }; 61 62 struct file_cache_ref { 63 vm_cache_ref *cache; 64 void *vnode; 65 void *device; 66 void *cookie; 67 file_map map; 68 }; 69 70 71 static struct cache_module_info *sCacheModule; 72 73 74 file_map::file_map() 75 { 76 array = NULL; 77 count = 0; 78 } 79 80 81 file_map::~file_map() 82 { 83 Free(); 84 } 85 86 87 file_extent * 88 file_map::operator[](uint32 index) 89 { 90 return ExtentAt(index); 91 } 92 93 94 file_extent * 95 file_map::ExtentAt(uint32 index) 96 { 97 if (index >= count) 98 return NULL; 99 100 if (count > CACHED_FILE_EXTENTS) 101 return &array[index]; 102 103 return &direct[index]; 104 } 105 106 107 status_t 108 file_map::Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset) 109 { 110 TRACE(("file_map::Add(vecCount = %ld)\n", vecCount)); 111 112 off_t offset = 0; 113 114 if (vecCount <= CACHED_FILE_EXTENTS && count == 0) { 115 // just use the reserved area in the file_cache_ref structure 116 } else { 117 // TODO: once we can invalidate only parts of the file map, 118 // we might need to copy the previously cached file extends 119 // from the direct range 120 file_extent *newMap = (file_extent *)realloc(array, 121 (count + vecCount) * sizeof(file_extent)); 122 if (newMap == NULL) 123 return B_NO_MEMORY; 124 125 array = newMap; 126 127 if (count != 0) { 128 file_extent *extent = ExtentAt(count - 1); 129 offset = extent->offset + extent->disk.length; 130 } 131 } 132 133 int32 start = count; 134 count += vecCount; 135 136 for (uint32 i = 0; i < vecCount; i++) { 137 file_extent *extent = ExtentAt(start + i); 138 139 extent->offset = offset; 140 extent->disk = vecs[i]; 141 142 offset += extent->disk.length; 143 } 144 145 #ifdef TRACE_FILE_CACHE 146 for (uint32 i = 0; i < count; i++) { 147 file_extent *extent = ExtentAt(i); 148 dprintf("[%ld] extend offset %Ld, disk offset %Ld, length %Ld\n", 149 i, extent->offset, extent->disk.offset, extent->disk.length); 150 } 151 #endif 152 153 lastOffset = offset; 154 return B_OK; 155 } 156 157 158 void 159 file_map::Free() 160 { 161 if (count > CACHED_FILE_EXTENTS) 162 free(array); 163 164 array = NULL; 165 count = 0; 166 } 167 168 169 // #pragma mark - 170 171 172 static void 173 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size) 174 { 175 if (index > 0 && (addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) { 176 // the iovec can be combined with the previous one 177 vecs[index - 1].iov_len += size; 178 return; 179 } 180 181 if (index == max) 182 panic("no more space for iovecs!"); 183 184 // we need to start a new iovec 185 vecs[index].iov_base = (void *)address; 186 vecs[index].iov_len = size; 187 index++; 188 } 189 190 191 static file_extent * 192 find_file_extent(file_cache_ref *ref, off_t offset, uint32 *_index) 193 { 194 // TODO: do binary search 195 196 for (uint32 index = 0; index < ref->map.count; index++) { 197 file_extent *extent = ref->map[index]; 198 199 if (extent->offset <= offset 200 && extent->offset + extent->disk.length > offset) { 201 if (_index) 202 *_index = index; 203 return extent; 204 } 205 } 206 207 return NULL; 208 } 209 210 211 static status_t 212 get_file_map(file_cache_ref *ref, off_t offset, size_t size, 213 file_io_vec *vecs, size_t *_count) 214 { 215 size_t maxVecs = *_count; 216 status_t status = B_OK; 217 218 if (ref->map.count == 0) { 219 // we don't yet have the map of this file, so let's grab it 220 // (ordered by offset, so that we can do a binary search on them) 221 222 mutex_lock(&ref->cache->lock); 223 224 // the file map could have been requested in the mean time 225 if (ref->map.count == 0) { 226 size_t vecCount = maxVecs; 227 off_t mapOffset = 0; 228 229 while (true) { 230 status = vfs_get_file_map(ref->vnode, mapOffset, ~0UL, vecs, &vecCount); 231 if (status < B_OK && status != B_BUFFER_OVERFLOW) { 232 mutex_unlock(&ref->cache->lock); 233 return status; 234 } 235 236 status_t addStatus = ref->map.Add(vecs, vecCount, mapOffset); 237 if (addStatus != B_OK) { 238 // only clobber the status in case of failure 239 status = addStatus; 240 } 241 242 if (status != B_BUFFER_OVERFLOW) 243 break; 244 245 // when we are here, the map has been stored in the array, and 246 // the array size was still too small to cover the whole file 247 vecCount = maxVecs; 248 } 249 } 250 251 mutex_unlock(&ref->cache->lock); 252 } 253 254 if (status != B_OK) { 255 // We must invalidate the (part of the) map we already 256 // have, as we cannot know if it's complete or not 257 ref->map.Free(); 258 return status; 259 } 260 261 // We now have cached the map of this file, we now need to 262 // translate it for the requested access. 263 264 uint32 index; 265 file_extent *fileExtent = find_file_extent(ref, offset, &index); 266 if (fileExtent == NULL) { 267 // access outside file bounds? But that's not our problem 268 *_count = 0; 269 return B_OK; 270 } 271 272 offset -= fileExtent->offset; 273 vecs[0].offset = fileExtent->disk.offset + offset; 274 vecs[0].length = fileExtent->disk.length - offset; 275 276 if (vecs[0].length >= size || index >= ref->map.count - 1) { 277 *_count = 1; 278 return B_OK; 279 } 280 281 // copy the rest of the vecs 282 283 size -= vecs[0].length; 284 285 for (index = 1; index < ref->map.count;) { 286 fileExtent++; 287 288 vecs[index] = fileExtent->disk; 289 index++; 290 291 if (size <= fileExtent->disk.length) 292 break; 293 294 if (index >= maxVecs) { 295 *_count = index; 296 return B_BUFFER_OVERFLOW; 297 } 298 299 size -= fileExtent->disk.length; 300 } 301 302 *_count = index; 303 return B_OK; 304 } 305 306 307 /*! 308 Does the dirty work of translating the request into actual disk offsets 309 and reads to or writes from the supplied iovecs as specified by \a doWrite. 310 */ 311 static status_t 312 pages_io(file_cache_ref *ref, off_t offset, const iovec *vecs, size_t count, 313 size_t *_numBytes, bool doWrite) 314 { 315 TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, vecCount = %lu, %s\n", 316 ref, offset, *_numBytes, count, doWrite ? "write" : "read")); 317 318 // translate the iovecs into direct device accesses 319 file_io_vec fileVecs[MAX_FILE_IO_VECS]; 320 size_t fileVecCount = MAX_FILE_IO_VECS; 321 size_t numBytes = *_numBytes; 322 323 status_t status = get_file_map(ref, offset, numBytes, fileVecs, 324 &fileVecCount); 325 if (status < B_OK && status != B_BUFFER_OVERFLOW) { 326 TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n", 327 offset, numBytes, strerror(status))); 328 return status; 329 } 330 331 bool bufferOverflow = status == B_BUFFER_OVERFLOW; 332 333 #ifdef TRACE_FILE_CACHE 334 dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount, offset, 335 numBytes, bufferOverflow ? " (array too small)" : ""); 336 for (size_t i = 0; i < fileVecCount; i++) { 337 dprintf(" [%lu] offset = %Ld, size = %Ld\n", 338 i, fileVecs[i].offset, fileVecs[i].length); 339 } 340 #endif 341 342 if (fileVecCount == 0) { 343 // There are no file vecs at this offset, so we're obviously trying 344 // to access the file outside of its bounds 345 TRACE(("pages_io: access outside of vnode %p at offset %Ld\n", 346 ref->vnode, offset)); 347 return B_BAD_VALUE; 348 } 349 350 uint32 fileVecIndex; 351 size_t size; 352 353 if (!doWrite) { 354 // now directly read the data from the device 355 // the first file_io_vec can be read directly 356 357 size = fileVecs[0].length; 358 if (size > numBytes) 359 size = numBytes; 360 361 status = vfs_read_pages(ref->device, ref->cookie, fileVecs[0].offset, 362 vecs, count, &size, false); 363 if (status < B_OK) 364 return status; 365 366 // TODO: this is a work-around for buggy device drivers! 367 // When our own drivers honour the length, we can: 368 // a) also use this direct I/O for writes (otherwise, it would 369 // overwrite precious data) 370 // b) panic if the term below is true (at least for writes) 371 if (size > fileVecs[0].length) { 372 //dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device); 373 size = fileVecs[0].length; 374 } 375 376 ASSERT(size <= fileVecs[0].length); 377 378 // If the file portion was contiguous, we're already done now 379 if (size == numBytes) 380 return B_OK; 381 382 // if we reached the end of the file, we can return as well 383 if (size != fileVecs[0].length) { 384 *_numBytes = size; 385 return B_OK; 386 } 387 388 fileVecIndex = 1; 389 } else { 390 fileVecIndex = 0; 391 size = 0; 392 } 393 394 // Too bad, let's process the rest of the file_io_vecs 395 396 size_t totalSize = size; 397 398 // first, find out where we have to continue in our iovecs 399 uint32 i = 0; 400 for (; i < count; i++) { 401 if (size < vecs[i].iov_len) 402 break; 403 404 size -= vecs[i].iov_len; 405 } 406 407 size_t vecOffset = size; 408 size_t bytesLeft = numBytes - size; 409 410 while (true) { 411 for (; fileVecIndex < fileVecCount; fileVecIndex++) { 412 file_io_vec &fileVec = fileVecs[fileVecIndex]; 413 off_t fileOffset = fileVec.offset; 414 off_t fileLeft = min_c(fileVec.length, bytesLeft); 415 416 TRACE(("FILE VEC [%lu] length %Ld\n", fileVecIndex, fileLeft)); 417 418 // process the complete fileVec 419 while (fileLeft > 0) { 420 iovec tempVecs[MAX_TEMP_IO_VECS]; 421 uint32 tempCount = 0; 422 423 // size tracks how much of what is left of the current fileVec 424 // (fileLeft) has been assigned to tempVecs 425 size = 0; 426 427 // assign what is left of the current fileVec to the tempVecs 428 for (size = 0; size < fileLeft && i < count 429 && tempCount < MAX_TEMP_IO_VECS;) { 430 // try to satisfy one iovec per iteration (or as much as 431 // possible) 432 433 // bytes left of the current iovec 434 size_t vecLeft = vecs[i].iov_len - vecOffset; 435 if (vecLeft == 0) { 436 vecOffset = 0; 437 i++; 438 continue; 439 } 440 441 TRACE(("fill vec %ld, offset = %lu, size = %lu\n", 442 i, vecOffset, size)); 443 444 // actually available bytes 445 size_t tempVecSize = min_c(vecLeft, fileLeft - size); 446 447 tempVecs[tempCount].iov_base 448 = (void *)((addr_t)vecs[i].iov_base + vecOffset); 449 tempVecs[tempCount].iov_len = tempVecSize; 450 tempCount++; 451 452 size += tempVecSize; 453 vecOffset += tempVecSize; 454 } 455 456 size_t bytes = size; 457 if (doWrite) { 458 status = vfs_write_pages(ref->device, ref->cookie, 459 fileOffset, tempVecs, tempCount, &bytes, false); 460 } else { 461 status = vfs_read_pages(ref->device, ref->cookie, 462 fileOffset, tempVecs, tempCount, &bytes, false); 463 } 464 if (status < B_OK) 465 return status; 466 467 totalSize += bytes; 468 bytesLeft -= size; 469 fileOffset += size; 470 fileLeft -= size; 471 //dprintf("-> file left = %Lu\n", fileLeft); 472 473 if (size != bytes || i >= count) { 474 // there are no more bytes or iovecs, let's bail out 475 *_numBytes = totalSize; 476 return B_OK; 477 } 478 } 479 } 480 481 if (bufferOverflow) { 482 status = get_file_map(ref, offset + totalSize, bytesLeft, fileVecs, 483 &fileVecCount); 484 if (status < B_OK && status != B_BUFFER_OVERFLOW) { 485 TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n", 486 offset, numBytes, strerror(status))); 487 return status; 488 } 489 490 bufferOverflow = status == B_BUFFER_OVERFLOW; 491 fileVecIndex = 0; 492 493 #ifdef TRACE_FILE_CACHE 494 dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount, 495 offset + totalSize, numBytes, 496 bufferOverflow ? " (array too small)" : ""); 497 for (size_t i = 0; i < fileVecCount; i++) { 498 dprintf(" [%lu] offset = %Ld, size = %Ld\n", 499 i, fileVecs[i].offset, fileVecs[i].length); 500 } 501 #endif 502 } else 503 break; 504 } 505 506 *_numBytes = totalSize; 507 return B_OK; 508 } 509 510 511 /*! 512 This function is called by read_into_cache() (and from there only) - it 513 can only handle a certain amount of bytes, and read_into_cache() makes 514 sure that it matches that criterion. 515 */ 516 static inline status_t 517 read_chunk_into_cache(file_cache_ref *ref, off_t offset, size_t numBytes, 518 int32 pageOffset, addr_t buffer, size_t bufferSize) 519 { 520 TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n", 521 offset, size, pageOffset, buffer, bufferSize)); 522 523 vm_cache_ref *cache = ref->cache; 524 525 iovec vecs[MAX_IO_VECS]; 526 int32 vecCount = 0; 527 528 vm_page *pages[MAX_IO_VECS]; 529 int32 pageIndex = 0; 530 531 // allocate pages for the cache and mark them busy 532 for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) { 533 vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE); 534 if (page == NULL) 535 panic("no more pages!"); 536 537 page->state = PAGE_STATE_BUSY; 538 539 vm_cache_insert_page(cache, page, offset + pos); 540 541 addr_t virtualAddress; 542 if (vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, 543 &virtualAddress, PHYSICAL_PAGE_CAN_WAIT) < B_OK) 544 panic("could not get physical page"); 545 546 add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE); 547 // TODO: check if the array is large enough (currently panics)! 548 } 549 550 mutex_unlock(&cache->lock); 551 552 // read file into reserved pages 553 status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, false); 554 if (status < B_OK) { 555 // reading failed, free allocated pages 556 557 dprintf("file_cache: read pages failed: %s\n", strerror(status)); 558 559 for (int32 i = 0; i < vecCount; i++) { 560 addr_t base = (addr_t)vecs[i].iov_base; 561 size_t size = vecs[i].iov_len; 562 563 for (size_t pos = 0; pos < size; 564 pos += B_PAGE_SIZE, base += B_PAGE_SIZE) { 565 vm_put_physical_page(base); 566 } 567 } 568 569 mutex_lock(&cache->lock); 570 571 for (int32 i = 0; i < pageIndex; i++) { 572 vm_cache_remove_page(cache, pages[i]); 573 vm_page_set_state(pages[i], PAGE_STATE_FREE); 574 } 575 576 return status; 577 } 578 579 // copy the pages and unmap them again 580 581 for (int32 i = 0; i < vecCount; i++) { 582 addr_t base = (addr_t)vecs[i].iov_base; 583 size_t size = vecs[i].iov_len; 584 585 // copy to user buffer if necessary 586 if (bufferSize != 0) { 587 size_t bytes = min_c(bufferSize, size - pageOffset); 588 589 user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes); 590 buffer += bytes; 591 bufferSize -= bytes; 592 pageOffset = 0; 593 } 594 595 for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE) 596 vm_put_physical_page(base); 597 } 598 599 mutex_lock(&cache->lock); 600 601 // make the pages accessible in the cache 602 for (int32 i = pageIndex; i-- > 0;) 603 pages[i]->state = PAGE_STATE_ACTIVE; 604 605 return B_OK; 606 } 607 608 609 /*! 610 This function reads \a size bytes directly from the file into the cache. 611 If \a bufferSize does not equal zero, \a bufferSize bytes from the data 612 read in are also copied to the provided \a buffer. 613 This function always allocates all pages; it is the responsibility of the 614 calling function to only ask for yet uncached ranges. 615 The cache_ref lock must be hold when calling this function. 616 */ 617 static status_t 618 read_into_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize) 619 { 620 TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n", 621 ref, offset, size, (void *)buffer, bufferSize)); 622 623 // do we have to read in anything at all? 624 if (size == 0) 625 return B_OK; 626 627 // make sure "offset" is page aligned - but also remember the page offset 628 int32 pageOffset = offset & (B_PAGE_SIZE - 1); 629 size = PAGE_ALIGN(size + pageOffset); 630 offset -= pageOffset; 631 632 while (true) { 633 size_t chunkSize = size; 634 if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE)) 635 chunkSize = MAX_IO_VECS * B_PAGE_SIZE; 636 637 status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset, 638 buffer, bufferSize); 639 if (status != B_OK) 640 return status; 641 642 if ((size -= chunkSize) == 0) 643 return B_OK; 644 645 if (chunkSize >= bufferSize) { 646 bufferSize = 0; 647 buffer = NULL; 648 } else { 649 bufferSize -= chunkSize - pageOffset; 650 buffer += chunkSize - pageOffset; 651 } 652 653 offset += chunkSize; 654 pageOffset = 0; 655 } 656 657 return B_OK; 658 } 659 660 661 /** Like read_chunk_into_cache() but writes data into the cache */ 662 663 static inline status_t 664 write_chunk_to_cache(file_cache_ref *ref, off_t offset, size_t numBytes, 665 int32 pageOffset, addr_t buffer, size_t bufferSize) 666 { 667 iovec vecs[MAX_IO_VECS]; 668 int32 vecCount = 0; 669 vm_page *pages[MAX_IO_VECS]; 670 int32 pageIndex = 0; 671 status_t status = B_OK; 672 673 // ToDo: this should be settable somewhere 674 bool writeThrough = false; 675 676 // allocate pages for the cache and mark them busy 677 for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) { 678 // ToDo: if space is becoming tight, and this cache is already grown 679 // big - shouldn't we better steal the pages directly in that case? 680 // (a working set like approach for the file cache) 681 vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE); 682 page->state = PAGE_STATE_BUSY; 683 684 vm_cache_insert_page(ref->cache, page, offset + pos); 685 686 addr_t virtualAddress; 687 vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, &virtualAddress, 688 PHYSICAL_PAGE_CAN_WAIT); 689 690 add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE); 691 // ToDo: check if the array is large enough! 692 } 693 694 mutex_unlock(&ref->cache->lock); 695 696 // copy contents (and read in partially written pages first) 697 698 if (pageOffset != 0) { 699 // This is only a partial write, so we have to read the rest of the page 700 // from the file to have consistent data in the cache 701 iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE }; 702 size_t bytesRead = B_PAGE_SIZE; 703 704 status = pages_io(ref, offset, &readVec, 1, &bytesRead, false); 705 // ToDo: handle errors for real! 706 if (status < B_OK) 707 panic("1. pages_io() failed: %s!\n", strerror(status)); 708 } 709 710 addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1); 711 if (lastPageOffset != 0) { 712 // get the last page in the I/O vectors 713 addr_t last = (addr_t)vecs[vecCount - 1].iov_base 714 + vecs[vecCount - 1].iov_len - B_PAGE_SIZE; 715 716 if (offset + pageOffset + bufferSize == ref->cache->cache->virtual_size) { 717 // the space in the page after this write action needs to be cleaned 718 memset((void *)(last + lastPageOffset), 0, B_PAGE_SIZE - lastPageOffset); 719 } else if (vecCount > 1) { 720 // the end of this write does not happen on a page boundary, so we 721 // need to fetch the last page before we can update it 722 iovec readVec = { (void *)last, B_PAGE_SIZE }; 723 size_t bytesRead = B_PAGE_SIZE; 724 725 status = pages_io(ref, offset + numBytes - B_PAGE_SIZE, &readVec, 1, 726 &bytesRead, false); 727 // ToDo: handle errors for real! 728 if (status < B_OK) 729 panic("pages_io() failed: %s!\n", strerror(status)); 730 } 731 } 732 733 for (int32 i = 0; i < vecCount; i++) { 734 addr_t base = (addr_t)vecs[i].iov_base; 735 size_t bytes = min_c(bufferSize, size_t(vecs[i].iov_len - pageOffset)); 736 737 // copy data from user buffer 738 user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes); 739 740 bufferSize -= bytes; 741 if (bufferSize == 0) 742 break; 743 744 buffer += bytes; 745 pageOffset = 0; 746 } 747 748 if (writeThrough) { 749 // write cached pages back to the file if we were asked to do that 750 status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, true); 751 if (status < B_OK) { 752 // ToDo: remove allocated pages, ...? 753 panic("file_cache: remove allocated pages! write pages failed: %s\n", 754 strerror(status)); 755 } 756 } 757 758 mutex_lock(&ref->cache->lock); 759 760 // unmap the pages again 761 762 for (int32 i = 0; i < vecCount; i++) { 763 addr_t base = (addr_t)vecs[i].iov_base; 764 size_t size = vecs[i].iov_len; 765 for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE) 766 vm_put_physical_page(base); 767 } 768 769 // make the pages accessible in the cache 770 for (int32 i = pageIndex; i-- > 0;) { 771 if (writeThrough) 772 pages[i]->state = PAGE_STATE_ACTIVE; 773 else 774 vm_page_set_state(pages[i], PAGE_STATE_MODIFIED); 775 } 776 777 return status; 778 } 779 780 781 /** Like read_into_cache() but writes data into the cache. To preserve data consistency, 782 * it might also read pages into the cache, though, if only a partial page gets written. 783 * The cache_ref lock must be hold when calling this function. 784 */ 785 786 static status_t 787 write_to_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize) 788 { 789 TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n", 790 ref, offset, size, (void *)buffer, bufferSize)); 791 792 // make sure "offset" is page aligned - but also remember the page offset 793 int32 pageOffset = offset & (B_PAGE_SIZE - 1); 794 size = PAGE_ALIGN(size + pageOffset); 795 offset -= pageOffset; 796 797 while (true) { 798 size_t chunkSize = size; 799 if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE)) 800 chunkSize = MAX_IO_VECS * B_PAGE_SIZE; 801 802 status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize); 803 if (status != B_OK) 804 return status; 805 806 if ((size -= chunkSize) == 0) 807 return B_OK; 808 809 if (chunkSize >= bufferSize) { 810 bufferSize = 0; 811 buffer = NULL; 812 } else { 813 bufferSize -= chunkSize - pageOffset; 814 buffer += chunkSize - pageOffset; 815 } 816 817 offset += chunkSize; 818 pageOffset = 0; 819 } 820 821 return B_OK; 822 } 823 824 825 static status_t 826 satisfy_cache_io(file_cache_ref *ref, off_t offset, addr_t buffer, addr_t lastBuffer, 827 bool doWrite) 828 { 829 size_t requestSize = buffer - lastBuffer; 830 831 if (doWrite) 832 return write_to_cache(ref, offset, requestSize, lastBuffer, requestSize); 833 834 return read_into_cache(ref, offset, requestSize, lastBuffer, requestSize); 835 } 836 837 838 static status_t 839 cache_io(void *_cacheRef, off_t offset, addr_t buffer, size_t *_size, bool doWrite) 840 { 841 if (_cacheRef == NULL) 842 panic("cache_io() called with NULL ref!\n"); 843 844 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 845 vm_cache_ref *cache = ref->cache; 846 off_t fileSize = cache->cache->virtual_size; 847 848 TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n", 849 ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read")); 850 851 // out of bounds access? 852 if (offset >= fileSize || offset < 0) { 853 *_size = 0; 854 return B_OK; 855 } 856 857 int32 pageOffset = offset & (B_PAGE_SIZE - 1); 858 size_t size = *_size; 859 offset -= pageOffset; 860 861 if (offset + pageOffset + size > fileSize) { 862 // adapt size to be within the file's offsets 863 size = fileSize - pageOffset - offset; 864 *_size = size; 865 } 866 867 // "offset" and "lastOffset" are always aligned to B_PAGE_SIZE, 868 // the "last*" variables always point to the end of the last 869 // satisfied request part 870 871 size_t bytesLeft = size, lastLeft = size; 872 int32 lastPageOffset = pageOffset; 873 addr_t lastBuffer = buffer; 874 off_t lastOffset = offset; 875 876 mutex_lock(&cache->lock); 877 878 for (; bytesLeft > 0; offset += B_PAGE_SIZE) { 879 // check if this page is already in memory 880 restart: 881 vm_page *page = vm_cache_lookup_page(cache, offset); 882 vm_page *dummyPage = NULL; 883 if (page != NULL) { 884 // The page is busy - since we need to unlock the cache sometime 885 // in the near future, we need to satisfy the request of the pages 886 // we didn't get yet (to make sure no one else interferes in the 887 // mean time). 888 status_t status = B_OK; 889 890 if (lastBuffer != buffer) { 891 status = satisfy_cache_io(ref, lastOffset + lastPageOffset, 892 buffer, lastBuffer, doWrite); 893 if (status == B_OK) { 894 lastBuffer = buffer; 895 lastLeft = bytesLeft; 896 lastOffset = offset; 897 lastPageOffset = 0; 898 pageOffset = 0; 899 } 900 } 901 902 if (status != B_OK) { 903 mutex_unlock(&cache->lock); 904 return status; 905 } 906 907 if (page->state == PAGE_STATE_BUSY) { 908 if (page->type == PAGE_TYPE_DUMMY) { 909 dummyPage = page; 910 page = vm_page_allocate_page(PAGE_STATE_FREE); 911 if (page == NULL) { 912 mutex_unlock(&cache->lock); 913 return B_NO_MEMORY; 914 } 915 } else { 916 mutex_unlock(&cache->lock); 917 // ToDo: don't wait forever! 918 snooze(20000); 919 mutex_lock(&cache->lock); 920 goto restart; 921 } 922 } 923 } 924 925 size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft); 926 addr_t virtualAddress; 927 928 TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset)); 929 if (page != NULL) { 930 vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, 931 &virtualAddress, PHYSICAL_PAGE_CAN_WAIT); 932 933 if (dummyPage != NULL && (!doWrite || bytesInPage != B_PAGE_SIZE)) { 934 // This page is currently in-use by someone else - since we cannot 935 // know if this someone does what we want, and if it even can do 936 // what we want (we may own a lock the blocks the other request), 937 // we need to handle this case specifically 938 iovec vec; 939 vec.iov_base = (void *)virtualAddress; 940 vec.iov_len = B_PAGE_SIZE; 941 942 size_t size = B_PAGE_SIZE; 943 status_t status = pages_io(ref, offset, &vec, 1, &size, false); 944 if (status != B_OK) { 945 vm_put_physical_page(virtualAddress); 946 mutex_unlock(&cache->lock); 947 return status; 948 } 949 } 950 951 // and copy the contents of the page already in memory 952 if (doWrite) { 953 user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage); 954 955 // make sure the page is in the modified list 956 if (page->state != PAGE_STATE_MODIFIED) 957 vm_page_set_state(page, PAGE_STATE_MODIFIED); 958 } else 959 user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage); 960 961 vm_put_physical_page(virtualAddress); 962 963 if (dummyPage != NULL) { 964 // check if the dummy page is still in place 965 restart_dummy_lookup: 966 vm_page *currentPage = vm_cache_lookup_page(cache, offset); 967 if (currentPage == NULL) { 968 // there is no page in place anymore, we'll put ours 969 // into it 970 vm_cache_insert_page(cache, page, offset); 971 } else if (currentPage->state == PAGE_STATE_BUSY) { 972 if (currentPage->type == PAGE_TYPE_DUMMY) { 973 // we let the other party add our page 974 currentPage->queue_next = page; 975 } else { 976 mutex_unlock(&cache->lock); 977 // ToDo: don't wait forever! 978 snooze(20000); 979 mutex_lock(&cache->lock); 980 goto restart_dummy_lookup; 981 } 982 } else { 983 // we need to copy our new page into the old one 984 addr_t destinationAddress; 985 vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, 986 &virtualAddress, PHYSICAL_PAGE_CAN_WAIT); 987 vm_get_physical_page(currentPage->physical_page_number * B_PAGE_SIZE, 988 &destinationAddress, PHYSICAL_PAGE_CAN_WAIT); 989 990 memcpy((void *)destinationAddress, (void *)virtualAddress, B_PAGE_SIZE); 991 992 vm_put_physical_page(destinationAddress); 993 vm_put_physical_page(virtualAddress); 994 995 vm_page_set_state(page, PAGE_STATE_FREE); 996 } 997 } 998 999 if (bytesLeft <= bytesInPage) { 1000 // we've read the last page, so we're done! 1001 mutex_unlock(&cache->lock); 1002 return B_OK; 1003 } 1004 1005 // prepare a potential gap request 1006 lastBuffer = buffer + bytesInPage; 1007 lastLeft = bytesLeft - bytesInPage; 1008 lastOffset = offset + B_PAGE_SIZE; 1009 lastPageOffset = 0; 1010 } 1011 1012 if (bytesLeft <= bytesInPage) 1013 break; 1014 1015 buffer += bytesInPage; 1016 bytesLeft -= bytesInPage; 1017 pageOffset = 0; 1018 } 1019 1020 // fill the last remaining bytes of the request (either write or read) 1021 1022 status_t status; 1023 if (doWrite) 1024 status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft); 1025 else 1026 status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft); 1027 1028 mutex_unlock(&cache->lock); 1029 return status; 1030 } 1031 1032 1033 static status_t 1034 file_cache_control(const char *subsystem, uint32 function, void *buffer, size_t bufferSize) 1035 { 1036 switch (function) { 1037 case CACHE_CLEAR: 1038 // ToDo: clear the cache 1039 dprintf("cache_control: clear cache!\n"); 1040 return B_OK; 1041 1042 case CACHE_SET_MODULE: 1043 { 1044 cache_module_info *module = sCacheModule; 1045 1046 // unset previous module 1047 1048 if (sCacheModule != NULL) { 1049 sCacheModule = NULL; 1050 snooze(100000); // 0.1 secs 1051 put_module(module->info.name); 1052 } 1053 1054 // get new module, if any 1055 1056 if (buffer == NULL) 1057 return B_OK; 1058 1059 char name[B_FILE_NAME_LENGTH]; 1060 if (!IS_USER_ADDRESS(buffer) 1061 || user_strlcpy(name, (char *)buffer, B_FILE_NAME_LENGTH) < B_OK) 1062 return B_BAD_ADDRESS; 1063 1064 if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME))) 1065 return B_BAD_VALUE; 1066 1067 dprintf("cache_control: set module %s!\n", name); 1068 1069 status_t status = get_module(name, (module_info **)&module); 1070 if (status == B_OK) 1071 sCacheModule = module; 1072 1073 return status; 1074 } 1075 } 1076 1077 return B_BAD_HANDLER; 1078 } 1079 1080 1081 // #pragma mark - private kernel API 1082 1083 1084 extern "C" void 1085 cache_prefetch_vnode(void *vnode, off_t offset, size_t size) 1086 { 1087 vm_cache_ref *cache; 1088 if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK) 1089 return; 1090 1091 file_cache_ref *ref = (struct file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref; 1092 off_t fileSize = cache->cache->virtual_size; 1093 1094 if (size > fileSize) 1095 size = fileSize; 1096 1097 // we never fetch more than 4 MB at once 1098 if (size > 4 * 1024 * 1024) 1099 size = 4 * 1024 * 1024; 1100 1101 size_t bytesLeft = size, lastLeft = size; 1102 off_t lastOffset = offset; 1103 size_t lastSize = 0; 1104 1105 mutex_lock(&cache->lock); 1106 1107 for (; bytesLeft > 0; offset += B_PAGE_SIZE) { 1108 // check if this page is already in memory 1109 addr_t virtualAddress; 1110 restart: 1111 vm_page *page = vm_cache_lookup_page(cache, offset); 1112 if (page != NULL) { 1113 // it is, so let's satisfy in the first part of the request 1114 if (lastOffset < offset) { 1115 size_t requestSize = offset - lastOffset; 1116 read_into_cache(ref, lastOffset, requestSize, NULL, 0); 1117 } 1118 1119 if (bytesLeft <= B_PAGE_SIZE) { 1120 // we've read the last page, so we're done! 1121 goto out; 1122 } 1123 1124 // prepare a potential gap request 1125 lastOffset = offset + B_PAGE_SIZE; 1126 lastLeft = bytesLeft - B_PAGE_SIZE; 1127 } 1128 1129 if (bytesLeft <= B_PAGE_SIZE) 1130 break; 1131 1132 bytesLeft -= B_PAGE_SIZE; 1133 } 1134 1135 // read in the last part 1136 read_into_cache(ref, lastOffset, lastLeft, NULL, 0); 1137 1138 out: 1139 mutex_unlock(&cache->lock); 1140 vm_cache_release_ref(cache); 1141 } 1142 1143 1144 extern "C" void 1145 cache_prefetch(mount_id mountID, vnode_id vnodeID, off_t offset, size_t size) 1146 { 1147 void *vnode; 1148 1149 // ToDo: schedule prefetch 1150 1151 TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID)); 1152 1153 // get the vnode for the object, this also grabs a ref to it 1154 if (vfs_get_vnode(mountID, vnodeID, &vnode) != B_OK) 1155 return; 1156 1157 cache_prefetch_vnode(vnode, offset, size); 1158 vfs_put_vnode(vnode); 1159 } 1160 1161 1162 extern "C" void 1163 cache_node_opened(void *vnode, int32 fdType, vm_cache_ref *cache, mount_id mountID, 1164 vnode_id parentID, vnode_id vnodeID, const char *name) 1165 { 1166 if (sCacheModule == NULL || sCacheModule->node_opened == NULL) 1167 return; 1168 1169 off_t size = -1; 1170 if (cache != NULL) { 1171 file_cache_ref *ref = (file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref; 1172 if (ref != NULL) 1173 size = ref->cache->cache->virtual_size; 1174 } 1175 1176 sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name, size); 1177 } 1178 1179 1180 extern "C" void 1181 cache_node_closed(void *vnode, int32 fdType, vm_cache_ref *cache, 1182 mount_id mountID, vnode_id vnodeID) 1183 { 1184 if (sCacheModule == NULL || sCacheModule->node_closed == NULL) 1185 return; 1186 1187 int32 accessType = 0; 1188 if (cache != NULL) { 1189 // ToDo: set accessType 1190 } 1191 1192 sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType); 1193 } 1194 1195 1196 extern "C" void 1197 cache_node_launched(size_t argCount, char * const *args) 1198 { 1199 if (sCacheModule == NULL || sCacheModule->node_launched == NULL) 1200 return; 1201 1202 sCacheModule->node_launched(argCount, args); 1203 } 1204 1205 1206 extern "C" status_t 1207 file_cache_init_post_boot_device(void) 1208 { 1209 // ToDo: get cache module out of driver settings 1210 1211 if (get_module("file_cache/launch_speedup/v1", (module_info **)&sCacheModule) == B_OK) { 1212 dprintf("** opened launch speedup: %Ld\n", system_time()); 1213 } else 1214 dprintf("** could not open launch speedup!\n"); 1215 1216 return B_OK; 1217 } 1218 1219 1220 extern "C" status_t 1221 file_cache_init(void) 1222 { 1223 register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0); 1224 return B_OK; 1225 } 1226 1227 1228 // #pragma mark - public FS API 1229 1230 1231 extern "C" void * 1232 file_cache_create(mount_id mountID, vnode_id vnodeID, off_t size, int fd) 1233 { 1234 TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd)); 1235 1236 file_cache_ref *ref = new file_cache_ref; 1237 if (ref == NULL) 1238 return NULL; 1239 1240 // TODO: delay vm_cache/vm_cache_ref creation until data is 1241 // requested/written for the first time? Listing lots of 1242 // files in Tracker (and elsewhere) could be slowed down. 1243 // Since the file_cache_ref itself doesn't have a lock, 1244 // we would need to "rent" one during construction, possibly 1245 // the vnode lock, maybe a dedicated one. 1246 // As there shouldn't be too much contention, we could also 1247 // use atomic_test_and_set(), and free the resources again 1248 // when that fails... 1249 1250 // Get the vnode of the underlying device 1251 if (vfs_get_vnode_from_fd(fd, true, &ref->device) != B_OK) 1252 goto err1; 1253 1254 // We also need the cookie of the underlying device to properly access it 1255 if (vfs_get_cookie_from_fd(fd, &ref->cookie) != B_OK) 1256 goto err2; 1257 1258 // Get the vnode for the object (note, this does not grab a reference to the node) 1259 if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK) 1260 goto err2; 1261 1262 // Gets (usually creates) the cache for the node - note, this does grab a 1263 // reference to the node... 1264 if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK) 1265 goto err2; 1266 1267 // ... that we don't need, and therefore release it again. 1268 // Our caller already holds a reference to the vnode; it will destroy us 1269 // when the last one goes away (which, of course, can only ever happen if 1270 // we don't grab an extra reference). 1271 vfs_put_vnode(ref->vnode); 1272 1273 ref->cache->cache->virtual_size = size; 1274 ((vnode_store *)ref->cache->cache->store)->file_cache_ref = ref; 1275 return ref; 1276 1277 err2: 1278 vfs_put_vnode(ref->device); 1279 err1: 1280 delete ref; 1281 return NULL; 1282 } 1283 1284 1285 extern "C" void 1286 file_cache_delete(void *_cacheRef) 1287 { 1288 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1289 1290 if (ref == NULL) 1291 return; 1292 1293 TRACE(("file_cache_delete(ref = %p)\n", ref)); 1294 1295 vfs_put_vnode(ref->device); 1296 delete ref; 1297 } 1298 1299 1300 extern "C" status_t 1301 file_cache_set_size(void *_cacheRef, off_t size) 1302 { 1303 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1304 1305 TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size)); 1306 1307 if (ref == NULL) 1308 return B_OK; 1309 1310 file_cache_invalidate_file_map(_cacheRef, 0, size); 1311 // ToDo: make this better (we would only need to extend or shrink the map) 1312 1313 mutex_lock(&ref->cache->lock); 1314 status_t status = vm_cache_resize(ref->cache, size); 1315 mutex_unlock(&ref->cache->lock); 1316 1317 return status; 1318 } 1319 1320 1321 extern "C" status_t 1322 file_cache_sync(void *_cacheRef) 1323 { 1324 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1325 if (ref == NULL) 1326 return B_BAD_VALUE; 1327 1328 return vm_cache_write_modified(ref->cache, true); 1329 } 1330 1331 1332 extern "C" status_t 1333 file_cache_read_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes) 1334 { 1335 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1336 1337 return pages_io(ref, offset, vecs, count, _numBytes, false); 1338 } 1339 1340 1341 extern "C" status_t 1342 file_cache_write_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes) 1343 { 1344 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1345 1346 status_t status = pages_io(ref, offset, vecs, count, _numBytes, true); 1347 TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n", 1348 ref, offset, vecs, count, *_numBytes, status)); 1349 1350 return status; 1351 } 1352 1353 1354 extern "C" status_t 1355 file_cache_read(void *_cacheRef, off_t offset, void *bufferBase, size_t *_size) 1356 { 1357 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1358 1359 TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n", 1360 ref, offset, bufferBase, *_size)); 1361 1362 return cache_io(ref, offset, (addr_t)bufferBase, _size, false); 1363 } 1364 1365 1366 extern "C" status_t 1367 file_cache_write(void *_cacheRef, off_t offset, const void *buffer, size_t *_size) 1368 { 1369 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1370 1371 status_t status = cache_io(ref, offset, (addr_t)const_cast<void *>(buffer), _size, true); 1372 TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n", 1373 ref, offset, buffer, *_size, status)); 1374 1375 return status; 1376 } 1377 1378 1379 extern "C" status_t 1380 file_cache_invalidate_file_map(void *_cacheRef, off_t offset, off_t size) 1381 { 1382 file_cache_ref *ref = (file_cache_ref *)_cacheRef; 1383 1384 // ToDo: honour offset/size parameters 1385 1386 TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size)); 1387 mutex_lock(&ref->cache->lock); 1388 ref->map.Free(); 1389 mutex_unlock(&ref->cache->lock); 1390 return B_OK; 1391 } 1392