xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision 4f2fd49bdc6078128b1391191e4edac647044c3d)
1 /*
2  * Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include <KernelExport.h>
14 #include <fs_cache.h>
15 
16 #include <condition_variable.h>
17 #include <file_cache.h>
18 #include <generic_syscall.h>
19 #include <low_resource_manager.h>
20 #include <thread.h>
21 #include <util/AutoLock.h>
22 #include <util/kernel_cpp.h>
23 #include <vfs.h>
24 #include <vm.h>
25 #include <vm_page.h>
26 #include <vm_cache.h>
27 
28 #include "io_requests.h"
29 
30 
31 //#define TRACE_FILE_CACHE
32 #ifdef TRACE_FILE_CACHE
33 #	define TRACE(x) dprintf x
34 #else
35 #	define TRACE(x) ;
36 #endif
37 
38 // maximum number of iovecs per request
39 #define MAX_IO_VECS			32	// 128 kB
40 #define MAX_FILE_IO_VECS	32
41 
42 #define BYPASS_IO_SIZE		65536
43 #define LAST_ACCESSES		3
44 
45 struct file_cache_ref {
46 	vm_cache		*cache;
47 	struct vnode	*vnode;
48 	off_t			last_access[LAST_ACCESSES];
49 		// TODO: it would probably be enough to only store the least
50 		//	significant 31 bits, and make this uint32 (one bit for
51 		//	write vs. read)
52 	int32			last_access_index;
53 	uint16			disabled_count;
54 	bool			last_access_was_write;
55 };
56 
57 typedef status_t (*cache_func)(file_cache_ref *ref, void *cookie, off_t offset,
58 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
59 	size_t lastReservedPages, size_t reservePages);
60 
61 
62 static struct cache_module_info *sCacheModule;
63 static const uint8 kZeroBuffer[4096] = {};
64 
65 
66 //	#pragma mark -
67 
68 
69 static void
70 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size)
71 {
72 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
73 			+ vecs[index - 1].iov_len == address) {
74 		// the iovec can be combined with the previous one
75 		vecs[index - 1].iov_len += size;
76 		return;
77 	}
78 
79 	if (index == max)
80 		panic("no more space for iovecs!");
81 
82 	// we need to start a new iovec
83 	vecs[index].iov_base = (void *)address;
84 	vecs[index].iov_len = size;
85 	index++;
86 }
87 
88 
89 static inline bool
90 access_is_sequential(file_cache_ref *ref)
91 {
92 	return ref->last_access[ref->last_access_index] != 0;
93 }
94 
95 
96 static inline void
97 push_access(file_cache_ref *ref, off_t offset, size_t bytes, bool isWrite)
98 {
99 	TRACE(("%p: push %Ld, %ld, %s\n", ref, offset, bytes,
100 		isWrite ? "write" : "read"));
101 
102 	int32 index = ref->last_access_index;
103 	int32 previous = index - 1;
104 	if (previous < 0)
105 		previous = LAST_ACCESSES - 1;
106 
107 	if (offset != ref->last_access[previous])
108 		ref->last_access[previous] = 0;
109 
110 	// we remember writes as negative offsets
111 	if (isWrite)
112 		ref->last_access[index] = -offset - bytes;
113 	else
114 		ref->last_access[index] = offset + bytes;
115 
116 	if (++index >= LAST_ACCESSES)
117 		index = 0;
118 	ref->last_access_index = index;
119 }
120 
121 
122 static void
123 reserve_pages(file_cache_ref *ref, size_t reservePages, bool isWrite)
124 {
125 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES) != B_NO_LOW_RESOURCE) {
126 		vm_cache *cache = ref->cache;
127 		cache->Lock();
128 
129 		if (list_is_empty(&cache->consumers) && cache->areas == NULL
130 			&& access_is_sequential(ref)) {
131 			// we are not mapped, and we're accessed sequentially
132 
133 			if (isWrite) {
134 				// just schedule some pages to be written back
135 				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
136 						vm_page* page = it.Next();) {
137 					if (page->state == PAGE_STATE_MODIFIED) {
138 						// TODO: for now, we only schedule one
139 						vm_page_schedule_write_page(page);
140 						break;
141 					}
142 				}
143 			} else {
144 				// free some pages from our cache
145 				// TODO: start with oldest
146 				uint32 left = reservePages;
147 				vm_page *page;
148 				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
149 						(page = it.Next()) != NULL && left > 0;) {
150 					if (page->state != PAGE_STATE_MODIFIED
151 						&& page->state != PAGE_STATE_BUSY) {
152 						cache->RemovePage(page);
153 						vm_page_set_state(page, PAGE_STATE_FREE);
154 						left--;
155 					}
156 				}
157 			}
158 		}
159 		cache->Unlock();
160 	}
161 
162 	vm_page_reserve_pages(reservePages);
163 }
164 
165 
166 /*!	Reads the requested amount of data into the cache, and allocates
167 	pages needed to fulfill that request. This function is called by cache_io().
168 	It can only handle a certain amount of bytes, and the caller must make
169 	sure that it matches that criterion.
170 	The cache_ref lock must be hold when calling this function; during
171 	operation it will unlock the cache, though.
172 */
173 static status_t
174 read_into_cache(file_cache_ref *ref, void *cookie, off_t offset,
175 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
176 	size_t lastReservedPages, size_t reservePages)
177 {
178 	TRACE(("read_into_cache(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
179 		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
180 
181 	vm_cache *cache = ref->cache;
182 
183 	// TODO: We're using way too much stack! Rather allocate a sufficiently
184 	// large chunk on the heap.
185 	iovec vecs[MAX_IO_VECS];
186 	int32 vecCount = 0;
187 
188 	size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
189 	vm_page *pages[MAX_IO_VECS];
190 	ConditionVariable busyConditions[MAX_IO_VECS];
191 	int32 pageIndex = 0;
192 
193 	// allocate pages for the cache and mark them busy
194 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
195 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(
196 			PAGE_STATE_FREE, true);
197 		if (page == NULL)
198 			panic("no more pages!");
199 
200 		busyConditions[pageIndex - 1].Publish(page, "page");
201 
202 		cache->InsertPage(page, offset + pos);
203 
204 		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
205 			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
206 			// TODO: check if the array is large enough (currently panics)!
207 	}
208 
209 	push_access(ref, offset, bufferSize, false);
210 	cache->Unlock();
211 	vm_page_unreserve_pages(lastReservedPages);
212 
213 	// read file into reserved pages
214 	status_t status = vfs_read_pages(ref->vnode, cookie, offset, vecs,
215 		vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
216 	if (status < B_OK) {
217 		// reading failed, free allocated pages
218 
219 		dprintf("file_cache: read pages failed: %s\n", strerror(status));
220 
221 		cache->Lock();
222 
223 		for (int32 i = 0; i < pageIndex; i++) {
224 			busyConditions[i].Unpublish();
225 			cache->RemovePage(pages[i]);
226 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
227 		}
228 
229 		return status;
230 	}
231 
232 	// copy the pages if needed and unmap them again
233 
234 	for (int32 i = 0; i < pageIndex; i++) {
235 		if (useBuffer && bufferSize != 0) {
236 			size_t bytes = min_c(bufferSize, (size_t)B_PAGE_SIZE - pageOffset);
237 
238 			vm_memcpy_from_physical((void*)buffer,
239 				pages[i]->physical_page_number * B_PAGE_SIZE + pageOffset,
240 				bytes, true);
241 
242 			buffer += bytes;
243 			bufferSize -= bytes;
244 			pageOffset = 0;
245 		}
246 	}
247 
248 	reserve_pages(ref, reservePages, false);
249 	cache->Lock();
250 
251 	// make the pages accessible in the cache
252 	for (int32 i = pageIndex; i-- > 0;) {
253 		pages[i]->state = PAGE_STATE_ACTIVE;
254 
255 		busyConditions[i].Unpublish();
256 	}
257 
258 	return B_OK;
259 }
260 
261 
262 static status_t
263 read_from_file(file_cache_ref *ref, void *cookie, off_t offset,
264 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
265 	size_t lastReservedPages, size_t reservePages)
266 {
267 	TRACE(("read_from_file(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
268 		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
269 
270 	if (!useBuffer)
271 		return B_OK;
272 
273 	iovec vec;
274 	vec.iov_base = (void *)buffer;
275 	vec.iov_len = bufferSize;
276 
277 	push_access(ref, offset, bufferSize, false);
278 	ref->cache->Unlock();
279 	vm_page_unreserve_pages(lastReservedPages);
280 
281 	status_t status = vfs_read_pages(ref->vnode, cookie, offset + pageOffset,
282 		&vec, 1, 0, &bufferSize);
283 
284 	if (status == B_OK)
285 		reserve_pages(ref, reservePages, false);
286 
287 	ref->cache->Lock();
288 
289 	return status;
290 }
291 
292 
293 /*!	Like read_into_cache() but writes data into the cache.
294 	To preserve data consistency, it might also read pages into the cache,
295 	though, if only a partial page gets written.
296 	The same restrictions apply.
297 */
298 static status_t
299 write_to_cache(file_cache_ref *ref, void *cookie, off_t offset,
300 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
301 	size_t lastReservedPages, size_t reservePages)
302 {
303 	// TODO: We're using way too much stack! Rather allocate a sufficiently
304 	// large chunk on the heap.
305 	iovec vecs[MAX_IO_VECS];
306 	int32 vecCount = 0;
307 	size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
308 	vm_page *pages[MAX_IO_VECS];
309 	int32 pageIndex = 0;
310 	status_t status = B_OK;
311 	ConditionVariable busyConditions[MAX_IO_VECS];
312 
313 	// ToDo: this should be settable somewhere
314 	bool writeThrough = false;
315 
316 	// allocate pages for the cache and mark them busy
317 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
318 		// TODO: if space is becoming tight, and this cache is already grown
319 		//	big - shouldn't we better steal the pages directly in that case?
320 		//	(a working set like approach for the file cache)
321 		// TODO: the pages we allocate here should have been reserved upfront
322 		//	in cache_io()
323 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(
324 			PAGE_STATE_FREE, true);
325 		busyConditions[pageIndex - 1].Publish(page, "page");
326 
327 		ref->cache->InsertPage(page, offset + pos);
328 
329 		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
330 		page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
331 	}
332 
333 	push_access(ref, offset, bufferSize, true);
334 	ref->cache->Unlock();
335 	vm_page_unreserve_pages(lastReservedPages);
336 
337 	// copy contents (and read in partially written pages first)
338 
339 	if (pageOffset != 0) {
340 		// This is only a partial write, so we have to read the rest of the page
341 		// from the file to have consistent data in the cache
342 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
343 		size_t bytesRead = B_PAGE_SIZE;
344 
345 		status = vfs_read_pages(ref->vnode, cookie, offset, &readVec, 1,
346 			B_PHYSICAL_IO_REQUEST, &bytesRead);
347 		// ToDo: handle errors for real!
348 		if (status < B_OK)
349 			panic("1. vfs_read_pages() failed: %s!\n", strerror(status));
350 	}
351 
352 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
353 	if (lastPageOffset != 0) {
354 		// get the last page in the I/O vectors
355 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
356 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
357 
358 		if (offset + pageOffset + bufferSize == ref->cache->virtual_end) {
359 			// the space in the page after this write action needs to be cleaned
360 			vm_memset_physical(last + lastPageOffset, 0,
361 				B_PAGE_SIZE - lastPageOffset);
362 		} else {
363 			// the end of this write does not happen on a page boundary, so we
364 			// need to fetch the last page before we can update it
365 			iovec readVec = { (void *)last, B_PAGE_SIZE };
366 			size_t bytesRead = B_PAGE_SIZE;
367 
368 			status = vfs_read_pages(ref->vnode, cookie,
369 				PAGE_ALIGN(offset + pageOffset + bufferSize) - B_PAGE_SIZE,
370 				&readVec, 1, B_PHYSICAL_IO_REQUEST, &bytesRead);
371 			// ToDo: handle errors for real!
372 			if (status < B_OK)
373 				panic("vfs_read_pages() failed: %s!\n", strerror(status));
374 
375 			if (bytesRead < B_PAGE_SIZE) {
376 				// the space beyond the file size needs to be cleaned
377 				vm_memset_physical(last + bytesRead, 0,
378 					B_PAGE_SIZE - bytesRead);
379 			}
380 		}
381 	}
382 
383 	for (int32 i = 0; i < vecCount; i++) {
384 		addr_t base = (addr_t)vecs[i].iov_base;
385 		size_t bytes = min_c(bufferSize,
386 			size_t(vecs[i].iov_len - pageOffset));
387 
388 		if (useBuffer) {
389 			// copy data from user buffer
390 			vm_memcpy_to_physical(base + pageOffset, (void *)buffer, bytes,
391 				true);
392 		} else {
393 			// clear buffer instead
394 			vm_memset_physical(base + pageOffset, 0, bytes);
395 		}
396 
397 		bufferSize -= bytes;
398 		if (bufferSize == 0)
399 			break;
400 
401 		buffer += bytes;
402 		pageOffset = 0;
403 	}
404 
405 	if (writeThrough) {
406 		// write cached pages back to the file if we were asked to do that
407 		status_t status = vfs_write_pages(ref->vnode, cookie, offset, vecs,
408 			vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
409 		if (status < B_OK) {
410 			// ToDo: remove allocated pages, ...?
411 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
412 				strerror(status));
413 		}
414 	}
415 
416 	if (status == B_OK)
417 		reserve_pages(ref, reservePages, true);
418 
419 	ref->cache->Lock();
420 
421 	// make the pages accessible in the cache
422 	for (int32 i = pageIndex; i-- > 0;) {
423 		busyConditions[i].Unpublish();
424 
425 		if (writeThrough)
426 			pages[i]->state = PAGE_STATE_ACTIVE;
427 		else
428 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
429 	}
430 
431 	return status;
432 }
433 
434 
435 static status_t
436 write_to_file(file_cache_ref *ref, void *cookie, off_t offset, int32 pageOffset,
437 	addr_t buffer, size_t bufferSize, bool useBuffer, size_t lastReservedPages,
438 	size_t reservePages)
439 {
440 	size_t chunkSize = 0;
441 	if (!useBuffer) {
442 		// we need to allocate a zero buffer
443 		// TODO: use smaller buffers if this fails
444 		chunkSize = min_c(bufferSize, B_PAGE_SIZE);
445 		buffer = (addr_t)malloc(chunkSize);
446 		if (buffer == 0)
447 			return B_NO_MEMORY;
448 
449 		memset((void *)buffer, 0, chunkSize);
450 	}
451 
452 	iovec vec;
453 	vec.iov_base = (void *)buffer;
454 	vec.iov_len = bufferSize;
455 
456 	push_access(ref, offset, bufferSize, true);
457 	ref->cache->Unlock();
458 	vm_page_unreserve_pages(lastReservedPages);
459 
460 	status_t status = B_OK;
461 
462 	if (!useBuffer) {
463 		while (bufferSize > 0) {
464 			if (bufferSize < chunkSize)
465 				chunkSize = bufferSize;
466 
467 			status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
468 				&vec, 1, 0, &chunkSize);
469 			if (status < B_OK)
470 				break;
471 
472 			bufferSize -= chunkSize;
473 			pageOffset += chunkSize;
474 		}
475 
476 		free((void*)buffer);
477 	} else {
478 		status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
479 			&vec, 1, 0, &bufferSize);
480 	}
481 
482 	if (status == B_OK)
483 		reserve_pages(ref, reservePages, true);
484 
485 	ref->cache->Lock();
486 
487 	return status;
488 }
489 
490 
491 static inline status_t
492 satisfy_cache_io(file_cache_ref *ref, void *cookie, cache_func function,
493 	off_t offset, addr_t buffer, bool useBuffer, int32 &pageOffset,
494 	size_t bytesLeft, size_t &reservePages, off_t &lastOffset,
495 	addr_t &lastBuffer, int32 &lastPageOffset, size_t &lastLeft,
496 	size_t &lastReservedPages)
497 {
498 	if (lastBuffer == buffer)
499 		return B_OK;
500 
501 	size_t requestSize = buffer - lastBuffer;
502 	reservePages = min_c(MAX_IO_VECS, (lastLeft - requestSize
503 		+ lastPageOffset + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
504 
505 	status_t status = function(ref, cookie, lastOffset, lastPageOffset,
506 		lastBuffer, requestSize, useBuffer, lastReservedPages, reservePages);
507 	if (status == B_OK) {
508 		lastReservedPages = reservePages;
509 		lastBuffer = buffer;
510 		lastLeft = bytesLeft;
511 		lastOffset = offset;
512 		lastPageOffset = 0;
513 		pageOffset = 0;
514 	}
515 	return status;
516 }
517 
518 
519 static status_t
520 cache_io(void *_cacheRef, void *cookie, off_t offset, addr_t buffer,
521 	size_t *_size, bool doWrite)
522 {
523 	if (_cacheRef == NULL)
524 		panic("cache_io() called with NULL ref!\n");
525 
526 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
527 	vm_cache *cache = ref->cache;
528 	off_t fileSize = cache->virtual_end;
529 	bool useBuffer = buffer != 0;
530 
531 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
532 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
533 
534 	// out of bounds access?
535 	if (offset >= fileSize || offset < 0) {
536 		*_size = 0;
537 		return B_OK;
538 	}
539 
540 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
541 	size_t size = *_size;
542 	offset -= pageOffset;
543 
544 	if (offset + pageOffset + size > fileSize) {
545 		// adapt size to be within the file's offsets
546 		size = fileSize - pageOffset - offset;
547 		*_size = size;
548 	}
549 	if (size == 0)
550 		return B_OK;
551 
552 	cache_func function;
553 	if (doWrite) {
554 		// in low memory situations, we bypass the cache beyond a
555 		// certain I/O size
556 		if (size >= BYPASS_IO_SIZE
557 			&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
558 				!= B_NO_LOW_RESOURCE) {
559 			function = write_to_file;
560 		} else
561 			function = write_to_cache;
562 	} else {
563 		if (size >= BYPASS_IO_SIZE
564 			&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
565 				!= B_NO_LOW_RESOURCE) {
566 			function = read_from_file;
567 		} else
568 			function = read_into_cache;
569 	}
570 
571 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
572 	// the "last*" variables always point to the end of the last
573 	// satisfied request part
574 
575 	const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
576 	size_t bytesLeft = size, lastLeft = size;
577 	int32 lastPageOffset = pageOffset;
578 	addr_t lastBuffer = buffer;
579 	off_t lastOffset = offset;
580 	size_t lastReservedPages = min_c(MAX_IO_VECS, (pageOffset + bytesLeft
581 		+ B_PAGE_SIZE - 1) >> PAGE_SHIFT);
582 	size_t reservePages = 0;
583 
584 	reserve_pages(ref, lastReservedPages, doWrite);
585 	AutoLocker<VMCache> locker(cache);
586 
587 	while (bytesLeft > 0) {
588 		// check if this page is already in memory
589 		vm_page *page = cache->LookupPage(offset);
590 		if (page != NULL) {
591 			// The page may be busy - since we need to unlock the cache sometime
592 			// in the near future, we need to satisfy the request of the pages
593 			// we didn't get yet (to make sure no one else interferes in the
594 			// mean time).
595 			status_t status = satisfy_cache_io(ref, cookie, function, offset,
596 				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
597 				lastOffset, lastBuffer, lastPageOffset, lastLeft,
598 				lastReservedPages);
599 			if (status != B_OK)
600 				return status;
601 
602 			if (page->state == PAGE_STATE_BUSY) {
603 				ConditionVariableEntry entry;
604 				entry.Add(page);
605 				locker.Unlock();
606 				entry.Wait();
607 				locker.Lock();
608 				continue;
609 			}
610 		}
611 
612 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
613 
614 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset "
615 			"= %lu\n", offset, page, bytesLeft, pageOffset));
616 
617 		if (page != NULL) {
618 			// Since we don't actually map pages as part of an area, we have
619 			// to manually maintain their usage_count
620 			page->usage_count = 2;
621 
622 			if (doWrite || useBuffer) {
623 				// Since the following user_mem{cpy,set}() might cause a page
624 				// fault, which in turn might cause pages to be reserved, we
625 				// need to unlock the cache temporarily to avoid a potential
626 				// deadlock. To make sure that our page doesn't go away, we mark
627 				// it busy for the time.
628 				uint8 oldPageState = page->state;
629 				page->state = PAGE_STATE_BUSY;
630 				locker.Unlock();
631 
632 				// copy the contents of the page already in memory
633 				addr_t pageAddress = page->physical_page_number * B_PAGE_SIZE
634 					+ pageOffset;
635 				if (doWrite) {
636 					if (useBuffer) {
637 						vm_memcpy_to_physical(pageAddress, (void*)buffer,
638 							bytesInPage, true);
639 					} else {
640 						vm_memset_physical(pageAddress, 0, bytesInPage);
641 					}
642 				} else if (useBuffer) {
643 					vm_memcpy_from_physical((void*)buffer, pageAddress,
644 						bytesInPage, true);
645 				}
646 
647 				locker.Lock();
648 
649 				page->state = oldPageState;
650 				if (doWrite && page->state != PAGE_STATE_MODIFIED)
651 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
652 			}
653 
654 			if (bytesLeft <= bytesInPage) {
655 				// we've read the last page, so we're done!
656 				locker.Unlock();
657 				vm_page_unreserve_pages(lastReservedPages);
658 				return B_OK;
659 			}
660 
661 			// prepare a potential gap request
662 			lastBuffer = buffer + bytesInPage;
663 			lastLeft = bytesLeft - bytesInPage;
664 			lastOffset = offset + B_PAGE_SIZE;
665 			lastPageOffset = 0;
666 		}
667 
668 		if (bytesLeft <= bytesInPage)
669 			break;
670 
671 		buffer += bytesInPage;
672 		bytesLeft -= bytesInPage;
673 		pageOffset = 0;
674 		offset += B_PAGE_SIZE;
675 
676 		if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
677 			status_t status = satisfy_cache_io(ref, cookie, function, offset,
678 				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
679 				lastOffset, lastBuffer, lastPageOffset, lastLeft,
680 				lastReservedPages);
681 			if (status != B_OK)
682 				return status;
683 		}
684 	}
685 
686 	// fill the last remaining bytes of the request (either write or read)
687 
688 	return function(ref, cookie, lastOffset, lastPageOffset, lastBuffer,
689 		lastLeft, useBuffer, lastReservedPages, 0);
690 }
691 
692 
693 static status_t
694 file_cache_control(const char *subsystem, uint32 function, void *buffer,
695 	size_t bufferSize)
696 {
697 	switch (function) {
698 		case CACHE_CLEAR:
699 			// ToDo: clear the cache
700 			dprintf("cache_control: clear cache!\n");
701 			return B_OK;
702 
703 		case CACHE_SET_MODULE:
704 		{
705 			cache_module_info *module = sCacheModule;
706 
707 			// unset previous module
708 
709 			if (sCacheModule != NULL) {
710 				sCacheModule = NULL;
711 				snooze(100000);	// 0.1 secs
712 				put_module(module->info.name);
713 			}
714 
715 			// get new module, if any
716 
717 			if (buffer == NULL)
718 				return B_OK;
719 
720 			char name[B_FILE_NAME_LENGTH];
721 			if (!IS_USER_ADDRESS(buffer)
722 				|| user_strlcpy(name, (char *)buffer,
723 						B_FILE_NAME_LENGTH) < B_OK)
724 				return B_BAD_ADDRESS;
725 
726 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
727 				return B_BAD_VALUE;
728 
729 			dprintf("cache_control: set module %s!\n", name);
730 
731 			status_t status = get_module(name, (module_info **)&module);
732 			if (status == B_OK)
733 				sCacheModule = module;
734 
735 			return status;
736 		}
737 	}
738 
739 	return B_BAD_HANDLER;
740 }
741 
742 
743 //	#pragma mark - private kernel API
744 
745 
746 extern "C" void
747 cache_prefetch_vnode(struct vnode *vnode, off_t offset, size_t size)
748 {
749 	vm_cache *cache;
750 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
751 		return;
752 
753 	file_cache_ref *ref = ((VMVnodeCache*)cache)->FileCacheRef();
754 	off_t fileSize = cache->virtual_end;
755 
756 	if (size > fileSize)
757 		size = fileSize;
758 
759 	// we never fetch more than 4 MB at once
760 	if (size > 4 * 1024 * 1024)
761 		size = 4 * 1024 * 1024;
762 
763 	cache_io(ref, NULL, offset, 0, &size, false);
764 	cache->Lock();
765 	cache->ReleaseRefAndUnlock();
766 }
767 
768 
769 extern "C" void
770 cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
771 {
772 	// ToDo: schedule prefetch
773 
774 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
775 
776 	// get the vnode for the object, this also grabs a ref to it
777 	struct vnode *vnode;
778 	if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
779 		return;
780 
781 	cache_prefetch_vnode(vnode, offset, size);
782 	vfs_put_vnode(vnode);
783 }
784 
785 
786 extern "C" void
787 cache_node_opened(struct vnode *vnode, int32 fdType, vm_cache *cache,
788 	dev_t mountID, ino_t parentID, ino_t vnodeID, const char *name)
789 {
790 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
791 		return;
792 
793 	off_t size = -1;
794 	if (cache != NULL) {
795 		file_cache_ref *ref = ((VMVnodeCache*)cache)->FileCacheRef();
796 		if (ref != NULL)
797 			size = cache->virtual_end;
798 	}
799 
800 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
801 		size);
802 }
803 
804 
805 extern "C" void
806 cache_node_closed(struct vnode *vnode, int32 fdType, vm_cache *cache,
807 	dev_t mountID, ino_t vnodeID)
808 {
809 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
810 		return;
811 
812 	int32 accessType = 0;
813 	if (cache != NULL) {
814 		// ToDo: set accessType
815 	}
816 
817 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
818 }
819 
820 
821 extern "C" void
822 cache_node_launched(size_t argCount, char * const *args)
823 {
824 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
825 		return;
826 
827 	sCacheModule->node_launched(argCount, args);
828 }
829 
830 
831 extern "C" status_t
832 file_cache_init_post_boot_device(void)
833 {
834 	// ToDo: get cache module out of driver settings
835 
836 	if (get_module("file_cache/launch_speedup/v1",
837 			(module_info **)&sCacheModule) == B_OK) {
838 		dprintf("** opened launch speedup: %Ld\n", system_time());
839 	}
840 	return B_OK;
841 }
842 
843 
844 extern "C" status_t
845 file_cache_init(void)
846 {
847 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
848 	return B_OK;
849 }
850 
851 
852 //	#pragma mark - public FS API
853 
854 
855 extern "C" void *
856 file_cache_create(dev_t mountID, ino_t vnodeID, off_t size)
857 {
858 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld)\n",
859 		mountID, vnodeID, size));
860 
861 	file_cache_ref *ref = new file_cache_ref;
862 	if (ref == NULL)
863 		return NULL;
864 
865 	memset(ref->last_access, 0, sizeof(ref->last_access));
866 	ref->last_access_index = 0;
867 	ref->disabled_count = 0;
868 
869 	// TODO: delay vm_cache creation until data is
870 	//	requested/written for the first time? Listing lots of
871 	//	files in Tracker (and elsewhere) could be slowed down.
872 	//	Since the file_cache_ref itself doesn't have a lock,
873 	//	we would need to "rent" one during construction, possibly
874 	//	the vnode lock, maybe a dedicated one.
875 	//	As there shouldn't be too much contention, we could also
876 	//	use atomic_test_and_set(), and free the resources again
877 	//	when that fails...
878 
879 	// Get the vnode for the object
880 	// (note, this does not grab a reference to the node)
881 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
882 		goto err1;
883 
884 	// Gets (usually creates) the cache for the node
885 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
886 		goto err1;
887 
888 	ref->cache->virtual_end = size;
889 	((VMVnodeCache*)ref->cache)->SetFileCacheRef(ref);
890 	return ref;
891 
892 err1:
893 	delete ref;
894 	return NULL;
895 }
896 
897 
898 extern "C" void
899 file_cache_delete(void *_cacheRef)
900 {
901 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
902 
903 	if (ref == NULL)
904 		return;
905 
906 	TRACE(("file_cache_delete(ref = %p)\n", ref));
907 
908 	ref->cache->ReleaseRef();
909 	delete ref;
910 }
911 
912 
913 extern "C" void
914 file_cache_enable(void *_cacheRef)
915 {
916 	file_cache_ref *ref = (file_cache_ref*)_cacheRef;
917 
918 	AutoLocker<VMCache> _(ref->cache);
919 
920 	if (ref->disabled_count == 0) {
921 		panic("Unbalanced file_cache_enable()!");
922 		return;
923 	}
924 
925 	ref->disabled_count--;
926 }
927 
928 
929 extern "C" status_t
930 file_cache_disable(void *_cacheRef)
931 {
932 	// TODO: This function only removes all pages from the cache and prevents
933 	// that the file cache functions add any new ones until re-enabled. The
934 	// VM (on page fault) can still add pages, if the file is mmap()ed. We
935 	// should mark the cache to prevent shared mappings of the file and fix
936 	// the page fault code to deal correctly with private mappings (i.e. only
937 	// insert pages in consumer caches).
938 
939 	file_cache_ref *ref = (file_cache_ref*)_cacheRef;
940 
941 	AutoLocker<VMCache> _(ref->cache);
942 
943 	// If already disabled, there's nothing to do for us.
944 	if (ref->disabled_count > 0) {
945 		ref->disabled_count++;
946 		return B_OK;
947 	}
948 
949 	// The file cache is not yet disabled. We need to evict all cached pages.
950 	status_t error = ref->cache->FlushAndRemoveAllPages();
951 	if (error != B_OK)
952 		return error;
953 
954 	ref->disabled_count++;
955 	return B_OK;
956 }
957 
958 
959 extern "C" bool
960 file_cache_is_enabled(void *_cacheRef)
961 {
962 	file_cache_ref *ref = (file_cache_ref*)_cacheRef;
963 	AutoLocker<VMCache> _(ref->cache);
964 
965 	return ref->disabled_count == 0;
966 }
967 
968 
969 extern "C" status_t
970 file_cache_set_size(void *_cacheRef, off_t newSize)
971 {
972 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
973 
974 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, newSize));
975 
976 	if (ref == NULL)
977 		return B_OK;
978 
979 	AutoLocker<VMCache> _(ref->cache);
980 
981 	off_t offset = ref->cache->virtual_end;
982 	off_t size = newSize;
983 	if (offset > newSize) {
984 		size = offset - newSize;
985 		offset = newSize;
986 	} else
987 		size = newSize - offset;
988 
989 	return ref->cache->Resize(newSize);
990 }
991 
992 
993 extern "C" status_t
994 file_cache_sync(void *_cacheRef)
995 {
996 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
997 	if (ref == NULL)
998 		return B_BAD_VALUE;
999 
1000 	return ref->cache->WriteModified();
1001 }
1002 
1003 
1004 extern "C" status_t
1005 file_cache_read(void *_cacheRef, void *cookie, off_t offset, void *buffer,
1006 	size_t *_size)
1007 {
1008 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1009 
1010 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1011 		ref, offset, buffer, *_size));
1012 
1013 	if (ref->disabled_count > 0) {
1014 		// Caching is disabled -- read directly from the file.
1015 		iovec vec;
1016 		vec.iov_base = buffer;
1017 		vec.iov_len = *_size;
1018 		return vfs_read_pages(ref->vnode, cookie, offset, &vec, 1, 0, _size);
1019 	}
1020 
1021 	return cache_io(ref, cookie, offset, (addr_t)buffer, _size, false);
1022 }
1023 
1024 
1025 extern "C" status_t
1026 file_cache_write(void *_cacheRef, void *cookie, off_t offset,
1027 	const void *buffer, size_t *_size)
1028 {
1029 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1030 
1031 	if (ref->disabled_count > 0) {
1032 		// Caching is disabled -- write directly to the file.
1033 
1034 		if (buffer != NULL) {
1035 			iovec vec;
1036 			vec.iov_base = (void*)buffer;
1037 			vec.iov_len = *_size;
1038 			return vfs_write_pages(ref->vnode, cookie, offset, &vec, 1, 0,
1039 				_size);
1040 		}
1041 
1042 		// NULL buffer -- use a dummy buffer to write zeroes
1043 		// TODO: This is not particularly efficient!
1044 		iovec vec;
1045 		vec.iov_base = (void*)kZeroBuffer;
1046 		vec.iov_len = sizeof(kZeroBuffer);
1047 		size_t size = *_size;
1048 		while (size > 0) {
1049 			size_t toWrite = min_c(size, vec.iov_len);
1050 			size_t written = toWrite;
1051 			status_t error = vfs_write_pages(ref->vnode, cookie, offset, &vec,
1052 				1, 0, &written);
1053 			if (error != B_OK)
1054 				return error;
1055 			if (written == 0)
1056 				break;
1057 
1058 			offset += written;
1059 			size -= written;
1060 		}
1061 
1062 		*_size -= size;
1063 		return B_OK;
1064 	}
1065 
1066 	status_t status = cache_io(ref, cookie, offset,
1067 		(addr_t)const_cast<void *>(buffer), _size, true);
1068 
1069 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu)"
1070 		" = %ld\n", ref, offset, buffer, *_size, status));
1071 
1072 	return status;
1073 }
1074 
1075