xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision b671e9bbdbd10268a042b4f4cc4317ccd03d105e)
1 /*
2  * Copyright 2004-2009, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include <KernelExport.h>
14 #include <fs_cache.h>
15 
16 #include <condition_variable.h>
17 #include <file_cache.h>
18 #include <generic_syscall.h>
19 #include <low_resource_manager.h>
20 #include <thread.h>
21 #include <util/AutoLock.h>
22 #include <util/kernel_cpp.h>
23 #include <vfs.h>
24 #include <vm.h>
25 #include <vm_page.h>
26 #include <vm_cache.h>
27 
28 #include "IORequest.h"
29 
30 
31 //#define TRACE_FILE_CACHE
32 #ifdef TRACE_FILE_CACHE
33 #	define TRACE(x) dprintf x
34 #else
35 #	define TRACE(x) ;
36 #endif
37 
38 // maximum number of iovecs per request
39 #define MAX_IO_VECS			32	// 128 kB
40 #define MAX_FILE_IO_VECS	32
41 
42 #define BYPASS_IO_SIZE		65536
43 #define LAST_ACCESSES		3
44 
45 struct file_cache_ref {
46 	vm_cache		*cache;
47 	struct vnode	*vnode;
48 	off_t			last_access[LAST_ACCESSES];
49 		// TODO: it would probably be enough to only store the least
50 		//	significant 31 bits, and make this uint32 (one bit for
51 		//	write vs. read)
52 	int32			last_access_index;
53 	uint16			disabled_count;
54 
55 	inline void SetLastAccess(int32 index, off_t access, bool isWrite)
56 	{
57 		// we remember writes as negative offsets
58 		last_access[index] = isWrite ? -access : access;
59 	}
60 
61 	inline off_t LastAccess(int32 index, bool isWrite)
62 	{
63 		return isWrite ? -last_access[index] : last_access[index];
64 	}
65 
66 	inline uint32 LastAccessPageOffset(int32 index, bool isWrite)
67 	{
68 		return LastAccess(index, isWrite) >> PAGE_SHIFT;
69 	}
70 };
71 
72 class PrecacheIO : public AsyncIOCallback {
73 public:
74 								PrecacheIO(file_cache_ref* ref, off_t offset,
75 									size_t size);
76 								~PrecacheIO();
77 
78 			status_t			Prepare();
79 			void				ReadAsync();
80 
81 	virtual	void				IOFinished(status_t status,
82 									bool partialTransfer,
83 									size_t bytesTransferred);
84 
85 private:
86 			file_cache_ref*		fRef;
87 			VMCache*			fCache;
88 			vm_page**			fPages;
89 			size_t				fPageCount;
90 			ConditionVariable*	fBusyConditions;
91 			iovec*				fVecs;
92 			off_t				fOffset;
93 			uint32				fVecCount;
94 			size_t				fSize;
95 };
96 
97 typedef status_t (*cache_func)(file_cache_ref* ref, void* cookie, off_t offset,
98 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
99 	size_t lastReservedPages, size_t reservePages);
100 
101 static void add_to_iovec(iovec* vecs, uint32 &index, uint32 max, addr_t address,
102 	size_t size);
103 
104 
105 static struct cache_module_info* sCacheModule;
106 static const uint8 kZeroBuffer[4096] = {};
107 
108 
109 //	#pragma mark -
110 
111 
112 PrecacheIO::PrecacheIO(file_cache_ref* ref, off_t offset, size_t size)
113 	:
114 	fRef(ref),
115 	fCache(ref->cache),
116 	fPages(NULL),
117 	fBusyConditions(NULL),
118 	fVecs(NULL),
119 	fOffset(offset),
120 	fVecCount(0),
121 	fSize(size)
122 {
123 	fPageCount = (size + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
124 	fCache->AcquireRefLocked();
125 }
126 
127 
128 PrecacheIO::~PrecacheIO()
129 {
130 	delete[] fPages;
131 	delete[] fBusyConditions;
132 	delete[] fVecs;
133 	fCache->ReleaseRefLocked();
134 }
135 
136 
137 status_t
138 PrecacheIO::Prepare()
139 {
140 	if (fPageCount == 0)
141 		return B_BAD_VALUE;
142 
143 	fPages = new(std::nothrow) vm_page*[fPageCount];
144 	if (fPages == NULL)
145 		return B_NO_MEMORY;
146 
147 	fBusyConditions = new(std::nothrow) ConditionVariable[fPageCount];
148 	if (fBusyConditions == NULL)
149 		return B_NO_MEMORY;
150 
151 	fVecs = new(std::nothrow) iovec[fPageCount];
152 	if (fVecs == NULL)
153 		return B_NO_MEMORY;
154 
155 	// allocate pages for the cache and mark them busy
156 	uint32 i = 0;
157 	for (size_t pos = 0; pos < fSize; pos += B_PAGE_SIZE) {
158 		vm_page* page = vm_page_allocate_page(PAGE_STATE_FREE, true);
159 		if (page == NULL)
160 			break;
161 
162 		fBusyConditions[i].Publish(page, "page");
163 		fCache->InsertPage(page, fOffset + pos);
164 
165 		add_to_iovec(fVecs, fVecCount, fPageCount,
166 			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
167 		fPages[i++] = page;
168 	}
169 
170 	if (i != fPageCount) {
171 		// allocating pages failed
172 		while (i-- > 0) {
173 			fBusyConditions[i].Unpublish();
174 			fCache->RemovePage(fPages[i]);
175 			vm_page_set_state(fPages[i], PAGE_STATE_FREE);
176 		}
177 		return B_NO_MEMORY;
178 	}
179 
180 	return B_OK;
181 }
182 
183 
184 void
185 PrecacheIO::ReadAsync()
186 {
187 	// This object is going to be deleted after the I/O request has been
188 	// fulfilled
189 	vfs_asynchronous_read_pages(fRef->vnode, NULL, fOffset, fVecs, fVecCount,
190 		fSize, B_PHYSICAL_IO_REQUEST, this);
191 }
192 
193 
194 void
195 PrecacheIO::IOFinished(status_t status, bool partialTransfer,
196 	size_t bytesTransferred)
197 {
198 	AutoLocker<VMCache> locker(fCache);
199 
200 	// Make successfully loaded pages accessible again (partially
201 	// transferred pages are considered failed)
202 	size_t pagesTransferred
203 		= (bytesTransferred + B_PAGE_SIZE - 1) / B_PAGE_SIZE;
204 
205 	if (fOffset + bytesTransferred > fCache->virtual_end)
206 		bytesTransferred = fCache->virtual_end - fOffset;
207 
208 	for (uint32 i = 0; i < pagesTransferred; i++) {
209 		if (i == pagesTransferred - 1
210 			&& (bytesTransferred % B_PAGE_SIZE) != 0) {
211 			// clear partial page
212 			size_t bytesTouched = bytesTransferred % B_PAGE_SIZE;
213 			vm_memset_physical((fPages[i]->physical_page_number << PAGE_SHIFT)
214 				+ bytesTouched, 0, B_PAGE_SIZE - bytesTouched);
215 		}
216 
217 		fPages[i]->state = PAGE_STATE_ACTIVE;
218 		fBusyConditions[i].Unpublish();
219 	}
220 
221 	// Free pages after failed I/O
222 	for (uint32 i = pagesTransferred; i < fPageCount; i++) {
223 		fBusyConditions[i].Unpublish();
224 		fCache->RemovePage(fPages[i]);
225 		vm_page_set_state(fPages[i], PAGE_STATE_FREE);
226 	}
227 
228 	delete this;
229 }
230 
231 
232 //	#pragma mark -
233 
234 
235 static void
236 add_to_iovec(iovec* vecs, uint32 &index, uint32 max, addr_t address,
237 	size_t size)
238 {
239 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
240 			+ vecs[index - 1].iov_len == address) {
241 		// the iovec can be combined with the previous one
242 		vecs[index - 1].iov_len += size;
243 		return;
244 	}
245 
246 	if (index == max)
247 		panic("no more space for iovecs!");
248 
249 	// we need to start a new iovec
250 	vecs[index].iov_base = (void*)address;
251 	vecs[index].iov_len = size;
252 	index++;
253 }
254 
255 
256 static inline bool
257 access_is_sequential(file_cache_ref* ref)
258 {
259 	return ref->last_access[ref->last_access_index] != 0;
260 }
261 
262 
263 static inline void
264 push_access(file_cache_ref* ref, off_t offset, size_t bytes, bool isWrite)
265 {
266 	TRACE(("%p: push %Ld, %ld, %s\n", ref, offset, bytes,
267 		isWrite ? "write" : "read"));
268 
269 	int32 index = ref->last_access_index;
270 	int32 previous = index - 1;
271 	if (previous < 0)
272 		previous = LAST_ACCESSES - 1;
273 
274 	if (offset != ref->LastAccess(previous, isWrite))
275 		ref->last_access[previous] = 0;
276 
277 	ref->SetLastAccess(index, offset + bytes, isWrite);
278 
279 	if (++index >= LAST_ACCESSES)
280 		index = 0;
281 	ref->last_access_index = index;
282 }
283 
284 
285 static void
286 reserve_pages(file_cache_ref* ref, size_t reservePages, bool isWrite)
287 {
288 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES) != B_NO_LOW_RESOURCE) {
289 		vm_cache* cache = ref->cache;
290 		cache->Lock();
291 
292 		if (list_is_empty(&cache->consumers) && cache->areas == NULL
293 			&& access_is_sequential(ref)) {
294 			// we are not mapped, and we're accessed sequentially
295 
296 			if (isWrite) {
297 				// just schedule some pages to be written back
298 				int32 index = ref->last_access_index;
299 				int32 previous = index - 1;
300 				if (previous < 0)
301 					previous = LAST_ACCESSES - 1;
302 
303 				vm_page_schedule_write_page_range(cache,
304 					ref->LastAccessPageOffset(previous, true),
305 					ref->LastAccessPageOffset(index, true));
306 			} else {
307 				// free some pages from our cache
308 				// TODO: start with oldest
309 				uint32 left = reservePages;
310 				vm_page* page;
311 				for (VMCachePagesTree::Iterator it = cache->pages.GetIterator();
312 						(page = it.Next()) != NULL && left > 0;) {
313 					if (page->state != PAGE_STATE_MODIFIED
314 						&& page->state != PAGE_STATE_BUSY) {
315 						cache->RemovePage(page);
316 						vm_page_set_state(page, PAGE_STATE_FREE);
317 						left--;
318 					}
319 				}
320 			}
321 		}
322 		cache->Unlock();
323 	}
324 
325 	vm_page_reserve_pages(reservePages);
326 }
327 
328 
329 static inline status_t
330 read_pages_and_clear_partial(file_cache_ref* ref, void* cookie, off_t offset,
331 	const iovec* vecs, size_t count, uint32 flags, size_t* _numBytes)
332 {
333 	size_t bytesUntouched = *_numBytes;
334 
335 	status_t status = vfs_read_pages(ref->vnode, cookie, offset, vecs, count,
336 		flags, _numBytes);
337 
338 	size_t bytesEnd = *_numBytes;
339 
340 	if (offset + bytesEnd > ref->cache->virtual_end)
341 		bytesEnd = ref->cache->virtual_end - offset;
342 
343 	if (status == B_OK && bytesEnd < bytesUntouched) {
344 		// Clear out any leftovers that were not touched by the above read.
345 		// We're doing this here so that not every file system/device has to
346 		// implement this.
347 		bytesUntouched -= bytesEnd;
348 
349 		for (int32 i = count; i-- > 0 && bytesUntouched != 0; ) {
350 			size_t length = min_c(bytesUntouched, vecs[i].iov_len);
351 			vm_memset_physical((addr_t)vecs[i].iov_base + vecs[i].iov_len
352 				- length, 0, length);
353 
354 			bytesUntouched -= length;
355 		}
356 	}
357 
358 	return status;
359 }
360 
361 
362 /*!	Reads the requested amount of data into the cache, and allocates
363 	pages needed to fulfill that request. This function is called by cache_io().
364 	It can only handle a certain amount of bytes, and the caller must make
365 	sure that it matches that criterion.
366 	The cache_ref lock must be hold when calling this function; during
367 	operation it will unlock the cache, though.
368 */
369 static status_t
370 read_into_cache(file_cache_ref* ref, void* cookie, off_t offset,
371 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
372 	size_t lastReservedPages, size_t reservePages)
373 {
374 	TRACE(("read_into_cache(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
375 		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
376 
377 	vm_cache* cache = ref->cache;
378 
379 	// TODO: We're using way too much stack! Rather allocate a sufficiently
380 	// large chunk on the heap.
381 	iovec vecs[MAX_IO_VECS];
382 	uint32 vecCount = 0;
383 
384 	size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
385 	vm_page* pages[MAX_IO_VECS];
386 	ConditionVariable busyConditions[MAX_IO_VECS];
387 	int32 pageIndex = 0;
388 
389 	// allocate pages for the cache and mark them busy
390 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
391 		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
392 			PAGE_STATE_FREE, true);
393 		if (page == NULL)
394 			panic("no more pages!");
395 
396 		busyConditions[pageIndex - 1].Publish(page, "page");
397 
398 		cache->InsertPage(page, offset + pos);
399 
400 		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
401 			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
402 			// TODO: check if the array is large enough (currently panics)!
403 	}
404 
405 	push_access(ref, offset, bufferSize, false);
406 	cache->Unlock();
407 	vm_page_unreserve_pages(lastReservedPages);
408 
409 	// read file into reserved pages
410 	status_t status = read_pages_and_clear_partial(ref, cookie, offset, vecs,
411 		vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
412 	if (status != B_OK) {
413 		// reading failed, free allocated pages
414 
415 		dprintf("file_cache: read pages failed: %s\n", strerror(status));
416 
417 		cache->Lock();
418 
419 		for (int32 i = 0; i < pageIndex; i++) {
420 			busyConditions[i].Unpublish();
421 			cache->RemovePage(pages[i]);
422 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
423 		}
424 
425 		return status;
426 	}
427 
428 	// copy the pages if needed and unmap them again
429 
430 	for (int32 i = 0; i < pageIndex; i++) {
431 		if (useBuffer && bufferSize != 0) {
432 			size_t bytes = min_c(bufferSize, (size_t)B_PAGE_SIZE - pageOffset);
433 
434 			vm_memcpy_from_physical((void*)buffer,
435 				pages[i]->physical_page_number * B_PAGE_SIZE + pageOffset,
436 				bytes, true);
437 
438 			buffer += bytes;
439 			bufferSize -= bytes;
440 			pageOffset = 0;
441 		}
442 	}
443 
444 	reserve_pages(ref, reservePages, false);
445 	cache->Lock();
446 
447 	// make the pages accessible in the cache
448 	for (int32 i = pageIndex; i-- > 0;) {
449 		pages[i]->state = PAGE_STATE_ACTIVE;
450 
451 		busyConditions[i].Unpublish();
452 	}
453 
454 	return B_OK;
455 }
456 
457 
458 static status_t
459 read_from_file(file_cache_ref* ref, void* cookie, off_t offset,
460 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
461 	size_t lastReservedPages, size_t reservePages)
462 {
463 	TRACE(("read_from_file(offset = %Ld, pageOffset = %ld, buffer = %#lx, "
464 		"bufferSize = %lu\n", offset, pageOffset, buffer, bufferSize));
465 
466 	if (!useBuffer)
467 		return B_OK;
468 
469 	iovec vec;
470 	vec.iov_base = (void*)buffer;
471 	vec.iov_len = bufferSize;
472 
473 	push_access(ref, offset, bufferSize, false);
474 	ref->cache->Unlock();
475 	vm_page_unreserve_pages(lastReservedPages);
476 
477 	status_t status = vfs_read_pages(ref->vnode, cookie, offset + pageOffset,
478 		&vec, 1, 0, &bufferSize);
479 
480 	if (status == B_OK)
481 		reserve_pages(ref, reservePages, false);
482 
483 	ref->cache->Lock();
484 
485 	return status;
486 }
487 
488 
489 /*!	Like read_into_cache() but writes data into the cache.
490 	To preserve data consistency, it might also read pages into the cache,
491 	though, if only a partial page gets written.
492 	The same restrictions apply.
493 */
494 static status_t
495 write_to_cache(file_cache_ref* ref, void* cookie, off_t offset,
496 	int32 pageOffset, addr_t buffer, size_t bufferSize, bool useBuffer,
497 	size_t lastReservedPages, size_t reservePages)
498 {
499 	// TODO: We're using way too much stack! Rather allocate a sufficiently
500 	// large chunk on the heap.
501 	iovec vecs[MAX_IO_VECS];
502 	uint32 vecCount = 0;
503 	size_t numBytes = PAGE_ALIGN(pageOffset + bufferSize);
504 	vm_page* pages[MAX_IO_VECS];
505 	int32 pageIndex = 0;
506 	status_t status = B_OK;
507 	ConditionVariable busyConditions[MAX_IO_VECS];
508 
509 	// ToDo: this should be settable somewhere
510 	bool writeThrough = false;
511 
512 	// allocate pages for the cache and mark them busy
513 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
514 		// TODO: if space is becoming tight, and this cache is already grown
515 		//	big - shouldn't we better steal the pages directly in that case?
516 		//	(a working set like approach for the file cache)
517 		// TODO: the pages we allocate here should have been reserved upfront
518 		//	in cache_io()
519 		vm_page* page = pages[pageIndex++] = vm_page_allocate_page(
520 			PAGE_STATE_FREE, true);
521 		busyConditions[pageIndex - 1].Publish(page, "page");
522 
523 		ref->cache->InsertPage(page, offset + pos);
524 
525 		add_to_iovec(vecs, vecCount, MAX_IO_VECS,
526 			page->physical_page_number * B_PAGE_SIZE, B_PAGE_SIZE);
527 	}
528 
529 	push_access(ref, offset, bufferSize, true);
530 	ref->cache->Unlock();
531 	vm_page_unreserve_pages(lastReservedPages);
532 
533 	// copy contents (and read in partially written pages first)
534 
535 	if (pageOffset != 0) {
536 		// This is only a partial write, so we have to read the rest of the page
537 		// from the file to have consistent data in the cache
538 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
539 		size_t bytesRead = B_PAGE_SIZE;
540 
541 		status = vfs_read_pages(ref->vnode, cookie, offset, &readVec, 1,
542 			B_PHYSICAL_IO_REQUEST, &bytesRead);
543 		// ToDo: handle errors for real!
544 		if (status < B_OK)
545 			panic("1. vfs_read_pages() failed: %s!\n", strerror(status));
546 	}
547 
548 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
549 	if (lastPageOffset != 0) {
550 		// get the last page in the I/O vectors
551 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
552 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
553 
554 		if (offset + pageOffset + bufferSize == ref->cache->virtual_end) {
555 			// the space in the page after this write action needs to be cleaned
556 			vm_memset_physical(last + lastPageOffset, 0,
557 				B_PAGE_SIZE - lastPageOffset);
558 		} else {
559 			// the end of this write does not happen on a page boundary, so we
560 			// need to fetch the last page before we can update it
561 			iovec readVec = { (void*)last, B_PAGE_SIZE };
562 			size_t bytesRead = B_PAGE_SIZE;
563 
564 			status = vfs_read_pages(ref->vnode, cookie,
565 				PAGE_ALIGN(offset + pageOffset + bufferSize) - B_PAGE_SIZE,
566 				&readVec, 1, B_PHYSICAL_IO_REQUEST, &bytesRead);
567 			// ToDo: handle errors for real!
568 			if (status < B_OK)
569 				panic("vfs_read_pages() failed: %s!\n", strerror(status));
570 
571 			if (bytesRead < B_PAGE_SIZE) {
572 				// the space beyond the file size needs to be cleaned
573 				vm_memset_physical(last + bytesRead, 0,
574 					B_PAGE_SIZE - bytesRead);
575 			}
576 		}
577 	}
578 
579 	for (uint32 i = 0; i < vecCount; i++) {
580 		addr_t base = (addr_t)vecs[i].iov_base;
581 		size_t bytes = min_c(bufferSize,
582 			size_t(vecs[i].iov_len - pageOffset));
583 
584 		if (useBuffer) {
585 			// copy data from user buffer
586 			vm_memcpy_to_physical(base + pageOffset, (void*)buffer, bytes,
587 				true);
588 		} else {
589 			// clear buffer instead
590 			vm_memset_physical(base + pageOffset, 0, bytes);
591 		}
592 
593 		bufferSize -= bytes;
594 		if (bufferSize == 0)
595 			break;
596 
597 		buffer += bytes;
598 		pageOffset = 0;
599 	}
600 
601 	if (writeThrough) {
602 		// write cached pages back to the file if we were asked to do that
603 		status_t status = vfs_write_pages(ref->vnode, cookie, offset, vecs,
604 			vecCount, B_PHYSICAL_IO_REQUEST, &numBytes);
605 		if (status < B_OK) {
606 			// ToDo: remove allocated pages, ...?
607 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
608 				strerror(status));
609 		}
610 	}
611 
612 	if (status == B_OK)
613 		reserve_pages(ref, reservePages, true);
614 
615 	ref->cache->Lock();
616 
617 	// make the pages accessible in the cache
618 	for (int32 i = pageIndex; i-- > 0;) {
619 		busyConditions[i].Unpublish();
620 
621 		if (writeThrough)
622 			pages[i]->state = PAGE_STATE_ACTIVE;
623 		else
624 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
625 	}
626 
627 	return status;
628 }
629 
630 
631 static status_t
632 write_to_file(file_cache_ref* ref, void* cookie, off_t offset, int32 pageOffset,
633 	addr_t buffer, size_t bufferSize, bool useBuffer, size_t lastReservedPages,
634 	size_t reservePages)
635 {
636 	size_t chunkSize = 0;
637 	if (!useBuffer) {
638 		// we need to allocate a zero buffer
639 		// TODO: use smaller buffers if this fails
640 		chunkSize = min_c(bufferSize, B_PAGE_SIZE);
641 		buffer = (addr_t)malloc(chunkSize);
642 		if (buffer == 0)
643 			return B_NO_MEMORY;
644 
645 		memset((void*)buffer, 0, chunkSize);
646 	}
647 
648 	iovec vec;
649 	vec.iov_base = (void*)buffer;
650 	vec.iov_len = bufferSize;
651 
652 	push_access(ref, offset, bufferSize, true);
653 	ref->cache->Unlock();
654 	vm_page_unreserve_pages(lastReservedPages);
655 
656 	status_t status = B_OK;
657 
658 	if (!useBuffer) {
659 		while (bufferSize > 0) {
660 			if (bufferSize < chunkSize)
661 				chunkSize = bufferSize;
662 
663 			status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
664 				&vec, 1, 0, &chunkSize);
665 			if (status < B_OK)
666 				break;
667 
668 			bufferSize -= chunkSize;
669 			pageOffset += chunkSize;
670 		}
671 
672 		free((void*)buffer);
673 	} else {
674 		status = vfs_write_pages(ref->vnode, cookie, offset + pageOffset,
675 			&vec, 1, 0, &bufferSize);
676 	}
677 
678 	if (status == B_OK)
679 		reserve_pages(ref, reservePages, true);
680 
681 	ref->cache->Lock();
682 
683 	return status;
684 }
685 
686 
687 static inline status_t
688 satisfy_cache_io(file_cache_ref* ref, void* cookie, cache_func function,
689 	off_t offset, addr_t buffer, bool useBuffer, int32 &pageOffset,
690 	size_t bytesLeft, size_t &reservePages, off_t &lastOffset,
691 	addr_t &lastBuffer, int32 &lastPageOffset, size_t &lastLeft,
692 	size_t &lastReservedPages)
693 {
694 	if (lastBuffer == buffer)
695 		return B_OK;
696 
697 	size_t requestSize = buffer - lastBuffer;
698 	reservePages = min_c(MAX_IO_VECS, (lastLeft - requestSize
699 		+ lastPageOffset + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
700 
701 	status_t status = function(ref, cookie, lastOffset, lastPageOffset,
702 		lastBuffer, requestSize, useBuffer, lastReservedPages, reservePages);
703 	if (status == B_OK) {
704 		lastReservedPages = reservePages;
705 		lastBuffer = buffer;
706 		lastLeft = bytesLeft;
707 		lastOffset = offset;
708 		lastPageOffset = 0;
709 		pageOffset = 0;
710 	}
711 	return status;
712 }
713 
714 
715 static status_t
716 cache_io(void* _cacheRef, void* cookie, off_t offset, addr_t buffer,
717 	size_t* _size, bool doWrite)
718 {
719 	if (_cacheRef == NULL)
720 		panic("cache_io() called with NULL ref!\n");
721 
722 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
723 	vm_cache* cache = ref->cache;
724 	off_t fileSize = cache->virtual_end;
725 	bool useBuffer = buffer != 0;
726 
727 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
728 		ref, offset, (void*)buffer, *_size, doWrite ? "write" : "read"));
729 
730 	// out of bounds access?
731 	if (offset >= fileSize || offset < 0) {
732 		*_size = 0;
733 		return B_OK;
734 	}
735 
736 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
737 	size_t size = *_size;
738 	offset -= pageOffset;
739 
740 	if (offset + pageOffset + size > fileSize) {
741 		// adapt size to be within the file's offsets
742 		size = fileSize - pageOffset - offset;
743 		*_size = size;
744 	}
745 	if (size == 0)
746 		return B_OK;
747 
748 	cache_func function;
749 	if (doWrite) {
750 		// in low memory situations, we bypass the cache beyond a
751 		// certain I/O size
752 		if (size >= BYPASS_IO_SIZE
753 			&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
754 				!= B_NO_LOW_RESOURCE) {
755 			function = write_to_file;
756 		} else
757 			function = write_to_cache;
758 	} else {
759 		if (size >= BYPASS_IO_SIZE
760 			&& low_resource_state(B_KERNEL_RESOURCE_PAGES)
761 				!= B_NO_LOW_RESOURCE) {
762 			function = read_from_file;
763 		} else
764 			function = read_into_cache;
765 	}
766 
767 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
768 	// the "last*" variables always point to the end of the last
769 	// satisfied request part
770 
771 	const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
772 	size_t bytesLeft = size, lastLeft = size;
773 	int32 lastPageOffset = pageOffset;
774 	addr_t lastBuffer = buffer;
775 	off_t lastOffset = offset;
776 	size_t lastReservedPages = min_c(MAX_IO_VECS, (pageOffset + bytesLeft
777 		+ B_PAGE_SIZE - 1) >> PAGE_SHIFT);
778 	size_t reservePages = 0;
779 
780 	reserve_pages(ref, lastReservedPages, doWrite);
781 	AutoLocker<VMCache> locker(cache);
782 
783 	while (bytesLeft > 0) {
784 		// check if this page is already in memory
785 		vm_page* page = cache->LookupPage(offset);
786 		if (page != NULL) {
787 			// The page may be busy - since we need to unlock the cache sometime
788 			// in the near future, we need to satisfy the request of the pages
789 			// we didn't get yet (to make sure no one else interferes in the
790 			// mean time).
791 			status_t status = satisfy_cache_io(ref, cookie, function, offset,
792 				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
793 				lastOffset, lastBuffer, lastPageOffset, lastLeft,
794 				lastReservedPages);
795 			if (status != B_OK)
796 				return status;
797 
798 			if (page->state == PAGE_STATE_BUSY) {
799 				ConditionVariableEntry entry;
800 				entry.Add(page);
801 				locker.Unlock();
802 				entry.Wait();
803 				locker.Lock();
804 				continue;
805 			}
806 		}
807 
808 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
809 
810 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset "
811 			"= %lu\n", offset, page, bytesLeft, pageOffset));
812 
813 		if (page != NULL) {
814 			// Since we don't actually map pages as part of an area, we have
815 			// to manually maintain their usage_count
816 			page->usage_count = 2;
817 
818 			if (doWrite || useBuffer) {
819 				// Since the following user_mem{cpy,set}() might cause a page
820 				// fault, which in turn might cause pages to be reserved, we
821 				// need to unlock the cache temporarily to avoid a potential
822 				// deadlock. To make sure that our page doesn't go away, we mark
823 				// it busy for the time.
824 				uint8 oldPageState = page->state;
825 				page->state = PAGE_STATE_BUSY;
826 				locker.Unlock();
827 
828 				// copy the contents of the page already in memory
829 				addr_t pageAddress = page->physical_page_number * B_PAGE_SIZE
830 					+ pageOffset;
831 				if (doWrite) {
832 					if (useBuffer) {
833 						vm_memcpy_to_physical(pageAddress, (void*)buffer,
834 							bytesInPage, true);
835 					} else {
836 						vm_memset_physical(pageAddress, 0, bytesInPage);
837 					}
838 				} else if (useBuffer) {
839 					vm_memcpy_from_physical((void*)buffer, pageAddress,
840 						bytesInPage, true);
841 				}
842 
843 				locker.Lock();
844 
845 				page->state = oldPageState;
846 				if (doWrite && page->state != PAGE_STATE_MODIFIED)
847 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
848 			}
849 
850 			if (bytesLeft <= bytesInPage) {
851 				// we've read the last page, so we're done!
852 				locker.Unlock();
853 				vm_page_unreserve_pages(lastReservedPages);
854 				return B_OK;
855 			}
856 
857 			// prepare a potential gap request
858 			lastBuffer = buffer + bytesInPage;
859 			lastLeft = bytesLeft - bytesInPage;
860 			lastOffset = offset + B_PAGE_SIZE;
861 			lastPageOffset = 0;
862 		}
863 
864 		if (bytesLeft <= bytesInPage)
865 			break;
866 
867 		buffer += bytesInPage;
868 		bytesLeft -= bytesInPage;
869 		pageOffset = 0;
870 		offset += B_PAGE_SIZE;
871 
872 		if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
873 			status_t status = satisfy_cache_io(ref, cookie, function, offset,
874 				buffer, useBuffer, pageOffset, bytesLeft, reservePages,
875 				lastOffset, lastBuffer, lastPageOffset, lastLeft,
876 				lastReservedPages);
877 			if (status != B_OK)
878 				return status;
879 		}
880 	}
881 
882 	// fill the last remaining bytes of the request (either write or read)
883 
884 	return function(ref, cookie, lastOffset, lastPageOffset, lastBuffer,
885 		lastLeft, useBuffer, lastReservedPages, 0);
886 }
887 
888 
889 static status_t
890 file_cache_control(const char* subsystem, uint32 function, void* buffer,
891 	size_t bufferSize)
892 {
893 	switch (function) {
894 		case CACHE_CLEAR:
895 			// ToDo: clear the cache
896 			dprintf("cache_control: clear cache!\n");
897 			return B_OK;
898 
899 		case CACHE_SET_MODULE:
900 		{
901 			cache_module_info* module = sCacheModule;
902 
903 			// unset previous module
904 
905 			if (sCacheModule != NULL) {
906 				sCacheModule = NULL;
907 				snooze(100000);	// 0.1 secs
908 				put_module(module->info.name);
909 			}
910 
911 			// get new module, if any
912 
913 			if (buffer == NULL)
914 				return B_OK;
915 
916 			char name[B_FILE_NAME_LENGTH];
917 			if (!IS_USER_ADDRESS(buffer)
918 				|| user_strlcpy(name, (char*)buffer,
919 						B_FILE_NAME_LENGTH) < B_OK)
920 				return B_BAD_ADDRESS;
921 
922 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
923 				return B_BAD_VALUE;
924 
925 			dprintf("cache_control: set module %s!\n", name);
926 
927 			status_t status = get_module(name, (module_info**)&module);
928 			if (status == B_OK)
929 				sCacheModule = module;
930 
931 			return status;
932 		}
933 	}
934 
935 	return B_BAD_HANDLER;
936 }
937 
938 
939 //	#pragma mark - private kernel API
940 
941 
942 extern "C" void
943 cache_prefetch_vnode(struct vnode* vnode, off_t offset, size_t size)
944 {
945 	if (size == 0)
946 		return;
947 
948 	vm_cache* cache;
949 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
950 		return;
951 
952 	file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
953 	off_t fileSize = cache->virtual_end;
954 
955 	if (offset + size > fileSize)
956 		size = fileSize - offset;
957 	size_t reservePages = size / B_PAGE_SIZE;
958 
959 	// Don't do anything if we don't have the resources left, or the cache
960 	// already contains more than 2/3 of its pages
961 	if (offset >= fileSize || vm_page_num_unused_pages() < 2 * reservePages
962 		|| 3 * cache->page_count > 2 * fileSize / B_PAGE_SIZE) {
963 		cache->ReleaseRef();
964 		return;
965 	}
966 
967 	// "offset" and "size" are always aligned to B_PAGE_SIZE,
968 	offset &= ~(B_PAGE_SIZE - 1);
969 	size = ROUNDUP(size, B_PAGE_SIZE);
970 
971 	size_t bytesToRead = 0;
972 	off_t lastOffset = offset;
973 
974 	vm_page_reserve_pages(reservePages);
975 
976 	cache->Lock();
977 
978 	while (true) {
979 		// check if this page is already in memory
980 		if (size > 0) {
981 			vm_page* page = cache->LookupPage(offset);
982 
983 			offset += B_PAGE_SIZE;
984 			size -= B_PAGE_SIZE;
985 
986 			if (page == NULL) {
987 				bytesToRead += B_PAGE_SIZE;
988 				continue;
989 			}
990 		}
991 		if (bytesToRead != 0) {
992 			// read the part before the current page (or the end of the request)
993 			PrecacheIO* io
994 				= new(std::nothrow) PrecacheIO(ref, lastOffset, bytesToRead);
995 			if (io == NULL || io->Prepare() != B_OK) {
996 				delete io;
997 				break;
998 			}
999 
1000 			// we must not have the cache locked during I/O
1001 			cache->Unlock();
1002 			io->ReadAsync();
1003 			cache->Lock();
1004 
1005 			bytesToRead = 0;
1006 		}
1007 
1008 		if (size == 0) {
1009 			// we have reached the end of the request
1010 			break;
1011 		}
1012 
1013 		lastOffset = offset;
1014 	}
1015 
1016 	cache->ReleaseRefAndUnlock();
1017 	vm_page_unreserve_pages(reservePages);
1018 }
1019 
1020 
1021 extern "C" void
1022 cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
1023 {
1024 	// ToDo: schedule prefetch
1025 
1026 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1027 
1028 	// get the vnode for the object, this also grabs a ref to it
1029 	struct vnode* vnode;
1030 	if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
1031 		return;
1032 
1033 	cache_prefetch_vnode(vnode, offset, size);
1034 	vfs_put_vnode(vnode);
1035 }
1036 
1037 
1038 extern "C" void
1039 cache_node_opened(struct vnode* vnode, int32 fdType, vm_cache* cache,
1040 	dev_t mountID, ino_t parentID, ino_t vnodeID, const char* name)
1041 {
1042 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1043 		return;
1044 
1045 	off_t size = -1;
1046 	if (cache != NULL) {
1047 		file_cache_ref* ref = ((VMVnodeCache*)cache)->FileCacheRef();
1048 		if (ref != NULL)
1049 			size = cache->virtual_end;
1050 	}
1051 
1052 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
1053 		size);
1054 }
1055 
1056 
1057 extern "C" void
1058 cache_node_closed(struct vnode* vnode, int32 fdType, vm_cache* cache,
1059 	dev_t mountID, ino_t vnodeID)
1060 {
1061 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1062 		return;
1063 
1064 	int32 accessType = 0;
1065 	if (cache != NULL) {
1066 		// ToDo: set accessType
1067 	}
1068 
1069 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1070 }
1071 
1072 
1073 extern "C" void
1074 cache_node_launched(size_t argCount, char*  const* args)
1075 {
1076 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1077 		return;
1078 
1079 	sCacheModule->node_launched(argCount, args);
1080 }
1081 
1082 
1083 extern "C" status_t
1084 file_cache_init_post_boot_device(void)
1085 {
1086 	// ToDo: get cache module out of driver settings
1087 
1088 	if (get_module("file_cache/launch_speedup/v1",
1089 			(module_info**)&sCacheModule) == B_OK) {
1090 		dprintf("** opened launch speedup: %Ld\n", system_time());
1091 	}
1092 	return B_OK;
1093 }
1094 
1095 
1096 extern "C" status_t
1097 file_cache_init(void)
1098 {
1099 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1100 	return B_OK;
1101 }
1102 
1103 
1104 //	#pragma mark - public FS API
1105 
1106 
1107 extern "C" void*
1108 file_cache_create(dev_t mountID, ino_t vnodeID, off_t size)
1109 {
1110 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld)\n",
1111 		mountID, vnodeID, size));
1112 
1113 	file_cache_ref* ref = new file_cache_ref;
1114 	if (ref == NULL)
1115 		return NULL;
1116 
1117 	memset(ref->last_access, 0, sizeof(ref->last_access));
1118 	ref->last_access_index = 0;
1119 	ref->disabled_count = 0;
1120 
1121 	// TODO: delay vm_cache creation until data is
1122 	//	requested/written for the first time? Listing lots of
1123 	//	files in Tracker (and elsewhere) could be slowed down.
1124 	//	Since the file_cache_ref itself doesn't have a lock,
1125 	//	we would need to "rent" one during construction, possibly
1126 	//	the vnode lock, maybe a dedicated one.
1127 	//	As there shouldn't be too much contention, we could also
1128 	//	use atomic_test_and_set(), and free the resources again
1129 	//	when that fails...
1130 
1131 	// Get the vnode for the object
1132 	// (note, this does not grab a reference to the node)
1133 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1134 		goto err1;
1135 
1136 	// Gets (usually creates) the cache for the node
1137 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1138 		goto err1;
1139 
1140 	ref->cache->virtual_end = size;
1141 	((VMVnodeCache*)ref->cache)->SetFileCacheRef(ref);
1142 	return ref;
1143 
1144 err1:
1145 	delete ref;
1146 	return NULL;
1147 }
1148 
1149 
1150 extern "C" void
1151 file_cache_delete(void* _cacheRef)
1152 {
1153 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1154 
1155 	if (ref == NULL)
1156 		return;
1157 
1158 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1159 
1160 	ref->cache->ReleaseRef();
1161 	delete ref;
1162 }
1163 
1164 
1165 extern "C" void
1166 file_cache_enable(void* _cacheRef)
1167 {
1168 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1169 
1170 	AutoLocker<VMCache> _(ref->cache);
1171 
1172 	if (ref->disabled_count == 0) {
1173 		panic("Unbalanced file_cache_enable()!");
1174 		return;
1175 	}
1176 
1177 	ref->disabled_count--;
1178 }
1179 
1180 
1181 extern "C" status_t
1182 file_cache_disable(void* _cacheRef)
1183 {
1184 	// TODO: This function only removes all pages from the cache and prevents
1185 	// that the file cache functions add any new ones until re-enabled. The
1186 	// VM (on page fault) can still add pages, if the file is mmap()ed. We
1187 	// should mark the cache to prevent shared mappings of the file and fix
1188 	// the page fault code to deal correctly with private mappings (i.e. only
1189 	// insert pages in consumer caches).
1190 
1191 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1192 
1193 	AutoLocker<VMCache> _(ref->cache);
1194 
1195 	// If already disabled, there's nothing to do for us.
1196 	if (ref->disabled_count > 0) {
1197 		ref->disabled_count++;
1198 		return B_OK;
1199 	}
1200 
1201 	// The file cache is not yet disabled. We need to evict all cached pages.
1202 	status_t error = ref->cache->FlushAndRemoveAllPages();
1203 	if (error != B_OK)
1204 		return error;
1205 
1206 	ref->disabled_count++;
1207 	return B_OK;
1208 }
1209 
1210 
1211 extern "C" bool
1212 file_cache_is_enabled(void* _cacheRef)
1213 {
1214 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1215 	AutoLocker<VMCache> _(ref->cache);
1216 
1217 	return ref->disabled_count == 0;
1218 }
1219 
1220 
1221 extern "C" status_t
1222 file_cache_set_size(void* _cacheRef, off_t newSize)
1223 {
1224 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1225 
1226 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, newSize));
1227 
1228 	if (ref == NULL)
1229 		return B_OK;
1230 
1231 	VMCache* cache = ref->cache;
1232 	AutoLocker<VMCache> _(cache);
1233 
1234 	off_t oldSize = cache->virtual_end;
1235 	status_t status = cache->Resize(newSize);
1236 	if (status == B_OK && newSize < oldSize) {
1237 		// We may have a new partial page at the end of the cache that must be
1238 		// cleared.
1239 		uint32 partialBytes = newSize % B_PAGE_SIZE;
1240 		if (partialBytes != 0) {
1241 			vm_page* page = cache->LookupPage(newSize - partialBytes);
1242 			if (page != NULL) {
1243 				vm_memset_physical(page->physical_page_number * B_PAGE_SIZE
1244 					+ partialBytes, 0, B_PAGE_SIZE - partialBytes);
1245 			}
1246 		}
1247 	}
1248 
1249 	return status;
1250 }
1251 
1252 
1253 extern "C" status_t
1254 file_cache_sync(void* _cacheRef)
1255 {
1256 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1257 	if (ref == NULL)
1258 		return B_BAD_VALUE;
1259 
1260 	return ref->cache->WriteModified();
1261 }
1262 
1263 
1264 extern "C" status_t
1265 file_cache_read(void* _cacheRef, void* cookie, off_t offset, void* buffer,
1266 	size_t* _size)
1267 {
1268 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1269 
1270 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1271 		ref, offset, buffer, *_size));
1272 
1273 	if (ref->disabled_count > 0) {
1274 		// Caching is disabled -- read directly from the file.
1275 		iovec vec;
1276 		vec.iov_base = buffer;
1277 		vec.iov_len = *_size;
1278 		return vfs_read_pages(ref->vnode, cookie, offset, &vec, 1, 0, _size);
1279 	}
1280 
1281 	return cache_io(ref, cookie, offset, (addr_t)buffer, _size, false);
1282 }
1283 
1284 
1285 extern "C" status_t
1286 file_cache_write(void* _cacheRef, void* cookie, off_t offset,
1287 	const void* buffer, size_t* _size)
1288 {
1289 	file_cache_ref* ref = (file_cache_ref*)_cacheRef;
1290 
1291 	if (ref->disabled_count > 0) {
1292 		// Caching is disabled -- write directly to the file.
1293 
1294 		if (buffer != NULL) {
1295 			iovec vec;
1296 			vec.iov_base = (void*)buffer;
1297 			vec.iov_len = *_size;
1298 			return vfs_write_pages(ref->vnode, cookie, offset, &vec, 1, 0,
1299 				_size);
1300 		}
1301 
1302 		// NULL buffer -- use a dummy buffer to write zeroes
1303 		// TODO: This is not particularly efficient!
1304 		iovec vec;
1305 		vec.iov_base = (void*)kZeroBuffer;
1306 		vec.iov_len = sizeof(kZeroBuffer);
1307 		size_t size = *_size;
1308 		while (size > 0) {
1309 			size_t toWrite = min_c(size, vec.iov_len);
1310 			size_t written = toWrite;
1311 			status_t error = vfs_write_pages(ref->vnode, cookie, offset, &vec,
1312 				1, 0, &written);
1313 			if (error != B_OK)
1314 				return error;
1315 			if (written == 0)
1316 				break;
1317 
1318 			offset += written;
1319 			size -= written;
1320 		}
1321 
1322 		*_size -= size;
1323 		return B_OK;
1324 	}
1325 
1326 	status_t status = cache_io(ref, cookie, offset,
1327 		(addr_t)const_cast<void*>(buffer), _size, true);
1328 
1329 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu)"
1330 		" = %ld\n", ref, offset, buffer, *_size, status));
1331 
1332 	return status;
1333 }
1334 
1335