xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision d9cebac2b77547b7064f22497514eecd2d047160)
1 /*
2  * Copyright 2004-2007, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 
13 #include <KernelExport.h>
14 #include <fs_cache.h>
15 
16 #include <condition_variable.h>
17 #include <file_cache.h>
18 #include <generic_syscall.h>
19 #include <util/AutoLock.h>
20 #include <util/kernel_cpp.h>
21 #include <vfs.h>
22 #include <vm.h>
23 #include <vm_page.h>
24 #include <vm_cache.h>
25 
26 
27 //#define TRACE_FILE_CACHE
28 #ifdef TRACE_FILE_CACHE
29 #	define TRACE(x) dprintf x
30 #else
31 #	define TRACE(x) ;
32 #endif
33 
34 // maximum number of iovecs per request
35 #define MAX_IO_VECS			32	// 128 kB
36 #define MAX_FILE_IO_VECS	32
37 #define MAX_TEMP_IO_VECS	8
38 
39 #define CACHED_FILE_EXTENTS	2
40 	// must be smaller than MAX_FILE_IO_VECS
41 	// ToDo: find out how much of these are typically used
42 
43 struct file_extent {
44 	off_t			offset;
45 	file_io_vec		disk;
46 };
47 
48 struct file_map {
49 	file_map();
50 	~file_map();
51 
52 	file_extent *operator[](uint32 index);
53 	file_extent *ExtentAt(uint32 index);
54 	status_t Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset);
55 	void Free();
56 
57 	union {
58 		file_extent	direct[CACHED_FILE_EXTENTS];
59 		file_extent	*array;
60 	};
61 	size_t			count;
62 };
63 
64 struct file_cache_ref {
65 	vm_cache		*cache;
66 	struct vnode	*vnode;
67 	struct vnode	*device;
68 	void			*cookie;
69 	file_map		map;
70 };
71 
72 
73 static struct cache_module_info *sCacheModule;
74 
75 
76 file_map::file_map()
77 {
78 	array = NULL;
79 	count = 0;
80 }
81 
82 
83 file_map::~file_map()
84 {
85 	Free();
86 }
87 
88 
89 file_extent *
90 file_map::operator[](uint32 index)
91 {
92 	return ExtentAt(index);
93 }
94 
95 
96 file_extent *
97 file_map::ExtentAt(uint32 index)
98 {
99 	if (index >= count)
100 		return NULL;
101 
102 	if (count > CACHED_FILE_EXTENTS)
103 		return &array[index];
104 
105 	return &direct[index];
106 }
107 
108 
109 status_t
110 file_map::Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset)
111 {
112 	TRACE(("file_map::Add(vecCount = %ld)\n", vecCount));
113 
114 	off_t offset = 0;
115 
116 	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
117 		// just use the reserved area in the file_cache_ref structure
118 	} else {
119 		// TODO: once we can invalidate only parts of the file map,
120 		//	we might need to copy the previously cached file extends
121 		//	from the direct range
122 		file_extent *newMap = (file_extent *)realloc(array,
123 			(count + vecCount) * sizeof(file_extent));
124 		if (newMap == NULL)
125 			return B_NO_MEMORY;
126 
127 		array = newMap;
128 
129 		if (count != 0) {
130 			file_extent *extent = ExtentAt(count - 1);
131 			offset = extent->offset + extent->disk.length;
132 		}
133 	}
134 
135 	int32 start = count;
136 	count += vecCount;
137 
138 	for (uint32 i = 0; i < vecCount; i++) {
139 		file_extent *extent = ExtentAt(start + i);
140 
141 		extent->offset = offset;
142 		extent->disk = vecs[i];
143 
144 		offset += extent->disk.length;
145 	}
146 
147 #ifdef TRACE_FILE_CACHE
148 	for (uint32 i = 0; i < count; i++) {
149 		file_extent *extent = ExtentAt(i);
150 		dprintf("[%ld] extend offset %Ld, disk offset %Ld, length %Ld\n",
151 			i, extent->offset, extent->disk.offset, extent->disk.length);
152 	}
153 #endif
154 
155 	lastOffset = offset;
156 	return B_OK;
157 }
158 
159 
160 void
161 file_map::Free()
162 {
163 	if (count > CACHED_FILE_EXTENTS)
164 		free(array);
165 
166 	array = NULL;
167 	count = 0;
168 }
169 
170 
171 //	#pragma mark -
172 
173 
174 static void
175 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size)
176 {
177 	if (index > 0 && (addr_t)vecs[index - 1].iov_base
178 			+ vecs[index - 1].iov_len == address) {
179 		// the iovec can be combined with the previous one
180 		vecs[index - 1].iov_len += size;
181 		return;
182 	}
183 
184 	if (index == max)
185 		panic("no more space for iovecs!");
186 
187 	// we need to start a new iovec
188 	vecs[index].iov_base = (void *)address;
189 	vecs[index].iov_len = size;
190 	index++;
191 }
192 
193 
194 static file_extent *
195 find_file_extent(file_cache_ref *ref, off_t offset, uint32 *_index)
196 {
197 	// TODO: do binary search
198 
199 	for (uint32 index = 0; index < ref->map.count; index++) {
200 		file_extent *extent = ref->map[index];
201 
202 		if (extent->offset <= offset
203 			&& extent->offset + extent->disk.length > offset) {
204 			if (_index)
205 				*_index = index;
206 			return extent;
207 		}
208 	}
209 
210 	return NULL;
211 }
212 
213 
214 static status_t
215 get_file_map(file_cache_ref *ref, off_t offset, size_t size,
216 	file_io_vec *vecs, size_t *_count)
217 {
218 	size_t maxVecs = *_count;
219 	status_t status = B_OK;
220 
221 	if (ref->map.count == 0) {
222 		// we don't yet have the map of this file, so let's grab it
223 		// (ordered by offset, so that we can do a binary search on them)
224 
225 		MutexLocker _(ref->cache->lock);
226 
227 		// the file map could have been requested in the mean time
228 		if (ref->map.count == 0) {
229 			size_t vecCount = maxVecs;
230 			off_t mapOffset = 0;
231 
232 			while (true) {
233 				status = vfs_get_file_map(ref->vnode, mapOffset, ~0UL, vecs,
234 					&vecCount);
235 				if (status < B_OK && status != B_BUFFER_OVERFLOW)
236 					return status;
237 
238 				status_t addStatus = ref->map.Add(vecs, vecCount, mapOffset);
239 				if (addStatus != B_OK) {
240 					// only clobber the status in case of failure
241 					status = addStatus;
242 				}
243 
244 				if (status != B_BUFFER_OVERFLOW)
245 					break;
246 
247 				// when we are here, the map has been stored in the array, and
248 				// the array size was still too small to cover the whole file
249 				vecCount = maxVecs;
250 			}
251 		}
252 	}
253 
254 	if (status != B_OK) {
255 		// We must invalidate the (part of the) map we already
256 		// have, as we cannot know if it's complete or not
257 		ref->map.Free();
258 		return status;
259 	}
260 
261 	// We now have cached the map of this file, we now need to
262 	// translate it for the requested access.
263 
264 	uint32 index;
265 	file_extent *fileExtent = find_file_extent(ref, offset, &index);
266 	if (fileExtent == NULL) {
267 		// access outside file bounds? But that's not our problem
268 		*_count = 0;
269 		return B_OK;
270 	}
271 
272 	offset -= fileExtent->offset;
273 	vecs[0].offset = fileExtent->disk.offset + offset;
274 	vecs[0].length = fileExtent->disk.length - offset;
275 
276 	if (vecs[0].length >= size || index >= ref->map.count - 1) {
277 		*_count = 1;
278 		return B_OK;
279 	}
280 
281 	// copy the rest of the vecs
282 
283 	size -= vecs[0].length;
284 
285 	for (index = 1; index < ref->map.count;) {
286 		fileExtent++;
287 
288 		vecs[index] = fileExtent->disk;
289 		index++;
290 
291 		if (size <= fileExtent->disk.length)
292 			break;
293 
294 		if (index >= maxVecs) {
295 			*_count = index;
296 			return B_BUFFER_OVERFLOW;
297 		}
298 
299 		size -= fileExtent->disk.length;
300 	}
301 
302 	*_count = index;
303 	return B_OK;
304 }
305 
306 
307 /*!
308 	Does the dirty work of translating the request into actual disk offsets
309 	and reads to or writes from the supplied iovecs as specified by \a doWrite.
310 */
311 static status_t
312 pages_io(file_cache_ref *ref, off_t offset, const iovec *vecs, size_t count,
313 	size_t *_numBytes, bool doWrite)
314 {
315 	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, vecCount = %lu, %s\n",
316 		ref, offset, *_numBytes, count, doWrite ? "write" : "read"));
317 
318 	// translate the iovecs into direct device accesses
319 	file_io_vec fileVecs[MAX_FILE_IO_VECS];
320 	size_t fileVecCount = MAX_FILE_IO_VECS;
321 	size_t numBytes = *_numBytes;
322 
323 	status_t status = get_file_map(ref, offset, numBytes, fileVecs,
324 		&fileVecCount);
325 	if (status < B_OK && status != B_BUFFER_OVERFLOW) {
326 		TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
327 			offset, numBytes, strerror(status)));
328 		return status;
329 	}
330 
331 	bool bufferOverflow = status == B_BUFFER_OVERFLOW;
332 
333 #ifdef TRACE_FILE_CACHE
334 	dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount, offset,
335 		numBytes, bufferOverflow ? " (array too small)" : "");
336 	for (size_t i = 0; i < fileVecCount; i++) {
337 		dprintf("  [%lu] offset = %Ld, size = %Ld\n",
338 			i, fileVecs[i].offset, fileVecs[i].length);
339 	}
340 #endif
341 
342 	if (fileVecCount == 0) {
343 		// There are no file vecs at this offset, so we're obviously trying
344 		// to access the file outside of its bounds
345 		TRACE(("pages_io: access outside of vnode %p at offset %Ld\n",
346 			ref->vnode, offset));
347 		return B_BAD_VALUE;
348 	}
349 
350 	uint32 fileVecIndex;
351 	size_t size;
352 
353 	if (!doWrite) {
354 		// now directly read the data from the device
355 		// the first file_io_vec can be read directly
356 
357 		size = fileVecs[0].length;
358 		if (size > numBytes)
359 			size = numBytes;
360 
361 		status = vfs_read_pages(ref->device, ref->cookie, fileVecs[0].offset,
362 			vecs, count, &size, true, false);
363 		if (status < B_OK)
364 			return status;
365 
366 		// TODO: this is a work-around for buggy device drivers!
367 		//	When our own drivers honour the length, we can:
368 		//	a) also use this direct I/O for writes (otherwise, it would
369 		//	   overwrite precious data)
370 		//	b) panic if the term below is true (at least for writes)
371 		if (size > fileVecs[0].length) {
372 			//dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
373 			size = fileVecs[0].length;
374 		}
375 
376 		ASSERT(size <= fileVecs[0].length);
377 
378 		// If the file portion was contiguous, we're already done now
379 		if (size == numBytes)
380 			return B_OK;
381 
382 		// if we reached the end of the file, we can return as well
383 		if (size != fileVecs[0].length) {
384 			*_numBytes = size;
385 			return B_OK;
386 		}
387 
388 		fileVecIndex = 1;
389 	} else {
390 		fileVecIndex = 0;
391 		size = 0;
392 	}
393 
394 	// Too bad, let's process the rest of the file_io_vecs
395 
396 	size_t totalSize = size;
397 
398 	// first, find out where we have to continue in our iovecs
399 	uint32 i = 0;
400 	for (; i < count; i++) {
401 		if (size < vecs[i].iov_len)
402 			break;
403 
404 		size -= vecs[i].iov_len;
405 	}
406 
407 	size_t vecOffset = size;
408 	size_t bytesLeft = numBytes - size;
409 
410 	while (true) {
411 		for (; fileVecIndex < fileVecCount; fileVecIndex++) {
412 			file_io_vec &fileVec = fileVecs[fileVecIndex];
413 			off_t fileOffset = fileVec.offset;
414 			off_t fileLeft = min_c(fileVec.length, bytesLeft);
415 
416 			TRACE(("FILE VEC [%lu] length %Ld\n", fileVecIndex, fileLeft));
417 
418 			// process the complete fileVec
419 			while (fileLeft > 0) {
420 				iovec tempVecs[MAX_TEMP_IO_VECS];
421 				uint32 tempCount = 0;
422 
423 				// size tracks how much of what is left of the current fileVec
424 				// (fileLeft) has been assigned to tempVecs
425 				size = 0;
426 
427 				// assign what is left of the current fileVec to the tempVecs
428 				for (size = 0; size < fileLeft && i < count
429 						&& tempCount < MAX_TEMP_IO_VECS;) {
430 					// try to satisfy one iovec per iteration (or as much as
431 					// possible)
432 
433 					// bytes left of the current iovec
434 					size_t vecLeft = vecs[i].iov_len - vecOffset;
435 					if (vecLeft == 0) {
436 						vecOffset = 0;
437 						i++;
438 						continue;
439 					}
440 
441 					TRACE(("fill vec %ld, offset = %lu, size = %lu\n",
442 						i, vecOffset, size));
443 
444 					// actually available bytes
445 					size_t tempVecSize = min_c(vecLeft, fileLeft - size);
446 
447 					tempVecs[tempCount].iov_base
448 						= (void *)((addr_t)vecs[i].iov_base + vecOffset);
449 					tempVecs[tempCount].iov_len = tempVecSize;
450 					tempCount++;
451 
452 					size += tempVecSize;
453 					vecOffset += tempVecSize;
454 				}
455 
456 				size_t bytes = size;
457 				if (doWrite) {
458 					status = vfs_write_pages(ref->device, ref->cookie,
459 						fileOffset, tempVecs, tempCount, &bytes, true, false);
460 				} else {
461 					status = vfs_read_pages(ref->device, ref->cookie,
462 						fileOffset, tempVecs, tempCount, &bytes, true, false);
463 				}
464 				if (status < B_OK)
465 					return status;
466 
467 				totalSize += bytes;
468 				bytesLeft -= size;
469 				fileOffset += size;
470 				fileLeft -= size;
471 				//dprintf("-> file left = %Lu\n", fileLeft);
472 
473 				if (size != bytes || i >= count) {
474 					// there are no more bytes or iovecs, let's bail out
475 					*_numBytes = totalSize;
476 					return B_OK;
477 				}
478 			}
479 		}
480 
481 		if (bufferOverflow) {
482 			status = get_file_map(ref, offset + totalSize, bytesLeft, fileVecs,
483 				&fileVecCount);
484 			if (status < B_OK && status != B_BUFFER_OVERFLOW) {
485 				TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
486 					offset, numBytes, strerror(status)));
487 				return status;
488 			}
489 
490 			bufferOverflow = status == B_BUFFER_OVERFLOW;
491 			fileVecIndex = 0;
492 
493 #ifdef TRACE_FILE_CACHE
494 			dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount,
495 				offset + totalSize, numBytes,
496 				bufferOverflow ? " (array too small)" : "");
497 			for (size_t i = 0; i < fileVecCount; i++) {
498 				dprintf("  [%lu] offset = %Ld, size = %Ld\n",
499 					i, fileVecs[i].offset, fileVecs[i].length);
500 			}
501 #endif
502 		} else
503 			break;
504 	}
505 
506 	*_numBytes = totalSize;
507 	return B_OK;
508 }
509 
510 
511 /*!	Reads the requested amount of data into the cache, and allocates
512 	pages needed to fulfill that request. This function is called by cache_io().
513 	It can only handle a certain amount of bytes, and the caller must make
514 	sure that it matches that criterion.
515 	The cache_ref lock must be hold when calling this function; during
516 	operation it will unlock the cache, though.
517 */
518 static status_t
519 read_into_cache(file_cache_ref *ref, off_t offset, size_t numBytes,
520 	int32 pageOffset, addr_t buffer, size_t bufferSize,
521 	size_t lastReservedPages, size_t reservePages)
522 {
523 	TRACE(("read_into_cache(offset = %Ld, size = %lu, pageOffset = %ld, buffer "
524 		"= %#lx, bufferSize = %lu\n", offset, numBytes, pageOffset, buffer,
525 		bufferSize));
526 
527 	vm_cache *cache = ref->cache;
528 
529 	// TODO: We're using way too much stack! Rather allocate a sufficiently
530 	// large chunk on the heap.
531 	iovec vecs[MAX_IO_VECS];
532 	int32 vecCount = 0;
533 
534 	vm_page *pages[MAX_IO_VECS];
535 	ConditionVariable<vm_page> busyConditions[MAX_IO_VECS];
536 	int32 pageIndex = 0;
537 
538 	// allocate pages for the cache and mark them busy
539 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
540 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(
541 			PAGE_STATE_FREE, true);
542 		if (page == NULL)
543 			panic("no more pages!");
544 
545 		busyConditions[pageIndex - 1].Publish(page, "page");
546 
547 		vm_cache_insert_page(cache, page, offset + pos);
548 
549 		addr_t virtualAddress;
550 		if (vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
551 				&virtualAddress, PHYSICAL_PAGE_CAN_WAIT) < B_OK)
552 			panic("could not get physical page");
553 
554 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
555 			// TODO: check if the array is large enough (currently panics)!
556 	}
557 
558 	mutex_unlock(&cache->lock);
559 	vm_page_unreserve_pages(lastReservedPages);
560 
561 	// read file into reserved pages
562 	status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, false);
563 	if (status < B_OK) {
564 		// reading failed, free allocated pages
565 
566 		dprintf("file_cache: read pages failed: %s\n", strerror(status));
567 
568 		for (int32 i = 0; i < vecCount; i++) {
569 			addr_t base = (addr_t)vecs[i].iov_base;
570 			size_t size = vecs[i].iov_len;
571 
572 			for (size_t pos = 0; pos < size;
573 					pos += B_PAGE_SIZE, base += B_PAGE_SIZE) {
574 				vm_put_physical_page(base);
575 			}
576 		}
577 
578 		mutex_lock(&cache->lock);
579 
580 		for (int32 i = 0; i < pageIndex; i++) {
581 			busyConditions[i].Unpublish();
582 			vm_cache_remove_page(cache, pages[i]);
583 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
584 		}
585 
586 		return status;
587 	}
588 
589 	// copy the pages and unmap them again
590 
591 	for (int32 i = 0; i < vecCount; i++) {
592 		addr_t base = (addr_t)vecs[i].iov_base;
593 		size_t size = vecs[i].iov_len;
594 
595 		// copy to user buffer if necessary
596 		if (bufferSize != 0) {
597 			size_t bytes = min_c(bufferSize, size - pageOffset);
598 
599 			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
600 			buffer += bytes;
601 			bufferSize -= bytes;
602 			pageOffset = 0;
603 		}
604 
605 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE,
606 				base += B_PAGE_SIZE) {
607 			vm_put_physical_page(base);
608 		}
609 	}
610 
611 	vm_page_reserve_pages(reservePages);
612 	mutex_lock(&cache->lock);
613 
614 	// make the pages accessible in the cache
615 	for (int32 i = pageIndex; i-- > 0;) {
616 		pages[i]->state = PAGE_STATE_ACTIVE;
617 
618 		busyConditions[i].Unpublish();
619 	}
620 
621 	return B_OK;
622 }
623 
624 
625 /*!	Like read_into_cache() but writes data into the cache.
626 	To preserve data consistency, it might also read pages into the cache,
627 	though, if only a partial page gets written.
628 	The same restrictions apply.
629 */
630 static status_t
631 write_to_cache(file_cache_ref *ref, off_t offset, size_t numBytes,
632 	int32 pageOffset, addr_t buffer, size_t bufferSize,
633 	size_t lastReservedPages, size_t reservePages)
634 {
635 	// TODO: We're using way too much stack! Rather allocate a sufficiently
636 	// large chunk on the heap.
637 	iovec vecs[MAX_IO_VECS];
638 	int32 vecCount = 0;
639 	vm_page *pages[MAX_IO_VECS];
640 	int32 pageIndex = 0;
641 	status_t status = B_OK;
642 	ConditionVariable<vm_page> busyConditions[MAX_IO_VECS];
643 
644 	// ToDo: this should be settable somewhere
645 	bool writeThrough = false;
646 
647 	// allocate pages for the cache and mark them busy
648 	for (size_t pos = 0; pos < numBytes; pos += B_PAGE_SIZE) {
649 		// TODO: if space is becoming tight, and this cache is already grown
650 		//	big - shouldn't we better steal the pages directly in that case?
651 		//	(a working set like approach for the file cache)
652 		// TODO: the pages we allocate here should have been reserved upfront
653 		//	in cache_io()
654 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(
655 			PAGE_STATE_FREE, true);
656 		busyConditions[pageIndex - 1].Publish(page, "page");
657 
658 		vm_cache_insert_page(ref->cache, page, offset + pos);
659 
660 		addr_t virtualAddress;
661 		vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
662 			&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
663 
664 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
665 		// ToDo: check if the array is large enough!
666 	}
667 
668 	mutex_unlock(&ref->cache->lock);
669 	vm_page_unreserve_pages(lastReservedPages);
670 
671 	// copy contents (and read in partially written pages first)
672 
673 	if (pageOffset != 0) {
674 		// This is only a partial write, so we have to read the rest of the page
675 		// from the file to have consistent data in the cache
676 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
677 		size_t bytesRead = B_PAGE_SIZE;
678 
679 		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
680 		// ToDo: handle errors for real!
681 		if (status < B_OK)
682 			panic("1. pages_io() failed: %s!\n", strerror(status));
683 	}
684 
685 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
686 	if (lastPageOffset != 0) {
687 		// get the last page in the I/O vectors
688 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
689 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
690 
691 		if (offset + pageOffset + bufferSize == ref->cache->virtual_size) {
692 			// the space in the page after this write action needs to be cleaned
693 			memset((void *)(last + lastPageOffset), 0,
694 				B_PAGE_SIZE - lastPageOffset);
695 		} else if (vecCount > 1) {
696 			// the end of this write does not happen on a page boundary, so we
697 			// need to fetch the last page before we can update it
698 			iovec readVec = { (void *)last, B_PAGE_SIZE };
699 			size_t bytesRead = B_PAGE_SIZE;
700 
701 			status = pages_io(ref, offset + numBytes - B_PAGE_SIZE, &readVec, 1,
702 				&bytesRead, false);
703 			// ToDo: handle errors for real!
704 			if (status < B_OK)
705 				panic("pages_io() failed: %s!\n", strerror(status));
706 		}
707 	}
708 
709 	for (int32 i = 0; i < vecCount; i++) {
710 		addr_t base = (addr_t)vecs[i].iov_base;
711 		size_t bytes = min_c(bufferSize, size_t(vecs[i].iov_len - pageOffset));
712 
713 		// copy data from user buffer
714 		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
715 
716 		bufferSize -= bytes;
717 		if (bufferSize == 0)
718 			break;
719 
720 		buffer += bytes;
721 		pageOffset = 0;
722 	}
723 
724 	if (writeThrough) {
725 		// write cached pages back to the file if we were asked to do that
726 		status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes,
727 			true);
728 		if (status < B_OK) {
729 			// ToDo: remove allocated pages, ...?
730 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
731 				strerror(status));
732 		}
733 	}
734 
735 	if (status == B_OK)
736 		vm_page_reserve_pages(reservePages);
737 
738 	mutex_lock(&ref->cache->lock);
739 
740 	// unmap the pages again
741 
742 	for (int32 i = 0; i < vecCount; i++) {
743 		addr_t base = (addr_t)vecs[i].iov_base;
744 		size_t size = vecs[i].iov_len;
745 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE,
746 				base += B_PAGE_SIZE) {
747 			vm_put_physical_page(base);
748 		}
749 	}
750 
751 	// make the pages accessible in the cache
752 	for (int32 i = pageIndex; i-- > 0;) {
753 		busyConditions[i].Unpublish();
754 
755 		if (writeThrough)
756 			pages[i]->state = PAGE_STATE_ACTIVE;
757 		else
758 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
759 	}
760 
761 	return status;
762 }
763 
764 
765 static status_t
766 satisfy_cache_io(file_cache_ref *ref, off_t offset, addr_t buffer,
767 	int32 &pageOffset, size_t bytesLeft, size_t &reservePages,
768 	off_t &lastOffset, addr_t &lastBuffer, int32 &lastPageOffset,
769 	size_t &lastLeft, size_t &lastReservedPages, bool doWrite)
770 {
771 	if (lastBuffer == buffer)
772 		return B_OK;
773 
774 	size_t requestSize = buffer - lastBuffer;
775 	reservePages = min_c(MAX_IO_VECS,
776 		(lastLeft - requestSize + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
777 
778 	status_t status;
779 	if (doWrite) {
780 		status = write_to_cache(ref, lastOffset, requestSize, lastPageOffset,
781 			lastBuffer, requestSize, lastReservedPages, reservePages);
782 	} else {
783 		status = read_into_cache(ref, lastOffset, requestSize, lastPageOffset,
784 			lastBuffer, requestSize, lastReservedPages, reservePages);
785 	}
786 	if (status == B_OK) {
787 		lastReservedPages = reservePages;
788 		lastBuffer = buffer;
789 		lastLeft = bytesLeft;
790 		lastOffset = offset;
791 		lastPageOffset = 0;
792 		pageOffset = 0;
793 	}
794 	return status;
795 }
796 
797 
798 static status_t
799 cache_io(void *_cacheRef, off_t offset, addr_t buffer, size_t *_size,
800 	bool doWrite)
801 {
802 	if (_cacheRef == NULL)
803 		panic("cache_io() called with NULL ref!\n");
804 
805 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
806 	vm_cache *cache = ref->cache;
807 	off_t fileSize = cache->virtual_size;
808 
809 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
810 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
811 
812 	// out of bounds access?
813 	if (offset >= fileSize || offset < 0) {
814 		*_size = 0;
815 		return B_OK;
816 	}
817 
818 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
819 	size_t size = *_size;
820 	offset -= pageOffset;
821 
822 	if (offset + pageOffset + size > fileSize) {
823 		// adapt size to be within the file's offsets
824 		size = fileSize - pageOffset - offset;
825 		*_size = size;
826 	}
827 	if (size == 0)
828 		return B_OK;
829 
830 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
831 	// the "last*" variables always point to the end of the last
832 	// satisfied request part
833 
834 	const uint32 kMaxChunkSize = MAX_IO_VECS * B_PAGE_SIZE;
835 	size_t bytesLeft = size, lastLeft = size;
836 	int32 lastPageOffset = pageOffset;
837 	addr_t lastBuffer = buffer;
838 	off_t lastOffset = offset;
839 	size_t lastReservedPages = min_c(MAX_IO_VECS,
840 		(bytesLeft + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
841 	size_t reservePages = 0;
842 
843 	vm_page_reserve_pages(lastReservedPages);
844 	MutexLocker locker(cache->lock);
845 
846 	while (bytesLeft > 0) {
847 		// check if this page is already in memory
848 		vm_page *page = vm_cache_lookup_page(cache, offset);
849 		if (page != NULL) {
850 			// The page may be busy - since we need to unlock the cache sometime
851 			// in the near future, we need to satisfy the request of the pages
852 			// we didn't get yet (to make sure no one else interferes in the
853 			// mean time).
854 			status_t status = satisfy_cache_io(ref, offset, buffer, pageOffset,
855 				bytesLeft, reservePages, lastOffset, lastBuffer, lastPageOffset,
856 				lastLeft, lastReservedPages, doWrite);
857 			if (status != B_OK)
858 				return status;
859 
860 			if (page->state == PAGE_STATE_BUSY) {
861 				ConditionVariableEntry<vm_page> entry;
862 				entry.Add(page);
863 				locker.Unlock();
864 				entry.Wait();
865 				locker.Lock();
866 				continue;
867 			}
868 		}
869 
870 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
871 		addr_t virtualAddress;
872 
873 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset "
874 			"= %lu\n", offset, page, bytesLeft, pageOffset));
875 
876 		if (page != NULL) {
877 			vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
878 				&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
879 
880 			// Since we don't actually map pages as part of an area, we have
881 			// to manually maintain its usage_count
882 			page->usage_count = 2;
883 
884 			// and copy the contents of the page already in memory
885 			if (doWrite) {
886 				user_memcpy((void *)(virtualAddress + pageOffset),
887 					(void *)buffer, bytesInPage);
888 
889 				// make sure the page is in the modified list
890 				if (page->state != PAGE_STATE_MODIFIED)
891 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
892 			} else {
893 				user_memcpy((void *)buffer,
894 					(void *)(virtualAddress + pageOffset), bytesInPage);
895 			}
896 
897 			vm_put_physical_page(virtualAddress);
898 
899 			if (bytesLeft <= bytesInPage) {
900 				// we've read the last page, so we're done!
901 				locker.Unlock();
902 				vm_page_unreserve_pages(lastReservedPages);
903 				return B_OK;
904 			}
905 
906 			// prepare a potential gap request
907 			lastBuffer = buffer + bytesInPage;
908 			lastLeft = bytesLeft - bytesInPage;
909 			lastOffset = offset + B_PAGE_SIZE;
910 			lastPageOffset = 0;
911 		}
912 
913 		if (bytesLeft <= bytesInPage)
914 			break;
915 
916 		buffer += bytesInPage;
917 		bytesLeft -= bytesInPage;
918 		pageOffset = 0;
919 		offset += B_PAGE_SIZE;
920 
921 		if (buffer - lastBuffer + lastPageOffset >= kMaxChunkSize) {
922 			status_t status = satisfy_cache_io(ref, offset, buffer, pageOffset,
923 				bytesLeft, reservePages, lastOffset, lastBuffer, lastPageOffset,
924 				lastLeft, lastReservedPages, doWrite);
925 			if (status != B_OK)
926 				return status;
927 		}
928 	}
929 
930 	// fill the last remaining bytes of the request (either write or read)
931 
932 	status_t status;
933 	if (doWrite) {
934 		status = write_to_cache(ref, lastOffset, lastLeft, lastPageOffset,
935 			lastBuffer, lastLeft, lastReservedPages, 0);
936 	} else {
937 		status = read_into_cache(ref, lastOffset, lastLeft, lastPageOffset,
938 			lastBuffer, lastLeft, lastReservedPages, 0);
939 	}
940 
941 	return status;
942 }
943 
944 
945 static status_t
946 file_cache_control(const char *subsystem, uint32 function, void *buffer,
947 	size_t bufferSize)
948 {
949 	switch (function) {
950 		case CACHE_CLEAR:
951 			// ToDo: clear the cache
952 			dprintf("cache_control: clear cache!\n");
953 			return B_OK;
954 
955 		case CACHE_SET_MODULE:
956 		{
957 			cache_module_info *module = sCacheModule;
958 
959 			// unset previous module
960 
961 			if (sCacheModule != NULL) {
962 				sCacheModule = NULL;
963 				snooze(100000);	// 0.1 secs
964 				put_module(module->info.name);
965 			}
966 
967 			// get new module, if any
968 
969 			if (buffer == NULL)
970 				return B_OK;
971 
972 			char name[B_FILE_NAME_LENGTH];
973 			if (!IS_USER_ADDRESS(buffer)
974 				|| user_strlcpy(name, (char *)buffer,
975 						B_FILE_NAME_LENGTH) < B_OK)
976 				return B_BAD_ADDRESS;
977 
978 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
979 				return B_BAD_VALUE;
980 
981 			dprintf("cache_control: set module %s!\n", name);
982 
983 			status_t status = get_module(name, (module_info **)&module);
984 			if (status == B_OK)
985 				sCacheModule = module;
986 
987 			return status;
988 		}
989 	}
990 
991 	return B_BAD_HANDLER;
992 }
993 
994 
995 //	#pragma mark - private kernel API
996 
997 
998 extern "C" void
999 cache_prefetch_vnode(struct vnode *vnode, off_t offset, size_t size)
1000 {
1001 #if 0
1002 	vm_cache *cache;
1003 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
1004 		return;
1005 
1006 	file_cache_ref *ref = (struct file_cache_ref *)
1007 		((vnode_store *)cache->store)->file_cache_ref;
1008 	off_t fileSize = cache->virtual_size;
1009 
1010 	if (size > fileSize)
1011 		size = fileSize;
1012 
1013 	// we never fetch more than 4 MB at once
1014 	if (size > 4 * 1024 * 1024)
1015 		size = 4 * 1024 * 1024;
1016 
1017 	size_t bytesLeft = size, lastLeft = size;
1018 	off_t lastOffset = offset;
1019 	size_t lastSize = 0;
1020 
1021 	mutex_lock(&cache->lock);
1022 
1023 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
1024 		// check if this page is already in memory
1025 		addr_t virtualAddress;
1026 	restart:
1027 		vm_page *page = vm_cache_lookup_page(cache, offset);
1028 		if (page != NULL) {
1029 			if (page->state == PAGE_STATE_BUSY) {
1030 				// if busy retry again later
1031 				ConditionVariableEntry<vm_page> entry;
1032 				entry.Add(page);
1033 				mutex_unlock(&cache->lock);
1034 				entry.Wait();
1035 				mutex_lock(&cache->lock);
1036 
1037 				goto restart;
1038 			}
1039 
1040 			// it is, so let's satisfy in the first part of the request
1041 			if (lastOffset < offset) {
1042 				size_t requestSize = offset - lastOffset;
1043 				read_into_cache(ref, lastOffset, requestSize, NULL, 0);
1044 			}
1045 
1046 			if (bytesLeft <= B_PAGE_SIZE) {
1047 				// we've read the last page, so we're done!
1048 				goto out;
1049 			}
1050 
1051 			// prepare a potential gap request
1052 			lastOffset = offset + B_PAGE_SIZE;
1053 			lastLeft = bytesLeft - B_PAGE_SIZE;
1054 		}
1055 
1056 		if (bytesLeft <= B_PAGE_SIZE)
1057 			break;
1058 
1059 		bytesLeft -= B_PAGE_SIZE;
1060 	}
1061 
1062 	// read in the last part
1063 	read_into_cache(ref, lastOffset, lastLeft, NULL, 0);
1064 
1065 out:
1066 	mutex_unlock(&cache->lock);
1067 	vm_cache_release_ref(cache);
1068 #endif
1069 }
1070 
1071 
1072 extern "C" void
1073 cache_prefetch(dev_t mountID, ino_t vnodeID, off_t offset, size_t size)
1074 {
1075 	// ToDo: schedule prefetch
1076 
1077 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1078 
1079 	// get the vnode for the object, this also grabs a ref to it
1080 	struct vnode *vnode;
1081 	if (vfs_get_vnode(mountID, vnodeID, true, &vnode) != B_OK)
1082 		return;
1083 
1084 	cache_prefetch_vnode(vnode, offset, size);
1085 	vfs_put_vnode(vnode);
1086 }
1087 
1088 
1089 extern "C" void
1090 cache_node_opened(struct vnode *vnode, int32 fdType, vm_cache *cache,
1091 	dev_t mountID, ino_t parentID, ino_t vnodeID, const char *name)
1092 {
1093 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1094 		return;
1095 
1096 	off_t size = -1;
1097 	if (cache != NULL) {
1098 		file_cache_ref *ref = (file_cache_ref *)
1099 			((vnode_store *)cache->store)->file_cache_ref;
1100 		if (ref != NULL)
1101 			size = cache->virtual_size;
1102 	}
1103 
1104 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name,
1105 		size);
1106 }
1107 
1108 
1109 extern "C" void
1110 cache_node_closed(struct vnode *vnode, int32 fdType, vm_cache *cache,
1111 	dev_t mountID, ino_t vnodeID)
1112 {
1113 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1114 		return;
1115 
1116 	int32 accessType = 0;
1117 	if (cache != NULL) {
1118 		// ToDo: set accessType
1119 	}
1120 
1121 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1122 }
1123 
1124 
1125 extern "C" void
1126 cache_node_launched(size_t argCount, char * const *args)
1127 {
1128 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1129 		return;
1130 
1131 	sCacheModule->node_launched(argCount, args);
1132 }
1133 
1134 
1135 extern "C" status_t
1136 file_cache_init_post_boot_device(void)
1137 {
1138 	// ToDo: get cache module out of driver settings
1139 
1140 	if (get_module("file_cache/launch_speedup/v1",
1141 			(module_info **)&sCacheModule) == B_OK) {
1142 		dprintf("** opened launch speedup: %Ld\n", system_time());
1143 	}
1144 	return B_OK;
1145 }
1146 
1147 
1148 extern "C" status_t
1149 file_cache_init(void)
1150 {
1151 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1152 	return B_OK;
1153 }
1154 
1155 
1156 //	#pragma mark - public FS API
1157 
1158 
1159 extern "C" void *
1160 file_cache_create(dev_t mountID, ino_t vnodeID, off_t size, int fd)
1161 {
1162 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, "
1163 		"fd = %d)\n", mountID, vnodeID, size, fd));
1164 
1165 	file_cache_ref *ref = new file_cache_ref;
1166 	if (ref == NULL)
1167 		return NULL;
1168 
1169 	// TODO: delay vm_cache creation until data is
1170 	//	requested/written for the first time? Listing lots of
1171 	//	files in Tracker (and elsewhere) could be slowed down.
1172 	//	Since the file_cache_ref itself doesn't have a lock,
1173 	//	we would need to "rent" one during construction, possibly
1174 	//	the vnode lock, maybe a dedicated one.
1175 	//	As there shouldn't be too much contention, we could also
1176 	//	use atomic_test_and_set(), and free the resources again
1177 	//	when that fails...
1178 
1179 	// Get the vnode of the underlying device
1180 	if (vfs_get_vnode_from_fd(fd, true, &ref->device) != B_OK)
1181 		goto err1;
1182 
1183 	// We also need the cookie of the underlying device to properly access it
1184 	if (vfs_get_cookie_from_fd(fd, &ref->cookie) != B_OK)
1185 		goto err2;
1186 
1187 	// Get the vnode for the object
1188 	// (note, this does not grab a reference to the node)
1189 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1190 		goto err2;
1191 
1192 	// Gets (usually creates) the cache for the node
1193 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1194 		goto err2;
1195 
1196 	ref->cache->virtual_size = size;
1197 	((vnode_store *)ref->cache->store)->file_cache_ref = ref;
1198 	return ref;
1199 
1200 err2:
1201 	vfs_put_vnode(ref->device);
1202 err1:
1203 	delete ref;
1204 	return NULL;
1205 }
1206 
1207 
1208 extern "C" void
1209 file_cache_delete(void *_cacheRef)
1210 {
1211 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1212 
1213 	if (ref == NULL)
1214 		return;
1215 
1216 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1217 
1218 	vm_cache_release_ref(ref->cache);
1219 	vfs_put_vnode(ref->device);
1220 	delete ref;
1221 }
1222 
1223 
1224 extern "C" status_t
1225 file_cache_set_size(void *_cacheRef, off_t newSize)
1226 {
1227 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1228 
1229 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1230 
1231 	if (ref == NULL)
1232 		return B_OK;
1233 
1234 	mutex_lock(&ref->cache->lock);
1235 
1236 	off_t offset = ref->cache->virtual_size;
1237 	off_t size = newSize;
1238 	if (offset > newSize) {
1239 		size = offset - newSize;
1240 		offset = newSize;
1241 	} else
1242 		size = newSize - offset;
1243 
1244 	status_t status = vm_cache_resize(ref->cache, newSize);
1245 	mutex_unlock(&ref->cache->lock);
1246 
1247 	file_cache_invalidate_file_map(_cacheRef, offset, size);
1248 
1249 	return status;
1250 }
1251 
1252 
1253 extern "C" status_t
1254 file_cache_sync(void *_cacheRef)
1255 {
1256 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1257 	if (ref == NULL)
1258 		return B_BAD_VALUE;
1259 
1260 	return vm_cache_write_modified(ref->cache, true);
1261 }
1262 
1263 
1264 extern "C" status_t
1265 file_cache_read_pages(void *_cacheRef, off_t offset, const iovec *vecs,
1266 	size_t count, size_t *_numBytes)
1267 {
1268 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1269 
1270 	return pages_io(ref, offset, vecs, count, _numBytes, false);
1271 }
1272 
1273 
1274 extern "C" status_t
1275 file_cache_write_pages(void *_cacheRef, off_t offset, const iovec *vecs,
1276 	size_t count, size_t *_numBytes)
1277 {
1278 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1279 
1280 	status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1281 
1282 	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, "
1283 		"count = %lu, bytes = %lu) = %ld\n", ref, offset, vecs, count,
1284 		*_numBytes, status));
1285 
1286 	return status;
1287 }
1288 
1289 
1290 extern "C" status_t
1291 file_cache_read(void *_cacheRef, off_t offset, void *bufferBase, size_t *_size)
1292 {
1293 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1294 
1295 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1296 		ref, offset, bufferBase, *_size));
1297 
1298 	return cache_io(ref, offset, (addr_t)bufferBase, _size, false);
1299 }
1300 
1301 
1302 extern "C" status_t
1303 file_cache_write(void *_cacheRef, off_t offset, const void *buffer,
1304 	size_t *_size)
1305 {
1306 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1307 
1308 	status_t status = cache_io(ref, offset, (addr_t)const_cast<void *>(buffer),
1309 		_size, true);
1310 
1311 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu)"
1312 		" = %ld\n", ref, offset, buffer, *_size, status));
1313 
1314 	return status;
1315 }
1316 
1317 
1318 extern "C" status_t
1319 file_cache_invalidate_file_map(void *_cacheRef, off_t offset, off_t size)
1320 {
1321 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1322 
1323 	// ToDo: honour offset/size parameters
1324 
1325 	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset,
1326 		size));
1327 
1328 	MutexLocker _(ref->cache->lock);
1329 	ref->map.Free();
1330 	return B_OK;
1331 }
1332