xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision d3d8b26997fac34a84981e6d2b649521de2cc45a)
1 /*
2  * Copyright 2004-2005, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <KernelExport.h>
10 #include <fs_cache.h>
11 
12 #include <util/kernel_cpp.h>
13 #include <file_cache.h>
14 #include <vfs.h>
15 #include <vm.h>
16 #include <vm_page.h>
17 #include <vm_cache.h>
18 #include <generic_syscall.h>
19 
20 #include <unistd.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 
25 //#define TRACE_FILE_CACHE
26 #ifdef TRACE_FILE_CACHE
27 #	define TRACE(x) dprintf x
28 #else
29 #	define TRACE(x) ;
30 #endif
31 
32 // maximum number of iovecs per request
33 #define MAX_IO_VECS			64	// 256 kB
34 #define MAX_FILE_IO_VECS	32
35 
36 #define CACHED_FILE_EXTENTS	2
37 	// must be smaller than MAX_FILE_IO_VECS
38 	// ToDo: find out how much of these are typically used
39 
40 struct file_extent {
41 	off_t			offset;
42 	file_io_vec		disk;
43 };
44 
45 struct file_map {
46 	file_map();
47 	~file_map();
48 
49 	file_extent *operator[](uint32 index);
50 	file_extent *ExtentAt(uint32 index);
51 	status_t Add(file_io_vec *vecs, size_t vecCount);
52 	void Free();
53 
54 	union {
55 		file_extent	direct[CACHED_FILE_EXTENTS];
56 		file_extent	*array;
57 	};
58 	size_t			count;
59 };
60 
61 struct file_cache_ref {
62 	vm_cache_ref	*cache;
63 	void			*vnode;
64 	void			*device;
65 	void			*cookie;
66 	file_map		map;
67 };
68 
69 
70 static struct cache_module_info *sCacheModule;
71 
72 
73 file_map::file_map()
74 {
75 	array = NULL;
76 	count = 0;
77 }
78 
79 
80 file_map::~file_map()
81 {
82 	Free();
83 }
84 
85 
86 file_extent *
87 file_map::operator[](uint32 index)
88 {
89 	return ExtentAt(index);
90 }
91 
92 
93 file_extent *
94 file_map::ExtentAt(uint32 index)
95 {
96 	if (index >= count)
97 		return NULL;
98 
99 	if (count > CACHED_FILE_EXTENTS)
100 		return &array[index];
101 
102 	return &direct[index];
103 }
104 
105 
106 status_t
107 file_map::Add(file_io_vec *vecs, size_t vecCount)
108 {
109 	off_t offset = 0;
110 
111 	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
112 		// just use the reserved area in the file_cache_ref structure
113 	} else {
114 		file_extent *newMap = (file_extent *)realloc(array,
115 			(count + vecCount) * sizeof(file_extent));
116 		if (newMap == NULL)
117 			return B_NO_MEMORY;
118 
119 		array = newMap;
120 
121 		if (count != 0) {
122 			file_extent *extent = ExtentAt(count - 1);
123 			offset = extent->offset + extent->disk.length;
124 		}
125 	}
126 
127 	count += vecCount;
128 
129 	for (uint32 i = 0; i < vecCount; i++) {
130 		file_extent *extent = ExtentAt(i);
131 
132 		extent->offset = offset;
133 		extent->disk = vecs[i];
134 
135 		offset += extent->disk.length;
136 	}
137 
138 	return B_OK;
139 }
140 
141 
142 void
143 file_map::Free()
144 {
145 	if (count > CACHED_FILE_EXTENTS)
146 		free(array);
147 
148 	array = NULL;
149 	count = 0;
150 }
151 
152 
153 //	#pragma mark -
154 
155 
156 static void
157 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size)
158 {
159 	if (index > 0 && (addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) {
160 		// the iovec can be combined with the previous one
161 		vecs[index - 1].iov_len += size;
162 		return;
163 	}
164 
165 	if (index == max)
166 		panic("no more space for iovecs!");
167 
168 	// we need to start a new iovec
169 	vecs[index].iov_base = (void *)address;
170 	vecs[index].iov_len = size;
171 	index++;
172 }
173 
174 
175 static file_extent *
176 find_file_extent(file_cache_ref *ref, off_t offset, uint32 *_index)
177 {
178 	// ToDo: do binary search
179 
180 	for (uint32 index = 0; index < ref->map.count; index++) {
181 		file_extent *extent = ref->map[index];
182 
183 		if (extent->offset <= offset
184 			&& extent->offset + extent->disk.length > offset) {
185 			if (_index)
186 				*_index = index;
187 			return extent;
188 		}
189 	}
190 
191 	return NULL;
192 }
193 
194 
195 static status_t
196 get_file_map(file_cache_ref *ref, off_t offset, size_t size,
197 	file_io_vec *vecs, size_t *_count)
198 {
199 	size_t maxVecs = *_count;
200 
201 	if (ref->map.count == 0) {
202 		// we don't yet have the map of this file, so let's grab it
203 		// (ordered by offset, so that we can do a binary search on them)
204 
205 		mutex_lock(&ref->cache->lock);
206 
207 		// the file map could have been requested in the mean time
208 		if (ref->map.count == 0) {
209 			size_t vecCount = maxVecs;
210 			status_t status;
211 			off_t mapOffset = 0;
212 
213 			while (true) {
214 				status = vfs_get_file_map(ref->vnode, mapOffset, ~0UL, vecs, &vecCount);
215 				if (status < B_OK && status != B_BUFFER_OVERFLOW) {
216 					mutex_unlock(&ref->cache->lock);
217 					return status;
218 				}
219 
220 				ref->map.Add(vecs, vecCount);
221 
222 				if (status != B_BUFFER_OVERFLOW)
223 					break;
224 
225 				// when we are here, the map has been stored in the array, and
226 				// the array size was still too small to cover the whole file
227 				file_io_vec *last = &vecs[vecCount - 1];
228 				mapOffset += last->length;
229 				vecCount = maxVecs;
230 			}
231 		}
232 
233 		mutex_unlock(&ref->cache->lock);
234 	}
235 
236 	// We now have cached the map of this file, we now need to
237 	// translate it for the requested access.
238 
239 	uint32 index;
240 	file_extent *fileExtent = find_file_extent(ref, offset, &index);
241 	if (fileExtent == NULL) {
242 		// access outside file bounds? But that's not our problem
243 		*_count = 0;
244 		return B_OK;
245 	}
246 
247 	offset -= fileExtent->offset;
248 	vecs[0].offset = fileExtent->disk.offset + offset;
249 	vecs[0].length = fileExtent->disk.length - offset;
250 
251 	if (vecs[0].length >= size || index >= ref->map.count - 1) {
252 		*_count = 1;
253 		return B_OK;
254 	}
255 
256 	// copy the rest of the vecs
257 
258 	size -= vecs[0].length;
259 
260 	for (index = 1; index < ref->map.count;) {
261 		fileExtent++;
262 
263 		vecs[index] = fileExtent->disk;
264 		index++;
265 
266 		if (index >= maxVecs) {
267 			*_count = index;
268 			return B_BUFFER_OVERFLOW;
269 		}
270 
271 		if (size <= fileExtent->disk.length)
272 			break;
273 
274 		size -= fileExtent->disk.length;
275 	}
276 
277 	*_count = index;
278 	return B_OK;
279 }
280 
281 
282 static status_t
283 pages_io(file_cache_ref *ref, off_t offset, const iovec *vecs, size_t count,
284 	size_t *_numBytes, bool doWrite)
285 {
286 	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, %s\n", ref, offset,
287 		*_numBytes, doWrite ? "write" : "read"));
288 
289 	// translate the iovecs into direct device accesses
290 	file_io_vec fileVecs[MAX_FILE_IO_VECS];
291 	size_t fileVecCount = MAX_FILE_IO_VECS;
292 	size_t numBytes = *_numBytes;
293 
294 	status_t status = get_file_map(ref, offset, numBytes, fileVecs, &fileVecCount);
295 	if (status < B_OK) {
296 		TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed\n", offset,
297 			numBytes));
298 		return status;
299 	}
300 
301 	// ToDo: handle array overflow gracefully!
302 
303 #ifdef TRACE_FILE_CACHE
304 	dprintf("got %lu file vecs for %Ld:%lu:\n", fileVecCount, offset, numBytes);
305 	for (size_t i = 0; i < fileVecCount; i++)
306 		dprintf("[%lu] offset = %Ld, size = %Ld\n", i, fileVecs[i].offset, fileVecs[i].length);
307 #endif
308 
309 	uint32 fileVecIndex;
310 	size_t size;
311 
312 	if (!doWrite) {
313 		// now directly read the data from the device
314 		// the first file_io_vec can be read directly
315 
316 		size = fileVecs[0].length;
317 		if (size > numBytes)
318 			size = numBytes;
319 
320 		status = vfs_read_pages(ref->device, ref->cookie, fileVecs[0].offset, vecs,
321 			count, &size, false);
322 		if (status < B_OK)
323 			return status;
324 
325 		// ToDo: this is a work-around for buggy device drivers!
326 		//	When our own drivers honour the length, we can:
327 		//	a) also use this direct I/O for writes (otherwise, it would overwrite precious data)
328 		//	b) panic if the term below is true (at least for writes)
329 		if (size > fileVecs[0].length) {
330 			//dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
331 			size = fileVecs[0].length;
332 		}
333 
334 		ASSERT(size <= fileVecs[0].length);
335 
336 		// If the file portion was contiguous, we're already done now
337 		if (size == numBytes)
338 			return B_OK;
339 
340 		// if we reached the end of the file, we can return as well
341 		if (size != fileVecs[0].length) {
342 			*_numBytes = size;
343 			return B_OK;
344 		}
345 
346 		fileVecIndex = 1;
347 	} else {
348 		fileVecIndex = 0;
349 		size = 0;
350 	}
351 
352 	// Too bad, let's process the rest of the file_io_vecs
353 
354 	size_t totalSize = size;
355 
356 	// first, find out where we have to continue in our iovecs
357 	uint32 i = 0;
358 	for (; i < count; i++) {
359 		if (size <= vecs[i].iov_len)
360 			break;
361 
362 		size -= vecs[i].iov_len;
363 	}
364 
365 	size_t vecOffset = size;
366 
367 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
368 		file_io_vec &fileVec = fileVecs[fileVecIndex];
369 		iovec tempVecs[8];
370 		uint32 tempCount = 1;
371 
372 		tempVecs[0].iov_base = (void *)((addr_t)vecs[i].iov_base + vecOffset);
373 
374 		size = min_c(vecs[i].iov_len - vecOffset, fileVec.length);
375 		tempVecs[0].iov_len = size;
376 
377 		TRACE(("fill vec %ld, offset = %lu, size = %lu\n", i, vecOffset, size));
378 
379 		if (size >= fileVec.length)
380 			vecOffset += size;
381 		else
382 			vecOffset = 0;
383 
384 		while (size < fileVec.length && ++i < count) {
385 			tempVecs[tempCount].iov_base = vecs[i].iov_base;
386 			tempCount++;
387 
388 			// is this iovec larger than the file_io_vec?
389 			if (vecs[i].iov_len + size > fileVec.length) {
390 				size += tempVecs[tempCount].iov_len = vecOffset = fileVec.length - size;
391 				break;
392 			}
393 
394 			size += tempVecs[tempCount].iov_len = vecs[i].iov_len;
395 		}
396 
397 		size_t bytes = size;
398 		if (doWrite) {
399 			status = vfs_write_pages(ref->device, ref->cookie, fileVec.offset, tempVecs,
400 				tempCount, &bytes, false);
401 		} else {
402 			status = vfs_read_pages(ref->device, ref->cookie, fileVec.offset, tempVecs,
403 				tempCount, &bytes, false);
404 		}
405 		if (status < B_OK)
406 			return status;
407 
408 		totalSize += size;
409 
410 		if (size != bytes) {
411 			// there are no more bytes, let's bail out
412 			*_numBytes = totalSize;
413 			return B_OK;
414 		}
415 	}
416 
417 	return B_OK;
418 }
419 
420 
421 /**	This function is called by read_into_cache() (and from there only) - it
422  *	can only handle a certain amount of bytes, and read_into_cache() makes
423  *	sure that it matches that criterion.
424  */
425 
426 static inline status_t
427 read_chunk_into_cache(file_cache_ref *ref, off_t offset, size_t size,
428 	int32 pageOffset, addr_t buffer, size_t bufferSize)
429 {
430 	TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n",
431 		offset, size, pageOffset, buffer, bufferSize));
432 
433 	vm_cache_ref *cache = ref->cache;
434 
435 	iovec vecs[MAX_IO_VECS];
436 	int32 vecCount = 0;
437 
438 	vm_page *pages[MAX_IO_VECS];
439 	int32 pageIndex = 0;
440 
441 	// allocate pages for the cache and mark them busy
442 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
443 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
444 		if (page == NULL)
445 			panic("no more pages!");
446 
447 		page->state = PAGE_STATE_BUSY;
448 
449 		vm_cache_insert_page(cache, page, offset + pos);
450 
451 		addr_t virtualAddress;
452 		if (vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, &virtualAddress, PHYSICAL_PAGE_CAN_WAIT) < B_OK)
453 			panic("could not get physical page");
454 
455 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
456 		// ToDo: check if the array is large enough!
457 	}
458 
459 	mutex_unlock(&cache->lock);
460 
461 	// read file into reserved pages
462 	status_t status = pages_io(ref, offset, vecs, vecCount, &size, false);
463 	if (status < B_OK) {
464 		// reading failed, free allocated pages
465 
466 		dprintf("file_cache: read pages failed: %s\n", strerror(status));
467 
468 		for (int32 i = 0; i < vecCount; i++) {
469 			addr_t base = (addr_t)vecs[i].iov_base;
470 			size_t size = vecs[i].iov_len;
471 
472 			for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
473 				vm_put_physical_page(base);
474 		}
475 
476 		mutex_lock(&cache->lock);
477 
478 		for (int32 i = 0; i < pageIndex; i++) {
479 			vm_cache_remove_page(cache, pages[i]);
480 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
481 		}
482 
483 		return status;
484 	}
485 
486 	// copy the pages and unmap them again
487 
488 	for (int32 i = 0; i < vecCount; i++) {
489 		addr_t base = (addr_t)vecs[i].iov_base;
490 		size_t size = vecs[i].iov_len;
491 
492 		// copy to user buffer if necessary
493 		if (bufferSize != 0) {
494 			size_t bytes = min_c(bufferSize, size - pageOffset);
495 
496 			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
497 			buffer += bytes;
498 			bufferSize -= bytes;
499 			pageOffset = 0;
500 		}
501 
502 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
503 			vm_put_physical_page(base);
504 	}
505 
506 	mutex_lock(&cache->lock);
507 
508 	// make the pages accessible in the cache
509 	for (int32 i = pageIndex; i-- > 0;)
510 		pages[i]->state = PAGE_STATE_ACTIVE;
511 
512 	return B_OK;
513 }
514 
515 
516 /**	This function reads \a size bytes directly from the file into the cache.
517  *	If \a bufferSize does not equal zero, \a bufferSize bytes from the data
518  *	read in are also copied to the provided \a buffer.
519  *	This function always allocates all pages; it is the responsibility of the
520  *	calling function to only ask for yet uncached ranges.
521  *	The cache_ref lock must be hold when calling this function.
522  */
523 
524 static status_t
525 read_into_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
526 {
527 	TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
528 		ref, offset, size, (void *)buffer, bufferSize));
529 
530 	// do we have to read in anything at all?
531 	if (size == 0)
532 		return B_OK;
533 
534 	// make sure "offset" is page aligned - but also remember the page offset
535 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
536 	size = PAGE_ALIGN(size + pageOffset);
537 	offset -= pageOffset;
538 
539 	while (true) {
540 		size_t chunkSize = size;
541 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
542 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
543 
544 		status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset,
545 								buffer, bufferSize);
546 		if (status != B_OK)
547 			return status;
548 
549 		if ((size -= chunkSize) == 0)
550 			return B_OK;
551 
552 		if (chunkSize >= bufferSize) {
553 			bufferSize = 0;
554 			buffer = NULL;
555 		} else {
556 			bufferSize -= chunkSize - pageOffset;
557 			buffer += chunkSize - pageOffset;
558 		}
559 
560 		offset += chunkSize;
561 		pageOffset = 0;
562 	}
563 
564 	return B_OK;
565 }
566 
567 
568 /**	Like read_chunk_into_cache() but writes data into the cache */
569 
570 static inline status_t
571 write_chunk_to_cache(file_cache_ref *ref, off_t offset, size_t size,
572 	int32 pageOffset, addr_t buffer, size_t bufferSize)
573 {
574 	iovec vecs[MAX_IO_VECS];
575 	int32 vecCount = 0;
576 	vm_page *pages[MAX_IO_VECS];
577 	int32 pageIndex = 0;
578 	status_t status = B_OK;
579 
580 	// ToDo: this should be settable somewhere
581 	bool writeThrough = false;
582 
583 	// allocate pages for the cache and mark them busy
584 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
585 		// ToDo: if space is becoming tight, and this cache is already grown
586 		//	big - shouldn't we better steal the pages directly in that case?
587 		//	(a working set like approach for the file cache)
588 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
589 		page->state = PAGE_STATE_BUSY;
590 
591 		vm_cache_insert_page(ref->cache, page, offset + pos);
592 
593 		addr_t virtualAddress;
594 		vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, &virtualAddress,
595 			PHYSICAL_PAGE_CAN_WAIT);
596 
597 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
598 		// ToDo: check if the array is large enough!
599 	}
600 
601 	mutex_unlock(&ref->cache->lock);
602 
603 	// copy contents (and read in partially written pages first)
604 
605 	if (pageOffset != 0) {
606 		// This is only a partial write, so we have to read the rest of the page
607 		// from the file to have consistent data in the cache
608 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
609 		size_t bytesRead = B_PAGE_SIZE;
610 
611 		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
612 		// ToDo: handle errors for real!
613 		if (status < B_OK)
614 			panic("1. pages_io() failed: %s!\n", strerror(status));
615 	}
616 
617 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
618 	if (lastPageOffset != 0) {
619 		// get the last page in the I/O vectors
620 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
621 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
622 
623 		if (offset + pageOffset + bufferSize == ref->cache->cache->virtual_size) {
624 			// the space in the page after this write action needs to be cleaned
625 			memset((void *)(last + lastPageOffset), 0, B_PAGE_SIZE - lastPageOffset);
626 		} else if (vecCount > 1) {
627 			// the end of this write does not happen on a page boundary, so we
628 			// need to fetch the last page before we can update it
629 			iovec readVec = { (void *)last, B_PAGE_SIZE };
630 			size_t bytesRead = B_PAGE_SIZE;
631 
632 			status = pages_io(ref, offset + size - B_PAGE_SIZE, &readVec, 1,
633 				&bytesRead, false);
634 			// ToDo: handle errors for real!
635 			if (status < B_OK)
636 				panic("pages_io() failed: %s!\n", strerror(status));
637 		}
638 	}
639 
640 	for (int32 i = 0; i < vecCount; i++) {
641 		addr_t base = (addr_t)vecs[i].iov_base;
642 		size_t bytes = min_c(bufferSize, size_t(vecs[i].iov_len - pageOffset));
643 
644 		// copy data from user buffer
645 		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
646 
647 		bufferSize -= bytes;
648 		if (bufferSize == 0)
649 			break;
650 
651 		buffer += bytes;
652 		pageOffset = 0;
653 	}
654 
655 	if (writeThrough) {
656 		// write cached pages back to the file if we were asked to do that
657 		status_t status = pages_io(ref, offset, vecs, vecCount, &size, true);
658 		if (status < B_OK) {
659 			// ToDo: remove allocated pages, ...?
660 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
661 				strerror(status));
662 		}
663 	}
664 
665 	mutex_lock(&ref->cache->lock);
666 
667 	// unmap the pages again
668 
669 	for (int32 i = 0; i < vecCount; i++) {
670 		addr_t base = (addr_t)vecs[i].iov_base;
671 		size_t size = vecs[i].iov_len;
672 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
673 			vm_put_physical_page(base);
674 	}
675 
676 	// make the pages accessible in the cache
677 	for (int32 i = pageIndex; i-- > 0;) {
678 		if (writeThrough)
679 			pages[i]->state = PAGE_STATE_ACTIVE;
680 		else
681 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
682 	}
683 
684 	return status;
685 }
686 
687 
688 /**	Like read_into_cache() but writes data into the cache. To preserve data consistency,
689  *	it might also read pages into the cache, though, if only a partial page gets written.
690  *	The cache_ref lock must be hold when calling this function.
691  */
692 
693 static status_t
694 write_to_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
695 {
696 	TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
697 		ref, offset, size, (void *)buffer, bufferSize));
698 
699 	// make sure "offset" is page aligned - but also remember the page offset
700 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
701 	size = PAGE_ALIGN(size + pageOffset);
702 	offset -= pageOffset;
703 
704 	while (true) {
705 		size_t chunkSize = size;
706 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
707 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
708 
709 		status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize);
710 		if (status != B_OK)
711 			return status;
712 
713 		if ((size -= chunkSize) == 0)
714 			return B_OK;
715 
716 		if (chunkSize >= bufferSize) {
717 			bufferSize = 0;
718 			buffer = NULL;
719 		} else {
720 			bufferSize -= chunkSize - pageOffset;
721 			buffer += chunkSize - pageOffset;
722 		}
723 
724 		offset += chunkSize;
725 		pageOffset = 0;
726 	}
727 
728 	return B_OK;
729 }
730 
731 
732 static status_t
733 satisfy_cache_io(file_cache_ref *ref, off_t offset, addr_t buffer, addr_t lastBuffer,
734 	bool doWrite)
735 {
736 	size_t requestSize = buffer - lastBuffer;
737 
738 	if (doWrite)
739 		return write_to_cache(ref, offset, requestSize, lastBuffer, requestSize);
740 
741 	return read_into_cache(ref, offset, requestSize, lastBuffer, requestSize);
742 }
743 
744 
745 static status_t
746 cache_io(void *_cacheRef, off_t offset, addr_t buffer, size_t *_size, bool doWrite)
747 {
748 	if (_cacheRef == NULL)
749 		panic("cache_io() called with NULL ref!\n");
750 
751 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
752 	vm_cache_ref *cache = ref->cache;
753 	off_t fileSize = cache->cache->virtual_size;
754 
755 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
756 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
757 
758 	// out of bounds access?
759 	if (offset >= fileSize || offset < 0) {
760 		*_size = 0;
761 		return B_OK;
762 	}
763 
764 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
765 	size_t size = *_size;
766 	offset -= pageOffset;
767 
768 	if (offset + pageOffset + size > fileSize) {
769 		// adapt size to be within the file's offsets
770 		size = fileSize - pageOffset - offset;
771 		*_size = size;
772 	}
773 
774 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
775 	// the "last*" variables always point to the end of the last
776 	// satisfied request part
777 
778 	size_t bytesLeft = size, lastLeft = size;
779 	int32 lastPageOffset = pageOffset;
780 	addr_t lastBuffer = buffer;
781 	off_t lastOffset = offset;
782 
783 	mutex_lock(&cache->lock);
784 
785 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
786 		// check if this page is already in memory
787 	restart:
788 		vm_page *page = vm_cache_lookup_page(cache, offset);
789 		vm_page *dummyPage = NULL;
790 		if (page != NULL) {
791 			// The page is busy - since we need to unlock the cache sometime
792 			// in the near future, we need to satisfy the request of the pages
793 			// we didn't get yet (to make sure no one else interferes in the
794 			// mean time).
795 			status_t status = B_OK;
796 
797 			if (lastBuffer != buffer) {
798 				status = satisfy_cache_io(ref, lastOffset + lastPageOffset,
799 					buffer, lastBuffer, doWrite);
800 				if (status == B_OK) {
801 					lastBuffer = buffer;
802 					lastLeft = bytesLeft;
803 					lastOffset = offset;
804 					lastPageOffset = 0;
805 					pageOffset = 0;
806 				}
807 			}
808 
809 			if (status != B_OK) {
810 				mutex_unlock(&cache->lock);
811 				return status;
812 			}
813 
814 			if (page->state == PAGE_STATE_BUSY) {
815 				if (page->type == PAGE_TYPE_DUMMY) {
816 					dummyPage = page;
817 					page = vm_page_allocate_page(PAGE_STATE_FREE);
818 					if (page == NULL) {
819 						mutex_unlock(&cache->lock);
820 						return B_NO_MEMORY;
821 					}
822 				} else {
823 					mutex_unlock(&cache->lock);
824 					// ToDo: don't wait forever!
825 					snooze(20000);
826 					mutex_lock(&cache->lock);
827 					goto restart;
828 				}
829 			}
830 		}
831 
832 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
833 		addr_t virtualAddress;
834 
835 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset));
836 		if (page != NULL) {
837 			vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
838 				&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
839 
840 			if (dummyPage != NULL && (!doWrite || bytesInPage != B_PAGE_SIZE)) {
841 				// This page is currently in-use by someone else - since we cannot
842 				// know if this someone does what we want, and if it even can do
843 				// what we want (we may own a lock the blocks the other request),
844 				// we need to handle this case specifically
845 				iovec vec;
846 				vec.iov_base = (void *)virtualAddress;
847 				vec.iov_len = B_PAGE_SIZE;
848 
849 				size_t size = B_PAGE_SIZE;
850 				status_t status = pages_io(ref, offset, &vec, 1, &size, false);
851 				if (status != B_OK) {
852 					vm_put_physical_page(virtualAddress);
853 					mutex_unlock(&cache->lock);
854 					return status;
855 				}
856 			}
857 
858 			// and copy the contents of the page already in memory
859 			if (doWrite) {
860 				user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage);
861 
862 				// make sure the page is in the modified list
863 				if (page->state != PAGE_STATE_MODIFIED)
864 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
865 			} else
866 				user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage);
867 
868 			vm_put_physical_page(virtualAddress);
869 
870 			if (dummyPage != NULL) {
871 				// check if the dummy page is still in place
872 			restart_dummy_lookup:
873 				vm_page *currentPage = vm_cache_lookup_page(cache, offset);
874 				if (currentPage->state == PAGE_STATE_BUSY) {
875 					if (currentPage->type == PAGE_TYPE_DUMMY) {
876 						// we let the other party add our page
877 						currentPage->queue_next = page;
878 					} else {
879 						mutex_unlock(&cache->lock);
880 						// ToDo: don't wait forever!
881 						snooze(20000);
882 						mutex_lock(&cache->lock);
883 						goto restart_dummy_lookup;
884 					}
885 				} else if (currentPage != NULL) {
886 					// we need to copy our new page into the old one
887 					addr_t destinationAddress;
888 					vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
889 						&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
890 					vm_get_physical_page(currentPage->physical_page_number * B_PAGE_SIZE,
891 						&destinationAddress, PHYSICAL_PAGE_CAN_WAIT);
892 
893 					memcpy((void *)destinationAddress, (void *)virtualAddress, B_PAGE_SIZE);
894 
895 					vm_put_physical_page(destinationAddress);
896 					vm_put_physical_page(virtualAddress);
897 
898 					vm_page_set_state(page, PAGE_STATE_FREE);
899 				} else {
900 					// there is no page in place anymore, we'll put ours into it
901 					vm_cache_insert_page(cache, page, offset);
902 				}
903 			}
904 
905 			if (bytesLeft <= bytesInPage) {
906 				// we've read the last page, so we're done!
907 				mutex_unlock(&cache->lock);
908 				return B_OK;
909 			}
910 
911 			// prepare a potential gap request
912 			lastBuffer = buffer + bytesInPage;
913 			lastLeft = bytesLeft - bytesInPage;
914 			lastOffset = offset + B_PAGE_SIZE;
915 			lastPageOffset = 0;
916 		}
917 
918 		if (bytesLeft <= bytesInPage)
919 			break;
920 
921 		buffer += bytesInPage;
922 		bytesLeft -= bytesInPage;
923 		pageOffset = 0;
924 	}
925 
926 	// fill the last remaining bytes of the request (either write or read)
927 
928 	status_t status;
929 	if (doWrite)
930 		status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
931 	else
932 		status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
933 
934 	mutex_unlock(&cache->lock);
935 	return status;
936 }
937 
938 
939 static status_t
940 file_cache_control(const char *subsystem, uint32 function, void *buffer, size_t bufferSize)
941 {
942 	switch (function) {
943 		case CACHE_CLEAR:
944 			// ToDo: clear the cache
945 			dprintf("cache_control: clear cache!\n");
946 			return B_OK;
947 
948 		case CACHE_SET_MODULE:
949 		{
950 			cache_module_info *module = sCacheModule;
951 
952 			// unset previous module
953 
954 			if (sCacheModule != NULL) {
955 				sCacheModule = NULL;
956 				snooze(100000);	// 0.1 secs
957 				put_module(module->info.name);
958 			}
959 
960 			// get new module, if any
961 
962 			if (buffer == NULL)
963 				return B_OK;
964 
965 			char name[B_FILE_NAME_LENGTH];
966 			if (!IS_USER_ADDRESS(buffer)
967 				|| user_strlcpy(name, (char *)buffer, B_FILE_NAME_LENGTH) < B_OK)
968 				return B_BAD_ADDRESS;
969 
970 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
971 				return B_BAD_VALUE;
972 
973 			dprintf("cache_control: set module %s!\n", name);
974 
975 			status_t status = get_module(name, (module_info **)&module);
976 			if (status == B_OK)
977 				sCacheModule = module;
978 
979 			return status;
980 		}
981 	}
982 
983 	return B_BAD_HANDLER;
984 }
985 
986 
987 //	#pragma mark -
988 //	kernel public API
989 
990 
991 extern "C" void
992 cache_prefetch_vnode(void *vnode, off_t offset, size_t size)
993 {
994 	vm_cache_ref *cache;
995 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
996 		return;
997 
998 	file_cache_ref *ref = (struct file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
999 	off_t fileSize = cache->cache->virtual_size;
1000 
1001 	if (size > fileSize)
1002 		size = fileSize;
1003 
1004 	// we never fetch more than 4 MB at once
1005 	if (size > 4 * 1024 * 1024)
1006 		size = 4 * 1024 * 1024;
1007 
1008 	size_t bytesLeft = size, lastLeft = size;
1009 	off_t lastOffset = offset;
1010 	size_t lastSize = 0;
1011 
1012 	mutex_lock(&cache->lock);
1013 
1014 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
1015 		// check if this page is already in memory
1016 		addr_t virtualAddress;
1017 	restart:
1018 		vm_page *page = vm_cache_lookup_page(cache, offset);
1019 		if (page != NULL) {
1020 			// it is, so let's satisfy in the first part of the request
1021 			if (lastOffset < offset) {
1022 				size_t requestSize = offset - lastOffset;
1023 				read_into_cache(ref, lastOffset, requestSize, NULL, 0);
1024 			}
1025 
1026 			if (bytesLeft <= B_PAGE_SIZE) {
1027 				// we've read the last page, so we're done!
1028 				goto out;
1029 			}
1030 
1031 			// prepare a potential gap request
1032 			lastOffset = offset + B_PAGE_SIZE;
1033 			lastLeft = bytesLeft - B_PAGE_SIZE;
1034 		}
1035 
1036 		if (bytesLeft <= B_PAGE_SIZE)
1037 			break;
1038 
1039 		bytesLeft -= B_PAGE_SIZE;
1040 	}
1041 
1042 	// read in the last part
1043 	read_into_cache(ref, lastOffset, lastLeft, NULL, 0);
1044 
1045 out:
1046 	mutex_unlock(&cache->lock);
1047 	vm_cache_release_ref(cache);
1048 }
1049 
1050 
1051 extern "C" void
1052 cache_prefetch(mount_id mountID, vnode_id vnodeID, off_t offset, size_t size)
1053 {
1054 	void *vnode;
1055 
1056 	// ToDo: schedule prefetch
1057 
1058 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1059 
1060 	// get the vnode for the object, this also grabs a ref to it
1061 	if (vfs_get_vnode(mountID, vnodeID, &vnode) != B_OK)
1062 		return;
1063 
1064 	cache_prefetch_vnode(vnode, offset, size);
1065 	vfs_put_vnode(vnode);
1066 }
1067 
1068 
1069 extern "C" void
1070 cache_node_opened(void *vnode, int32 fdType, vm_cache_ref *cache, mount_id mountID,
1071 	vnode_id parentID, vnode_id vnodeID, const char *name)
1072 {
1073 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1074 		return;
1075 
1076 	off_t size = -1;
1077 	if (cache != NULL) {
1078 		file_cache_ref *ref = (file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
1079 		if (ref != NULL)
1080 			size = ref->cache->cache->virtual_size;
1081 	}
1082 
1083 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name, size);
1084 }
1085 
1086 
1087 extern "C" void
1088 cache_node_closed(void *vnode, int32 fdType, vm_cache_ref *cache,
1089 	mount_id mountID, vnode_id vnodeID)
1090 {
1091 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1092 		return;
1093 
1094 	int32 accessType = 0;
1095 	if (cache != NULL) {
1096 		// ToDo: set accessType
1097 	}
1098 
1099 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1100 }
1101 
1102 
1103 extern "C" void
1104 cache_node_launched(size_t argCount, char * const *args)
1105 {
1106 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1107 		return;
1108 
1109 	sCacheModule->node_launched(argCount, args);
1110 }
1111 
1112 
1113 extern "C" status_t
1114 file_cache_init_post_boot_device(void)
1115 {
1116 	// ToDo: get cache module out of driver settings
1117 
1118 	if (get_module("file_cache/launch_speedup/v1", (module_info **)&sCacheModule) == B_OK) {
1119 		dprintf("** opened launch speedup: %Ld\n", system_time());
1120 	} else
1121 		dprintf("** could not open launch speedup!\n");
1122 
1123 	return B_OK;
1124 }
1125 
1126 
1127 extern "C" status_t
1128 file_cache_init(void)
1129 {
1130 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1131 	return B_OK;
1132 }
1133 
1134 
1135 //	#pragma mark -
1136 //	public FS API
1137 
1138 
1139 extern "C" void *
1140 file_cache_create(mount_id mountID, vnode_id vnodeID, off_t size, int fd)
1141 {
1142 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd));
1143 
1144 	file_cache_ref *ref = new file_cache_ref;
1145 	if (ref == NULL)
1146 		return NULL;
1147 
1148 	// ToDo: delay vm_cache/vm_cache_ref creation until data is
1149 	//	requested/written for the first time? Listing lots of
1150 	//	files in Tracker (and elsewhere) could be slowed down.
1151 	//	Since the file_cache_ref itself doesn't have a lock,
1152 	//	we would need to "rent" one during construction, possibly
1153 	//	the vnode lock, maybe a dedicated one.
1154 	//	As there shouldn't be too much contention, we could also
1155 	//	use atomic_test_and_set(), and free the resources again
1156 	//	when that fails...
1157 
1158 	// get the vnode of the underlying device
1159 	if (vfs_get_vnode_from_fd(fd, true, &ref->device) != B_OK)
1160 		goto err1;
1161 
1162 	// we also need the cookie of the underlying device to properly access it
1163 	if (vfs_get_cookie_from_fd(fd, &ref->cookie) != B_OK)
1164 		goto err2;
1165 
1166 	// get the vnode for the object (note, this does not grab a reference to the node)
1167 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1168 		goto err2;
1169 
1170 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1171 		goto err2;
1172 
1173 	ref->cache->cache->virtual_size = size;
1174 	((vnode_store *)ref->cache->cache->store)->file_cache_ref = ref;
1175 	return ref;
1176 
1177 err2:
1178 	vfs_put_vnode(ref->device);
1179 err1:
1180 	delete ref;
1181 	return NULL;
1182 }
1183 
1184 
1185 extern "C" void
1186 file_cache_delete(void *_cacheRef)
1187 {
1188 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1189 
1190 	if (ref == NULL)
1191 		return;
1192 
1193 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1194 
1195 	vfs_put_vnode(ref->device);
1196 	delete ref;
1197 }
1198 
1199 
1200 extern "C" status_t
1201 file_cache_set_size(void *_cacheRef, off_t size)
1202 {
1203 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1204 
1205 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1206 
1207 	if (ref == NULL)
1208 		return B_OK;
1209 
1210 	file_cache_invalidate_file_map(_cacheRef, 0, size);
1211 		// ToDo: make this better (we would only need to extend or shrink the map)
1212 
1213 	mutex_lock(&ref->cache->lock);
1214 	status_t status = vm_cache_resize(ref->cache, size);
1215 	mutex_unlock(&ref->cache->lock);
1216 
1217 	return status;
1218 }
1219 
1220 
1221 extern "C" status_t
1222 file_cache_sync(void *_cacheRef)
1223 {
1224 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1225 	if (ref == NULL)
1226 		return B_BAD_VALUE;
1227 
1228 	return vm_cache_write_modified(ref->cache, true);
1229 }
1230 
1231 
1232 extern "C" status_t
1233 file_cache_read_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1234 {
1235 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1236 
1237 	return pages_io(ref, offset, vecs, count, _numBytes, false);
1238 }
1239 
1240 
1241 extern "C" status_t
1242 file_cache_write_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1243 {
1244 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1245 
1246 	status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1247 	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n",
1248 		ref, offset, vecs, count, *_numBytes, status));
1249 
1250 	return status;
1251 }
1252 
1253 
1254 extern "C" status_t
1255 file_cache_read(void *_cacheRef, off_t offset, void *bufferBase, size_t *_size)
1256 {
1257 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1258 
1259 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1260 		ref, offset, bufferBase, *_size));
1261 
1262 	return cache_io(ref, offset, (addr_t)bufferBase, _size, false);
1263 }
1264 
1265 
1266 extern "C" status_t
1267 file_cache_write(void *_cacheRef, off_t offset, const void *buffer, size_t *_size)
1268 {
1269 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1270 
1271 	status_t status = cache_io(ref, offset, (addr_t)const_cast<void *>(buffer), _size, true);
1272 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n",
1273 		ref, offset, buffer, *_size, status));
1274 
1275 	return status;
1276 }
1277 
1278 
1279 extern "C" status_t
1280 file_cache_invalidate_file_map(void *_cacheRef, off_t offset, off_t size)
1281 {
1282 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1283 
1284 	// ToDo: honour offset/size parameters
1285 
1286 	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size));
1287 	mutex_lock(&ref->cache->lock);
1288 	ref->map.Free();
1289 	mutex_unlock(&ref->cache->lock);
1290 	return B_OK;
1291 }
1292