xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision fef6144999c2fa611f59ee6ffe6dd7999501385c)
1 /*
2  * Copyright 2004-2005, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <KernelExport.h>
10 #include <fs_cache.h>
11 
12 #include <util/kernel_cpp.h>
13 #include <file_cache.h>
14 #include <vfs.h>
15 #include <vm.h>
16 #include <vm_page.h>
17 #include <vm_cache.h>
18 #include <generic_syscall.h>
19 
20 #include <unistd.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 
25 //#define TRACE_FILE_CACHE
26 #ifdef TRACE_FILE_CACHE
27 #	define TRACE(x) dprintf x
28 #else
29 #	define TRACE(x) ;
30 #endif
31 
32 // maximum number of iovecs per request
33 #define MAX_IO_VECS			64	// 256 kB
34 #define MAX_FILE_IO_VECS	32
35 
36 #define CACHED_FILE_EXTENTS	2
37 	// must be smaller than MAX_FILE_IO_VECS
38 	// ToDo: find out how much of these are typically used
39 
40 struct file_extent {
41 	off_t			offset;
42 	file_io_vec		disk;
43 };
44 
45 struct file_map {
46 	file_map();
47 	~file_map();
48 
49 	file_extent *operator[](uint32 index);
50 	file_extent *ExtentAt(uint32 index);
51 	status_t Add(file_io_vec *vecs, size_t vecCount);
52 	void Free();
53 
54 	union {
55 		file_extent	direct[CACHED_FILE_EXTENTS];
56 		file_extent	*array;
57 	};
58 	size_t			count;
59 };
60 
61 struct file_cache_ref {
62 	vm_cache_ref	*cache;
63 	void			*vnode;
64 	void			*device;
65 	void			*cookie;
66 	file_map		map;
67 };
68 
69 
70 static struct cache_module_info *sCacheModule;
71 
72 
73 file_map::file_map()
74 {
75 	array = NULL;
76 	count = 0;
77 }
78 
79 
80 file_map::~file_map()
81 {
82 	Free();
83 }
84 
85 
86 file_extent *
87 file_map::operator[](uint32 index)
88 {
89 	return ExtentAt(index);
90 }
91 
92 
93 file_extent *
94 file_map::ExtentAt(uint32 index)
95 {
96 	if (index >= count)
97 		return NULL;
98 
99 	if (count > CACHED_FILE_EXTENTS)
100 		return &array[index];
101 
102 	return &direct[index];
103 }
104 
105 
106 status_t
107 file_map::Add(file_io_vec *vecs, size_t vecCount)
108 {
109 	off_t offset = 0;
110 
111 	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
112 		// just use the reserved area in the file_cache_ref structure
113 	} else {
114 		file_extent *newMap = (file_extent *)realloc(array,
115 			(count + vecCount) * sizeof(file_extent));
116 		if (newMap == NULL)
117 			return B_NO_MEMORY;
118 
119 		array = newMap;
120 
121 		if (count != 0) {
122 			file_extent *extent = ExtentAt(count - 1);
123 			offset = extent->offset + extent->disk.length;
124 		}
125 	}
126 
127 	count += vecCount;
128 
129 	for (uint32 i = 0; i < vecCount; i++) {
130 		file_extent *extent = ExtentAt(i);
131 
132 		extent->offset = offset;
133 		extent->disk = vecs[i];
134 
135 		offset += extent->disk.length;
136 	}
137 
138 	return B_OK;
139 }
140 
141 
142 void
143 file_map::Free()
144 {
145 	if (count > CACHED_FILE_EXTENTS)
146 		free(array);
147 
148 	array = NULL;
149 	count = 0;
150 }
151 
152 
153 //	#pragma mark -
154 
155 
156 static void
157 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size)
158 {
159 	if (index > 0 && (addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) {
160 		// the iovec can be combined with the previous one
161 		vecs[index - 1].iov_len += size;
162 		return;
163 	}
164 
165 	if (index == max)
166 		panic("no more space for iovecs!");
167 
168 	// we need to start a new iovec
169 	vecs[index].iov_base = (void *)address;
170 	vecs[index].iov_len = size;
171 	index++;
172 }
173 
174 
175 static file_extent *
176 find_file_extent(file_cache_ref *ref, off_t offset, uint32 *_index)
177 {
178 	// ToDo: do binary search
179 
180 	for (uint32 index = 0; index < ref->map.count; index++) {
181 		file_extent *extent = ref->map[index];
182 
183 		if (extent->offset <= offset
184 			&& extent->offset + extent->disk.length > offset) {
185 			if (_index)
186 				*_index = index;
187 			return extent;
188 		}
189 	}
190 
191 	return NULL;
192 }
193 
194 
195 static status_t
196 get_file_map(file_cache_ref *ref, off_t offset, size_t size,
197 	file_io_vec *vecs, size_t *_count)
198 {
199 	size_t maxVecs = *_count;
200 
201 	if (ref->map.count == 0) {
202 		// we don't yet have the map of this file, so let's grab it
203 		// (ordered by offset, so that we can do a binary search on them)
204 
205 		mutex_lock(&ref->cache->lock);
206 
207 		// the file map could have been requested in the mean time
208 		if (ref->map.count == 0) {
209 			size_t vecCount = maxVecs;
210 			status_t status;
211 			off_t mapOffset = 0;
212 
213 			while (true) {
214 				status = vfs_get_file_map(ref->vnode, mapOffset, ~0UL, vecs, &vecCount);
215 				if (status < B_OK && status != B_BUFFER_OVERFLOW) {
216 					mutex_unlock(&ref->cache->lock);
217 					return status;
218 				}
219 
220 				ref->map.Add(vecs, vecCount);
221 
222 				if (status != B_BUFFER_OVERFLOW)
223 					break;
224 
225 				// when we are here, the map has been stored in the array, and
226 				// the array size was still too small to cover the whole file
227 				file_io_vec *last = &vecs[vecCount - 1];
228 				mapOffset += last->length;
229 				vecCount = maxVecs;
230 			}
231 		}
232 
233 		mutex_unlock(&ref->cache->lock);
234 	}
235 
236 	// We now have cached the map of this file, we now need to
237 	// translate it for the requested access.
238 
239 	uint32 index;
240 	file_extent *fileExtent = find_file_extent(ref, offset, &index);
241 	if (fileExtent == NULL) {
242 		// access outside file bounds? But that's not our problem
243 		*_count = 0;
244 		return B_OK;
245 	}
246 
247 	offset -= fileExtent->offset;
248 	vecs[0].offset = fileExtent->disk.offset + offset;
249 	vecs[0].length = fileExtent->disk.length - offset;
250 
251 	if (vecs[0].length >= size || index >= ref->map.count - 1) {
252 		*_count = 1;
253 		return B_OK;
254 	}
255 
256 	// copy the rest of the vecs
257 
258 	size -= vecs[0].length;
259 
260 	for (index = 1; index < ref->map.count;) {
261 		fileExtent++;
262 
263 		vecs[index] = fileExtent->disk;
264 		index++;
265 
266 		if (index >= maxVecs) {
267 			*_count = index;
268 			return B_BUFFER_OVERFLOW;
269 		}
270 
271 		if (size <= fileExtent->disk.length)
272 			break;
273 
274 		size -= fileExtent->disk.length;
275 	}
276 
277 	*_count = index;
278 	return B_OK;
279 }
280 
281 
282 static status_t
283 pages_io(file_cache_ref *ref, off_t offset, const iovec *vecs, size_t count,
284 	size_t *_numBytes, bool doWrite)
285 {
286 	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, %s\n", ref, offset,
287 		*_numBytes, doWrite ? "write" : "read"));
288 
289 	// translate the iovecs into direct device accesses
290 	file_io_vec fileVecs[MAX_FILE_IO_VECS];
291 	size_t fileVecCount = MAX_FILE_IO_VECS;
292 	size_t numBytes = *_numBytes;
293 
294 	status_t status = get_file_map(ref, offset, numBytes, fileVecs, &fileVecCount);
295 	if (status < B_OK)
296 		return status;
297 
298 	// ToDo: handle array overflow gracefully!
299 
300 #ifdef TRACE_FILE_CACHE
301 	dprintf("got %lu file vecs for %Ld:%lu:\n", fileVecCount, offset, numBytes);
302 	for (size_t i = 0; i < fileVecCount; i++)
303 		dprintf("[%lu] offset = %Ld, size = %Ld\n", i, fileVecs[i].offset, fileVecs[i].length);
304 #endif
305 
306 	uint32 fileVecIndex;
307 	size_t size;
308 
309 	if (!doWrite) {
310 		// now directly read the data from the device
311 		// the first file_io_vec can be read directly
312 
313 		size = fileVecs[0].length;
314 		if (size > numBytes)
315 			size = numBytes;
316 
317 		status = vfs_read_pages(ref->device, ref->cookie, fileVecs[0].offset, vecs, count, &size);
318 		if (status < B_OK)
319 			return status;
320 
321 		// ToDo: this is a work-around for buggy device drivers!
322 		//	When our own drivers honour the length, we can:
323 		//	a) also use this direct I/O for writes (otherwise, it would overwrite precious data)
324 		//	b) panic if the term below is true (at least for writes)
325 		if (size > fileVecs[0].length) {
326 			dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
327 			size = fileVecs[0].length;
328 		}
329 
330 		ASSERT(size <= fileVecs[0].length);
331 
332 		// If the file portion was contiguous, we're already done now
333 		if (size == numBytes)
334 			return B_OK;
335 
336 		// if we reached the end of the file, we can return as well
337 		if (size != fileVecs[0].length) {
338 			*_numBytes = size;
339 			return B_OK;
340 		}
341 
342 		fileVecIndex = 1;
343 	} else {
344 		fileVecIndex = 0;
345 		size = 0;
346 	}
347 
348 	// Too bad, let's process the rest of the file_io_vecs
349 
350 	size_t totalSize = size;
351 
352 	// first, find out where we have to continue in our iovecs
353 	uint32 i = 0;
354 	for (; i < count; i++) {
355 		if (size <= vecs[i].iov_len)
356 			break;
357 
358 		size -= vecs[i].iov_len;
359 	}
360 
361 	size_t vecOffset = size;
362 
363 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
364 		file_io_vec &fileVec = fileVecs[fileVecIndex];
365 		iovec tempVecs[8];
366 		uint32 tempCount = 1;
367 
368 		tempVecs[0].iov_base = (void *)((addr_t)vecs[i].iov_base + vecOffset);
369 
370 		size = min_c(vecs[i].iov_len - vecOffset, fileVec.length);
371 		tempVecs[0].iov_len = size;
372 
373 		TRACE(("fill vec %ld, offset = %lu, size = %lu\n", i, vecOffset, size));
374 
375 		if (size >= fileVec.length)
376 			vecOffset += size;
377 		else
378 			vecOffset = 0;
379 
380 		while (size < fileVec.length && ++i < count) {
381 			tempVecs[tempCount].iov_base = vecs[i].iov_base;
382 			tempCount++;
383 
384 			// is this iovec larger than the file_io_vec?
385 			if (vecs[i].iov_len + size > fileVec.length) {
386 				size += tempVecs[tempCount].iov_len = vecOffset = fileVec.length - size;
387 				break;
388 			}
389 
390 			size += tempVecs[tempCount].iov_len = vecs[i].iov_len;
391 		}
392 
393 		size_t bytes = size;
394 		if (doWrite)
395 			status = vfs_write_pages(ref->device, ref->cookie, fileVec.offset, tempVecs, tempCount, &bytes);
396 		else
397 			status = vfs_read_pages(ref->device, ref->cookie, fileVec.offset, tempVecs, tempCount, &bytes);
398 		if (status < B_OK)
399 			return status;
400 
401 		totalSize += size;
402 
403 		if (size != bytes) {
404 			// there are no more bytes, let's bail out
405 			*_numBytes = totalSize;
406 			return B_OK;
407 		}
408 	}
409 
410 	return B_OK;
411 }
412 
413 
414 /**	This function is called by read_into_cache() (and from there only) - it
415  *	can only handle a certain amount of bytes, and read_into_cache() makes
416  *	sure that it matches that criterion.
417  */
418 
419 static inline status_t
420 read_chunk_into_cache(file_cache_ref *ref, off_t offset, size_t size,
421 	int32 pageOffset, addr_t buffer, size_t bufferSize)
422 {
423 	TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n",
424 		offset, size, pageOffset, buffer, bufferSize));
425 
426 	vm_cache_ref *cache = ref->cache;
427 
428 	iovec vecs[MAX_IO_VECS];
429 	int32 vecCount = 0;
430 
431 	vm_page *pages[MAX_IO_VECS];
432 	int32 pageIndex = 0;
433 
434 	// allocate pages for the cache and mark them busy
435 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
436 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
437 		if (page == NULL)
438 			panic("no more pages!");
439 
440 		page->state = PAGE_STATE_BUSY;
441 
442 		vm_cache_insert_page(cache, page, offset + pos);
443 
444 		addr_t virtualAddress;
445 		if (vm_get_physical_page(page->ppn * B_PAGE_SIZE, &virtualAddress, PHYSICAL_PAGE_CAN_WAIT) < B_OK)
446 			panic("could not get physical page");
447 
448 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
449 		// ToDo: check if the array is large enough!
450 	}
451 
452 	mutex_unlock(&cache->lock);
453 
454 	// read file into reserved pages
455 	status_t status = pages_io(ref, offset, vecs, vecCount, &size, false);
456 	if (status < B_OK) {
457 		// ToDo: remove allocated pages...
458 		panic("file_cache: remove allocated pages! read pages failed: %s\n", strerror(status));
459 		mutex_lock(&cache->lock);
460 		return status;
461 	}
462 
463 	// copy the pages and unmap them again
464 
465 	for (int32 i = 0; i < vecCount; i++) {
466 		addr_t base = (addr_t)vecs[i].iov_base;
467 		size_t size = vecs[i].iov_len;
468 
469 		// copy to user buffer if necessary
470 		if (bufferSize != 0) {
471 			size_t bytes = min_c(bufferSize, size - pageOffset);
472 
473 			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
474 			buffer += bytes;
475 			bufferSize -= bytes;
476 			pageOffset = 0;
477 		}
478 
479 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
480 			vm_put_physical_page(base);
481 	}
482 
483 	mutex_lock(&cache->lock);
484 
485 	// make the pages accessible in the cache
486 	for (int32 i = pageIndex; i-- > 0;)
487 		pages[i]->state = PAGE_STATE_ACTIVE;
488 
489 	return B_OK;
490 }
491 
492 
493 /**	This function reads \a size bytes directly from the file into the cache.
494  *	If \a bufferSize does not equal zero, \a bufferSize bytes from the data
495  *	read in are also copied to the provided \a buffer.
496  *	This function always allocates all pages; it is the responsibility of the
497  *	calling function to only ask for yet uncached ranges.
498  *	The cache_ref lock must be hold when calling this function.
499  */
500 
501 static status_t
502 read_into_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
503 {
504 	TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
505 		ref, offset, size, (void *)buffer, bufferSize));
506 
507 	// do we have to read in anything at all?
508 	if (size == 0)
509 		return B_OK;
510 
511 	// make sure "offset" is page aligned - but also remember the page offset
512 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
513 	size = PAGE_ALIGN(size + pageOffset);
514 	offset -= pageOffset;
515 
516 	while (true) {
517 		size_t chunkSize = size;
518 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
519 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
520 
521 		status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset,
522 								buffer, bufferSize);
523 		if (status != B_OK)
524 			return status;
525 
526 		if ((size -= chunkSize) == 0)
527 			return B_OK;
528 
529 		if (chunkSize >= bufferSize) {
530 			bufferSize = 0;
531 			buffer = NULL;
532 		} else {
533 			bufferSize -= chunkSize - pageOffset;
534 			buffer += chunkSize - pageOffset;
535 		}
536 
537 		offset += chunkSize;
538 		pageOffset = 0;
539 	}
540 
541 	return B_OK;
542 }
543 
544 
545 /**	Like read_chunk_into_cache() but writes data into the cache */
546 
547 static inline status_t
548 write_chunk_to_cache(file_cache_ref *ref, off_t offset, size_t size,
549 	int32 pageOffset, addr_t buffer, size_t bufferSize)
550 {
551 	iovec vecs[MAX_IO_VECS];
552 	int32 vecCount = 0;
553 	vm_page *pages[MAX_IO_VECS];
554 	int32 pageIndex = 0;
555 	status_t status = B_OK;
556 
557 	// ToDo: this should be settable somewhere
558 	bool writeThrough = false;
559 
560 	// allocate pages for the cache and mark them busy
561 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
562 		// ToDo: if space is becoming tight, and this cache is already grown
563 		//	big - shouldn't we better steal the pages directly in that case?
564 		//	(a working set like approach for the file cache)
565 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
566 		page->state = PAGE_STATE_BUSY;
567 
568 		vm_cache_insert_page(ref->cache, page, offset + pos);
569 
570 		addr_t virtualAddress;
571 		vm_get_physical_page(page->ppn * B_PAGE_SIZE, &virtualAddress,
572 			PHYSICAL_PAGE_CAN_WAIT);
573 
574 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
575 		// ToDo: check if the array is large enough!
576 	}
577 
578 	mutex_unlock(&ref->cache->lock);
579 
580 	// copy contents (and read in partially written pages first)
581 
582 	if (pageOffset != 0) {
583 		// This is only a partial write, so we have to read the rest of the page
584 		// from the file to have consistent data in the cache
585 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
586 		size_t bytesRead = B_PAGE_SIZE;
587 
588 		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
589 		// ToDo: handle errors for real!
590 		if (status < B_OK)
591 			panic("pages_io() failed!\n");
592 	}
593 
594 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
595 	if (lastPageOffset != 0) {
596 		// get the last page in the I/O vectors
597 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
598 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
599 
600 		if (offset + pageOffset + bufferSize == ref->cache->cache->virtual_size) {
601 			// the space in the page after this write action needs to be cleaned
602 			memset((void *)(last + lastPageOffset), 0, B_PAGE_SIZE - lastPageOffset);
603 		} else if (vecCount > 1) {
604 			// the end of this write does not happen on a page boundary, so we
605 			// need to fetch the last page before we can update it
606 			iovec readVec = { (void *)last, B_PAGE_SIZE };
607 			size_t bytesRead = B_PAGE_SIZE;
608 
609 			status = pages_io(ref, offset + size - B_PAGE_SIZE, &readVec, 1,
610 				&bytesRead, false);
611 			// ToDo: handle errors for real!
612 			if (status < B_OK)
613 				panic("pages_io() failed!\n");
614 		}
615 	}
616 
617 	for (int32 i = 0; i < vecCount; i++) {
618 		addr_t base = (addr_t)vecs[i].iov_base;
619 		size_t bytes = min_c(bufferSize, size_t(vecs[i].iov_len - pageOffset));
620 
621 		// copy data from user buffer
622 		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
623 
624 		bufferSize -= bytes;
625 		if (bufferSize == 0)
626 			break;
627 
628 		buffer += bytes;
629 		pageOffset = 0;
630 	}
631 
632 	if (writeThrough) {
633 		// write cached pages back to the file if we were asked to do that
634 		status_t status = pages_io(ref, offset, vecs, vecCount, &size, true);
635 		if (status < B_OK) {
636 			// ToDo: remove allocated pages, ...?
637 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
638 				strerror(status));
639 		}
640 	}
641 
642 	mutex_lock(&ref->cache->lock);
643 
644 	// unmap the pages again
645 
646 	for (int32 i = 0; i < vecCount; i++) {
647 		addr_t base = (addr_t)vecs[i].iov_base;
648 		size_t size = vecs[i].iov_len;
649 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
650 			vm_put_physical_page(base);
651 	}
652 
653 	// make the pages accessible in the cache
654 	for (int32 i = pageIndex; i-- > 0;) {
655 		if (writeThrough)
656 			pages[i]->state = PAGE_STATE_ACTIVE;
657 		else
658 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
659 	}
660 
661 	return status;
662 }
663 
664 
665 /**	Like read_into_cache() but writes data into the cache. To preserve data consistency,
666  *	it might also read pages into the cache, though, if only a partial page gets written.
667  *	The cache_ref lock must be hold when calling this function.
668  */
669 
670 static status_t
671 write_to_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
672 {
673 	TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
674 		ref, offset, size, (void *)buffer, bufferSize));
675 
676 	// make sure "offset" is page aligned - but also remember the page offset
677 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
678 	size = PAGE_ALIGN(size + pageOffset);
679 	offset -= pageOffset;
680 
681 	while (true) {
682 		size_t chunkSize = size;
683 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
684 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
685 
686 		status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize);
687 		if (status != B_OK)
688 			return status;
689 
690 		if ((size -= chunkSize) == 0)
691 			return B_OK;
692 
693 		if (chunkSize >= bufferSize) {
694 			bufferSize = 0;
695 			buffer = NULL;
696 		} else {
697 			bufferSize -= chunkSize - pageOffset;
698 			buffer += chunkSize - pageOffset;
699 		}
700 
701 		offset += chunkSize;
702 		pageOffset = 0;
703 	}
704 
705 	return B_OK;
706 }
707 
708 
709 static status_t
710 cache_io(void *_cacheRef, off_t offset, addr_t buffer, size_t *_size, bool doWrite)
711 {
712 	if (_cacheRef == NULL)
713 		panic("cache_io() called with NULL ref!\n");
714 
715 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
716 	vm_cache_ref *cache = ref->cache;
717 	off_t fileSize = cache->cache->virtual_size;
718 
719 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
720 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
721 
722 	// out of bounds access?
723 	if (offset >= fileSize || offset < 0) {
724 		*_size = 0;
725 		return B_OK;
726 	}
727 
728 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
729 	size_t size = *_size;
730 	offset -= pageOffset;
731 
732 	if (offset + pageOffset + size > fileSize) {
733 		// adapt size to be within the file's offsets
734 		size = fileSize - pageOffset - offset;
735 		*_size = size;
736 	}
737 
738 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
739 	// the "last*" variables always point to the end of the last
740 	// satisfied request part
741 
742 	size_t bytesLeft = size, lastLeft = size;
743 	int32 lastPageOffset = pageOffset;
744 	addr_t lastBuffer = buffer;
745 	off_t lastOffset = offset;
746 
747 	mutex_lock(&cache->lock);
748 
749 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
750 		// check if this page is already in memory
751 		addr_t virtualAddress;
752 	restart:
753 		vm_page *page = vm_cache_lookup_page(cache, offset);
754 		if (page != NULL && page->state == PAGE_STATE_BUSY) {
755 			// ToDo: don't wait forever!
756 			mutex_unlock(&cache->lock);
757 			snooze(20000);
758 			mutex_lock(&cache->lock);
759 			goto restart;
760 		}
761 
762 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
763 
764 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset));
765 		if (page != NULL
766 			&& vm_get_physical_page(page->ppn * B_PAGE_SIZE,
767 					&virtualAddress, PHYSICAL_PAGE_CAN_WAIT) == B_OK) {
768 			// it is, so let's satisfy the first part of the request, if we have to
769 			if (lastBuffer != buffer) {
770 				size_t requestSize = buffer - lastBuffer;
771 				status_t status;
772 				if (doWrite) {
773 					status = write_to_cache(ref, lastOffset + lastPageOffset,
774 						requestSize, lastBuffer, requestSize);
775 				} else {
776 					status = read_into_cache(ref, lastOffset + lastPageOffset,
777 						requestSize, lastBuffer, requestSize);
778 				}
779 				if (status != B_OK) {
780 					vm_put_physical_page(virtualAddress);
781 					mutex_unlock(&cache->lock);
782 					return B_IO_ERROR;
783 				}
784 			}
785 
786 			// and copy the contents of the page already in memory
787 			if (doWrite) {
788 				user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage);
789 
790 				// make sure the page is in the modified list
791 				if (page->state != PAGE_STATE_MODIFIED)
792 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
793 			} else
794 				user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage);
795 
796 			vm_put_physical_page(virtualAddress);
797 
798 			if (bytesLeft <= bytesInPage) {
799 				// we've read the last page, so we're done!
800 				mutex_unlock(&cache->lock);
801 				return B_OK;
802 			}
803 
804 			// prepare a potential gap request
805 			lastBuffer = buffer + bytesInPage;
806 			lastLeft = bytesLeft - bytesInPage;
807 			lastOffset = offset + B_PAGE_SIZE;
808 			lastPageOffset = 0;
809 		}
810 
811 		if (bytesLeft <= bytesInPage)
812 			break;
813 
814 		buffer += bytesInPage;
815 		bytesLeft -= bytesInPage;
816 		pageOffset = 0;
817 	}
818 
819 	// fill the last remaining bytes of the request (either write or read)
820 
821 	status_t status;
822 	if (doWrite)
823 		status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
824 	else
825 		status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
826 
827 	mutex_unlock(&cache->lock);
828 	return status;
829 }
830 
831 
832 static status_t
833 file_cache_control(const char *subsystem, uint32 function, void *buffer, size_t bufferSize)
834 {
835 	switch (function) {
836 		case CACHE_CLEAR:
837 			// ToDo: clear the cache
838 			dprintf("cache_control: clear cache!\n");
839 			break;
840 		case CACHE_SET_MODULE:
841 		{
842 			cache_module_info *module = sCacheModule;
843 
844 			// unset previous module
845 
846 			if (sCacheModule != NULL) {
847 				sCacheModule = NULL;
848 				snooze(100000);	// 0.1 secs
849 				put_module(module->info.name);
850 			}
851 
852 			// get new module, if any
853 
854 			if (buffer == NULL)
855 				break;
856 
857 			char name[B_FILE_NAME_LENGTH];
858 			if (!IS_USER_ADDRESS(buffer)
859 				|| user_strlcpy(name, (char *)buffer, B_FILE_NAME_LENGTH) < B_OK)
860 				return B_BAD_ADDRESS;
861 
862 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
863 				return B_BAD_VALUE;
864 
865 			dprintf("cache_control: set module %s!\n", name);
866 
867 			if (get_module(name, (module_info **)&module) == B_OK)
868 				sCacheModule = module;
869 			break;
870 		}
871 	}
872 
873 	return B_OK;
874 }
875 
876 
877 //	#pragma mark -
878 //	kernel public API
879 
880 
881 extern "C" void
882 cache_prefetch_vnode(void *vnode, off_t offset, size_t size)
883 {
884 	vm_cache_ref *cache;
885 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
886 		return;
887 
888 	file_cache_ref *ref = (struct file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
889 	off_t fileSize = cache->cache->virtual_size;
890 
891 	if (size > fileSize)
892 		size = fileSize;
893 
894 	// we never fetch more than 4 MB at once
895 	if (size > 4 * 1024 * 1024)
896 		size = 4 * 1024 * 1024;
897 
898 	size_t bytesLeft = size, lastLeft = size;
899 	off_t lastOffset = offset;
900 	size_t lastSize = 0;
901 
902 	mutex_lock(&cache->lock);
903 
904 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
905 		// check if this page is already in memory
906 		addr_t virtualAddress;
907 	restart:
908 		vm_page *page = vm_cache_lookup_page(cache, offset);
909 		if (page != NULL) {
910 			// it is, so let's satisfy in the first part of the request
911 			if (lastOffset < offset) {
912 				size_t requestSize = offset - lastOffset;
913 				read_into_cache(ref, lastOffset, requestSize, NULL, 0);
914 			}
915 
916 			if (bytesLeft <= B_PAGE_SIZE) {
917 				// we've read the last page, so we're done!
918 				goto out;
919 			}
920 
921 			// prepare a potential gap request
922 			lastOffset = offset + B_PAGE_SIZE;
923 			lastLeft = bytesLeft - B_PAGE_SIZE;
924 		}
925 
926 		if (bytesLeft <= B_PAGE_SIZE)
927 			break;
928 
929 		bytesLeft -= B_PAGE_SIZE;
930 	}
931 
932 	// read in the last part
933 	read_into_cache(ref, lastOffset, lastLeft, NULL, 0);
934 
935 out:
936 	mutex_unlock(&cache->lock);
937 }
938 
939 
940 extern "C" void
941 cache_prefetch(mount_id mountID, vnode_id vnodeID, off_t offset, size_t size)
942 {
943 	void *vnode;
944 
945 	// ToDo: schedule prefetch
946 
947 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
948 
949 	// get the vnode for the object, this also grabs a ref to it
950 	if (vfs_get_vnode(mountID, vnodeID, &vnode) != B_OK)
951 		return;
952 
953 	cache_prefetch_vnode(vnode, offset, size);
954 	vfs_put_vnode(vnode);
955 }
956 
957 
958 extern "C" void
959 cache_node_opened(void *vnode, int32 fdType, vm_cache_ref *cache, mount_id mountID,
960 	vnode_id parentID, vnode_id vnodeID, const char *name)
961 {
962 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
963 		return;
964 
965 	off_t size = -1;
966 	if (cache != NULL) {
967 		file_cache_ref *ref = (file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
968 		if (ref != NULL)
969 			size = ref->cache->cache->virtual_size;
970 	}
971 
972 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name, size);
973 }
974 
975 
976 extern "C" void
977 cache_node_closed(void *vnode, int32 fdType, vm_cache_ref *cache,
978 	mount_id mountID, vnode_id vnodeID)
979 {
980 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
981 		return;
982 
983 	int32 accessType = 0;
984 	if (cache != NULL) {
985 		// ToDo: set accessType
986 	}
987 
988 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
989 }
990 
991 
992 extern "C" void
993 cache_node_launched(size_t argCount, char * const *args)
994 {
995 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
996 		return;
997 
998 	sCacheModule->node_launched(argCount, args);
999 }
1000 
1001 
1002 extern "C" status_t
1003 file_cache_init_post_boot_device(void)
1004 {
1005 	// ToDo: get cache module out of driver settings
1006 
1007 	if (get_module("file_cache/launch_speedup/v1", (module_info **)&sCacheModule) == B_OK) {
1008 		dprintf("** opened launch speedup: %Ld\n", system_time());
1009 	} else
1010 		dprintf("** could not open launch speedup!\n");
1011 
1012 	return B_OK;
1013 }
1014 
1015 
1016 extern "C" status_t
1017 file_cache_init(void)
1018 {
1019 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1020 	return B_OK;
1021 }
1022 
1023 
1024 //	#pragma mark -
1025 //	public FS API
1026 
1027 
1028 extern "C" void *
1029 file_cache_create(mount_id mountID, vnode_id vnodeID, off_t size, int fd)
1030 {
1031 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd));
1032 
1033 	file_cache_ref *ref = new file_cache_ref;
1034 	if (ref == NULL)
1035 		return NULL;
1036 
1037 	// ToDo: delay vm_cache/vm_cache_ref creation until data is
1038 	//	requested/written for the first time? Listing lots of
1039 	//	files in Tracker (and elsewhere) could be slowed down.
1040 	//	Since the file_cache_ref itself doesn't have a lock,
1041 	//	we would need to "rent" one during construction, possibly
1042 	//	the vnode lock, maybe a dedicated one.
1043 	//	As there shouldn't be too much contention, we could also
1044 	//	use atomic_test_and_set(), and free the resources again
1045 	//	when that fails...
1046 
1047 	// get the vnode of the underlying device
1048 	if (vfs_get_vnode_from_fd(fd, true, &ref->device) != B_OK)
1049 		goto err1;
1050 
1051 	// we also need the cookie of the underlying device to properly access it
1052 	if (vfs_get_cookie_from_fd(fd, &ref->cookie) != B_OK)
1053 		goto err2;
1054 
1055 	// get the vnode for the object (note, this does not grab a reference to the node)
1056 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1057 		goto err2;
1058 
1059 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1060 		goto err3;
1061 
1062 	ref->cache->cache->virtual_size = size;
1063 	((vnode_store *)ref->cache->cache->store)->file_cache_ref = ref;
1064 	return ref;
1065 
1066 err3:
1067 	vfs_put_vnode(ref->vnode);
1068 err2:
1069 	vfs_put_vnode(ref->device);
1070 err1:
1071 	delete ref;
1072 	return NULL;
1073 }
1074 
1075 
1076 extern "C" void
1077 file_cache_delete(void *_cacheRef)
1078 {
1079 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1080 
1081 	if (ref == NULL)
1082 		return;
1083 
1084 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1085 
1086 	vfs_put_vnode(ref->device);
1087 	delete ref;
1088 }
1089 
1090 
1091 extern "C" status_t
1092 file_cache_set_size(void *_cacheRef, off_t size)
1093 {
1094 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1095 
1096 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1097 
1098 	if (ref == NULL)
1099 		return B_OK;
1100 
1101 	file_cache_invalidate_file_map(_cacheRef, 0, size);
1102 		// ToDo: make this better (we would only need to extend or shrink the map)
1103 
1104 	mutex_lock(&ref->cache->lock);
1105 	status_t status = vm_cache_resize(ref->cache, size);
1106 	mutex_unlock(&ref->cache->lock);
1107 
1108 	return status;
1109 }
1110 
1111 
1112 extern "C" status_t
1113 file_cache_sync(void *_cacheRef)
1114 {
1115 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1116 	if (ref == NULL)
1117 		return B_BAD_VALUE;
1118 
1119 	return vm_cache_write_modified(ref->cache);
1120 }
1121 
1122 
1123 extern "C" status_t
1124 file_cache_read_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1125 {
1126 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1127 
1128 	return pages_io(ref, offset, vecs, count, _numBytes, false);
1129 }
1130 
1131 
1132 extern "C" status_t
1133 file_cache_write_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1134 {
1135 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1136 
1137 	status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1138 	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n",
1139 		ref, offset, vecs, count, *_numBytes, status));
1140 
1141 	return status;
1142 }
1143 
1144 
1145 extern "C" status_t
1146 file_cache_read(void *_cacheRef, off_t offset, void *bufferBase, size_t *_size)
1147 {
1148 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1149 
1150 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1151 		ref, offset, bufferBase, *_size));
1152 
1153 	return cache_io(ref, offset, (addr_t)bufferBase, _size, false);
1154 }
1155 
1156 
1157 extern "C" status_t
1158 file_cache_write(void *_cacheRef, off_t offset, const void *buffer, size_t *_size)
1159 {
1160 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1161 
1162 	status_t status = cache_io(ref, offset, (addr_t)const_cast<void *>(buffer), _size, true);
1163 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n",
1164 		ref, offset, buffer, *_size, status));
1165 
1166 	return status;
1167 }
1168 
1169 
1170 extern "C" status_t
1171 file_cache_invalidate_file_map(void *_cacheRef, off_t offset, off_t size)
1172 {
1173 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1174 
1175 	// ToDo: honour offset/size parameters
1176 
1177 	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size));
1178 	mutex_lock(&ref->cache->lock);
1179 	ref->map.Free();
1180 	mutex_unlock(&ref->cache->lock);
1181 	return B_OK;
1182 }
1183