xref: /haiku/src/system/kernel/cache/file_cache.cpp (revision 9d6d3fcf5fe8308cd020cecf89dede440346f8c4)
1 /*
2  * Copyright 2004-2006, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "vnode_store.h"
8 
9 #include <KernelExport.h>
10 #include <fs_cache.h>
11 
12 #include <util/kernel_cpp.h>
13 #include <file_cache.h>
14 #include <vfs.h>
15 #include <vm.h>
16 #include <vm_page.h>
17 #include <vm_cache.h>
18 #include <generic_syscall.h>
19 
20 #include <unistd.h>
21 #include <stdlib.h>
22 #include <string.h>
23 
24 
25 //#define TRACE_FILE_CACHE
26 #ifdef TRACE_FILE_CACHE
27 #	define TRACE(x) dprintf x
28 #else
29 #	define TRACE(x) ;
30 #endif
31 
32 // maximum number of iovecs per request
33 #define MAX_IO_VECS			64	// 256 kB
34 #define MAX_FILE_IO_VECS	32
35 
36 #define CACHED_FILE_EXTENTS	2
37 	// must be smaller than MAX_FILE_IO_VECS
38 	// ToDo: find out how much of these are typically used
39 
40 struct file_extent {
41 	off_t			offset;
42 	file_io_vec		disk;
43 };
44 
45 struct file_map {
46 	file_map();
47 	~file_map();
48 
49 	file_extent *operator[](uint32 index);
50 	file_extent *ExtentAt(uint32 index);
51 	status_t Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset);
52 	void Free();
53 
54 	union {
55 		file_extent	direct[CACHED_FILE_EXTENTS];
56 		file_extent	*array;
57 	};
58 	size_t			count;
59 };
60 
61 struct file_cache_ref {
62 	vm_cache_ref	*cache;
63 	void			*vnode;
64 	void			*device;
65 	void			*cookie;
66 	file_map		map;
67 };
68 
69 
70 static struct cache_module_info *sCacheModule;
71 
72 
73 file_map::file_map()
74 {
75 	array = NULL;
76 	count = 0;
77 }
78 
79 
80 file_map::~file_map()
81 {
82 	Free();
83 }
84 
85 
86 file_extent *
87 file_map::operator[](uint32 index)
88 {
89 	return ExtentAt(index);
90 }
91 
92 
93 file_extent *
94 file_map::ExtentAt(uint32 index)
95 {
96 	if (index >= count)
97 		return NULL;
98 
99 	if (count > CACHED_FILE_EXTENTS)
100 		return &array[index];
101 
102 	return &direct[index];
103 }
104 
105 
106 status_t
107 file_map::Add(file_io_vec *vecs, size_t vecCount, off_t &lastOffset)
108 {
109 	TRACE(("file_map::Add(vecCount = %ld)\n", vecCount));
110 
111 	off_t offset = 0;
112 
113 	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
114 		// just use the reserved area in the file_cache_ref structure
115 	} else {
116 		// TODO: once we can invalidate only parts of the file map,
117 		//	we might need to copy the previously cached file extends
118 		//	from the direct range
119 		file_extent *newMap = (file_extent *)realloc(array,
120 			(count + vecCount) * sizeof(file_extent));
121 		if (newMap == NULL)
122 			return B_NO_MEMORY;
123 
124 		array = newMap;
125 
126 		if (count != 0) {
127 			file_extent *extent = ExtentAt(count - 1);
128 			offset = extent->offset + extent->disk.length;
129 		}
130 	}
131 
132 	int32 start = count;
133 	count += vecCount;
134 
135 	for (uint32 i = 0; i < vecCount; i++) {
136 		file_extent *extent = ExtentAt(start + i);
137 
138 		extent->offset = offset;
139 		extent->disk = vecs[i];
140 
141 		offset += extent->disk.length;
142 	}
143 
144 #ifdef TRACE_FILE_CACHE
145 	for (uint32 i = 0; i < count; i++) {
146 		file_extent *extent = ExtentAt(i);
147 		dprintf("[%ld] extend offset %Ld, disk offset %Ld, length %Ld\n",
148 			i, extent->offset, extent->disk.offset, extent->disk.length);
149 	}
150 #endif
151 
152 	lastOffset = offset;
153 	return B_OK;
154 }
155 
156 
157 void
158 file_map::Free()
159 {
160 	if (count > CACHED_FILE_EXTENTS)
161 		free(array);
162 
163 	array = NULL;
164 	count = 0;
165 }
166 
167 
168 //	#pragma mark -
169 
170 
171 static void
172 add_to_iovec(iovec *vecs, int32 &index, int32 max, addr_t address, size_t size)
173 {
174 	if (index > 0 && (addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) {
175 		// the iovec can be combined with the previous one
176 		vecs[index - 1].iov_len += size;
177 		return;
178 	}
179 
180 	if (index == max)
181 		panic("no more space for iovecs!");
182 
183 	// we need to start a new iovec
184 	vecs[index].iov_base = (void *)address;
185 	vecs[index].iov_len = size;
186 	index++;
187 }
188 
189 
190 static file_extent *
191 find_file_extent(file_cache_ref *ref, off_t offset, uint32 *_index)
192 {
193 	// ToDo: do binary search
194 
195 	for (uint32 index = 0; index < ref->map.count; index++) {
196 		file_extent *extent = ref->map[index];
197 
198 		if (extent->offset <= offset
199 			&& extent->offset + extent->disk.length > offset) {
200 			if (_index)
201 				*_index = index;
202 			return extent;
203 		}
204 	}
205 
206 	return NULL;
207 }
208 
209 
210 static status_t
211 get_file_map(file_cache_ref *ref, off_t offset, size_t size,
212 	file_io_vec *vecs, size_t *_count)
213 {
214 	size_t maxVecs = *_count;
215 	status_t status = B_OK;
216 
217 	if (ref->map.count == 0) {
218 		// we don't yet have the map of this file, so let's grab it
219 		// (ordered by offset, so that we can do a binary search on them)
220 
221 		mutex_lock(&ref->cache->lock);
222 
223 		// the file map could have been requested in the mean time
224 		if (ref->map.count == 0) {
225 			size_t vecCount = maxVecs;
226 			off_t mapOffset = 0;
227 
228 			while (true) {
229 				status = vfs_get_file_map(ref->vnode, mapOffset, ~0UL, vecs, &vecCount);
230 				if (status < B_OK && status != B_BUFFER_OVERFLOW) {
231 					mutex_unlock(&ref->cache->lock);
232 					return status;
233 				}
234 
235 				status_t addStatus = ref->map.Add(vecs, vecCount, mapOffset);
236 				if (addStatus != B_OK) {
237 					// only clobber the status in case of failure
238 					status = addStatus;
239 				}
240 
241 				if (status != B_BUFFER_OVERFLOW)
242 					break;
243 
244 				// when we are here, the map has been stored in the array, and
245 				// the array size was still too small to cover the whole file
246 				vecCount = maxVecs;
247 			}
248 		}
249 
250 		mutex_unlock(&ref->cache->lock);
251 	}
252 
253 	if (status != B_OK) {
254 		// We must invalidate the (part of the) map we already
255 		// have, as we cannot know if it's complete or not
256 		ref->map.Free();
257 		return status;
258 	}
259 
260 	// We now have cached the map of this file, we now need to
261 	// translate it for the requested access.
262 
263 	uint32 index;
264 	file_extent *fileExtent = find_file_extent(ref, offset, &index);
265 	if (fileExtent == NULL) {
266 		// access outside file bounds? But that's not our problem
267 		*_count = 0;
268 		return B_OK;
269 	}
270 
271 	offset -= fileExtent->offset;
272 	vecs[0].offset = fileExtent->disk.offset + offset;
273 	vecs[0].length = fileExtent->disk.length - offset;
274 
275 	if (vecs[0].length >= size || index >= ref->map.count - 1) {
276 		*_count = 1;
277 		return B_OK;
278 	}
279 
280 	// copy the rest of the vecs
281 
282 	size -= vecs[0].length;
283 
284 	for (index = 1; index < ref->map.count;) {
285 		fileExtent++;
286 
287 		vecs[index] = fileExtent->disk;
288 		index++;
289 
290 		if (index >= maxVecs) {
291 			*_count = index;
292 			return B_BUFFER_OVERFLOW;
293 		}
294 
295 		if (size <= fileExtent->disk.length)
296 			break;
297 
298 		size -= fileExtent->disk.length;
299 	}
300 
301 	*_count = index;
302 	return B_OK;
303 }
304 
305 
306 static status_t
307 pages_io(file_cache_ref *ref, off_t offset, const iovec *vecs, size_t count,
308 	size_t *_numBytes, bool doWrite)
309 {
310 	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, %s\n", ref, offset,
311 		*_numBytes, doWrite ? "write" : "read"));
312 
313 	// translate the iovecs into direct device accesses
314 	file_io_vec fileVecs[MAX_FILE_IO_VECS];
315 	size_t fileVecCount = MAX_FILE_IO_VECS;
316 	size_t numBytes = *_numBytes;
317 
318 	status_t status = get_file_map(ref, offset, numBytes, fileVecs, &fileVecCount);
319 	if (status < B_OK) {
320 		TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed\n", offset,
321 			numBytes));
322 		return status;
323 	}
324 
325 	// ToDo: handle array overflow gracefully!
326 
327 #ifdef TRACE_FILE_CACHE
328 	dprintf("got %lu file vecs for %Ld:%lu:\n", fileVecCount, offset, numBytes);
329 	for (size_t i = 0; i < fileVecCount; i++)
330 		dprintf("[%lu] offset = %Ld, size = %Ld\n", i, fileVecs[i].offset, fileVecs[i].length);
331 #endif
332 
333 	if (fileVecCount == 0) {
334 		// There are no file vecs at this offset, so we're obviously trying
335 		// to access the file outside of its bounds
336 		TRACE(("pages_io: access outside of vnode %p at offset %Ld\n", ref->vnode, offset));
337 		return B_BAD_VALUE;
338 	}
339 
340 	uint32 fileVecIndex;
341 	size_t size;
342 
343 	if (!doWrite) {
344 		// now directly read the data from the device
345 		// the first file_io_vec can be read directly
346 
347 		size = fileVecs[0].length;
348 		if (size > numBytes)
349 			size = numBytes;
350 
351 		status = vfs_read_pages(ref->device, ref->cookie, fileVecs[0].offset, vecs,
352 			count, &size, false);
353 		if (status < B_OK)
354 			return status;
355 
356 		// ToDo: this is a work-around for buggy device drivers!
357 		//	When our own drivers honour the length, we can:
358 		//	a) also use this direct I/O for writes (otherwise, it would overwrite precious data)
359 		//	b) panic if the term below is true (at least for writes)
360 		if (size > fileVecs[0].length) {
361 			//dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
362 			size = fileVecs[0].length;
363 		}
364 
365 		ASSERT(size <= fileVecs[0].length);
366 
367 		// If the file portion was contiguous, we're already done now
368 		if (size == numBytes)
369 			return B_OK;
370 
371 		// if we reached the end of the file, we can return as well
372 		if (size != fileVecs[0].length) {
373 			*_numBytes = size;
374 			return B_OK;
375 		}
376 
377 		fileVecIndex = 1;
378 	} else {
379 		fileVecIndex = 0;
380 		size = 0;
381 	}
382 
383 	// Too bad, let's process the rest of the file_io_vecs
384 
385 	size_t totalSize = size;
386 
387 	// first, find out where we have to continue in our iovecs
388 	uint32 i = 0;
389 	for (; i < count; i++) {
390 		if (size <= vecs[i].iov_len)
391 			break;
392 
393 		size -= vecs[i].iov_len;
394 	}
395 
396 	size_t vecOffset = size;
397 
398 	for (; fileVecIndex < fileVecCount; fileVecIndex++) {
399 		file_io_vec &fileVec = fileVecs[fileVecIndex];
400 		iovec tempVecs[8];
401 		uint32 tempCount = 1;
402 
403 		tempVecs[0].iov_base = (void *)((addr_t)vecs[i].iov_base + vecOffset);
404 
405 		size = min_c(vecs[i].iov_len - vecOffset, fileVec.length);
406 		tempVecs[0].iov_len = size;
407 
408 		TRACE(("fill vec %ld, offset = %lu, size = %lu\n", i, vecOffset, size));
409 
410 		if (size >= fileVec.length)
411 			vecOffset += size;
412 		else
413 			vecOffset = 0;
414 
415 		while (size < fileVec.length && ++i < count) {
416 			tempVecs[tempCount].iov_base = vecs[i].iov_base;
417 			tempCount++;
418 
419 			// is this iovec larger than the file_io_vec?
420 			if (vecs[i].iov_len + size > fileVec.length) {
421 				size += tempVecs[tempCount].iov_len = vecOffset = fileVec.length - size;
422 				break;
423 			}
424 
425 			size += tempVecs[tempCount].iov_len = vecs[i].iov_len;
426 		}
427 
428 		size_t bytes = size;
429 		if (doWrite) {
430 			status = vfs_write_pages(ref->device, ref->cookie, fileVec.offset, tempVecs,
431 				tempCount, &bytes, false);
432 		} else {
433 			status = vfs_read_pages(ref->device, ref->cookie, fileVec.offset, tempVecs,
434 				tempCount, &bytes, false);
435 		}
436 		if (status < B_OK)
437 			return status;
438 
439 		totalSize += size;
440 
441 		if (size != bytes) {
442 			// there are no more bytes, let's bail out
443 			*_numBytes = totalSize;
444 			return B_OK;
445 		}
446 	}
447 
448 	return B_OK;
449 }
450 
451 
452 /**	This function is called by read_into_cache() (and from there only) - it
453  *	can only handle a certain amount of bytes, and read_into_cache() makes
454  *	sure that it matches that criterion.
455  */
456 
457 static inline status_t
458 read_chunk_into_cache(file_cache_ref *ref, off_t offset, size_t size,
459 	int32 pageOffset, addr_t buffer, size_t bufferSize)
460 {
461 	TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n",
462 		offset, size, pageOffset, buffer, bufferSize));
463 
464 	vm_cache_ref *cache = ref->cache;
465 
466 	iovec vecs[MAX_IO_VECS];
467 	int32 vecCount = 0;
468 
469 	vm_page *pages[MAX_IO_VECS];
470 	int32 pageIndex = 0;
471 
472 	// allocate pages for the cache and mark them busy
473 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
474 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
475 		if (page == NULL)
476 			panic("no more pages!");
477 
478 		page->state = PAGE_STATE_BUSY;
479 
480 		vm_cache_insert_page(cache, page, offset + pos);
481 
482 		addr_t virtualAddress;
483 		if (vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, &virtualAddress, PHYSICAL_PAGE_CAN_WAIT) < B_OK)
484 			panic("could not get physical page");
485 
486 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
487 		// ToDo: check if the array is large enough!
488 	}
489 
490 	mutex_unlock(&cache->lock);
491 
492 	// read file into reserved pages
493 	status_t status = pages_io(ref, offset, vecs, vecCount, &size, false);
494 	if (status < B_OK) {
495 		// reading failed, free allocated pages
496 
497 		dprintf("file_cache: read pages failed: %s\n", strerror(status));
498 
499 		for (int32 i = 0; i < vecCount; i++) {
500 			addr_t base = (addr_t)vecs[i].iov_base;
501 			size_t size = vecs[i].iov_len;
502 
503 			for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
504 				vm_put_physical_page(base);
505 		}
506 
507 		mutex_lock(&cache->lock);
508 
509 		for (int32 i = 0; i < pageIndex; i++) {
510 			vm_cache_remove_page(cache, pages[i]);
511 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
512 		}
513 
514 		return status;
515 	}
516 
517 	// copy the pages and unmap them again
518 
519 	for (int32 i = 0; i < vecCount; i++) {
520 		addr_t base = (addr_t)vecs[i].iov_base;
521 		size_t size = vecs[i].iov_len;
522 
523 		// copy to user buffer if necessary
524 		if (bufferSize != 0) {
525 			size_t bytes = min_c(bufferSize, size - pageOffset);
526 
527 			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
528 			buffer += bytes;
529 			bufferSize -= bytes;
530 			pageOffset = 0;
531 		}
532 
533 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
534 			vm_put_physical_page(base);
535 	}
536 
537 	mutex_lock(&cache->lock);
538 
539 	// make the pages accessible in the cache
540 	for (int32 i = pageIndex; i-- > 0;)
541 		pages[i]->state = PAGE_STATE_ACTIVE;
542 
543 	return B_OK;
544 }
545 
546 
547 /**	This function reads \a size bytes directly from the file into the cache.
548  *	If \a bufferSize does not equal zero, \a bufferSize bytes from the data
549  *	read in are also copied to the provided \a buffer.
550  *	This function always allocates all pages; it is the responsibility of the
551  *	calling function to only ask for yet uncached ranges.
552  *	The cache_ref lock must be hold when calling this function.
553  */
554 
555 static status_t
556 read_into_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
557 {
558 	TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
559 		ref, offset, size, (void *)buffer, bufferSize));
560 
561 	// do we have to read in anything at all?
562 	if (size == 0)
563 		return B_OK;
564 
565 	// make sure "offset" is page aligned - but also remember the page offset
566 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
567 	size = PAGE_ALIGN(size + pageOffset);
568 	offset -= pageOffset;
569 
570 	while (true) {
571 		size_t chunkSize = size;
572 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
573 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
574 
575 		status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset,
576 								buffer, bufferSize);
577 		if (status != B_OK)
578 			return status;
579 
580 		if ((size -= chunkSize) == 0)
581 			return B_OK;
582 
583 		if (chunkSize >= bufferSize) {
584 			bufferSize = 0;
585 			buffer = NULL;
586 		} else {
587 			bufferSize -= chunkSize - pageOffset;
588 			buffer += chunkSize - pageOffset;
589 		}
590 
591 		offset += chunkSize;
592 		pageOffset = 0;
593 	}
594 
595 	return B_OK;
596 }
597 
598 
599 /**	Like read_chunk_into_cache() but writes data into the cache */
600 
601 static inline status_t
602 write_chunk_to_cache(file_cache_ref *ref, off_t offset, size_t size,
603 	int32 pageOffset, addr_t buffer, size_t bufferSize)
604 {
605 	iovec vecs[MAX_IO_VECS];
606 	int32 vecCount = 0;
607 	vm_page *pages[MAX_IO_VECS];
608 	int32 pageIndex = 0;
609 	status_t status = B_OK;
610 
611 	// ToDo: this should be settable somewhere
612 	bool writeThrough = false;
613 
614 	// allocate pages for the cache and mark them busy
615 	for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE) {
616 		// ToDo: if space is becoming tight, and this cache is already grown
617 		//	big - shouldn't we better steal the pages directly in that case?
618 		//	(a working set like approach for the file cache)
619 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
620 		page->state = PAGE_STATE_BUSY;
621 
622 		vm_cache_insert_page(ref->cache, page, offset + pos);
623 
624 		addr_t virtualAddress;
625 		vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE, &virtualAddress,
626 			PHYSICAL_PAGE_CAN_WAIT);
627 
628 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, B_PAGE_SIZE);
629 		// ToDo: check if the array is large enough!
630 	}
631 
632 	mutex_unlock(&ref->cache->lock);
633 
634 	// copy contents (and read in partially written pages first)
635 
636 	if (pageOffset != 0) {
637 		// This is only a partial write, so we have to read the rest of the page
638 		// from the file to have consistent data in the cache
639 		iovec readVec = { vecs[0].iov_base, B_PAGE_SIZE };
640 		size_t bytesRead = B_PAGE_SIZE;
641 
642 		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
643 		// ToDo: handle errors for real!
644 		if (status < B_OK)
645 			panic("1. pages_io() failed: %s!\n", strerror(status));
646 	}
647 
648 	addr_t lastPageOffset = (pageOffset + bufferSize) & (B_PAGE_SIZE - 1);
649 	if (lastPageOffset != 0) {
650 		// get the last page in the I/O vectors
651 		addr_t last = (addr_t)vecs[vecCount - 1].iov_base
652 			+ vecs[vecCount - 1].iov_len - B_PAGE_SIZE;
653 
654 		if (offset + pageOffset + bufferSize == ref->cache->cache->virtual_size) {
655 			// the space in the page after this write action needs to be cleaned
656 			memset((void *)(last + lastPageOffset), 0, B_PAGE_SIZE - lastPageOffset);
657 		} else if (vecCount > 1) {
658 			// the end of this write does not happen on a page boundary, so we
659 			// need to fetch the last page before we can update it
660 			iovec readVec = { (void *)last, B_PAGE_SIZE };
661 			size_t bytesRead = B_PAGE_SIZE;
662 
663 			status = pages_io(ref, offset + size - B_PAGE_SIZE, &readVec, 1,
664 				&bytesRead, false);
665 			// ToDo: handle errors for real!
666 			if (status < B_OK)
667 				panic("pages_io() failed: %s!\n", strerror(status));
668 		}
669 	}
670 
671 	for (int32 i = 0; i < vecCount; i++) {
672 		addr_t base = (addr_t)vecs[i].iov_base;
673 		size_t bytes = min_c(bufferSize, size_t(vecs[i].iov_len - pageOffset));
674 
675 		// copy data from user buffer
676 		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
677 
678 		bufferSize -= bytes;
679 		if (bufferSize == 0)
680 			break;
681 
682 		buffer += bytes;
683 		pageOffset = 0;
684 	}
685 
686 	if (writeThrough) {
687 		// write cached pages back to the file if we were asked to do that
688 		status_t status = pages_io(ref, offset, vecs, vecCount, &size, true);
689 		if (status < B_OK) {
690 			// ToDo: remove allocated pages, ...?
691 			panic("file_cache: remove allocated pages! write pages failed: %s\n",
692 				strerror(status));
693 		}
694 	}
695 
696 	mutex_lock(&ref->cache->lock);
697 
698 	// unmap the pages again
699 
700 	for (int32 i = 0; i < vecCount; i++) {
701 		addr_t base = (addr_t)vecs[i].iov_base;
702 		size_t size = vecs[i].iov_len;
703 		for (size_t pos = 0; pos < size; pos += B_PAGE_SIZE, base += B_PAGE_SIZE)
704 			vm_put_physical_page(base);
705 	}
706 
707 	// make the pages accessible in the cache
708 	for (int32 i = pageIndex; i-- > 0;) {
709 		if (writeThrough)
710 			pages[i]->state = PAGE_STATE_ACTIVE;
711 		else
712 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
713 	}
714 
715 	return status;
716 }
717 
718 
719 /**	Like read_into_cache() but writes data into the cache. To preserve data consistency,
720  *	it might also read pages into the cache, though, if only a partial page gets written.
721  *	The cache_ref lock must be hold when calling this function.
722  */
723 
724 static status_t
725 write_to_cache(file_cache_ref *ref, off_t offset, size_t size, addr_t buffer, size_t bufferSize)
726 {
727 	TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
728 		ref, offset, size, (void *)buffer, bufferSize));
729 
730 	// make sure "offset" is page aligned - but also remember the page offset
731 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
732 	size = PAGE_ALIGN(size + pageOffset);
733 	offset -= pageOffset;
734 
735 	while (true) {
736 		size_t chunkSize = size;
737 		if (chunkSize > (MAX_IO_VECS * B_PAGE_SIZE))
738 			chunkSize = MAX_IO_VECS * B_PAGE_SIZE;
739 
740 		status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize);
741 		if (status != B_OK)
742 			return status;
743 
744 		if ((size -= chunkSize) == 0)
745 			return B_OK;
746 
747 		if (chunkSize >= bufferSize) {
748 			bufferSize = 0;
749 			buffer = NULL;
750 		} else {
751 			bufferSize -= chunkSize - pageOffset;
752 			buffer += chunkSize - pageOffset;
753 		}
754 
755 		offset += chunkSize;
756 		pageOffset = 0;
757 	}
758 
759 	return B_OK;
760 }
761 
762 
763 static status_t
764 satisfy_cache_io(file_cache_ref *ref, off_t offset, addr_t buffer, addr_t lastBuffer,
765 	bool doWrite)
766 {
767 	size_t requestSize = buffer - lastBuffer;
768 
769 	if (doWrite)
770 		return write_to_cache(ref, offset, requestSize, lastBuffer, requestSize);
771 
772 	return read_into_cache(ref, offset, requestSize, lastBuffer, requestSize);
773 }
774 
775 
776 static status_t
777 cache_io(void *_cacheRef, off_t offset, addr_t buffer, size_t *_size, bool doWrite)
778 {
779 	if (_cacheRef == NULL)
780 		panic("cache_io() called with NULL ref!\n");
781 
782 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
783 	vm_cache_ref *cache = ref->cache;
784 	off_t fileSize = cache->cache->virtual_size;
785 
786 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
787 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
788 
789 	// out of bounds access?
790 	if (offset >= fileSize || offset < 0) {
791 		*_size = 0;
792 		return B_OK;
793 	}
794 
795 	int32 pageOffset = offset & (B_PAGE_SIZE - 1);
796 	size_t size = *_size;
797 	offset -= pageOffset;
798 
799 	if (offset + pageOffset + size > fileSize) {
800 		// adapt size to be within the file's offsets
801 		size = fileSize - pageOffset - offset;
802 		*_size = size;
803 	}
804 
805 	// "offset" and "lastOffset" are always aligned to B_PAGE_SIZE,
806 	// the "last*" variables always point to the end of the last
807 	// satisfied request part
808 
809 	size_t bytesLeft = size, lastLeft = size;
810 	int32 lastPageOffset = pageOffset;
811 	addr_t lastBuffer = buffer;
812 	off_t lastOffset = offset;
813 
814 	mutex_lock(&cache->lock);
815 
816 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
817 		// check if this page is already in memory
818 	restart:
819 		vm_page *page = vm_cache_lookup_page(cache, offset);
820 		vm_page *dummyPage = NULL;
821 		if (page != NULL) {
822 			// The page is busy - since we need to unlock the cache sometime
823 			// in the near future, we need to satisfy the request of the pages
824 			// we didn't get yet (to make sure no one else interferes in the
825 			// mean time).
826 			status_t status = B_OK;
827 
828 			if (lastBuffer != buffer) {
829 				status = satisfy_cache_io(ref, lastOffset + lastPageOffset,
830 					buffer, lastBuffer, doWrite);
831 				if (status == B_OK) {
832 					lastBuffer = buffer;
833 					lastLeft = bytesLeft;
834 					lastOffset = offset;
835 					lastPageOffset = 0;
836 					pageOffset = 0;
837 				}
838 			}
839 
840 			if (status != B_OK) {
841 				mutex_unlock(&cache->lock);
842 				return status;
843 			}
844 
845 			if (page->state == PAGE_STATE_BUSY) {
846 				if (page->type == PAGE_TYPE_DUMMY) {
847 					dummyPage = page;
848 					page = vm_page_allocate_page(PAGE_STATE_FREE);
849 					if (page == NULL) {
850 						mutex_unlock(&cache->lock);
851 						return B_NO_MEMORY;
852 					}
853 				} else {
854 					mutex_unlock(&cache->lock);
855 					// ToDo: don't wait forever!
856 					snooze(20000);
857 					mutex_lock(&cache->lock);
858 					goto restart;
859 				}
860 			}
861 		}
862 
863 		size_t bytesInPage = min_c(size_t(B_PAGE_SIZE - pageOffset), bytesLeft);
864 		addr_t virtualAddress;
865 
866 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset));
867 		if (page != NULL) {
868 			vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
869 				&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
870 
871 			if (dummyPage != NULL && (!doWrite || bytesInPage != B_PAGE_SIZE)) {
872 				// This page is currently in-use by someone else - since we cannot
873 				// know if this someone does what we want, and if it even can do
874 				// what we want (we may own a lock the blocks the other request),
875 				// we need to handle this case specifically
876 				iovec vec;
877 				vec.iov_base = (void *)virtualAddress;
878 				vec.iov_len = B_PAGE_SIZE;
879 
880 				size_t size = B_PAGE_SIZE;
881 				status_t status = pages_io(ref, offset, &vec, 1, &size, false);
882 				if (status != B_OK) {
883 					vm_put_physical_page(virtualAddress);
884 					mutex_unlock(&cache->lock);
885 					return status;
886 				}
887 			}
888 
889 			// and copy the contents of the page already in memory
890 			if (doWrite) {
891 				user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage);
892 
893 				// make sure the page is in the modified list
894 				if (page->state != PAGE_STATE_MODIFIED)
895 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
896 			} else
897 				user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage);
898 
899 			vm_put_physical_page(virtualAddress);
900 
901 			if (dummyPage != NULL) {
902 				// check if the dummy page is still in place
903 			restart_dummy_lookup:
904 				vm_page *currentPage = vm_cache_lookup_page(cache, offset);
905 				if (currentPage->state == PAGE_STATE_BUSY) {
906 					if (currentPage->type == PAGE_TYPE_DUMMY) {
907 						// we let the other party add our page
908 						currentPage->queue_next = page;
909 					} else {
910 						mutex_unlock(&cache->lock);
911 						// ToDo: don't wait forever!
912 						snooze(20000);
913 						mutex_lock(&cache->lock);
914 						goto restart_dummy_lookup;
915 					}
916 				} else if (currentPage != NULL) {
917 					// we need to copy our new page into the old one
918 					addr_t destinationAddress;
919 					vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
920 						&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
921 					vm_get_physical_page(currentPage->physical_page_number * B_PAGE_SIZE,
922 						&destinationAddress, PHYSICAL_PAGE_CAN_WAIT);
923 
924 					memcpy((void *)destinationAddress, (void *)virtualAddress, B_PAGE_SIZE);
925 
926 					vm_put_physical_page(destinationAddress);
927 					vm_put_physical_page(virtualAddress);
928 
929 					vm_page_set_state(page, PAGE_STATE_FREE);
930 				} else {
931 					// there is no page in place anymore, we'll put ours into it
932 					vm_cache_insert_page(cache, page, offset);
933 				}
934 			}
935 
936 			if (bytesLeft <= bytesInPage) {
937 				// we've read the last page, so we're done!
938 				mutex_unlock(&cache->lock);
939 				return B_OK;
940 			}
941 
942 			// prepare a potential gap request
943 			lastBuffer = buffer + bytesInPage;
944 			lastLeft = bytesLeft - bytesInPage;
945 			lastOffset = offset + B_PAGE_SIZE;
946 			lastPageOffset = 0;
947 		}
948 
949 		if (bytesLeft <= bytesInPage)
950 			break;
951 
952 		buffer += bytesInPage;
953 		bytesLeft -= bytesInPage;
954 		pageOffset = 0;
955 	}
956 
957 	// fill the last remaining bytes of the request (either write or read)
958 
959 	status_t status;
960 	if (doWrite)
961 		status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
962 	else
963 		status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
964 
965 	mutex_unlock(&cache->lock);
966 	return status;
967 }
968 
969 
970 static status_t
971 file_cache_control(const char *subsystem, uint32 function, void *buffer, size_t bufferSize)
972 {
973 	switch (function) {
974 		case CACHE_CLEAR:
975 			// ToDo: clear the cache
976 			dprintf("cache_control: clear cache!\n");
977 			return B_OK;
978 
979 		case CACHE_SET_MODULE:
980 		{
981 			cache_module_info *module = sCacheModule;
982 
983 			// unset previous module
984 
985 			if (sCacheModule != NULL) {
986 				sCacheModule = NULL;
987 				snooze(100000);	// 0.1 secs
988 				put_module(module->info.name);
989 			}
990 
991 			// get new module, if any
992 
993 			if (buffer == NULL)
994 				return B_OK;
995 
996 			char name[B_FILE_NAME_LENGTH];
997 			if (!IS_USER_ADDRESS(buffer)
998 				|| user_strlcpy(name, (char *)buffer, B_FILE_NAME_LENGTH) < B_OK)
999 				return B_BAD_ADDRESS;
1000 
1001 			if (strncmp(name, CACHE_MODULES_NAME, strlen(CACHE_MODULES_NAME)))
1002 				return B_BAD_VALUE;
1003 
1004 			dprintf("cache_control: set module %s!\n", name);
1005 
1006 			status_t status = get_module(name, (module_info **)&module);
1007 			if (status == B_OK)
1008 				sCacheModule = module;
1009 
1010 			return status;
1011 		}
1012 	}
1013 
1014 	return B_BAD_HANDLER;
1015 }
1016 
1017 
1018 //	#pragma mark -
1019 //	kernel public API
1020 
1021 
1022 extern "C" void
1023 cache_prefetch_vnode(void *vnode, off_t offset, size_t size)
1024 {
1025 	vm_cache_ref *cache;
1026 	if (vfs_get_vnode_cache(vnode, &cache, false) != B_OK)
1027 		return;
1028 
1029 	file_cache_ref *ref = (struct file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
1030 	off_t fileSize = cache->cache->virtual_size;
1031 
1032 	if (size > fileSize)
1033 		size = fileSize;
1034 
1035 	// we never fetch more than 4 MB at once
1036 	if (size > 4 * 1024 * 1024)
1037 		size = 4 * 1024 * 1024;
1038 
1039 	size_t bytesLeft = size, lastLeft = size;
1040 	off_t lastOffset = offset;
1041 	size_t lastSize = 0;
1042 
1043 	mutex_lock(&cache->lock);
1044 
1045 	for (; bytesLeft > 0; offset += B_PAGE_SIZE) {
1046 		// check if this page is already in memory
1047 		addr_t virtualAddress;
1048 	restart:
1049 		vm_page *page = vm_cache_lookup_page(cache, offset);
1050 		if (page != NULL) {
1051 			// it is, so let's satisfy in the first part of the request
1052 			if (lastOffset < offset) {
1053 				size_t requestSize = offset - lastOffset;
1054 				read_into_cache(ref, lastOffset, requestSize, NULL, 0);
1055 			}
1056 
1057 			if (bytesLeft <= B_PAGE_SIZE) {
1058 				// we've read the last page, so we're done!
1059 				goto out;
1060 			}
1061 
1062 			// prepare a potential gap request
1063 			lastOffset = offset + B_PAGE_SIZE;
1064 			lastLeft = bytesLeft - B_PAGE_SIZE;
1065 		}
1066 
1067 		if (bytesLeft <= B_PAGE_SIZE)
1068 			break;
1069 
1070 		bytesLeft -= B_PAGE_SIZE;
1071 	}
1072 
1073 	// read in the last part
1074 	read_into_cache(ref, lastOffset, lastLeft, NULL, 0);
1075 
1076 out:
1077 	mutex_unlock(&cache->lock);
1078 	vm_cache_release_ref(cache);
1079 }
1080 
1081 
1082 extern "C" void
1083 cache_prefetch(mount_id mountID, vnode_id vnodeID, off_t offset, size_t size)
1084 {
1085 	void *vnode;
1086 
1087 	// ToDo: schedule prefetch
1088 
1089 	TRACE(("cache_prefetch(vnode %ld:%Ld)\n", mountID, vnodeID));
1090 
1091 	// get the vnode for the object, this also grabs a ref to it
1092 	if (vfs_get_vnode(mountID, vnodeID, &vnode) != B_OK)
1093 		return;
1094 
1095 	cache_prefetch_vnode(vnode, offset, size);
1096 	vfs_put_vnode(vnode);
1097 }
1098 
1099 
1100 extern "C" void
1101 cache_node_opened(void *vnode, int32 fdType, vm_cache_ref *cache, mount_id mountID,
1102 	vnode_id parentID, vnode_id vnodeID, const char *name)
1103 {
1104 	if (sCacheModule == NULL || sCacheModule->node_opened == NULL)
1105 		return;
1106 
1107 	off_t size = -1;
1108 	if (cache != NULL) {
1109 		file_cache_ref *ref = (file_cache_ref *)((vnode_store *)cache->cache->store)->file_cache_ref;
1110 		if (ref != NULL)
1111 			size = ref->cache->cache->virtual_size;
1112 	}
1113 
1114 	sCacheModule->node_opened(vnode, fdType, mountID, parentID, vnodeID, name, size);
1115 }
1116 
1117 
1118 extern "C" void
1119 cache_node_closed(void *vnode, int32 fdType, vm_cache_ref *cache,
1120 	mount_id mountID, vnode_id vnodeID)
1121 {
1122 	if (sCacheModule == NULL || sCacheModule->node_closed == NULL)
1123 		return;
1124 
1125 	int32 accessType = 0;
1126 	if (cache != NULL) {
1127 		// ToDo: set accessType
1128 	}
1129 
1130 	sCacheModule->node_closed(vnode, fdType, mountID, vnodeID, accessType);
1131 }
1132 
1133 
1134 extern "C" void
1135 cache_node_launched(size_t argCount, char * const *args)
1136 {
1137 	if (sCacheModule == NULL || sCacheModule->node_launched == NULL)
1138 		return;
1139 
1140 	sCacheModule->node_launched(argCount, args);
1141 }
1142 
1143 
1144 extern "C" status_t
1145 file_cache_init_post_boot_device(void)
1146 {
1147 	// ToDo: get cache module out of driver settings
1148 
1149 	if (get_module("file_cache/launch_speedup/v1", (module_info **)&sCacheModule) == B_OK) {
1150 		dprintf("** opened launch speedup: %Ld\n", system_time());
1151 	} else
1152 		dprintf("** could not open launch speedup!\n");
1153 
1154 	return B_OK;
1155 }
1156 
1157 
1158 extern "C" status_t
1159 file_cache_init(void)
1160 {
1161 	register_generic_syscall(CACHE_SYSCALLS, file_cache_control, 1, 0);
1162 	return B_OK;
1163 }
1164 
1165 
1166 //	#pragma mark -
1167 //	public FS API
1168 
1169 
1170 extern "C" void *
1171 file_cache_create(mount_id mountID, vnode_id vnodeID, off_t size, int fd)
1172 {
1173 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd));
1174 
1175 	file_cache_ref *ref = new file_cache_ref;
1176 	if (ref == NULL)
1177 		return NULL;
1178 
1179 	// TODO: delay vm_cache/vm_cache_ref creation until data is
1180 	//	requested/written for the first time? Listing lots of
1181 	//	files in Tracker (and elsewhere) could be slowed down.
1182 	//	Since the file_cache_ref itself doesn't have a lock,
1183 	//	we would need to "rent" one during construction, possibly
1184 	//	the vnode lock, maybe a dedicated one.
1185 	//	As there shouldn't be too much contention, we could also
1186 	//	use atomic_test_and_set(), and free the resources again
1187 	//	when that fails...
1188 
1189 	// Get the vnode of the underlying device
1190 	if (vfs_get_vnode_from_fd(fd, true, &ref->device) != B_OK)
1191 		goto err1;
1192 
1193 	// We also need the cookie of the underlying device to properly access it
1194 	if (vfs_get_cookie_from_fd(fd, &ref->cookie) != B_OK)
1195 		goto err2;
1196 
1197 	// Get the vnode for the object (note, this does not grab a reference to the node)
1198 	if (vfs_lookup_vnode(mountID, vnodeID, &ref->vnode) != B_OK)
1199 		goto err2;
1200 
1201 	// Gets (usually creates) the cache for the node - note, this does grab a
1202 	// reference to the node...
1203 	if (vfs_get_vnode_cache(ref->vnode, &ref->cache, true) != B_OK)
1204 		goto err2;
1205 
1206 	// ... that we don't need, and therefore release it again.
1207 	// Our caller already holds a reference to the vnode; it will destroy us
1208 	// when the last one goes away (which, of course, can only ever happen if
1209 	// we don't grab an extra reference).
1210 	vfs_put_vnode(ref->vnode);
1211 
1212 	ref->cache->cache->virtual_size = size;
1213 	((vnode_store *)ref->cache->cache->store)->file_cache_ref = ref;
1214 	return ref;
1215 
1216 err2:
1217 	vfs_put_vnode(ref->device);
1218 err1:
1219 	delete ref;
1220 	return NULL;
1221 }
1222 
1223 
1224 extern "C" void
1225 file_cache_delete(void *_cacheRef)
1226 {
1227 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1228 
1229 	if (ref == NULL)
1230 		return;
1231 
1232 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1233 
1234 	vfs_put_vnode(ref->device);
1235 	delete ref;
1236 }
1237 
1238 
1239 extern "C" status_t
1240 file_cache_set_size(void *_cacheRef, off_t size)
1241 {
1242 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1243 
1244 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1245 
1246 	if (ref == NULL)
1247 		return B_OK;
1248 
1249 	file_cache_invalidate_file_map(_cacheRef, 0, size);
1250 		// ToDo: make this better (we would only need to extend or shrink the map)
1251 
1252 	mutex_lock(&ref->cache->lock);
1253 	status_t status = vm_cache_resize(ref->cache, size);
1254 	mutex_unlock(&ref->cache->lock);
1255 
1256 	return status;
1257 }
1258 
1259 
1260 extern "C" status_t
1261 file_cache_sync(void *_cacheRef)
1262 {
1263 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1264 	if (ref == NULL)
1265 		return B_BAD_VALUE;
1266 
1267 	return vm_cache_write_modified(ref->cache, true);
1268 }
1269 
1270 
1271 extern "C" status_t
1272 file_cache_read_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1273 {
1274 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1275 
1276 	return pages_io(ref, offset, vecs, count, _numBytes, false);
1277 }
1278 
1279 
1280 extern "C" status_t
1281 file_cache_write_pages(void *_cacheRef, off_t offset, const iovec *vecs, size_t count, size_t *_numBytes)
1282 {
1283 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1284 
1285 	status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1286 	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n",
1287 		ref, offset, vecs, count, *_numBytes, status));
1288 
1289 	return status;
1290 }
1291 
1292 
1293 extern "C" status_t
1294 file_cache_read(void *_cacheRef, off_t offset, void *bufferBase, size_t *_size)
1295 {
1296 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1297 
1298 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1299 		ref, offset, bufferBase, *_size));
1300 
1301 	return cache_io(ref, offset, (addr_t)bufferBase, _size, false);
1302 }
1303 
1304 
1305 extern "C" status_t
1306 file_cache_write(void *_cacheRef, off_t offset, const void *buffer, size_t *_size)
1307 {
1308 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1309 
1310 	status_t status = cache_io(ref, offset, (addr_t)const_cast<void *>(buffer), _size, true);
1311 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n",
1312 		ref, offset, buffer, *_size, status));
1313 
1314 	return status;
1315 }
1316 
1317 
1318 extern "C" status_t
1319 file_cache_invalidate_file_map(void *_cacheRef, off_t offset, off_t size)
1320 {
1321 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1322 
1323 	// ToDo: honour offset/size parameters
1324 
1325 	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size));
1326 	mutex_lock(&ref->cache->lock);
1327 	ref->map.Free();
1328 	mutex_unlock(&ref->cache->lock);
1329 	return B_OK;
1330 }
1331