xref: /haiku/src/tools/fs_shell/file_cache.cpp (revision 6dcd0ccf238263a3e5eb2e2a44e2ed0da1617a42)
1 /*
2  * Copyright 2004-2007, Haiku Inc. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  *
5  * Authors:
6  * 		Axel Dörfler <axeld@pinc-software.de>
7  * 		Ingo Weinhold <bonefish@cs.tu-berlin.de>
8  */
9 
10 #include "fssh_fs_cache.h"
11 
12 #include <new>
13 
14 #include <stdlib.h>
15 
16 #include "DoublyLinkedList.h"
17 #include "fssh_kernel_export.h"
18 #include "fssh_stdio.h"
19 #include "fssh_string.h"
20 #include "fssh_uio.h"
21 #include "fssh_unistd.h"
22 #include "hash.h"
23 #include "lock.h"
24 #include "vfs.h"
25 
26 
27 #undef TRACE
28 //#define TRACE_FILE_CACHE
29 #ifdef TRACE_FILE_CACHE
30 #	define TRACE(x) fssh_dprintf x
31 #else
32 #	define TRACE(x) ;
33 #endif
34 
35 using std::nothrow;
36 
37 
38 // This is a hacked version of the kernel file cache implementation. The main
39 // part of the implementation didn't change that much -- some code not needed
40 // in userland has been removed, file_cache_ref, vm_cache_ref and vm_cache
41 // have been unified, and the complete interface to the VM (vm_*()) and the
42 // VFS (vfs_*()) has been re-implemented to suit our needs.
43 //
44 // The PagePool class is a data structure used for managing the pages (vm_page)
45 // allocated and assigned to caches (file_cache_ref). It has a list for free
46 // pages, i.e. those that are not assigned to any cache and can be reused at
47 // once. A second list contains those pages that belong to a cache, but are
48 // not in use by any of the functions. These pages have a reference count of
49 // 0. They will be stolen from the owning cache when a usable page is needed,
50 // there are no free pages anymore, and the limit of pages that shall be used
51 // at maximum has already been reached.
52 //
53 // The only purpose of the page reference count (vm_page::ref_count) is to
54 // indicate whether the page is in use (i.e. known to one or more of the cache
55 // functions currently being executed). vm_page_get_page(),
56 // vm_page_allocate_page(), and vm_cache_lookup_page acquire a reference to
57 // the page, vm_page_put_page() releases a reference.
58 
59 // vm_page::state indicates in what state
60 // a page currently is. PAGE_STATE_FREE is only encountered for pages not
61 // belonging to a cache (being in page pool's free pages list, or just having
62 // been removed or not yet added). PAGE_STATE_ACTIVE/MODIFIED indicate that
63 // the page contains valid file data, in the latter case not yet written to
64 // disk. PAGE_STATE_BUSY means that the page is currently being manipulated by
65 // an operation, usually meaning that page data are being read from/written to
66 // disk. The required behavior when encountering a busy page, is to put the
67 // page, release the page pool and cache locks, wait a short time, and
68 // retry again later.
69 //
70 // Changing the page state requires having a reference to the page,
71 // holding the lock of the owning cache (if any) and the lock of the page pool
72 // (in case the page is not newly allocated).
73 //
74 // A page is in up to three lists. The free or unused list in the page pool
75 // (guarded by the page pool lock), the pages list of the cache the page
76 // belongs to, and the modified pages list of the cache (both cache lists
77 // are guarded by both the page pool and the cache lock). The modified pages
78 // list only contains PAGE_STATE_MODIFIED or PAGE_STATE_BUSY pages.
79 
80 
81 // maximum number of iovecs per request
82 #define MAX_IO_VECS			64	// 256 kB
83 #define MAX_FILE_IO_VECS	32
84 #define MAX_TEMP_IO_VECS	8
85 
86 #define CACHED_FILE_EXTENTS	2
87 	// must be smaller than MAX_FILE_IO_VECS
88 	// ToDo: find out how much of these are typically used
89 
90 #define user_memcpy(a, b, c) fssh_memcpy(a, b, c)
91 
92 #define PAGE_ALIGN(x) (((x) + (FSSH_B_PAGE_SIZE - 1)) & ~(FSSH_B_PAGE_SIZE - 1))
93 #define ASSERT(x)
94 
95 namespace FSShell {
96 
97 enum {
98 	PAGE_STATE_ACTIVE = 0,
99 	PAGE_STATE_BUSY,
100 	PAGE_STATE_MODIFIED,
101 	PAGE_STATE_FREE
102 };
103 
104 enum {
105 	PHYSICAL_PAGE_NO_WAIT = 0,
106 	PHYSICAL_PAGE_CAN_WAIT,
107 };
108 
109 struct vm_page;
110 struct file_cache_ref;
111 
112 typedef DoublyLinkedListLink<vm_page> page_link;
113 
114 // vm page
115 struct vm_page {
116 	vm_page*			next;
117 	page_link			link_cache;
118 	page_link			link_cache_modified;
119 	page_link			link_pool;
120 	file_cache_ref*		cache;
121 	fssh_off_t			offset;
122 	void*				data;
123 	uint8_t				state;
124 	uint32_t			ref_count;
125 
126 	vm_page()
127 		: next(NULL),
128 		  cache(NULL),
129 		  offset(0),
130 		  data(malloc(FSSH_B_PAGE_SIZE)),
131 		  state(PAGE_STATE_FREE),
132 		  ref_count(1)
133 	{
134 	}
135 
136 	~vm_page()
137 	{
138 		free(data);
139 	}
140 
141 	fssh_addr_t			Address() const	{ return (fssh_addr_t)data; }
142 
143 	static int Compare(void *_cacheEntry, const void *_key);
144 	static uint32_t Hash(void *_cacheEntry, const void *_key, uint32_t range);
145 };
146 
147 struct file_extent {
148 	fssh_off_t			offset;
149 	fssh_file_io_vec	disk;
150 };
151 
152 struct file_map {
153 	file_map();
154 	~file_map();
155 
156 	file_extent *operator[](uint32_t index);
157 	file_extent *ExtentAt(uint32_t index);
158 	fssh_status_t Add(fssh_file_io_vec *vecs, fssh_size_t vecCount, fssh_off_t &lastOffset);
159 	void Free();
160 
161 	union {
162 		file_extent	direct[CACHED_FILE_EXTENTS];
163 		file_extent	*array;
164 	};
165 	fssh_size_t			count;
166 };
167 
168 
169 static vm_page *vm_page_allocate_page(int state);
170 static void vm_page_get_page(vm_page* page);
171 static fssh_status_t vm_page_put_page(vm_page* page);
172 static fssh_status_t vm_page_set_state(vm_page *page, int state);
173 
174 static void vm_cache_insert_page(file_cache_ref *cacheRef, vm_page *page,
175 	fssh_off_t offset);
176 static void vm_cache_remove_page(file_cache_ref *cacheRef, vm_page *page);
177 static vm_page *vm_cache_lookup_page(file_cache_ref *cacheRef, fssh_off_t page);
178 static fssh_status_t vm_cache_resize(file_cache_ref *cacheRef, fssh_off_t newSize);
179 static fssh_status_t vm_cache_write_modified(file_cache_ref *ref, bool fsReenter);
180 
181 static fssh_status_t vfs_read_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs,
182 	fssh_size_t count, fssh_size_t *_numBytes, bool fsReenter);
183 static fssh_status_t vfs_write_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs,
184 	fssh_size_t count, fssh_size_t *_numBytes, bool fsReenter);
185 
186 static fssh_status_t pages_io(file_cache_ref *ref, fssh_off_t offset, const fssh_iovec *vecs,
187 	fssh_size_t count, fssh_size_t *_numBytes, bool doWrite);
188 
189 
190 typedef DoublyLinkedList<vm_page,
191 	DoublyLinkedListMemberGetLink<vm_page,
192 		&vm_page::link_cache_modified> > cache_modified_page_list;
193 
194 typedef DoublyLinkedList<vm_page,
195 	DoublyLinkedListMemberGetLink<vm_page,
196 		&vm_page::link_cache> > cache_page_list;
197 
198 struct file_cache_ref {
199 	mutex						lock;
200 	fssh_mount_id				mountID;
201 	fssh_vnode_id				nodeID;
202 	void*						node;
203 	int							deviceFD;
204 	fssh_off_t					virtual_size;
205 
206 	cache_page_list				pages;
207 	cache_modified_page_list	modifiedPages;
208 
209 	file_map					map;
210 };
211 
212 struct page_hash_key {
213 	page_hash_key() {}
214 	page_hash_key(int fd, fssh_vnode_id id, fssh_off_t offset)
215 		: deviceFD(fd),
216 		  nodeID(id),
217 		  offset(offset)
218 	{
219 	}
220 
221 	int				deviceFD;
222 	fssh_vnode_id	nodeID;
223 	fssh_off_t		offset;
224 };
225 
226 
227 typedef DoublyLinkedList<vm_page,
228 	DoublyLinkedListMemberGetLink<vm_page,
229 		&vm_page::link_pool> > pool_page_list;
230 
231 struct PagePool {
232 
233 	PagePool()
234 		: pageHash(NULL),
235 		  unusedPages(),
236 		  freePages(),
237 		  allocatedPages(0)
238 	{
239 	}
240 
241 	~PagePool()
242 	{
243 	}
244 
245 	fssh_status_t Init()
246 	{
247 		fssh_status_t error = recursive_lock_init(&lock, "page pool");
248 		if (error != FSSH_B_OK) {
249 			fssh_panic("PagePool: Failed to init lock\n");
250 			return error;
251 		}
252 
253 		pageHash = hash_init(256, 0, &vm_page::Compare, &vm_page::Hash);
254 		if (pageHash == NULL) {
255 			fssh_panic("PagePool: Failed to create page hash\n");
256 			return FSSH_B_NO_MEMORY;
257 		}
258 
259 		return FSSH_B_OK;
260 	}
261 
262 	bool Lock() { return (recursive_lock_lock(&lock) == FSSH_B_OK); }
263 	void Unlock() { recursive_lock_unlock(&lock); }
264 
265 	recursive_lock	lock;
266 	hash_table*		pageHash;
267 	pool_page_list	unusedPages;
268 	pool_page_list	freePages;
269 	uint32_t		allocatedPages;
270 };
271 
272 struct PagePutter {
273 	PagePutter(vm_page* page)
274 		: fPage(page)
275 	{
276 	}
277 
278 	~PagePutter()
279 	{
280 		if (fPage)
281 			vm_page_put_page(fPage);
282 	}
283 
284 private:
285 	vm_page*	fPage;
286 };
287 
288 static PagePool sPagePool;
289 static const uint32_t kMaxAllocatedPages = 1024;
290 
291 
292 int
293 vm_page::Compare(void *_cacheEntry, const void *_key)
294 {
295 	vm_page *page = (vm_page*)_cacheEntry;
296 	const page_hash_key *key = (const page_hash_key *)_key;
297 
298 	// device FD
299 	if (page->cache->deviceFD != key->deviceFD)
300 		return page->cache->deviceFD - key->deviceFD;
301 
302 	// node ID
303 	if (page->cache->nodeID < key->nodeID)
304 		return -1;
305 	if (page->cache->nodeID > key->nodeID)
306 		return 1;
307 
308 	// offset
309 	if (page->offset < key->offset)
310 		return -1;
311 	if (page->offset > key->offset)
312 		return 1;
313 
314 	return 0;
315 }
316 
317 uint32_t
318 vm_page::Hash(void *_cacheEntry, const void *_key, uint32_t range)
319 {
320 	vm_page *page = (vm_page*)_cacheEntry;
321 	const page_hash_key *key = (const page_hash_key *)_key;
322 
323 	int fd = (page ? page->cache->deviceFD : key->deviceFD);
324 	fssh_vnode_id id = (page ? page->cache->nodeID : key->nodeID);
325 	fssh_off_t offset = (page ? page->offset : key->offset);
326 
327 	uint32_t value = fd;
328 	value = value * 17 + id;
329 	value = value * 17 + offset;
330 
331 	return value % range;
332 }
333 
334 
335 vm_page *
336 vm_page_allocate_page(int state)
337 {
338 	AutoLocker<PagePool> poolLocker(sPagePool);
339 
340 	// is a queued free page available?
341 	vm_page* page = sPagePool.freePages.RemoveHead();
342 	if (page) {
343 		page->ref_count++;
344 		return page;
345 	}
346 
347 	// no free page
348 
349 #if 0
350 // TODO: Nice idea in principle, but causes a serious locking problem.
351 // vm_page_allocate_page() is always invoked with some cached locked at the
352 // same time. Thus locking a cache here to steal a page can cause a deadlock.
353 	// If the limit for allocated pages has been reached, we try to steal an
354 	// unused page.
355 	if (sPagePool.allocatedPages >= kMaxAllocatedPages
356 		&& !sPagePool.unusedPages.IsEmpty()) {
357 		// we grab the first non-busy page
358 		for (pool_page_list::Iterator it(&sPagePool.unusedPages);
359 			vm_page* currentPage = it.Next();) {
360 			if (currentPage->state != PAGE_STATE_BUSY) {
361 				it.Remove();
362 				page = currentPage;
363 				page->ref_count++;
364 				break;
365 			}
366 		}
367 
368 		// If we've found a suitable page, we need to mark it busy, write it
369 		// if it was modified, and finally remove it from its cache.
370 		if (page != NULL) {
371 			bool modified = (page->state == PAGE_STATE_MODIFIED);
372 
373 			// mark the page busy
374 			page->state = PAGE_STATE_BUSY;
375 
376 			// We don't need the pool lock anymore.
377 			poolLocker.Unlock();
378 
379 			file_cache_ref* cache = page->cache;
380 
381 			// If the page is modified, we write it to the disk.
382 			if (modified) {
383 				// find out, how much to write, and remove the page from the
384 				// cache's modified pages list
385 				MutexLocker cacheLocker(cache->lock);
386 				fssh_size_t bytes = fssh_min_c(FSSH_B_PAGE_SIZE,
387 					cache->virtual_size - page->offset);
388 				cache->modifiedPages.Remove(page);
389 				cacheLocker.Unlock();
390 
391 				// if we need to write anything, do it now
392 				if (bytes > 0) {
393 					fssh_iovec vecs[1];
394 					vecs[0].iov_base = page->data;
395 					vecs[0].iov_len = bytes;
396 					fssh_status_t error = pages_io(cache, page->offset, vecs, 1,
397 						&bytes, true);
398 					if (error != FSSH_B_OK) {
399 						// failed to write the page: bad, but we can't do
400 						// much about it
401 						fssh_dprintf("vm_page_allocate_page(): Failed to write "
402 							"page %p (cache %p, offset: %lld).\n", page,
403 							cache, page->offset);
404 					}
405 				}
406 			}
407 
408 			// remove the page from the cache
409 			MutexLocker cacheLocker(cache->lock);
410 			vm_cache_remove_page(cache, page);
411 
412 			// now it's ours
413 			page->state = PAGE_STATE_FREE;
414 
415 			return page;
416 		}
417 	}
418 #endif	// 0
419 
420 	// no page yet -- allocate a new one
421 
422 	page = new(nothrow) vm_page;
423 	if (!page || !page->data) {
424 		delete page;
425 		return NULL;
426 	}
427 
428 	sPagePool.allocatedPages++;
429 
430 	return page;
431 }
432 
433 
434 static void
435 vm_page_get_page(vm_page* page)
436 {
437 	if (page) {
438 		AutoLocker<PagePool> _(sPagePool);
439 
440 		// increase ref count
441 		page->ref_count++;
442 
443 		// if the page was unused before, remove it from the unused pages list
444 		if (page->ref_count == 1)
445 			sPagePool.unusedPages.Remove(page);
446 	}
447 }
448 
449 
450 static fssh_status_t
451 vm_page_put_page(vm_page* page)
452 {
453 	if (!page)
454 		return FSSH_B_BAD_VALUE;
455 
456 	AutoLocker<PagePool> _(sPagePool);
457 
458 	if (page->ref_count <= 0) {
459 		fssh_panic("vm_page_put_page(): page %p already unreferenced!\n", page);
460 		return FSSH_B_BAD_VALUE;
461 	}
462 
463 	// decrease ref count
464 	page->ref_count--;
465 
466 	if (page->ref_count > 0)
467 		return FSSH_B_OK;
468 
469 	// the page is unreference now: add it to the unused or free pages list
470 
471 	if (page->state == PAGE_STATE_FREE) {
472 		// page is free
473 		// if we've maxed out the allowed allocated page, free this one,
474 		// otherwise add it to the free list
475 		if (sPagePool.allocatedPages > kMaxAllocatedPages) {
476 			delete page;
477 			sPagePool.allocatedPages--;
478 		} else
479 			sPagePool.freePages.Add(page);
480 	} else {
481 		// page is is not free; add to unused pages list
482 		sPagePool.unusedPages.Add(page);
483 	}
484 
485 	return FSSH_B_OK;
486 }
487 
488 
489 fssh_status_t
490 vm_page_set_state(vm_page *page, int state)
491 {
492 	AutoLocker<PagePool> _(sPagePool);
493 
494 	if (page->ref_count <= 0) {
495 		fssh_panic("vm_page_set_state(): page %p is already unreferenced!\n",
496 			page);
497 		return FSSH_B_BAD_VALUE;
498 	}
499 
500 	if (state == page->state)
501 		return FSSH_B_OK;
502 
503 	// If it was modified before, remove the page from the cache's modified
504 	// pages list.
505 	if (page->state == PAGE_STATE_MODIFIED && page->cache)
506 		page->cache->modifiedPages.Remove(page);
507 
508 	switch (state) {
509 		case PAGE_STATE_ACTIVE:
510 		case PAGE_STATE_BUSY:
511 		case PAGE_STATE_FREE:
512 			page->state = state;
513 			break;
514 
515 		case PAGE_STATE_MODIFIED:
516 		{
517 			page->state = state;
518 
519 			// add page to the modified list of the cache
520 			if (!page->cache) {
521 				fssh_panic("vm_page_set_state(): setting page state of page %p "
522 					"to PAGE_STATE_MODIFIED, but page is not in a cache\n",
523 					page);
524 				return FSSH_B_BAD_VALUE;
525 			}
526 
527 			page->cache->modifiedPages.Add(page);
528 
529 			break;
530 		}
531 
532 		default:
533 			fssh_panic("vm_page_set_state(): invalid page state: %d\n", state);
534 			return FSSH_B_BAD_VALUE;
535 	}
536 
537 	return FSSH_B_OK;
538 }
539 
540 
541 // cache must be locked
542 //
543 void
544 vm_cache_insert_page(file_cache_ref *cache, vm_page *page, fssh_off_t offset)
545 {
546 	AutoLocker<PagePool> _(sPagePool);
547 
548 	if (page->cache != NULL) {
549 		fssh_panic("vm_cache_insert_page(%p, %p): page already in cache %p\n",
550 			cache, page, page->cache);
551 		return;
552 	}
553 
554 	page->cache = cache;
555 	page->offset = offset;
556 
557 	// insert page into hash
558 	fssh_status_t error = hash_insert(sPagePool.pageHash, page);
559 	if (error != FSSH_B_OK) {
560 		fssh_panic("vm_cache_insert_page(): Failed to insert page %p into hash!\n",
561 			page);
562 		page->cache = NULL;
563 		page->offset = 0;
564 		return;
565 	}
566 
567 	// add page to cache page list
568 	cache->pages.Add(page);
569 }
570 
571 
572 // cache must be locked
573 //
574 void
575 vm_cache_remove_page(file_cache_ref *cache, vm_page *page)
576 {
577 	if (cache != page->cache) {
578 		fssh_panic("vm_cache_remove_page(%p, %p): page is in cache %p\n",
579 			cache, page, page->cache);
580 		return;
581 	}
582 
583 	AutoLocker<PagePool> _(sPagePool);
584 
585 	if (page->state == PAGE_STATE_MODIFIED)
586 		cache->modifiedPages.Remove(page);
587 
588 	cache->pages.Remove(page);
589 
590 	hash_remove(sPagePool.pageHash, page);
591 
592 	page->cache = NULL;
593 	page->offset = 0;
594 }
595 
596 
597 vm_page *
598 vm_cache_lookup_page(file_cache_ref *cache, fssh_off_t offset)
599 {
600 	if (!cache)
601 		return NULL;
602 
603 	AutoLocker<PagePool> _(sPagePool);
604 
605 	page_hash_key key(cache->deviceFD, cache->nodeID, offset);
606 
607 	vm_page* page = (vm_page*)hash_lookup(sPagePool.pageHash, &key);
608 
609 	if (page)
610 		vm_page_get_page(page);
611 
612 	return page;
613 }
614 
615 
616 // cache must be locked
617 //
618 fssh_status_t
619 vm_cache_resize(file_cache_ref *cache, fssh_off_t newSize)
620 {
621 	fssh_off_t oldAlignedSize = PAGE_ALIGN(cache->virtual_size);
622 	fssh_off_t newAlignedSize = PAGE_ALIGN(newSize);
623 
624 	cache->virtual_size = newSize;
625 
626 	// if the aligned cache size increased or remained the same, we're done
627 	if (newAlignedSize >= oldAlignedSize)
628 		return FSSH_B_OK;
629 
630 	// the file shrinks, so we need to get rid of excess pages
631 
632 	// Hold the page pool lock virtually all the time from now on.
633 	AutoLocker<PagePool> poolLocker(sPagePool);
634 
635 	// For sake of efficiency we sort the cache's list of pages so that all
636 	// pages that need to be removed are at the beginning of the list.
637 	vm_page* page = cache->pages.Head();
638 	if (newAlignedSize > 0) {
639 		while (page) {
640 			vm_page* nextPage = cache->pages.GetNext(page);
641 
642 			if (page->offset >= newAlignedSize) {
643 				// move to the beginning of the list
644 				cache->pages.Remove(page);
645 				cache->pages.Add(page, false);
646 			}
647 
648 			page = nextPage;
649 		}
650 	}
651 
652 	// now we remove and free the excess pages one by one
653 	while (true) {
654 		// get the first page in the list to remove
655 		// (since we sorted the list, this is usually very cheap)
656 		for (cache_page_list::Iterator it(&cache->pages); (page = it.Next());) {
657 			if (page->offset >= newAlignedSize)
658 				break;
659 		}
660 
661 		// no more pages? -- then we're done
662 		if (!page)
663 			return FSSH_B_OK;
664 
665 		if (page->state == PAGE_STATE_BUSY) {
666 			// the page is busy -- wait a while and try again
667 			poolLocker.Unlock();
668 			mutex_unlock(&cache->lock);
669 			fssh_snooze(20000);
670 			mutex_lock(&cache->lock);
671 			poolLocker.Lock();
672 		} else {
673 			// the page isn't busy -- get rid of it
674 			vm_page_get_page(page);
675 			vm_cache_remove_page(cache, page);
676 			vm_page_set_state(page, PAGE_STATE_FREE);
677 			vm_page_put_page(page);
678 		}
679 	}
680 
681 	return FSSH_B_OK;
682 }
683 
684 
685 fssh_status_t
686 vm_cache_write_modified(file_cache_ref *cache, bool fsReenter)
687 {
688 	// TODO: Write more than one page at a time. To avoid deadlocks, when a
689 	// busy page is encountered the previously collected pages need to be
690 	// written. Otherwise as many pages as our on-stack array can contain
691 	// can be processed at once.
692 	MutexLocker locker(cache->lock);
693 	while (true) {
694 		// get the next modified page and mark it busy
695 		vm_page* page = NULL;
696 
697 		while (true) {
698 			// get the first modified page
699 			AutoLocker<PagePool> poolLocker(sPagePool);
700 			page = cache->modifiedPages.Head();
701 
702 			if (!page)
703 				return FSSH_B_OK;
704 
705 			// if not marked busy, remove it and mark it busy
706 			if (page->state != PAGE_STATE_BUSY) {
707 				cache->modifiedPages.Remove(page);
708 				vm_page_get_page(page);
709 				page->state = PAGE_STATE_BUSY;
710 				break;
711 			}
712 
713 			// page is busy -- wait a while
714 			vm_page_put_page(page);
715 			poolLocker.Unlock();
716 			locker.Unlock();
717 			fssh_snooze(20000);
718 			locker.Lock();
719 		}
720 
721 		locker.Unlock();
722 
723 		// write the page
724 		fssh_size_t bytes = fssh_min_c(FSSH_B_PAGE_SIZE, cache->virtual_size - page->offset);
725 		fssh_iovec vecs[1];
726 		vecs[0].iov_base = page->data;
727 		vecs[0].iov_len = bytes;
728 		fssh_status_t error = pages_io(cache, page->offset, vecs, 1, &bytes, true);
729 		if (error != FSSH_B_OK)
730 			return error;
731 
732 		locker.Lock();
733 
734 		vm_page_set_state(page, PAGE_STATE_ACTIVE);
735 		vm_page_put_page(page);
736 	}
737 
738 	return FSSH_B_OK;
739 }
740 
741 
742 fssh_status_t
743 vfs_read_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs, fssh_size_t count,
744 	fssh_size_t *_numBytes, bool fsReenter)
745 {
746 	// check how much the iovecs allow us to read
747 	fssh_size_t toRead = 0;
748 	for (fssh_size_t i = 0; i < count; i++)
749 		toRead += vecs[i].iov_len;
750 
751 	fssh_iovec* newVecs = NULL;
752 	if (*_numBytes < toRead) {
753 		// We're supposed to read less than specified by the vecs. Since
754 		// readv_pos() doesn't support this, we need to clone the vecs.
755 		newVecs = new(nothrow) fssh_iovec[count];
756 		if (!newVecs)
757 			return FSSH_B_NO_MEMORY;
758 
759 		fssh_size_t newCount = 0;
760 		for (fssh_size_t i = 0; i < count && toRead > 0; i++) {
761 			fssh_size_t vecLen = fssh_min_c(vecs[i].iov_len, toRead);
762 			newVecs[i].iov_base = vecs[i].iov_base;
763 			newVecs[i].iov_len = vecLen;
764 			toRead -= vecLen;
765 			newCount++;
766 		}
767 
768 		vecs = newVecs;
769 		count = newCount;
770 	}
771 
772 	fssh_ssize_t bytesRead = fssh_readv_pos(fd, pos, vecs, count);
773 	delete[] newVecs;
774 	if (bytesRead < 0)
775 		return bytesRead;
776 
777 	*_numBytes = bytesRead;
778 	return FSSH_B_OK;
779 }
780 
781 
782 fssh_status_t
783 vfs_write_pages(int fd, fssh_off_t pos, const fssh_iovec *vecs, fssh_size_t count,
784 	fssh_size_t *_numBytes, bool fsReenter)
785 {
786 	// check how much the iovecs allow us to write
787 	fssh_size_t toWrite = 0;
788 	for (fssh_size_t i = 0; i < count; i++)
789 		toWrite += vecs[i].iov_len;
790 
791 	fssh_iovec* newVecs = NULL;
792 	if (*_numBytes < toWrite) {
793 		// We're supposed to write less than specified by the vecs. Since
794 		// writev_pos() doesn't support this, we need to clone the vecs.
795 		newVecs = new(nothrow) fssh_iovec[count];
796 		if (!newVecs)
797 			return FSSH_B_NO_MEMORY;
798 
799 		fssh_size_t newCount = 0;
800 		for (fssh_size_t i = 0; i < count && toWrite > 0; i++) {
801 			fssh_size_t vecLen = fssh_min_c(vecs[i].iov_len, toWrite);
802 			newVecs[i].iov_base = vecs[i].iov_base;
803 			newVecs[i].iov_len = vecLen;
804 			toWrite -= vecLen;
805 			newCount++;
806 		}
807 
808 		vecs = newVecs;
809 		count = newCount;
810 	}
811 
812 	fssh_ssize_t bytesWritten = fssh_writev_pos(fd, pos, vecs, count);
813 	delete[] newVecs;
814 	if (bytesWritten < 0)
815 		return bytesWritten;
816 
817 	*_numBytes = bytesWritten;
818 	return FSSH_B_OK;
819 }
820 
821 
822 fssh_status_t
823 file_cache_init()
824 {
825 	fssh_status_t error = sPagePool.Init();
826 	if (error != FSSH_B_OK)
827 		return error;
828 
829 	return FSSH_B_OK;
830 }
831 
832 
833 // #pragma mark -
834 
835 
836 file_map::file_map()
837 {
838 	array = NULL;
839 	count = 0;
840 }
841 
842 
843 file_map::~file_map()
844 {
845 	Free();
846 }
847 
848 
849 file_extent *
850 file_map::operator[](uint32_t index)
851 {
852 	return ExtentAt(index);
853 }
854 
855 
856 file_extent *
857 file_map::ExtentAt(uint32_t index)
858 {
859 	if (index >= count)
860 		return NULL;
861 
862 	if (count > CACHED_FILE_EXTENTS)
863 		return &array[index];
864 
865 	return &direct[index];
866 }
867 
868 
869 fssh_status_t
870 file_map::Add(fssh_file_io_vec *vecs, fssh_size_t vecCount, fssh_off_t &lastOffset)
871 {
872 	TRACE(("file_map::Add(vecCount = %ld)\n", vecCount));
873 
874 	fssh_off_t offset = 0;
875 
876 	if (vecCount <= CACHED_FILE_EXTENTS && count == 0) {
877 		// just use the reserved area in the file_cache_ref structure
878 	} else {
879 		// TODO: once we can invalidate only parts of the file map,
880 		//	we might need to copy the previously cached file extends
881 		//	from the direct range
882 		file_extent *newMap = (file_extent *)realloc(array,
883 			(count + vecCount) * sizeof(file_extent));
884 		if (newMap == NULL)
885 			return FSSH_B_NO_MEMORY;
886 
887 		array = newMap;
888 
889 		if (count != 0) {
890 			file_extent *extent = ExtentAt(count - 1);
891 			offset = extent->offset + extent->disk.length;
892 		}
893 	}
894 
895 	int32_t start = count;
896 	count += vecCount;
897 
898 	for (uint32_t i = 0; i < vecCount; i++) {
899 		file_extent *extent = ExtentAt(start + i);
900 
901 		extent->offset = offset;
902 		extent->disk = vecs[i];
903 
904 		offset += extent->disk.length;
905 	}
906 
907 #ifdef TRACE_FILE_CACHE
908 	for (uint32_t i = 0; i < count; i++) {
909 		file_extent *extent = ExtentAt(i);
910 		fssh_dprintf("[%ld] extend offset %Ld, disk offset %Ld, length %Ld\n",
911 			i, extent->offset, extent->disk.offset, extent->disk.length);
912 	}
913 #endif
914 
915 	lastOffset = offset;
916 	return FSSH_B_OK;
917 }
918 
919 
920 void
921 file_map::Free()
922 {
923 	if (count > CACHED_FILE_EXTENTS)
924 		free(array);
925 
926 	array = NULL;
927 	count = 0;
928 }
929 
930 
931 //	#pragma mark -
932 
933 
934 static void
935 add_to_iovec(fssh_iovec *vecs, int32_t &index, int32_t max, fssh_addr_t address, fssh_size_t size)
936 {
937 	if (index > 0 && (fssh_addr_t)vecs[index - 1].iov_base + vecs[index - 1].iov_len == address) {
938 		// the iovec can be combined with the previous one
939 		vecs[index - 1].iov_len += size;
940 		return;
941 	}
942 
943 	if (index == max)
944 		fssh_panic("no more space for iovecs!");
945 
946 	// we need to start a new iovec
947 	vecs[index].iov_base = (void *)address;
948 	vecs[index].iov_len = size;
949 	index++;
950 }
951 
952 
953 static file_extent *
954 find_file_extent(file_cache_ref *ref, fssh_off_t offset, uint32_t *_index)
955 {
956 	// TODO: do binary search
957 
958 	for (uint32_t index = 0; index < ref->map.count; index++) {
959 		file_extent *extent = ref->map[index];
960 
961 		if (extent->offset <= offset
962 			&& extent->offset + extent->disk.length > offset) {
963 			if (_index)
964 				*_index = index;
965 			return extent;
966 		}
967 	}
968 
969 	return NULL;
970 }
971 
972 
973 static fssh_status_t
974 get_file_map(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size,
975 	fssh_file_io_vec *vecs, fssh_size_t *_count)
976 {
977 	fssh_size_t maxVecs = *_count;
978 	fssh_status_t status = FSSH_B_OK;
979 
980 	if (ref->map.count == 0) {
981 		// we don't yet have the map of this file, so let's grab it
982 		// (ordered by offset, so that we can do a binary search on them)
983 
984 		mutex_lock(&ref->lock);
985 
986 		// the file map could have been requested in the mean time
987 		if (ref->map.count == 0) {
988 			fssh_size_t vecCount = maxVecs;
989 			fssh_off_t mapOffset = 0;
990 
991 			while (true) {
992 				status = vfs_get_file_map(ref->node, mapOffset, ~0UL, vecs,
993 					&vecCount);
994 				if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
995 					mutex_unlock(&ref->lock);
996 					return status;
997 				}
998 
999 				fssh_status_t addStatus = ref->map.Add(vecs, vecCount, mapOffset);
1000 				if (addStatus != FSSH_B_OK) {
1001 					// only clobber the status in case of failure
1002 					status = addStatus;
1003 				}
1004 
1005 				if (status != FSSH_B_BUFFER_OVERFLOW)
1006 					break;
1007 
1008 				// when we are here, the map has been stored in the array, and
1009 				// the array size was still too small to cover the whole file
1010 				vecCount = maxVecs;
1011 			}
1012 		}
1013 
1014 		mutex_unlock(&ref->lock);
1015 	}
1016 
1017 	if (status != FSSH_B_OK) {
1018 		// We must invalidate the (part of the) map we already
1019 		// have, as we cannot know if it's complete or not
1020 		ref->map.Free();
1021 		return status;
1022 	}
1023 
1024 	// We now have cached the map of this file, we now need to
1025 	// translate it for the requested access.
1026 
1027 	uint32_t index;
1028 	file_extent *fileExtent = find_file_extent(ref, offset, &index);
1029 	if (fileExtent == NULL) {
1030 		// access outside file bounds? But that's not our problem
1031 		*_count = 0;
1032 		return FSSH_B_OK;
1033 	}
1034 
1035 	offset -= fileExtent->offset;
1036 	vecs[0].offset = fileExtent->disk.offset + offset;
1037 	vecs[0].length = fileExtent->disk.length - offset;
1038 
1039 	if (vecs[0].length >= size || index >= ref->map.count - 1) {
1040 		*_count = 1;
1041 		return FSSH_B_OK;
1042 	}
1043 
1044 	// copy the rest of the vecs
1045 
1046 	size -= vecs[0].length;
1047 
1048 	for (index = 1; index < ref->map.count;) {
1049 		fileExtent++;
1050 
1051 		vecs[index] = fileExtent->disk;
1052 		index++;
1053 
1054 		if (size <= fileExtent->disk.length)
1055 			break;
1056 
1057 		if (index >= maxVecs) {
1058 			*_count = index;
1059 			return FSSH_B_BUFFER_OVERFLOW;
1060 		}
1061 
1062 		size -= fileExtent->disk.length;
1063 	}
1064 
1065 	*_count = index;
1066 	return FSSH_B_OK;
1067 }
1068 
1069 
1070 /*!
1071 	Does the dirty work of translating the request into actual disk offsets
1072 	and reads to or writes from the supplied iovecs as specified by \a doWrite.
1073 */
1074 static fssh_status_t
1075 pages_io(file_cache_ref *ref, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count,
1076 	fssh_size_t *_numBytes, bool doWrite)
1077 {
1078 	TRACE(("pages_io: ref = %p, offset = %Ld, size = %lu, vecCount = %lu, %s\n",
1079 		ref, offset, *_numBytes, count, doWrite ? "write" : "read"));
1080 
1081 	// translate the iovecs into direct device accesses
1082 	fssh_file_io_vec fileVecs[MAX_FILE_IO_VECS];
1083 	fssh_size_t fileVecCount = MAX_FILE_IO_VECS;
1084 	fssh_size_t numBytes = *_numBytes;
1085 
1086 	fssh_status_t status = get_file_map(ref, offset, numBytes, fileVecs,
1087 		&fileVecCount);
1088 	if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
1089 		TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
1090 			offset, numBytes, fssh_strerror(status)));
1091 		return status;
1092 	}
1093 
1094 	bool bufferOverflow = status == FSSH_B_BUFFER_OVERFLOW;
1095 
1096 #ifdef TRACE_FILE_CACHE
1097 	fssh_dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount, offset,
1098 		numBytes, bufferOverflow ? " (array too small)" : "");
1099 	for (fssh_size_t i = 0; i < fileVecCount; i++) {
1100 		fssh_dprintf("  [%lu] offset = %Ld, size = %Ld\n",
1101 			i, fileVecs[i].offset, fileVecs[i].length);
1102 	}
1103 #endif
1104 
1105 	if (fileVecCount == 0) {
1106 		// There are no file vecs at this offset, so we're obviously trying
1107 		// to access the file outside of its bounds
1108 		TRACE(("pages_io: access outside of vnode %p at offset %Ld\n",
1109 			ref->vnode, offset));
1110 		return FSSH_B_BAD_VALUE;
1111 	}
1112 
1113 	uint32_t fileVecIndex;
1114 	fssh_size_t size;
1115 
1116 	if (!doWrite) {
1117 		// now directly read the data from the device
1118 		// the first file_io_vec can be read directly
1119 
1120 		size = fileVecs[0].length;
1121 		if (size > numBytes)
1122 			size = numBytes;
1123 
1124 		status = vfs_read_pages(ref->deviceFD, fileVecs[0].offset, vecs,
1125 			count, &size, false);
1126 		if (status < FSSH_B_OK)
1127 			return status;
1128 
1129 		// TODO: this is a work-around for buggy device drivers!
1130 		//	When our own drivers honour the length, we can:
1131 		//	a) also use this direct I/O for writes (otherwise, it would
1132 		//	   overwrite precious data)
1133 		//	b) panic if the term below is true (at least for writes)
1134 		if (size > fileVecs[0].length) {
1135 			//fssh_dprintf("warning: device driver %p doesn't respect total length in read_pages() call!\n", ref->device);
1136 			size = fileVecs[0].length;
1137 		}
1138 
1139 		ASSERT(size <= fileVecs[0].length);
1140 
1141 		// If the file portion was contiguous, we're already done now
1142 		if (size == numBytes)
1143 			return FSSH_B_OK;
1144 
1145 		// if we reached the end of the file, we can return as well
1146 		if (size != fileVecs[0].length) {
1147 			*_numBytes = size;
1148 			return FSSH_B_OK;
1149 		}
1150 
1151 		fileVecIndex = 1;
1152 	} else {
1153 		fileVecIndex = 0;
1154 		size = 0;
1155 	}
1156 
1157 	// Too bad, let's process the rest of the file_io_vecs
1158 
1159 	fssh_size_t totalSize = size;
1160 
1161 	// first, find out where we have to continue in our iovecs
1162 	uint32_t i = 0;
1163 	for (; i < count; i++) {
1164 		if (size < vecs[i].iov_len)
1165 			break;
1166 
1167 		size -= vecs[i].iov_len;
1168 	}
1169 
1170 	fssh_size_t vecOffset = size;
1171 	fssh_size_t bytesLeft = numBytes - size;
1172 
1173 	while (true) {
1174 		for (; fileVecIndex < fileVecCount; fileVecIndex++) {
1175 			fssh_file_io_vec &fileVec = fileVecs[fileVecIndex];
1176 			fssh_off_t fileOffset = fileVec.offset;
1177 			fssh_off_t fileLeft = fssh_min_c(fileVec.length, bytesLeft);
1178 
1179 			TRACE(("FILE VEC [%lu] length %Ld\n", fileVecIndex, fileLeft));
1180 
1181 			// process the complete fileVec
1182 			while (fileLeft > 0) {
1183 				fssh_iovec tempVecs[MAX_TEMP_IO_VECS];
1184 				uint32_t tempCount = 0;
1185 
1186 				// size tracks how much of what is left of the current fileVec
1187 				// (fileLeft) has been assigned to tempVecs
1188 				size = 0;
1189 
1190 				// assign what is left of the current fileVec to the tempVecs
1191 				for (size = 0; size < fileLeft && i < count
1192 						&& tempCount < MAX_TEMP_IO_VECS;) {
1193 					// try to satisfy one iovec per iteration (or as much as
1194 					// possible)
1195 
1196 					// bytes left of the current iovec
1197 					fssh_size_t vecLeft = vecs[i].iov_len - vecOffset;
1198 					if (vecLeft == 0) {
1199 						vecOffset = 0;
1200 						i++;
1201 						continue;
1202 					}
1203 
1204 					TRACE(("fill vec %ld, offset = %lu, size = %lu\n",
1205 						i, vecOffset, size));
1206 
1207 					// actually available bytes
1208 					fssh_size_t tempVecSize = fssh_min_c(vecLeft, fileLeft - size);
1209 
1210 					tempVecs[tempCount].iov_base
1211 						= (void *)((fssh_addr_t)vecs[i].iov_base + vecOffset);
1212 					tempVecs[tempCount].iov_len = tempVecSize;
1213 					tempCount++;
1214 
1215 					size += tempVecSize;
1216 					vecOffset += tempVecSize;
1217 				}
1218 
1219 				fssh_size_t bytes = size;
1220 				if (doWrite) {
1221 					status = vfs_write_pages(ref->deviceFD, fileOffset,
1222 						tempVecs, tempCount, &bytes, false);
1223 				} else {
1224 					status = vfs_read_pages(ref->deviceFD, fileOffset,
1225 						tempVecs, tempCount, &bytes, false);
1226 				}
1227 				if (status < FSSH_B_OK)
1228 					return status;
1229 
1230 				totalSize += bytes;
1231 				bytesLeft -= size;
1232 				fileOffset += size;
1233 				fileLeft -= size;
1234 				//fssh_dprintf("-> file left = %Lu\n", fileLeft);
1235 
1236 				if (size != bytes || i >= count) {
1237 					// there are no more bytes or iovecs, let's bail out
1238 					*_numBytes = totalSize;
1239 					return FSSH_B_OK;
1240 				}
1241 			}
1242 		}
1243 
1244 		if (bufferOverflow) {
1245 			status = get_file_map(ref, offset + totalSize, bytesLeft, fileVecs,
1246 				&fileVecCount);
1247 			if (status < FSSH_B_OK && status != FSSH_B_BUFFER_OVERFLOW) {
1248 				TRACE(("get_file_map(offset = %Ld, numBytes = %lu) failed: %s\n",
1249 					offset, numBytes, fssh_strerror(status)));
1250 				return status;
1251 			}
1252 
1253 			bufferOverflow = status == FSSH_B_BUFFER_OVERFLOW;
1254 			fileVecIndex = 0;
1255 
1256 #ifdef TRACE_FILE_CACHE
1257 			fssh_dprintf("got %lu file vecs for %Ld:%lu%s:\n", fileVecCount,
1258 				offset + totalSize, numBytes,
1259 				bufferOverflow ? " (array too small)" : "");
1260 			for (fssh_size_t i = 0; i < fileVecCount; i++) {
1261 				fssh_dprintf("  [%lu] offset = %Ld, size = %Ld\n",
1262 					i, fileVecs[i].offset, fileVecs[i].length);
1263 			}
1264 #endif
1265 		} else
1266 			break;
1267 	}
1268 
1269 	*_numBytes = totalSize;
1270 	return FSSH_B_OK;
1271 }
1272 
1273 
1274 /*!
1275 	This function is called by read_into_cache() (and from there only) - it
1276 	can only handle a certain amount of bytes, and read_into_cache() makes
1277 	sure that it matches that criterion.
1278 */
1279 static inline fssh_status_t
1280 read_chunk_into_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t numBytes,
1281 	int32_t pageOffset, fssh_addr_t buffer, fssh_size_t bufferSize)
1282 {
1283 	TRACE(("read_chunk(offset = %Ld, size = %lu, pageOffset = %ld, buffer = %#lx, bufferSize = %lu\n",
1284 		offset, size, pageOffset, buffer, bufferSize));
1285 
1286 	fssh_iovec vecs[MAX_IO_VECS];
1287 	int32_t vecCount = 0;
1288 
1289 	vm_page *pages[MAX_IO_VECS];
1290 	int32_t pageIndex = 0;
1291 
1292 	// allocate pages for the cache and mark them busy
1293 	for (fssh_size_t pos = 0; pos < numBytes; pos += FSSH_B_PAGE_SIZE) {
1294 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
1295 		if (page == NULL)
1296 			fssh_panic("no more pages!");
1297 
1298 		page->state = PAGE_STATE_BUSY;
1299 
1300 		vm_cache_insert_page(ref, page, offset + pos);
1301 
1302 		fssh_addr_t virtualAddress = page->Address();
1303 
1304 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, FSSH_B_PAGE_SIZE);
1305 		// ToDo: check if the array is large enough!
1306 	}
1307 
1308 	mutex_unlock(&ref->lock);
1309 
1310 	// read file into reserved pages
1311 	fssh_status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, false);
1312 	if (status < FSSH_B_OK) {
1313 		// reading failed, free allocated pages
1314 
1315 		fssh_dprintf("file_cache: read pages failed: %s\n", fssh_strerror(status));
1316 
1317 		mutex_lock(&ref->lock);
1318 
1319 		for (int32_t i = 0; i < pageIndex; i++) {
1320 			vm_cache_remove_page(ref, pages[i]);
1321 			vm_page_set_state(pages[i], PAGE_STATE_FREE);
1322 			vm_page_put_page(pages[i]);
1323 		}
1324 
1325 		return status;
1326 	}
1327 
1328 	// copy the pages and unmap them again
1329 
1330 	for (int32_t i = 0; i < vecCount; i++) {
1331 		fssh_addr_t base = (fssh_addr_t)vecs[i].iov_base;
1332 		fssh_size_t size = vecs[i].iov_len;
1333 
1334 		// copy to user buffer if necessary
1335 		if (bufferSize != 0) {
1336 			fssh_size_t bytes = fssh_min_c(bufferSize, size - pageOffset);
1337 
1338 			user_memcpy((void *)buffer, (void *)(base + pageOffset), bytes);
1339 			buffer += bytes;
1340 			bufferSize -= bytes;
1341 			pageOffset = 0;
1342 		}
1343 	}
1344 
1345 	mutex_lock(&ref->lock);
1346 
1347 	// make the pages accessible in the cache
1348 	for (int32_t i = pageIndex; i-- > 0;) {
1349 		vm_page_set_state(pages[i], PAGE_STATE_ACTIVE);
1350 		vm_page_put_page(pages[i]);
1351 	}
1352 
1353 	return FSSH_B_OK;
1354 }
1355 
1356 
1357 /*!
1358 	This function reads \a size bytes directly from the file into the cache.
1359 	If \a bufferSize does not equal zero, \a bufferSize bytes from the data
1360 	read in are also copied to the provided \a buffer.
1361 	This function always allocates all pages; it is the responsibility of the
1362 	calling function to only ask for yet uncached ranges.
1363 	The cache_ref lock must be hold when calling this function.
1364 */
1365 static fssh_status_t
1366 read_into_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size, fssh_addr_t buffer, fssh_size_t bufferSize)
1367 {
1368 	TRACE(("read_from_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
1369 		ref, offset, size, (void *)buffer, bufferSize));
1370 
1371 	// do we have to read in anything at all?
1372 	if (size == 0)
1373 		return FSSH_B_OK;
1374 
1375 	// make sure "offset" is page aligned - but also remember the page offset
1376 	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1377 	size = PAGE_ALIGN(size + pageOffset);
1378 	offset -= pageOffset;
1379 
1380 	while (true) {
1381 		fssh_size_t chunkSize = size;
1382 		if (chunkSize > (MAX_IO_VECS * FSSH_B_PAGE_SIZE))
1383 			chunkSize = MAX_IO_VECS * FSSH_B_PAGE_SIZE;
1384 
1385 		fssh_status_t status = read_chunk_into_cache(ref, offset, chunkSize, pageOffset,
1386 								buffer, bufferSize);
1387 		if (status != FSSH_B_OK)
1388 			return status;
1389 
1390 		if ((size -= chunkSize) == 0)
1391 			return FSSH_B_OK;
1392 
1393 		if (chunkSize >= bufferSize) {
1394 			bufferSize = 0;
1395 			buffer = 0;
1396 		} else {
1397 			bufferSize -= chunkSize - pageOffset;
1398 			buffer += chunkSize - pageOffset;
1399 		}
1400 
1401 		offset += chunkSize;
1402 		pageOffset = 0;
1403 	}
1404 
1405 	return FSSH_B_OK;
1406 }
1407 
1408 
1409 /**	Like read_chunk_into_cache() but writes data into the cache */
1410 
1411 static inline fssh_status_t
1412 write_chunk_to_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t numBytes,
1413 	int32_t pageOffset, fssh_addr_t buffer, fssh_size_t bufferSize)
1414 {
1415 	fssh_iovec vecs[MAX_IO_VECS];
1416 	int32_t vecCount = 0;
1417 	vm_page *pages[MAX_IO_VECS];
1418 	int32_t pageIndex = 0;
1419 	fssh_status_t status = FSSH_B_OK;
1420 
1421 	// ToDo: this should be settable somewhere
1422 	bool writeThrough = false;
1423 
1424 	// allocate pages for the cache and mark them busy
1425 	for (fssh_size_t pos = 0; pos < numBytes; pos += FSSH_B_PAGE_SIZE) {
1426 		// ToDo: if space is becoming tight, and this cache is already grown
1427 		//	big - shouldn't we better steal the pages directly in that case?
1428 		//	(a working set like approach for the file cache)
1429 		vm_page *page = pages[pageIndex++] = vm_page_allocate_page(PAGE_STATE_FREE);
1430 		page->state = PAGE_STATE_BUSY;
1431 
1432 		vm_cache_insert_page(ref, page, offset + pos);
1433 
1434 		fssh_addr_t virtualAddress = page->Address();
1435 
1436 		add_to_iovec(vecs, vecCount, MAX_IO_VECS, virtualAddress, FSSH_B_PAGE_SIZE);
1437 		// ToDo: check if the array is large enough!
1438 	}
1439 
1440 	mutex_unlock(&ref->lock);
1441 
1442 	// copy contents (and read in partially written pages first)
1443 
1444 	if (pageOffset != 0) {
1445 		// This is only a partial write, so we have to read the rest of the page
1446 		// from the file to have consistent data in the cache
1447 		fssh_iovec readVec = { vecs[0].iov_base, FSSH_B_PAGE_SIZE };
1448 		fssh_size_t bytesRead = FSSH_B_PAGE_SIZE;
1449 
1450 		status = pages_io(ref, offset, &readVec, 1, &bytesRead, false);
1451 		// ToDo: handle errors for real!
1452 		if (status < FSSH_B_OK)
1453 			fssh_panic("1. pages_io() failed: %s!\n", fssh_strerror(status));
1454 	}
1455 
1456 	fssh_addr_t lastPageOffset = (pageOffset + bufferSize) & (FSSH_B_PAGE_SIZE - 1);
1457 	if (lastPageOffset != 0) {
1458 		// get the last page in the I/O vectors
1459 		fssh_addr_t last = (fssh_addr_t)vecs[vecCount - 1].iov_base
1460 			+ vecs[vecCount - 1].iov_len - FSSH_B_PAGE_SIZE;
1461 
1462 		if (offset + pageOffset + bufferSize == ref->virtual_size) {
1463 			// the space in the page after this write action needs to be cleaned
1464 			fssh_memset((void *)(last + lastPageOffset), 0, FSSH_B_PAGE_SIZE - lastPageOffset);
1465 		} else if (vecCount > 1) {
1466 			// the end of this write does not happen on a page boundary, so we
1467 			// need to fetch the last page before we can update it
1468 			fssh_iovec readVec = { (void *)last, FSSH_B_PAGE_SIZE };
1469 			fssh_size_t bytesRead = FSSH_B_PAGE_SIZE;
1470 
1471 			status = pages_io(ref, offset + numBytes - FSSH_B_PAGE_SIZE, &readVec, 1,
1472 				&bytesRead, false);
1473 			// ToDo: handle errors for real!
1474 			if (status < FSSH_B_OK)
1475 				fssh_panic("pages_io() failed: %s!\n", fssh_strerror(status));
1476 		}
1477 	}
1478 
1479 	for (int32_t i = 0; i < vecCount; i++) {
1480 		fssh_addr_t base = (fssh_addr_t)vecs[i].iov_base;
1481 		fssh_size_t bytes = fssh_min_c(bufferSize, fssh_size_t(vecs[i].iov_len - pageOffset));
1482 
1483 		// copy data from user buffer
1484 		user_memcpy((void *)(base + pageOffset), (void *)buffer, bytes);
1485 
1486 		bufferSize -= bytes;
1487 		if (bufferSize == 0)
1488 			break;
1489 
1490 		buffer += bytes;
1491 		pageOffset = 0;
1492 	}
1493 
1494 	if (writeThrough) {
1495 		// write cached pages back to the file if we were asked to do that
1496 		fssh_status_t status = pages_io(ref, offset, vecs, vecCount, &numBytes, true);
1497 		if (status < FSSH_B_OK) {
1498 			// ToDo: remove allocated pages, ...?
1499 			fssh_panic("file_cache: remove allocated pages! write pages failed: %s\n",
1500 				fssh_strerror(status));
1501 		}
1502 	}
1503 
1504 	mutex_lock(&ref->lock);
1505 
1506 	// unmap the pages again
1507 
1508 	// make the pages accessible in the cache
1509 	for (int32_t i = pageIndex; i-- > 0;) {
1510 		if (writeThrough)
1511 			vm_page_set_state(pages[i], PAGE_STATE_ACTIVE);
1512 		else
1513 			vm_page_set_state(pages[i], PAGE_STATE_MODIFIED);
1514 		vm_page_put_page(pages[i]);
1515 	}
1516 
1517 	return status;
1518 }
1519 
1520 
1521 /**	Like read_into_cache() but writes data into the cache. To preserve data consistency,
1522  *	it might also read pages into the cache, though, if only a partial page gets written.
1523  *	The cache_ref lock must be hold when calling this function.
1524  */
1525 
1526 static fssh_status_t
1527 write_to_cache(file_cache_ref *ref, fssh_off_t offset, fssh_size_t size, fssh_addr_t buffer, fssh_size_t bufferSize)
1528 {
1529 	TRACE(("write_to_cache: ref = %p, offset = %Ld, size = %lu, buffer = %p, bufferSize = %lu\n",
1530 		ref, offset, size, (void *)buffer, bufferSize));
1531 
1532 	// make sure "offset" is page aligned - but also remember the page offset
1533 	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1534 	size = PAGE_ALIGN(size + pageOffset);
1535 	offset -= pageOffset;
1536 
1537 	while (true) {
1538 		fssh_size_t chunkSize = size;
1539 		if (chunkSize > (MAX_IO_VECS * FSSH_B_PAGE_SIZE))
1540 			chunkSize = MAX_IO_VECS * FSSH_B_PAGE_SIZE;
1541 
1542 		fssh_status_t status = write_chunk_to_cache(ref, offset, chunkSize, pageOffset, buffer, bufferSize);
1543 		if (status != FSSH_B_OK)
1544 			return status;
1545 
1546 		if ((size -= chunkSize) == 0)
1547 			return FSSH_B_OK;
1548 
1549 		if (chunkSize >= bufferSize) {
1550 			bufferSize = 0;
1551 			buffer = 0;
1552 		} else {
1553 			bufferSize -= chunkSize - pageOffset;
1554 			buffer += chunkSize - pageOffset;
1555 		}
1556 
1557 		offset += chunkSize;
1558 		pageOffset = 0;
1559 	}
1560 
1561 	return FSSH_B_OK;
1562 }
1563 
1564 
1565 static fssh_status_t
1566 satisfy_cache_io(file_cache_ref *ref, fssh_off_t offset, fssh_addr_t buffer, fssh_addr_t lastBuffer,
1567 	bool doWrite)
1568 {
1569 	fssh_size_t requestSize = buffer - lastBuffer;
1570 
1571 	if (doWrite)
1572 		return write_to_cache(ref, offset, requestSize, lastBuffer, requestSize);
1573 
1574 	return read_into_cache(ref, offset, requestSize, lastBuffer, requestSize);
1575 }
1576 
1577 
1578 static fssh_status_t
1579 cache_io(void *_cacheRef, fssh_off_t offset, fssh_addr_t buffer, fssh_size_t *_size, bool doWrite)
1580 {
1581 	if (_cacheRef == NULL)
1582 		fssh_panic("cache_io() called with NULL ref!\n");
1583 
1584 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1585 	fssh_off_t fileSize = ref->virtual_size;
1586 
1587 	TRACE(("cache_io(ref = %p, offset = %Ld, buffer = %p, size = %lu, %s)\n",
1588 		ref, offset, (void *)buffer, *_size, doWrite ? "write" : "read"));
1589 
1590 	// out of bounds access?
1591 	if (offset >= fileSize || offset < 0) {
1592 		*_size = 0;
1593 		return FSSH_B_OK;
1594 	}
1595 
1596 	int32_t pageOffset = offset & (FSSH_B_PAGE_SIZE - 1);
1597 	fssh_size_t size = *_size;
1598 	offset -= pageOffset;
1599 
1600 	if (offset + pageOffset + size > fileSize) {
1601 		// adapt size to be within the file's offsets
1602 		size = fileSize - pageOffset - offset;
1603 		*_size = size;
1604 	}
1605 
1606 	// "offset" and "lastOffset" are always aligned to FSSH_B_PAGE_SIZE,
1607 	// the "last*" variables always point to the end of the last
1608 	// satisfied request part
1609 
1610 	fssh_size_t bytesLeft = size, lastLeft = size;
1611 	int32_t lastPageOffset = pageOffset;
1612 	fssh_addr_t lastBuffer = buffer;
1613 	fssh_off_t lastOffset = offset;
1614 
1615 	mutex_lock(&ref->lock);
1616 
1617 	for (; bytesLeft > 0; offset += FSSH_B_PAGE_SIZE) {
1618 		// check if this page is already in memory
1619 	restart:
1620 		vm_page *page = vm_cache_lookup_page(ref, offset);
1621 		PagePutter pagePutter(page);
1622 
1623 		if (page != NULL) {
1624 			// The page is busy - since we need to unlock the cache sometime
1625 			// in the near future, we need to satisfy the request of the pages
1626 			// we didn't get yet (to make sure no one else interferes in the
1627 			// mean time).
1628 			fssh_status_t status = FSSH_B_OK;
1629 
1630 			if (lastBuffer != buffer) {
1631 				status = satisfy_cache_io(ref, lastOffset + lastPageOffset,
1632 					buffer, lastBuffer, doWrite);
1633 				if (status == FSSH_B_OK) {
1634 					lastBuffer = buffer;
1635 					lastLeft = bytesLeft;
1636 					lastOffset = offset;
1637 					lastPageOffset = 0;
1638 					pageOffset = 0;
1639 				}
1640 			}
1641 
1642 			if (status != FSSH_B_OK) {
1643 				mutex_unlock(&ref->lock);
1644 				return status;
1645 			}
1646 
1647 			if (page->state == PAGE_STATE_BUSY) {
1648 				mutex_unlock(&ref->lock);
1649 				// ToDo: don't wait forever!
1650 				fssh_snooze(20000);
1651 				mutex_lock(&ref->lock);
1652 				goto restart;
1653 			}
1654 		}
1655 
1656 		fssh_size_t bytesInPage = fssh_min_c(fssh_size_t(FSSH_B_PAGE_SIZE - pageOffset), bytesLeft);
1657 		fssh_addr_t virtualAddress;
1658 
1659 		TRACE(("lookup page from offset %Ld: %p, size = %lu, pageOffset = %lu\n", offset, page, bytesLeft, pageOffset));
1660 		if (page != NULL) {
1661 			virtualAddress = page->Address();
1662 
1663 			// and copy the contents of the page already in memory
1664 			if (doWrite) {
1665 				user_memcpy((void *)(virtualAddress + pageOffset), (void *)buffer, bytesInPage);
1666 
1667 				// make sure the page is in the modified list
1668 				if (page->state != PAGE_STATE_MODIFIED)
1669 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
1670 			} else
1671 				user_memcpy((void *)buffer, (void *)(virtualAddress + pageOffset), bytesInPage);
1672 
1673 			if (bytesLeft <= bytesInPage) {
1674 				// we've read the last page, so we're done!
1675 				mutex_unlock(&ref->lock);
1676 				return FSSH_B_OK;
1677 			}
1678 
1679 			// prepare a potential gap request
1680 			lastBuffer = buffer + bytesInPage;
1681 			lastLeft = bytesLeft - bytesInPage;
1682 			lastOffset = offset + FSSH_B_PAGE_SIZE;
1683 			lastPageOffset = 0;
1684 		}
1685 
1686 		if (bytesLeft <= bytesInPage)
1687 			break;
1688 
1689 		buffer += bytesInPage;
1690 		bytesLeft -= bytesInPage;
1691 		pageOffset = 0;
1692 	}
1693 
1694 	// fill the last remaining bytes of the request (either write or read)
1695 
1696 	fssh_status_t status;
1697 	if (doWrite)
1698 		status = write_to_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
1699 	else
1700 		status = read_into_cache(ref, lastOffset + lastPageOffset, lastLeft, lastBuffer, lastLeft);
1701 
1702 	mutex_unlock(&ref->lock);
1703 	return status;
1704 }
1705 
1706 
1707 }	// namespace FSShell
1708 
1709 
1710 //	#pragma mark - public FS API
1711 
1712 
1713 using namespace FSShell;
1714 
1715 
1716 void *
1717 fssh_file_cache_create(fssh_mount_id mountID, fssh_vnode_id vnodeID,
1718 	fssh_off_t size, int fd)
1719 {
1720 	TRACE(("file_cache_create(mountID = %ld, vnodeID = %Ld, size = %Ld, fd = %d)\n", mountID, vnodeID, size, fd));
1721 
1722 	file_cache_ref *ref = new(nothrow) file_cache_ref;
1723 	if (ref == NULL)
1724 		return NULL;
1725 
1726 	ref->mountID = mountID;
1727 	ref->nodeID = vnodeID;
1728 	ref->deviceFD = fd;
1729 	ref->virtual_size = size;
1730 
1731 	// get vnode
1732 	fssh_status_t error = vfs_lookup_vnode(mountID, vnodeID, &ref->node);
1733 	if (error != FSSH_B_OK) {
1734 		fssh_dprintf("file_cache_create(): Failed get vnode %d:%lld: %s\n",
1735 			mountID, vnodeID, fssh_strerror(error));
1736 		delete ref;
1737 		return NULL;
1738 	}
1739 
1740 	// create lock
1741 	char buffer[32];
1742 	fssh_snprintf(buffer, sizeof(buffer), "file cache %d:%lld", (int)mountID, vnodeID);
1743 	error = mutex_init(&ref->lock, buffer);
1744 	if (error != FSSH_B_OK) {
1745 		fssh_dprintf("file_cache_create(): Failed to init mutex: %s\n",
1746 			fssh_strerror(error));
1747 		delete ref;
1748 		return NULL;
1749 	}
1750 
1751 	return ref;
1752 }
1753 
1754 
1755 void
1756 fssh_file_cache_delete(void *_cacheRef)
1757 {
1758 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1759 
1760 	if (ref == NULL)
1761 		return;
1762 
1763 	TRACE(("file_cache_delete(ref = %p)\n", ref));
1764 
1765 	// write all modified pages and resize the cache to 0 to free all pages
1766 	// it contains
1767 	vm_cache_write_modified(ref, false);
1768 
1769 	mutex_lock(&ref->lock);
1770 	vm_cache_resize(ref, 0);
1771 
1772 	mutex_destroy(&ref->lock);
1773 
1774 	delete ref;
1775 }
1776 
1777 
1778 fssh_status_t
1779 fssh_file_cache_set_size(void *_cacheRef, fssh_off_t size)
1780 {
1781 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1782 
1783 	TRACE(("file_cache_set_size(ref = %p, size = %Ld)\n", ref, size));
1784 
1785 	if (ref == NULL)
1786 		return FSSH_B_OK;
1787 
1788 	fssh_file_cache_invalidate_file_map(_cacheRef, 0, size);
1789 		// ToDo: make this better (we would only need to extend or shrink the map)
1790 
1791 	mutex_lock(&ref->lock);
1792 	fssh_status_t status = vm_cache_resize(ref, size);
1793 	mutex_unlock(&ref->lock);
1794 
1795 	return status;
1796 }
1797 
1798 
1799 fssh_status_t
1800 fssh_file_cache_sync(void *_cacheRef)
1801 {
1802 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1803 	if (ref == NULL)
1804 		return FSSH_B_BAD_VALUE;
1805 
1806 	return vm_cache_write_modified(ref, true);
1807 }
1808 
1809 
1810 fssh_status_t
1811 fssh_file_cache_read_pages(void *_cacheRef, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count, fssh_size_t *_numBytes)
1812 {
1813 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1814 
1815 	return pages_io(ref, offset, vecs, count, _numBytes, false);
1816 }
1817 
1818 
1819 fssh_status_t
1820 fssh_file_cache_write_pages(void *_cacheRef, fssh_off_t offset, const fssh_iovec *vecs, fssh_size_t count, fssh_size_t *_numBytes)
1821 {
1822 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1823 
1824 	fssh_status_t status = pages_io(ref, offset, vecs, count, _numBytes, true);
1825 	TRACE(("file_cache_write_pages(ref = %p, offset = %Ld, vecs = %p, count = %lu, bytes = %lu) = %ld\n",
1826 		ref, offset, vecs, count, *_numBytes, status));
1827 
1828 	return status;
1829 }
1830 
1831 
1832 fssh_status_t
1833 fssh_file_cache_read(void *_cacheRef, fssh_off_t offset, void *bufferBase, fssh_size_t *_size)
1834 {
1835 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1836 
1837 	TRACE(("file_cache_read(ref = %p, offset = %Ld, buffer = %p, size = %lu)\n",
1838 		ref, offset, bufferBase, *_size));
1839 
1840 	return cache_io(ref, offset, (fssh_addr_t)bufferBase, _size, false);
1841 }
1842 
1843 
1844 fssh_status_t
1845 fssh_file_cache_write(void *_cacheRef, fssh_off_t offset, const void *buffer, fssh_size_t *_size)
1846 {
1847 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1848 
1849 	fssh_status_t status = cache_io(ref, offset, (fssh_addr_t)const_cast<void *>(buffer), _size, true);
1850 	TRACE(("file_cache_write(ref = %p, offset = %Ld, buffer = %p, size = %lu) = %ld\n",
1851 		ref, offset, buffer, *_size, status));
1852 
1853 	return status;
1854 }
1855 
1856 
1857 fssh_status_t
1858 fssh_file_cache_invalidate_file_map(void *_cacheRef, fssh_off_t offset, fssh_off_t size)
1859 {
1860 	file_cache_ref *ref = (file_cache_ref *)_cacheRef;
1861 
1862 	// ToDo: honour offset/size parameters
1863 
1864 	TRACE(("file_cache_invalidate_file_map(offset = %Ld, size = %Ld)\n", offset, size));
1865 	mutex_lock(&ref->lock);
1866 	ref->map.Free();
1867 	mutex_unlock(&ref->lock);
1868 	return FSSH_B_OK;
1869 }
1870