xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision 9642f7705b27e5c270c15fa526d14e1848c2c27d)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 #include <StackOrHeapArray.h>
31 
32 #include <arch_config.h>
33 #include <boot_device.h>
34 #include <disk_device_manager/KDiskDevice.h>
35 #include <disk_device_manager/KDiskDeviceManager.h>
36 #include <disk_device_manager/KDiskSystem.h>
37 #include <disk_device_manager/KPartitionVisitor.h>
38 #include <driver_settings.h>
39 #include <fs/fd.h>
40 #include <fs/KPath.h>
41 #include <fs_info.h>
42 #include <fs_interface.h>
43 #include <heap.h>
44 #include <kernel_daemon.h>
45 #include <slab/Slab.h>
46 #include <syscalls.h>
47 #include <system_info.h>
48 #include <tracing.h>
49 #include <util/AutoLock.h>
50 #include <util/DoublyLinkedList.h>
51 #include <util/OpenHashTable.h>
52 #include <util/RadixBitmap.h>
53 #include <vfs.h>
54 #include <vm/vm.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_priv.h>
57 #include <vm/VMAddressSpace.h>
58 
59 #include "IORequest.h"
60 
61 
62 #if	ENABLE_SWAP_SUPPORT
63 
64 //#define TRACE_VM_ANONYMOUS_CACHE
65 #ifdef TRACE_VM_ANONYMOUS_CACHE
66 #	define TRACE(x...) dprintf(x)
67 #else
68 #	define TRACE(x...) do { } while (false)
69 #endif
70 
71 
72 // number of free swap blocks the object cache shall minimally have
73 #define MIN_SWAP_BLOCK_RESERVE	4096
74 
75 // interval the has resizer is triggered (in 0.1s)
76 #define SWAP_HASH_RESIZE_INTERVAL	5
77 
78 #define INITIAL_SWAP_HASH_SIZE		1024
79 
80 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
81 
82 #define SWAP_BLOCK_PAGES 32
83 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
84 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
85 
86 
87 static const char* const kDefaultSwapPath = "/var/swap";
88 
89 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
90 	int				fd;
91 	struct vnode*	vnode;
92 	void*			cookie;
93 	swap_addr_t		first_slot;
94 	swap_addr_t		last_slot;
95 	radix_bitmap*	bmp;
96 };
97 
98 struct swap_hash_key {
99 	VMAnonymousCache	*cache;
100 	off_t				page_index;  // page index in the cache
101 };
102 
103 // Each swap block contains swap address information for
104 // SWAP_BLOCK_PAGES continuous pages from the same cache
105 struct swap_block {
106 	swap_block*		hash_link;
107 	swap_hash_key	key;
108 	uint32			used;
109 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
110 };
111 
112 struct SwapHashTableDefinition {
113 	typedef swap_hash_key KeyType;
114 	typedef swap_block ValueType;
115 
116 	SwapHashTableDefinition() {}
117 
118 	size_t HashKey(const swap_hash_key& key) const
119 	{
120 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
121 		VMAnonymousCache* cache = key.cache;
122 		return blockIndex ^ (size_t)(int*)cache;
123 	}
124 
125 	size_t Hash(const swap_block* value) const
126 	{
127 		return HashKey(value->key);
128 	}
129 
130 	bool Compare(const swap_hash_key& key, const swap_block* value) const
131 	{
132 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
133 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
134 			&& key.cache == value->key.cache;
135 	}
136 
137 	swap_block*& GetLink(swap_block* value) const
138 	{
139 		return value->hash_link;
140 	}
141 };
142 
143 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
144 typedef DoublyLinkedList<swap_file> SwapFileList;
145 
146 static SwapHashTable sSwapHashTable;
147 static rw_lock sSwapHashLock;
148 
149 static SwapFileList sSwapFileList;
150 static mutex sSwapFileListLock;
151 static swap_file* sSwapFileAlloc = NULL; // allocate from here
152 static uint32 sSwapFileCount = 0;
153 
154 static off_t sAvailSwapSpace = 0;
155 static mutex sAvailSwapSpaceLock;
156 
157 static object_cache* sSwapBlockCache;
158 
159 
160 #if SWAP_TRACING
161 namespace SwapTracing {
162 
163 class SwapTraceEntry : public AbstractTraceEntry {
164 public:
165 	SwapTraceEntry(VMAnonymousCache* cache)
166 		:
167 		fCache(cache)
168 	{
169 	}
170 
171 protected:
172 	VMAnonymousCache*	fCache;
173 };
174 
175 
176 class ReadPage : public SwapTraceEntry {
177 public:
178 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
179 		swap_addr_t swapSlotIndex)
180 		:
181 		SwapTraceEntry(cache),
182 		fPageIndex(pageIndex),
183 		fSwapSlotIndex(swapSlotIndex)
184 	{
185 		Initialized();
186 	}
187 
188 	virtual void AddDump(TraceOutput& out)
189 	{
190 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
191 			fCache, fPageIndex, fSwapSlotIndex);
192 	}
193 
194 private:
195 	page_num_t		fPageIndex;
196 	swap_addr_t		fSwapSlotIndex;
197 };
198 
199 
200 class WritePage : public SwapTraceEntry {
201 public:
202 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
203 		swap_addr_t swapSlotIndex)
204 		:
205 		SwapTraceEntry(cache),
206 		fPageIndex(pageIndex),
207 		fSwapSlotIndex(swapSlotIndex)
208 	{
209 		Initialized();
210 	}
211 
212 	virtual void AddDump(TraceOutput& out)
213 	{
214 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
215 			fCache, fPageIndex, fSwapSlotIndex);
216 	}
217 
218 private:
219 	page_num_t		fPageIndex;
220 	swap_addr_t		fSwapSlotIndex;
221 };
222 
223 }	// namespace SwapTracing
224 
225 #	define T(x) new(std::nothrow) SwapTracing::x;
226 #else
227 #	define T(x) ;
228 #endif
229 
230 
231 static int
232 dump_swap_info(int argc, char** argv)
233 {
234 	swap_addr_t totalSwapPages = 0;
235 	swap_addr_t freeSwapPages = 0;
236 
237 	kprintf("swap files:\n");
238 
239 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
240 		swap_file* file = it.Next();) {
241 		swap_addr_t total = file->last_slot - file->first_slot;
242 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
243 			"\n", file->vnode, total, file->bmp->free_slots);
244 
245 		totalSwapPages += total;
246 		freeSwapPages += file->bmp->free_slots;
247 	}
248 
249 	kprintf("\n");
250 	kprintf("swap space in pages:\n");
251 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
252 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
253 	kprintf("reserved:  %9" B_PRIdOFF "\n",
254 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
255 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
256 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
257 
258 	return 0;
259 }
260 
261 
262 static swap_addr_t
263 swap_slot_alloc(uint32 count)
264 {
265 	mutex_lock(&sSwapFileListLock);
266 
267 	if (sSwapFileList.IsEmpty()) {
268 		mutex_unlock(&sSwapFileListLock);
269 		panic("swap_slot_alloc(): no swap file in the system\n");
270 		return SWAP_SLOT_NONE;
271 	}
272 
273 	// since radix bitmap could not handle more than 32 pages, we return
274 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
275 	if (count > BITMAP_RADIX) {
276 		mutex_unlock(&sSwapFileListLock);
277 		return SWAP_SLOT_NONE;
278 	}
279 
280 	swap_addr_t j, addr = SWAP_SLOT_NONE;
281 	for (j = 0; j < sSwapFileCount; j++) {
282 		if (sSwapFileAlloc == NULL)
283 			sSwapFileAlloc = sSwapFileList.First();
284 
285 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
286 		if (addr != SWAP_SLOT_NONE) {
287 			addr += sSwapFileAlloc->first_slot;
288 			break;
289 		}
290 
291 		// this swap_file is full, find another
292 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
293 	}
294 
295 	if (j == sSwapFileCount) {
296 		mutex_unlock(&sSwapFileListLock);
297 		panic("swap_slot_alloc: swap space exhausted!\n");
298 		return SWAP_SLOT_NONE;
299 	}
300 
301 	// if this swap file has used more than 90% percent of its space
302 	// switch to another
303 	if (sSwapFileAlloc->bmp->free_slots
304 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
305 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
306 	}
307 
308 	mutex_unlock(&sSwapFileListLock);
309 
310 	return addr;
311 }
312 
313 
314 static swap_file*
315 find_swap_file(swap_addr_t slotIndex)
316 {
317 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
318 		swap_file* swapFile = it.Next();) {
319 		if (slotIndex >= swapFile->first_slot
320 			&& slotIndex < swapFile->last_slot) {
321 			return swapFile;
322 		}
323 	}
324 
325 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
326 		slotIndex);
327 	return NULL;
328 }
329 
330 
331 static void
332 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
333 {
334 	if (slotIndex == SWAP_SLOT_NONE)
335 		return;
336 
337 	mutex_lock(&sSwapFileListLock);
338 	swap_file* swapFile = find_swap_file(slotIndex);
339 	slotIndex -= swapFile->first_slot;
340 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
341 	mutex_unlock(&sSwapFileListLock);
342 }
343 
344 
345 static off_t
346 swap_space_reserve(off_t amount)
347 {
348 	mutex_lock(&sAvailSwapSpaceLock);
349 	if (sAvailSwapSpace >= amount)
350 		sAvailSwapSpace -= amount;
351 	else {
352 		amount = sAvailSwapSpace;
353 		sAvailSwapSpace = 0;
354 	}
355 	mutex_unlock(&sAvailSwapSpaceLock);
356 
357 	return amount;
358 }
359 
360 
361 static void
362 swap_space_unreserve(off_t amount)
363 {
364 	mutex_lock(&sAvailSwapSpaceLock);
365 	sAvailSwapSpace += amount;
366 	mutex_unlock(&sAvailSwapSpaceLock);
367 }
368 
369 
370 static void
371 swap_hash_resizer(void*, int)
372 {
373 	WriteLocker locker(sSwapHashLock);
374 
375 	size_t size;
376 	void* allocation;
377 
378 	do {
379 		size = sSwapHashTable.ResizeNeeded();
380 		if (size == 0)
381 			return;
382 
383 		locker.Unlock();
384 
385 		allocation = malloc(size);
386 		if (allocation == NULL)
387 			return;
388 
389 		locker.Lock();
390 
391 	} while (!sSwapHashTable.Resize(allocation, size));
392 }
393 
394 
395 // #pragma mark -
396 
397 
398 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
399 public:
400 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
401 		:
402 		StackableAsyncIOCallback(callback),
403 		fCache(cache)
404 	{
405 	}
406 
407 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
408 	{
409 		fPageIndex = pageIndex;
410 		fSlotIndex = slotIndex;
411 		fNewSlot = newSlot;
412 	}
413 
414 	virtual void IOFinished(status_t status, bool partialTransfer,
415 		generic_size_t bytesTransferred)
416 	{
417 		if (fNewSlot) {
418 			if (status == B_OK) {
419 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
420 			} else {
421 				AutoLocker<VMCache> locker(fCache);
422 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
423 				locker.Unlock();
424 
425 				swap_slot_dealloc(fSlotIndex, 1);
426 			}
427 		}
428 
429 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
430 
431 		delete this;
432 	}
433 
434 private:
435 	VMAnonymousCache*	fCache;
436 	page_num_t			fPageIndex;
437 	swap_addr_t			fSlotIndex;
438 	bool				fNewSlot;
439 };
440 
441 
442 // #pragma mark -
443 
444 
445 VMAnonymousCache::~VMAnonymousCache()
446 {
447 	// free allocated swap space and swap block
448 	for (off_t offset = virtual_base, toFree = fAllocatedSwapSize;
449 		offset < virtual_end && toFree > 0; offset += B_PAGE_SIZE) {
450 		swap_addr_t slotIndex = _SwapBlockGetAddress(offset >> PAGE_SHIFT);
451 		if (slotIndex == SWAP_SLOT_NONE)
452 			continue;
453 
454 		swap_slot_dealloc(slotIndex, 1);
455 		_SwapBlockFree(offset >> PAGE_SHIFT, 1);
456 		toFree -= B_PAGE_SIZE;
457 	}
458 
459 	swap_space_unreserve(fCommittedSwapSize);
460 	if (committed_size > fCommittedSwapSize)
461 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
462 }
463 
464 
465 status_t
466 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
467 	int32 numGuardPages, uint32 allocationFlags)
468 {
469 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
470 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
471 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
472 		numGuardPages);
473 
474 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
475 	if (error != B_OK)
476 		return error;
477 
478 	fCanOvercommit = canOvercommit;
479 	fHasPrecommitted = false;
480 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
481 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
482 	fCommittedSwapSize = 0;
483 	fAllocatedSwapSize = 0;
484 
485 	return B_OK;
486 }
487 
488 
489 status_t
490 VMAnonymousCache::Resize(off_t newSize, int priority)
491 {
492 	// If the cache size shrinks, drop all swap pages beyond the new size.
493 	if (fAllocatedSwapSize > 0) {
494 		off_t oldPageCount = (virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
495 		swap_block* swapBlock = NULL;
496 
497 		for (off_t pageIndex = (newSize + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
498 			pageIndex < oldPageCount && fAllocatedSwapSize > 0; pageIndex++) {
499 
500 			WriteLocker locker(sSwapHashLock);
501 
502 			// Get the swap slot index for the page.
503 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
504 			if (swapBlock == NULL || blockIndex == 0) {
505 				swap_hash_key key = { this, pageIndex };
506 				swapBlock = sSwapHashTable.Lookup(key);
507 
508 				if (swapBlock == NULL) {
509 					pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES);
510 					continue;
511 				}
512 			}
513 
514 			swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
515 			vm_page* page;
516 			if (slotIndex != SWAP_SLOT_NONE
517 				&& ((page = LookupPage((off_t)pageIndex * B_PAGE_SIZE)) == NULL
518 					|| !page->busy)) {
519 					// TODO: We skip (i.e. leak) swap space of busy pages, since
520 					// there could be I/O going on (paging in/out). Waiting is
521 					// not an option as 1. unlocking the cache means that new
522 					// swap pages could be added in a range we've already
523 					// cleared (since the cache still has the old size) and 2.
524 					// we'd risk a deadlock in case we come from the file cache
525 					// and the FS holds the node's write-lock. We should mark
526 					// the page invalid and let the one responsible clean up.
527 					// There's just no such mechanism yet.
528 				swap_slot_dealloc(slotIndex, 1);
529 				fAllocatedSwapSize -= B_PAGE_SIZE;
530 
531 				swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
532 				if (--swapBlock->used == 0) {
533 					// All swap pages have been freed -- we can discard the swap
534 					// block.
535 					sSwapHashTable.RemoveUnchecked(swapBlock);
536 					object_cache_free(sSwapBlockCache, swapBlock,
537 						CACHE_DONT_WAIT_FOR_MEMORY
538 							| CACHE_DONT_LOCK_KERNEL_SPACE);
539 				}
540 			}
541 		}
542 	}
543 
544 	return VMCache::Resize(newSize, priority);
545 }
546 
547 
548 status_t
549 VMAnonymousCache::Commit(off_t size, int priority)
550 {
551 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
552 
553 	// If we can overcommit, we don't commit here, but in Fault(). We always
554 	// unreserve memory, if we're asked to shrink our commitment, though.
555 	if (fCanOvercommit && size > committed_size) {
556 		if (fHasPrecommitted)
557 			return B_OK;
558 
559 		// pre-commit some pages to make a later failure less probable
560 		fHasPrecommitted = true;
561 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
562 		if (size > precommitted)
563 			size = precommitted;
564 	}
565 
566 	return _Commit(size, priority);
567 }
568 
569 
570 bool
571 VMAnonymousCache::HasPage(off_t offset)
572 {
573 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
574 		return true;
575 
576 	return false;
577 }
578 
579 
580 bool
581 VMAnonymousCache::DebugHasPage(off_t offset)
582 {
583 	off_t pageIndex = offset >> PAGE_SHIFT;
584 	swap_hash_key key = { this, pageIndex };
585 	swap_block* swap = sSwapHashTable.Lookup(key);
586 	if (swap == NULL)
587 		return false;
588 
589 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
590 }
591 
592 
593 status_t
594 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
595 	uint32 flags, generic_size_t* _numBytes)
596 {
597 	off_t pageIndex = offset >> PAGE_SHIFT;
598 
599 	for (uint32 i = 0, j = 0; i < count; i = j) {
600 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
601 		for (j = i + 1; j < count; j++) {
602 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
603 			if (slotIndex != startSlotIndex + j - i)
604 				break;
605 		}
606 
607 		T(ReadPage(this, pageIndex, startSlotIndex));
608 			// TODO: Assumes that only one page is read.
609 
610 		swap_file* swapFile = find_swap_file(startSlotIndex);
611 
612 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
613 			* B_PAGE_SIZE;
614 
615 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
616 			vecs + i, j - i, flags, _numBytes);
617 		if (status != B_OK)
618 			return status;
619 	}
620 
621 	return B_OK;
622 }
623 
624 
625 status_t
626 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
627 	uint32 flags, generic_size_t* _numBytes)
628 {
629 	off_t pageIndex = offset >> PAGE_SHIFT;
630 
631 	AutoLocker<VMCache> locker(this);
632 
633 	page_num_t totalPages = 0;
634 	for (uint32 i = 0; i < count; i++) {
635 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
636 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
637 		if (slotIndex != SWAP_SLOT_NONE) {
638 			swap_slot_dealloc(slotIndex, pageCount);
639 			_SwapBlockFree(pageIndex + totalPages, pageCount);
640 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
641 		}
642 
643 		totalPages += pageCount;
644 	}
645 
646 	off_t totalSize = totalPages * B_PAGE_SIZE;
647 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
648 		return B_ERROR;
649 
650 	fAllocatedSwapSize += totalSize;
651 	locker.Unlock();
652 
653 	page_num_t pagesLeft = totalPages;
654 	totalPages = 0;
655 
656 	for (uint32 i = 0; i < count; i++) {
657 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
658 
659 		generic_addr_t vectorBase = vecs[i].base;
660 		generic_size_t vectorLength = vecs[i].length;
661 		page_num_t n = pageCount;
662 
663 		for (page_num_t j = 0; j < pageCount; j += n) {
664 			swap_addr_t slotIndex;
665 			// try to allocate n slots, if fail, try to allocate n/2
666 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
667 				n >>= 1;
668 
669 			if (slotIndex == SWAP_SLOT_NONE)
670 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
671 
672 			T(WritePage(this, pageIndex, slotIndex));
673 				// TODO: Assumes that only one page is written.
674 
675 			swap_file* swapFile = find_swap_file(slotIndex);
676 
677 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
678 
679 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
680 			generic_io_vec vector[1];
681 			vector->base = vectorBase;
682 			vector->length = length;
683 
684 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
685 				pos, vector, 1, flags, &length);
686 			if (status != B_OK) {
687 				locker.Lock();
688 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
689 				locker.Unlock();
690 
691 				swap_slot_dealloc(slotIndex, n);
692 				return status;
693 			}
694 
695 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
696 			pagesLeft -= n;
697 
698 			if (n != pageCount) {
699 				vectorBase = vectorBase + n * B_PAGE_SIZE;
700 				vectorLength -= n * B_PAGE_SIZE;
701 			}
702 		}
703 
704 		totalPages += pageCount;
705 	}
706 
707 	ASSERT(pagesLeft == 0);
708 	return B_OK;
709 }
710 
711 
712 status_t
713 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
714 	size_t count, generic_size_t numBytes, uint32 flags,
715 	AsyncIOCallback* _callback)
716 {
717 	// TODO: Currently this method is only used for single pages. Either make
718 	// more flexible use of it or change the interface!
719 	// This implementation relies on the current usage!
720 	ASSERT(count == 1);
721 	ASSERT(numBytes <= B_PAGE_SIZE);
722 
723 	page_num_t pageIndex = offset >> PAGE_SHIFT;
724 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
725 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
726 
727 	// If the page doesn't have any swap space yet, allocate it.
728 	if (newSlot) {
729 		AutoLocker<VMCache> locker(this);
730 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
731 			_callback->IOFinished(B_ERROR, true, 0);
732 			return B_ERROR;
733 		}
734 
735 		fAllocatedSwapSize += B_PAGE_SIZE;
736 
737 		slotIndex = swap_slot_alloc(1);
738 	}
739 
740 	// create our callback
741 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
742 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
743 		: new(std::nothrow) WriteCallback(this, _callback);
744 	if (callback == NULL) {
745 		if (newSlot) {
746 			AutoLocker<VMCache> locker(this);
747 			fAllocatedSwapSize -= B_PAGE_SIZE;
748 			locker.Unlock();
749 
750 			swap_slot_dealloc(slotIndex, 1);
751 		}
752 		_callback->IOFinished(B_NO_MEMORY, true, 0);
753 		return B_NO_MEMORY;
754 	}
755 	// TODO: If the page already had swap space assigned, we don't need an own
756 	// callback.
757 
758 	callback->SetTo(pageIndex, slotIndex, newSlot);
759 
760 	T(WritePage(this, pageIndex, slotIndex));
761 
762 	// write the page asynchrounously
763 	swap_file* swapFile = find_swap_file(slotIndex);
764 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
765 
766 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
767 		vecs, 1, numBytes, flags, callback);
768 }
769 
770 
771 bool
772 VMAnonymousCache::CanWritePage(off_t offset)
773 {
774 	// We can write the page, if we have not used all of our committed swap
775 	// space or the page already has a swap slot assigned.
776 	return fAllocatedSwapSize < fCommittedSwapSize
777 		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
778 }
779 
780 
781 int32
782 VMAnonymousCache::MaxPagesPerAsyncWrite() const
783 {
784 	return 1;
785 }
786 
787 
788 status_t
789 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
790 {
791 	if (fGuardedSize > 0) {
792 		uint32 guardOffset;
793 
794 #ifdef STACK_GROWS_DOWNWARDS
795 		guardOffset = 0;
796 #elif defined(STACK_GROWS_UPWARDS)
797 		guardOffset = virtual_size - fGuardedSize;
798 #else
799 #	error Stack direction has not been defined in arch_config.h
800 #endif
801 		// report stack fault, guard page hit!
802 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
803 			TRACE(("stack overflow!\n"));
804 			return B_BAD_ADDRESS;
805 		}
806 	}
807 
808 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
809 		if (fPrecommittedPages == 0) {
810 			// never commit more than needed
811 			if (committed_size / B_PAGE_SIZE > page_count)
812 				return B_BAD_HANDLER;
813 
814 			// try to commit additional swap space/memory
815 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
816 				fCommittedSwapSize += B_PAGE_SIZE;
817 			} else {
818 				int priority = aspace == VMAddressSpace::Kernel()
819 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
820 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
821 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
822 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
823 					return B_NO_MEMORY;
824 				}
825 			}
826 
827 			committed_size += B_PAGE_SIZE;
828 		} else
829 			fPrecommittedPages--;
830 	}
831 
832 	// This will cause vm_soft_fault() to handle the fault
833 	return B_BAD_HANDLER;
834 }
835 
836 
837 void
838 VMAnonymousCache::Merge(VMCache* _source)
839 {
840 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
841 	if (source == NULL) {
842 		panic("VMAnonymousCache::MergeStore(): merge with incompatible cache "
843 			"%p requested", _source);
844 		return;
845 	}
846 
847 	// take over the source' committed size
848 	fCommittedSwapSize += source->fCommittedSwapSize;
849 	source->fCommittedSwapSize = 0;
850 	committed_size += source->committed_size;
851 	source->committed_size = 0;
852 
853 	off_t actualSize = virtual_end - virtual_base;
854 	if (committed_size > actualSize)
855 		_Commit(actualSize, VM_PRIORITY_USER);
856 
857 	// Move all not shadowed swap pages from the source to the consumer cache.
858 	// Also remove all source pages that are shadowed by consumer swap pages.
859 	_MergeSwapPages(source);
860 
861 	// Move all not shadowed pages from the source to the consumer cache.
862 	if (source->page_count < page_count)
863 		_MergePagesSmallerSource(source);
864 	else
865 		_MergePagesSmallerConsumer(source);
866 }
867 
868 
869 void
870 VMAnonymousCache::DeleteObject()
871 {
872 	object_cache_delete(gAnonymousCacheObjectCache, this);
873 }
874 
875 
876 void
877 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
878 	swap_addr_t startSlotIndex, uint32 count)
879 {
880 	WriteLocker locker(sSwapHashLock);
881 
882 	uint32 left = count;
883 	for (uint32 i = 0, j = 0; i < count; i += j) {
884 		off_t pageIndex = startPageIndex + i;
885 		swap_addr_t slotIndex = startSlotIndex + i;
886 
887 		swap_hash_key key = { this, pageIndex };
888 
889 		swap_block* swap = sSwapHashTable.Lookup(key);
890 		while (swap == NULL) {
891 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
892 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
893 			if (swap == NULL) {
894 				// Wait a short time until memory is available again.
895 				locker.Unlock();
896 				snooze(10000);
897 				locker.Lock();
898 				swap = sSwapHashTable.Lookup(key);
899 				continue;
900 			}
901 
902 			swap->key.cache = this;
903 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
904 			swap->used = 0;
905 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
906 				swap->swap_slots[i] = SWAP_SLOT_NONE;
907 
908 			sSwapHashTable.InsertUnchecked(swap);
909 		}
910 
911 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
912 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
913 			swap->swap_slots[blockIndex++] = slotIndex + j;
914 			left--;
915 		}
916 
917 		swap->used += j;
918 	}
919 }
920 
921 
922 void
923 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
924 {
925 	WriteLocker locker(sSwapHashLock);
926 
927 	uint32 left = count;
928 	for (uint32 i = 0, j = 0; i < count; i += j) {
929 		off_t pageIndex = startPageIndex + i;
930 		swap_hash_key key = { this, pageIndex };
931 		swap_block* swap = sSwapHashTable.Lookup(key);
932 
933 		ASSERT(swap != NULL);
934 
935 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
936 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
937 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
938 			left--;
939 		}
940 
941 		swap->used -= j;
942 		if (swap->used == 0) {
943 			sSwapHashTable.RemoveUnchecked(swap);
944 			object_cache_free(sSwapBlockCache, swap,
945 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
946 		}
947 	}
948 }
949 
950 
951 swap_addr_t
952 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
953 {
954 	ReadLocker locker(sSwapHashLock);
955 
956 	swap_hash_key key = { this, pageIndex };
957 	swap_block* swap = sSwapHashTable.Lookup(key);
958 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
959 
960 	if (swap != NULL) {
961 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
962 		slotIndex = swap->swap_slots[blockIndex];
963 	}
964 
965 	return slotIndex;
966 }
967 
968 
969 status_t
970 VMAnonymousCache::_Commit(off_t size, int priority)
971 {
972 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
973 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
974 		fCommittedSwapSize);
975 
976 	// Basic strategy: reserve swap space first, only when running out of swap
977 	// space, reserve real memory.
978 
979 	off_t committedMemory = committed_size - fCommittedSwapSize;
980 
981 	// Regardless of whether we're asked to grow or shrink the commitment,
982 	// we always try to reserve as much as possible of the final commitment
983 	// in the swap space.
984 	if (size > fCommittedSwapSize) {
985 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
986 		committed_size = fCommittedSwapSize + committedMemory;
987 		if (size > fCommittedSwapSize) {
988 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
989 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
990 		}
991 	}
992 
993 	if (committed_size == size)
994 		return B_OK;
995 
996 	if (committed_size > size) {
997 		// The commitment shrinks -- unreserve real memory first.
998 		off_t toUnreserve = committed_size - size;
999 		if (committedMemory > 0) {
1000 			off_t unreserved = min_c(toUnreserve, committedMemory);
1001 			vm_unreserve_memory(unreserved);
1002 			committedMemory -= unreserved;
1003 			committed_size -= unreserved;
1004 			toUnreserve -= unreserved;
1005 		}
1006 
1007 		// Unreserve swap space.
1008 		if (toUnreserve > 0) {
1009 			swap_space_unreserve(toUnreserve);
1010 			fCommittedSwapSize -= toUnreserve;
1011 			committed_size -= toUnreserve;
1012 		}
1013 
1014 		return B_OK;
1015 	}
1016 
1017 	// The commitment grows -- we have already tried to reserve swap space at
1018 	// the start of the method, so we try to reserve real memory, now.
1019 
1020 	off_t toReserve = size - committed_size;
1021 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1022 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1023 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1024 		return B_NO_MEMORY;
1025 	}
1026 
1027 	committed_size = size;
1028 	return B_OK;
1029 }
1030 
1031 
1032 void
1033 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1034 {
1035 	// The source cache has less pages than the consumer (this cache), so we
1036 	// iterate through the source's pages and move the ones that are not
1037 	// shadowed up to the consumer.
1038 
1039 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1040 			vm_page* page = it.Next();) {
1041 		// Note: Removing the current node while iterating through a
1042 		// IteratableSplayTree is safe.
1043 		vm_page* consumerPage = LookupPage(
1044 			(off_t)page->cache_offset << PAGE_SHIFT);
1045 		if (consumerPage == NULL) {
1046 			// the page is not yet in the consumer cache - move it upwards
1047 			ASSERT_PRINT(!page->busy, "page: %p", page);
1048 			MovePage(page);
1049 		}
1050 	}
1051 }
1052 
1053 
1054 void
1055 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1056 {
1057 	// The consumer (this cache) has less pages than the source, so we move the
1058 	// consumer's pages to the source (freeing shadowed ones) and finally just
1059 	// all pages of the source back to the consumer.
1060 
1061 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1062 		vm_page* page = it.Next();) {
1063 		// If a source page is in the way, remove and free it.
1064 		vm_page* sourcePage = source->LookupPage(
1065 			(off_t)page->cache_offset << PAGE_SHIFT);
1066 		if (sourcePage != NULL) {
1067 			DEBUG_PAGE_ACCESS_START(sourcePage);
1068 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1069 			ASSERT_PRINT(sourcePage->WiredCount() == 0
1070 					&& sourcePage->mappings.IsEmpty(),
1071 				"sourcePage: %p, page: %p", sourcePage, page);
1072 			source->RemovePage(sourcePage);
1073 			vm_page_free(source, sourcePage);
1074 		}
1075 
1076 		// Note: Removing the current node while iterating through a
1077 		// IteratableSplayTree is safe.
1078 		source->MovePage(page);
1079 	}
1080 
1081 	MoveAllPages(source);
1082 }
1083 
1084 
1085 void
1086 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1087 {
1088 	// If neither source nor consumer have swap pages, we don't have to do
1089 	// anything.
1090 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1091 		return;
1092 
1093 	for (off_t offset = source->virtual_base
1094 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1095 		offset < source->virtual_end;
1096 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1097 
1098 		WriteLocker locker(sSwapHashLock);
1099 
1100 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1101 		swap_hash_key key = { source, swapBlockPageIndex };
1102 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1103 
1104 		// remove the source swap block -- we will either take over the swap
1105 		// space (and the block) or free it
1106 		if (sourceSwapBlock != NULL)
1107 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1108 
1109 		key.cache = this;
1110 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1111 
1112 		locker.Unlock();
1113 
1114 		// remove all source pages that are shadowed by consumer swap pages
1115 		if (swapBlock != NULL) {
1116 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1117 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1118 					vm_page* page = source->LookupPage(
1119 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1120 					if (page != NULL) {
1121 						DEBUG_PAGE_ACCESS_START(page);
1122 						ASSERT_PRINT(!page->busy, "page: %p", page);
1123 						source->RemovePage(page);
1124 						vm_page_free(source, page);
1125 					}
1126 				}
1127 			}
1128 		}
1129 
1130 		if (sourceSwapBlock == NULL)
1131 			continue;
1132 
1133 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1134 			off_t pageIndex = swapBlockPageIndex + i;
1135 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1136 
1137 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1138 				continue;
1139 
1140 			if ((swapBlock != NULL
1141 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1142 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1143 				// The consumer already has a page or a swapped out page
1144 				// at this index. So we can free the source swap space.
1145 				swap_slot_dealloc(sourceSlotIndex, 1);
1146 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1147 				sourceSwapBlock->used--;
1148 			}
1149 
1150 			// We've either freed the source swap page or are going to move it
1151 			// to the consumer. At any rate, the source cache doesn't own it
1152 			// anymore.
1153 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1154 		}
1155 
1156 		// All source swap pages that have not been freed yet are taken over by
1157 		// the consumer.
1158 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1159 
1160 		if (sourceSwapBlock->used == 0) {
1161 			// All swap pages have been freed -- we can discard the source swap
1162 			// block.
1163 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1164 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1165 		} else if (swapBlock == NULL) {
1166 			// We need to take over some of the source's swap pages and there's
1167 			// no swap block in the consumer cache. Just take over the source
1168 			// swap block.
1169 			sourceSwapBlock->key.cache = this;
1170 			locker.Lock();
1171 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1172 			locker.Unlock();
1173 		} else {
1174 			// We need to take over some of the source's swap pages and there's
1175 			// already a swap block in the consumer cache. Copy the respective
1176 			// swap addresses and discard the source swap block.
1177 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1178 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1179 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1180 			}
1181 
1182 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1183 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1184 		}
1185 	}
1186 }
1187 
1188 
1189 // #pragma mark -
1190 
1191 
1192 // TODO: This can be removed if we get BFS uuid's
1193 struct VolumeInfo {
1194 	char name[B_FILE_NAME_LENGTH];
1195 	char device[B_FILE_NAME_LENGTH];
1196 	char filesystem[B_OS_NAME_LENGTH];
1197 	off_t capacity;
1198 };
1199 
1200 
1201 class PartitionScorer : public KPartitionVisitor {
1202 public:
1203 	PartitionScorer(VolumeInfo& volumeInfo)
1204 		:
1205 		fBestPartition(NULL),
1206 		fBestScore(-1),
1207 		fVolumeInfo(volumeInfo)
1208 	{
1209 	}
1210 
1211 	virtual bool VisitPre(KPartition* partition)
1212 	{
1213 		if (!partition->ContainsFileSystem())
1214 			return false;
1215 
1216 		KPath path;
1217 		partition->GetPath(&path);
1218 
1219 		int score = 0;
1220 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1221 			score += 4;
1222 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1223 			score += 3;
1224 		if (fVolumeInfo.capacity == partition->Size())
1225 			score += 2;
1226 		if (strcmp(fVolumeInfo.filesystem,
1227 			partition->DiskSystem()->ShortName()) == 0) {
1228 			score += 1;
1229 		}
1230 		if (score >= 4 && score > fBestScore) {
1231 			fBestPartition = partition;
1232 			fBestScore = score;
1233 		}
1234 
1235 		return false;
1236 	}
1237 
1238 	KPartition* fBestPartition;
1239 
1240 private:
1241 	int32		fBestScore;
1242 	VolumeInfo&	fVolumeInfo;
1243 };
1244 
1245 
1246 status_t
1247 get_mount_point(KPartition* partition, KPath* mountPoint)
1248 {
1249 	if (!mountPoint || !partition->ContainsFileSystem())
1250 		return B_BAD_VALUE;
1251 
1252 	int nameLength = 0;
1253 	const char* volumeName = partition->ContentName();
1254 	if (volumeName != NULL)
1255 		nameLength = strlen(volumeName);
1256 	if (nameLength == 0) {
1257 		volumeName = partition->Name();
1258 		if (volumeName != NULL)
1259 			nameLength = strlen(volumeName);
1260 		if (nameLength == 0) {
1261 			volumeName = "unnamed volume";
1262 			nameLength = strlen(volumeName);
1263 		}
1264 	}
1265 
1266 	BStackOrHeapArray<char, 128> basePath(nameLength + 1);
1267 	if (!basePath.IsValid())
1268 		return B_NO_MEMORY;
1269 	int32 len = snprintf(basePath, nameLength + 1, "/%s", volumeName);
1270 	for (int32 i = 1; i < len; i++)
1271 		if (basePath[i] == '/')
1272 		basePath[i] = '-';
1273 	char* path = mountPoint->LockBuffer();
1274 	int32 pathLen = mountPoint->BufferSize();
1275 	strncpy(path, basePath, pathLen);
1276 
1277 	struct stat dummy;
1278 	for (int i = 1; ; i++) {
1279 		if (stat(path, &dummy) != 0)
1280 			break;
1281 		snprintf(path, pathLen, "%s%d", (char*)basePath, i);
1282 	}
1283 
1284 	mountPoint->UnlockBuffer();
1285 	return B_OK;
1286 }
1287 
1288 
1289 status_t
1290 swap_file_add(const char* path)
1291 {
1292 	// open the file
1293 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1294 	if (fd < 0)
1295 		return errno;
1296 
1297 	// fstat() it and check whether we can use it
1298 	struct stat st;
1299 	if (fstat(fd, &st) < 0) {
1300 		close(fd);
1301 		return errno;
1302 	}
1303 
1304 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1305 		close(fd);
1306 		return B_BAD_VALUE;
1307 	}
1308 
1309 	if (st.st_size < B_PAGE_SIZE) {
1310 		close(fd);
1311 		return B_BAD_VALUE;
1312 	}
1313 
1314 	// get file descriptor, vnode, and cookie
1315 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1316 	put_fd(descriptor);
1317 
1318 	vnode* node = fd_vnode(descriptor);
1319 	if (node == NULL) {
1320 		close(fd);
1321 		return B_BAD_VALUE;
1322 	}
1323 
1324 	// do the allocations and prepare the swap_file structure
1325 	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1326 	if (swap == NULL) {
1327 		close(fd);
1328 		return B_NO_MEMORY;
1329 	}
1330 
1331 	swap->fd = fd;
1332 	swap->vnode = node;
1333 	swap->cookie = descriptor->cookie;
1334 
1335 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1336 	swap->bmp = radix_bitmap_create(pageCount);
1337 	if (swap->bmp == NULL) {
1338 		free(swap);
1339 		close(fd);
1340 		return B_NO_MEMORY;
1341 	}
1342 
1343 	// set slot index and add this file to swap file list
1344 	mutex_lock(&sSwapFileListLock);
1345 	// TODO: Also check whether the swap file is already registered!
1346 	if (sSwapFileList.IsEmpty()) {
1347 		swap->first_slot = 0;
1348 		swap->last_slot = pageCount;
1349 	} else {
1350 		// leave one page gap between two swap files
1351 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1352 		swap->last_slot = swap->first_slot + pageCount;
1353 	}
1354 	sSwapFileList.Add(swap);
1355 	sSwapFileCount++;
1356 	mutex_unlock(&sSwapFileListLock);
1357 
1358 	mutex_lock(&sAvailSwapSpaceLock);
1359 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1360 	mutex_unlock(&sAvailSwapSpaceLock);
1361 
1362 	return B_OK;
1363 }
1364 
1365 
1366 status_t
1367 swap_file_delete(const char* path)
1368 {
1369 	vnode* node = NULL;
1370 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1371 	if (status != B_OK)
1372 		return status;
1373 
1374 	MutexLocker locker(sSwapFileListLock);
1375 
1376 	swap_file* swapFile = NULL;
1377 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1378 			(swapFile = it.Next()) != NULL;) {
1379 		if (swapFile->vnode == node)
1380 			break;
1381 	}
1382 
1383 	vfs_put_vnode(node);
1384 
1385 	if (swapFile == NULL)
1386 		return B_ERROR;
1387 
1388 	// if this file is currently used, we can't delete
1389 	// TODO: mark this swap file deleting, and remove it after releasing
1390 	// all the swap space
1391 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1392 		return B_ERROR;
1393 
1394 	sSwapFileList.Remove(swapFile);
1395 	sSwapFileCount--;
1396 	locker.Unlock();
1397 
1398 	mutex_lock(&sAvailSwapSpaceLock);
1399 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1400 		* B_PAGE_SIZE;
1401 	mutex_unlock(&sAvailSwapSpaceLock);
1402 
1403 	close(swapFile->fd);
1404 	radix_bitmap_destroy(swapFile->bmp);
1405 	free(swapFile);
1406 
1407 	return B_OK;
1408 }
1409 
1410 
1411 void
1412 swap_init(void)
1413 {
1414 	// create swap block cache
1415 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1416 		sizeof(void*), NULL, NULL, NULL);
1417 	if (sSwapBlockCache == NULL)
1418 		panic("swap_init(): can't create object cache for swap blocks\n");
1419 
1420 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1421 		MIN_SWAP_BLOCK_RESERVE);
1422 	if (error != B_OK) {
1423 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1424 			strerror(error));
1425 	}
1426 
1427 	// init swap hash table
1428 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1429 	rw_lock_init(&sSwapHashLock, "swaphash");
1430 
1431 	error = register_resource_resizer(swap_hash_resizer, NULL,
1432 		SWAP_HASH_RESIZE_INTERVAL);
1433 	if (error != B_OK) {
1434 		panic("swap_init(): Failed to register swap hash resizer: %s",
1435 			strerror(error));
1436 	}
1437 
1438 	// init swap file list
1439 	mutex_init(&sSwapFileListLock, "swaplist");
1440 	sSwapFileAlloc = NULL;
1441 	sSwapFileCount = 0;
1442 
1443 	// init available swap space
1444 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1445 	sAvailSwapSpace = 0;
1446 
1447 	add_debugger_command_etc("swap", &dump_swap_info,
1448 		"Print infos about the swap usage",
1449 		"\n"
1450 		"Print infos about the swap usage.\n", 0);
1451 }
1452 
1453 
1454 void
1455 swap_init_post_modules()
1456 {
1457 	// Never try to create a swap file on a read-only device - when booting
1458 	// from CD, the write overlay is used.
1459 	if (gReadOnlyBootDevice)
1460 		return;
1461 
1462 	bool swapEnabled = true;
1463 	bool swapAutomatic = true;
1464 	off_t swapSize = 0;
1465 
1466 	dev_t swapDeviceID = -1;
1467 	VolumeInfo selectedVolume = {};
1468 
1469 	void* settings = load_driver_settings("virtual_memory");
1470 
1471 	if (settings != NULL) {
1472 		// We pass a lot of information on the swap device, this is mostly to
1473 		// ensure that we are dealing with the same device that was configured.
1474 
1475 		// TODO: Some kind of BFS uuid would be great here :)
1476 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1477 
1478 		if (enabled != NULL) {
1479 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1480 				true, false);
1481 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1482 				true, false);
1483 
1484 			if (swapEnabled && !swapAutomatic) {
1485 				const char* size = get_driver_parameter(settings, "swap_size",
1486 					NULL, NULL);
1487 				const char* volume = get_driver_parameter(settings,
1488 					"swap_volume_name", NULL, NULL);
1489 				const char* device = get_driver_parameter(settings,
1490 					"swap_volume_device", NULL, NULL);
1491 				const char* filesystem = get_driver_parameter(settings,
1492 					"swap_volume_filesystem", NULL, NULL);
1493 				const char* capacity = get_driver_parameter(settings,
1494 					"swap_volume_capacity", NULL, NULL);
1495 
1496 				if (size != NULL && device != NULL && volume != NULL
1497 					&& filesystem != NULL && capacity != NULL) {
1498 					// User specified a size / volume that seems valid
1499 					swapAutomatic = false;
1500 					swapSize = atoll(size);
1501 					strlcpy(selectedVolume.name, volume,
1502 						sizeof(selectedVolume.name));
1503 					strlcpy(selectedVolume.device, device,
1504 						sizeof(selectedVolume.device));
1505 					strlcpy(selectedVolume.filesystem, filesystem,
1506 						sizeof(selectedVolume.filesystem));
1507 					selectedVolume.capacity = atoll(capacity);
1508 				} else {
1509 					// Something isn't right with swap config, go auto
1510 					swapAutomatic = true;
1511 					dprintf("%s: virtual_memory configuration is invalid, "
1512 						"using automatic swap\n", __func__);
1513 				}
1514 			}
1515 		}
1516 		unload_driver_settings(settings);
1517 	}
1518 
1519 	if (swapAutomatic) {
1520 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1521 		if (swapSize <= (1024 * 1024 * 1024)) {
1522 			// Memory under 1GB? double the swap
1523 			swapSize *= 2;
1524 		}
1525 		// Automatic swap defaults to the boot device
1526 		swapDeviceID = gBootDevice;
1527 	}
1528 
1529 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1530 		dprintf("%s: virtual_memory is disabled\n", __func__);
1531 		return;
1532 	}
1533 
1534 	if (!swapAutomatic && swapDeviceID < 0) {
1535 		// If user-specified swap, and no swap device has been chosen yet...
1536 		KDiskDeviceManager::CreateDefault();
1537 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1538 		PartitionScorer visitor(selectedVolume);
1539 
1540 		KDiskDevice* device;
1541 		int32 cookie = 0;
1542 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1543 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1544 				|| device->IsRemovable()) {
1545 				continue;
1546 			}
1547 			device->VisitEachDescendant(&visitor);
1548 		}
1549 
1550 		if (!visitor.fBestPartition) {
1551 			dprintf("%s: Can't find configured swap partition '%s'\n",
1552 				__func__, selectedVolume.name);
1553 		} else {
1554 			if (visitor.fBestPartition->IsMounted())
1555 				swapDeviceID = visitor.fBestPartition->VolumeID();
1556 			else {
1557 				KPath devPath, mountPoint;
1558 				visitor.fBestPartition->GetPath(&devPath);
1559 				get_mount_point(visitor.fBestPartition, &mountPoint);
1560 				const char* mountPath = mountPoint.Path();
1561 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1562 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1563 					NULL, 0, NULL, 0);
1564 				if (swapDeviceID < 0) {
1565 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1566 						__func__, selectedVolume.name);
1567 				}
1568 			}
1569 		}
1570 	}
1571 
1572 	if (swapDeviceID < 0)
1573 		swapDeviceID = gBootDevice;
1574 
1575 	// We now have a swapDeviceID which is used for the swap file
1576 
1577 	KPath path;
1578 	struct fs_info info;
1579 	_kern_read_fs_info(swapDeviceID, &info);
1580 	if (swapDeviceID == gBootDevice)
1581 		path = kDefaultSwapPath;
1582 	else {
1583 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1584 			path.BufferSize());
1585 		path.UnlockBuffer();
1586 		path.Append("swap");
1587 	}
1588 
1589 	const char* swapPath = path.Path();
1590 
1591 	// Swap size limits prevent oversized swap files
1592 	if (swapAutomatic) {
1593 		off_t existingSwapSize = 0;
1594 		struct stat existingSwapStat;
1595 		if (stat(swapPath, &existingSwapStat) == 0)
1596 			existingSwapSize = existingSwapStat.st_size;
1597 
1598 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1599 
1600 		// Adjust automatic swap to a maximum of 25% of the free space
1601 		if (swapSize > (freeSpace / 4))
1602 			swapSize = (freeSpace / 4);
1603 	}
1604 
1605 	// Create swap file
1606 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1607 	if (fd < 0) {
1608 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1609 			swapPath, strerror(errno));
1610 		return;
1611 	}
1612 
1613 	struct stat stat;
1614 	stat.st_size = swapSize;
1615 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1616 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1617 	if (error != B_OK) {
1618 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1619 			__func__, swapPath, swapSize, strerror(error));
1620 	}
1621 
1622 	close(fd);
1623 
1624 	error = swap_file_add(swapPath);
1625 	if (error != B_OK) {
1626 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1627 			strerror(error));
1628 	}
1629 }
1630 
1631 
1632 //! Used by page daemon to free swap space.
1633 bool
1634 swap_free_page_swap_space(vm_page* page)
1635 {
1636 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1637 	if (cache == NULL)
1638 		return false;
1639 
1640 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1641 	if (slotIndex == SWAP_SLOT_NONE)
1642 		return false;
1643 
1644 	swap_slot_dealloc(slotIndex, 1);
1645 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1646 	cache->_SwapBlockFree(page->cache_offset, 1);
1647 
1648 	return true;
1649 }
1650 
1651 
1652 uint32
1653 swap_available_pages()
1654 {
1655 	mutex_lock(&sAvailSwapSpaceLock);
1656 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1657 	mutex_unlock(&sAvailSwapSpaceLock);
1658 
1659 	return avail;
1660 }
1661 
1662 
1663 uint32
1664 swap_total_swap_pages()
1665 {
1666 	mutex_lock(&sSwapFileListLock);
1667 
1668 	uint32 totalSwapSlots = 0;
1669 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1670 		swap_file* swapFile = it.Next();) {
1671 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1672 	}
1673 
1674 	mutex_unlock(&sSwapFileListLock);
1675 
1676 	return totalSwapSlots;
1677 }
1678 
1679 
1680 #endif	// ENABLE_SWAP_SUPPORT
1681 
1682 
1683 void
1684 swap_get_info(system_info* info)
1685 {
1686 #if ENABLE_SWAP_SUPPORT
1687 	info->max_swap_pages = swap_total_swap_pages();
1688 	info->free_swap_pages = swap_available_pages();
1689 #else
1690 	info->max_swap_space = 0;
1691 	info->free_swap_space = 0;
1692 #endif
1693 }
1694 
1695