xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision 529cd177b573aaba391c8adc9c9f5ad76a14bf81)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 
31 #include <arch_config.h>
32 #include <boot_device.h>
33 #include <disk_device_manager/KDiskDevice.h>
34 #include <disk_device_manager/KDiskDeviceManager.h>
35 #include <disk_device_manager/KDiskSystem.h>
36 #include <disk_device_manager/KPartitionVisitor.h>
37 #include <driver_settings.h>
38 #include <fs/fd.h>
39 #include <fs/KPath.h>
40 #include <fs_info.h>
41 #include <fs_interface.h>
42 #include <heap.h>
43 #include <kernel_daemon.h>
44 #include <slab/Slab.h>
45 #include <syscalls.h>
46 #include <system_info.h>
47 #include <tracing.h>
48 #include <util/AutoLock.h>
49 #include <util/DoublyLinkedList.h>
50 #include <util/OpenHashTable.h>
51 #include <util/RadixBitmap.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_priv.h>
56 #include <vm/VMAddressSpace.h>
57 
58 #include "IORequest.h"
59 
60 
61 #if	ENABLE_SWAP_SUPPORT
62 
63 //#define TRACE_VM_ANONYMOUS_CACHE
64 #ifdef TRACE_VM_ANONYMOUS_CACHE
65 #	define TRACE(x...) dprintf(x)
66 #else
67 #	define TRACE(x...) do { } while (false)
68 #endif
69 
70 
71 // number of free swap blocks the object cache shall minimally have
72 #define MIN_SWAP_BLOCK_RESERVE	4096
73 
74 // interval the has resizer is triggered (in 0.1s)
75 #define SWAP_HASH_RESIZE_INTERVAL	5
76 
77 #define INITIAL_SWAP_HASH_SIZE		1024
78 
79 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
80 
81 #define SWAP_BLOCK_PAGES 32
82 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
83 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
84 
85 
86 static const char* const kDefaultSwapPath = "/var/swap";
87 
88 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
89 	int				fd;
90 	struct vnode*	vnode;
91 	void*			cookie;
92 	swap_addr_t		first_slot;
93 	swap_addr_t		last_slot;
94 	radix_bitmap*	bmp;
95 };
96 
97 struct swap_hash_key {
98 	VMAnonymousCache	*cache;
99 	off_t				page_index;  // page index in the cache
100 };
101 
102 // Each swap block contains swap address information for
103 // SWAP_BLOCK_PAGES continuous pages from the same cache
104 struct swap_block {
105 	swap_block*		hash_link;
106 	swap_hash_key	key;
107 	uint32			used;
108 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
109 };
110 
111 struct SwapHashTableDefinition {
112 	typedef swap_hash_key KeyType;
113 	typedef swap_block ValueType;
114 
115 	SwapHashTableDefinition() {}
116 
117 	size_t HashKey(const swap_hash_key& key) const
118 	{
119 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
120 		VMAnonymousCache* cache = key.cache;
121 		return blockIndex ^ (size_t)(int*)cache;
122 	}
123 
124 	size_t Hash(const swap_block* value) const
125 	{
126 		return HashKey(value->key);
127 	}
128 
129 	bool Compare(const swap_hash_key& key, const swap_block* value) const
130 	{
131 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
132 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
133 			&& key.cache == value->key.cache;
134 	}
135 
136 	swap_block*& GetLink(swap_block* value) const
137 	{
138 		return value->hash_link;
139 	}
140 };
141 
142 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
143 typedef DoublyLinkedList<swap_file> SwapFileList;
144 
145 static SwapHashTable sSwapHashTable;
146 static rw_lock sSwapHashLock;
147 
148 static SwapFileList sSwapFileList;
149 static mutex sSwapFileListLock;
150 static swap_file* sSwapFileAlloc = NULL; // allocate from here
151 static uint32 sSwapFileCount = 0;
152 
153 static off_t sAvailSwapSpace = 0;
154 static mutex sAvailSwapSpaceLock;
155 
156 static object_cache* sSwapBlockCache;
157 
158 
159 #if SWAP_TRACING
160 namespace SwapTracing {
161 
162 class SwapTraceEntry : public AbstractTraceEntry {
163 public:
164 	SwapTraceEntry(VMAnonymousCache* cache)
165 		:
166 		fCache(cache)
167 	{
168 	}
169 
170 protected:
171 	VMAnonymousCache*	fCache;
172 };
173 
174 
175 class ReadPage : public SwapTraceEntry {
176 public:
177 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
178 		swap_addr_t swapSlotIndex)
179 		:
180 		SwapTraceEntry(cache),
181 		fPageIndex(pageIndex),
182 		fSwapSlotIndex(swapSlotIndex)
183 	{
184 		Initialized();
185 	}
186 
187 	virtual void AddDump(TraceOutput& out)
188 	{
189 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
190 			fCache, fPageIndex, fSwapSlotIndex);
191 	}
192 
193 private:
194 	page_num_t		fPageIndex;
195 	swap_addr_t		fSwapSlotIndex;
196 };
197 
198 
199 class WritePage : public SwapTraceEntry {
200 public:
201 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
202 		swap_addr_t swapSlotIndex)
203 		:
204 		SwapTraceEntry(cache),
205 		fPageIndex(pageIndex),
206 		fSwapSlotIndex(swapSlotIndex)
207 	{
208 		Initialized();
209 	}
210 
211 	virtual void AddDump(TraceOutput& out)
212 	{
213 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
214 			fCache, fPageIndex, fSwapSlotIndex);
215 	}
216 
217 private:
218 	page_num_t		fPageIndex;
219 	swap_addr_t		fSwapSlotIndex;
220 };
221 
222 }	// namespace SwapTracing
223 
224 #	define T(x) new(std::nothrow) SwapTracing::x;
225 #else
226 #	define T(x) ;
227 #endif
228 
229 
230 static int
231 dump_swap_info(int argc, char** argv)
232 {
233 	swap_addr_t totalSwapPages = 0;
234 	swap_addr_t freeSwapPages = 0;
235 
236 	kprintf("swap files:\n");
237 
238 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
239 		swap_file* file = it.Next();) {
240 		swap_addr_t total = file->last_slot - file->first_slot;
241 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
242 			"\n", file->vnode, total, file->bmp->free_slots);
243 
244 		totalSwapPages += total;
245 		freeSwapPages += file->bmp->free_slots;
246 	}
247 
248 	kprintf("\n");
249 	kprintf("swap space in pages:\n");
250 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
251 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
252 	kprintf("reserved:  %9" B_PRIdOFF "\n",
253 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
254 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
255 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
256 
257 	return 0;
258 }
259 
260 
261 static swap_addr_t
262 swap_slot_alloc(uint32 count)
263 {
264 	mutex_lock(&sSwapFileListLock);
265 
266 	if (sSwapFileList.IsEmpty()) {
267 		mutex_unlock(&sSwapFileListLock);
268 		panic("swap_slot_alloc(): no swap file in the system\n");
269 		return SWAP_SLOT_NONE;
270 	}
271 
272 	// since radix bitmap could not handle more than 32 pages, we return
273 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
274 	if (count > BITMAP_RADIX) {
275 		mutex_unlock(&sSwapFileListLock);
276 		return SWAP_SLOT_NONE;
277 	}
278 
279 	swap_addr_t j, addr = SWAP_SLOT_NONE;
280 	for (j = 0; j < sSwapFileCount; j++) {
281 		if (sSwapFileAlloc == NULL)
282 			sSwapFileAlloc = sSwapFileList.First();
283 
284 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
285 		if (addr != SWAP_SLOT_NONE) {
286 			addr += sSwapFileAlloc->first_slot;
287 			break;
288 		}
289 
290 		// this swap_file is full, find another
291 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
292 	}
293 
294 	if (j == sSwapFileCount) {
295 		mutex_unlock(&sSwapFileListLock);
296 		panic("swap_slot_alloc: swap space exhausted!\n");
297 		return SWAP_SLOT_NONE;
298 	}
299 
300 	// if this swap file has used more than 90% percent of its space
301 	// switch to another
302 	if (sSwapFileAlloc->bmp->free_slots
303 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
304 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
305 	}
306 
307 	mutex_unlock(&sSwapFileListLock);
308 
309 	return addr;
310 }
311 
312 
313 static swap_file*
314 find_swap_file(swap_addr_t slotIndex)
315 {
316 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
317 		swap_file* swapFile = it.Next();) {
318 		if (slotIndex >= swapFile->first_slot
319 			&& slotIndex < swapFile->last_slot) {
320 			return swapFile;
321 		}
322 	}
323 
324 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
325 		slotIndex);
326 	return NULL;
327 }
328 
329 
330 static void
331 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
332 {
333 	if (slotIndex == SWAP_SLOT_NONE)
334 		return;
335 
336 	mutex_lock(&sSwapFileListLock);
337 	swap_file* swapFile = find_swap_file(slotIndex);
338 	slotIndex -= swapFile->first_slot;
339 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
340 	mutex_unlock(&sSwapFileListLock);
341 }
342 
343 
344 static off_t
345 swap_space_reserve(off_t amount)
346 {
347 	mutex_lock(&sAvailSwapSpaceLock);
348 	if (sAvailSwapSpace >= amount)
349 		sAvailSwapSpace -= amount;
350 	else {
351 		amount = sAvailSwapSpace;
352 		sAvailSwapSpace = 0;
353 	}
354 	mutex_unlock(&sAvailSwapSpaceLock);
355 
356 	return amount;
357 }
358 
359 
360 static void
361 swap_space_unreserve(off_t amount)
362 {
363 	mutex_lock(&sAvailSwapSpaceLock);
364 	sAvailSwapSpace += amount;
365 	mutex_unlock(&sAvailSwapSpaceLock);
366 }
367 
368 
369 static void
370 swap_hash_resizer(void*, int)
371 {
372 	WriteLocker locker(sSwapHashLock);
373 
374 	size_t size;
375 	void* allocation;
376 
377 	do {
378 		size = sSwapHashTable.ResizeNeeded();
379 		if (size == 0)
380 			return;
381 
382 		locker.Unlock();
383 
384 		allocation = malloc(size);
385 		if (allocation == NULL)
386 			return;
387 
388 		locker.Lock();
389 
390 	} while (!sSwapHashTable.Resize(allocation, size));
391 }
392 
393 
394 // #pragma mark -
395 
396 
397 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
398 public:
399 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
400 		:
401 		StackableAsyncIOCallback(callback),
402 		fCache(cache)
403 	{
404 	}
405 
406 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
407 	{
408 		fPageIndex = pageIndex;
409 		fSlotIndex = slotIndex;
410 		fNewSlot = newSlot;
411 	}
412 
413 	virtual void IOFinished(status_t status, bool partialTransfer,
414 		generic_size_t bytesTransferred)
415 	{
416 		if (fNewSlot) {
417 			if (status == B_OK) {
418 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
419 			} else {
420 				AutoLocker<VMCache> locker(fCache);
421 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
422 				locker.Unlock();
423 
424 				swap_slot_dealloc(fSlotIndex, 1);
425 			}
426 		}
427 
428 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
429 
430 		delete this;
431 	}
432 
433 private:
434 	VMAnonymousCache*	fCache;
435 	page_num_t			fPageIndex;
436 	swap_addr_t			fSlotIndex;
437 	bool				fNewSlot;
438 };
439 
440 
441 // #pragma mark -
442 
443 
444 VMAnonymousCache::~VMAnonymousCache()
445 {
446 	// free allocated swap space and swap block
447 	for (off_t offset = virtual_base, toFree = fAllocatedSwapSize;
448 		offset < virtual_end && toFree > 0; offset += B_PAGE_SIZE) {
449 		swap_addr_t slotIndex = _SwapBlockGetAddress(offset >> PAGE_SHIFT);
450 		if (slotIndex == SWAP_SLOT_NONE)
451 			continue;
452 
453 		swap_slot_dealloc(slotIndex, 1);
454 		_SwapBlockFree(offset >> PAGE_SHIFT, 1);
455 		toFree -= B_PAGE_SIZE;
456 	}
457 
458 	swap_space_unreserve(fCommittedSwapSize);
459 	if (committed_size > fCommittedSwapSize)
460 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
461 }
462 
463 
464 status_t
465 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
466 	int32 numGuardPages, uint32 allocationFlags)
467 {
468 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
469 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
470 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
471 		numGuardPages);
472 
473 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
474 	if (error != B_OK)
475 		return error;
476 
477 	fCanOvercommit = canOvercommit;
478 	fHasPrecommitted = false;
479 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
480 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
481 	fCommittedSwapSize = 0;
482 	fAllocatedSwapSize = 0;
483 
484 	return B_OK;
485 }
486 
487 
488 status_t
489 VMAnonymousCache::Resize(off_t newSize, int priority)
490 {
491 	// If the cache size shrinks, drop all swap pages beyond the new size.
492 	if (fAllocatedSwapSize > 0) {
493 		off_t oldPageCount = (virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
494 		swap_block* swapBlock = NULL;
495 
496 		for (off_t pageIndex = (newSize + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
497 			pageIndex < oldPageCount && fAllocatedSwapSize > 0; pageIndex++) {
498 
499 			WriteLocker locker(sSwapHashLock);
500 
501 			// Get the swap slot index for the page.
502 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
503 			if (swapBlock == NULL || blockIndex == 0) {
504 				swap_hash_key key = { this, pageIndex };
505 				swapBlock = sSwapHashTable.Lookup(key);
506 
507 				if (swapBlock == NULL) {
508 					pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES);
509 					continue;
510 				}
511 			}
512 
513 			swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
514 			vm_page* page;
515 			if (slotIndex != SWAP_SLOT_NONE
516 				&& ((page = LookupPage((off_t)pageIndex * B_PAGE_SIZE)) == NULL
517 					|| !page->busy)) {
518 					// TODO: We skip (i.e. leak) swap space of busy pages, since
519 					// there could be I/O going on (paging in/out). Waiting is
520 					// not an option as 1. unlocking the cache means that new
521 					// swap pages could be added in a range we've already
522 					// cleared (since the cache still has the old size) and 2.
523 					// we'd risk a deadlock in case we come from the file cache
524 					// and the FS holds the node's write-lock. We should mark
525 					// the page invalid and let the one responsible clean up.
526 					// There's just no such mechanism yet.
527 				swap_slot_dealloc(slotIndex, 1);
528 				fAllocatedSwapSize -= B_PAGE_SIZE;
529 
530 				swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
531 				if (--swapBlock->used == 0) {
532 					// All swap pages have been freed -- we can discard the swap
533 					// block.
534 					sSwapHashTable.RemoveUnchecked(swapBlock);
535 					object_cache_free(sSwapBlockCache, swapBlock,
536 						CACHE_DONT_WAIT_FOR_MEMORY
537 							| CACHE_DONT_LOCK_KERNEL_SPACE);
538 				}
539 			}
540 		}
541 	}
542 
543 	return VMCache::Resize(newSize, priority);
544 }
545 
546 
547 status_t
548 VMAnonymousCache::Commit(off_t size, int priority)
549 {
550 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
551 
552 	// If we can overcommit, we don't commit here, but in Fault(). We always
553 	// unreserve memory, if we're asked to shrink our commitment, though.
554 	if (fCanOvercommit && size > committed_size) {
555 		if (fHasPrecommitted)
556 			return B_OK;
557 
558 		// pre-commit some pages to make a later failure less probable
559 		fHasPrecommitted = true;
560 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
561 		if (size > precommitted)
562 			size = precommitted;
563 	}
564 
565 	return _Commit(size, priority);
566 }
567 
568 
569 bool
570 VMAnonymousCache::HasPage(off_t offset)
571 {
572 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
573 		return true;
574 
575 	return false;
576 }
577 
578 
579 bool
580 VMAnonymousCache::DebugHasPage(off_t offset)
581 {
582 	off_t pageIndex = offset >> PAGE_SHIFT;
583 	swap_hash_key key = { this, pageIndex };
584 	swap_block* swap = sSwapHashTable.Lookup(key);
585 	if (swap == NULL)
586 		return false;
587 
588 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
589 }
590 
591 
592 status_t
593 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
594 	uint32 flags, generic_size_t* _numBytes)
595 {
596 	off_t pageIndex = offset >> PAGE_SHIFT;
597 
598 	for (uint32 i = 0, j = 0; i < count; i = j) {
599 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
600 		for (j = i + 1; j < count; j++) {
601 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
602 			if (slotIndex != startSlotIndex + j - i)
603 				break;
604 		}
605 
606 		T(ReadPage(this, pageIndex, startSlotIndex));
607 			// TODO: Assumes that only one page is read.
608 
609 		swap_file* swapFile = find_swap_file(startSlotIndex);
610 
611 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
612 			* B_PAGE_SIZE;
613 
614 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
615 			vecs + i, j - i, flags, _numBytes);
616 		if (status != B_OK)
617 			return status;
618 	}
619 
620 	return B_OK;
621 }
622 
623 
624 status_t
625 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
626 	uint32 flags, generic_size_t* _numBytes)
627 {
628 	off_t pageIndex = offset >> PAGE_SHIFT;
629 
630 	AutoLocker<VMCache> locker(this);
631 
632 	page_num_t totalPages = 0;
633 	for (uint32 i = 0; i < count; i++) {
634 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
635 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
636 		if (slotIndex != SWAP_SLOT_NONE) {
637 			swap_slot_dealloc(slotIndex, pageCount);
638 			_SwapBlockFree(pageIndex + totalPages, pageCount);
639 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
640 		}
641 
642 		totalPages += pageCount;
643 	}
644 
645 	off_t totalSize = totalPages * B_PAGE_SIZE;
646 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
647 		return B_ERROR;
648 
649 	fAllocatedSwapSize += totalSize;
650 	locker.Unlock();
651 
652 	page_num_t pagesLeft = totalPages;
653 	totalPages = 0;
654 
655 	for (uint32 i = 0; i < count; i++) {
656 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
657 
658 		generic_addr_t vectorBase = vecs[i].base;
659 		generic_size_t vectorLength = vecs[i].length;
660 		page_num_t n = pageCount;
661 
662 		for (page_num_t j = 0; j < pageCount; j += n) {
663 			swap_addr_t slotIndex;
664 			// try to allocate n slots, if fail, try to allocate n/2
665 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
666 				n >>= 1;
667 
668 			if (slotIndex == SWAP_SLOT_NONE)
669 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
670 
671 			T(WritePage(this, pageIndex, slotIndex));
672 				// TODO: Assumes that only one page is written.
673 
674 			swap_file* swapFile = find_swap_file(slotIndex);
675 
676 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
677 
678 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
679 			generic_io_vec vector[1];
680 			vector->base = vectorBase;
681 			vector->length = length;
682 
683 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
684 				pos, vector, 1, flags, &length);
685 			if (status != B_OK) {
686 				locker.Lock();
687 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
688 				locker.Unlock();
689 
690 				swap_slot_dealloc(slotIndex, n);
691 				return status;
692 			}
693 
694 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
695 			pagesLeft -= n;
696 
697 			if (n != pageCount) {
698 				vectorBase = vectorBase + n * B_PAGE_SIZE;
699 				vectorLength -= n * B_PAGE_SIZE;
700 			}
701 		}
702 
703 		totalPages += pageCount;
704 	}
705 
706 	ASSERT(pagesLeft == 0);
707 	return B_OK;
708 }
709 
710 
711 status_t
712 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
713 	size_t count, generic_size_t numBytes, uint32 flags,
714 	AsyncIOCallback* _callback)
715 {
716 	// TODO: Currently this method is only used for single pages. Either make
717 	// more flexible use of it or change the interface!
718 	// This implementation relies on the current usage!
719 	ASSERT(count == 1);
720 	ASSERT(numBytes <= B_PAGE_SIZE);
721 
722 	page_num_t pageIndex = offset >> PAGE_SHIFT;
723 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
724 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
725 
726 	// If the page doesn't have any swap space yet, allocate it.
727 	if (newSlot) {
728 		AutoLocker<VMCache> locker(this);
729 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
730 			_callback->IOFinished(B_ERROR, true, 0);
731 			return B_ERROR;
732 		}
733 
734 		fAllocatedSwapSize += B_PAGE_SIZE;
735 
736 		slotIndex = swap_slot_alloc(1);
737 	}
738 
739 	// create our callback
740 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
741 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
742 		: new(std::nothrow) WriteCallback(this, _callback);
743 	if (callback == NULL) {
744 		if (newSlot) {
745 			AutoLocker<VMCache> locker(this);
746 			fAllocatedSwapSize -= B_PAGE_SIZE;
747 			locker.Unlock();
748 
749 			swap_slot_dealloc(slotIndex, 1);
750 		}
751 		_callback->IOFinished(B_NO_MEMORY, true, 0);
752 		return B_NO_MEMORY;
753 	}
754 	// TODO: If the page already had swap space assigned, we don't need an own
755 	// callback.
756 
757 	callback->SetTo(pageIndex, slotIndex, newSlot);
758 
759 	T(WritePage(this, pageIndex, slotIndex));
760 
761 	// write the page asynchrounously
762 	swap_file* swapFile = find_swap_file(slotIndex);
763 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
764 
765 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
766 		vecs, 1, numBytes, flags, callback);
767 }
768 
769 
770 bool
771 VMAnonymousCache::CanWritePage(off_t offset)
772 {
773 	// We can write the page, if we have not used all of our committed swap
774 	// space or the page already has a swap slot assigned.
775 	return fAllocatedSwapSize < fCommittedSwapSize
776 		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
777 }
778 
779 
780 int32
781 VMAnonymousCache::MaxPagesPerAsyncWrite() const
782 {
783 	return 1;
784 }
785 
786 
787 status_t
788 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
789 {
790 	if (fGuardedSize > 0) {
791 		uint32 guardOffset;
792 
793 #ifdef STACK_GROWS_DOWNWARDS
794 		guardOffset = 0;
795 #elif defined(STACK_GROWS_UPWARDS)
796 		guardOffset = virtual_size - fGuardedSize;
797 #else
798 #	error Stack direction has not been defined in arch_config.h
799 #endif
800 		// report stack fault, guard page hit!
801 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
802 			TRACE(("stack overflow!\n"));
803 			return B_BAD_ADDRESS;
804 		}
805 	}
806 
807 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
808 		if (fPrecommittedPages == 0) {
809 			// never commit more than needed
810 			if (committed_size / B_PAGE_SIZE > page_count)
811 				return B_BAD_HANDLER;
812 
813 			// try to commit additional swap space/memory
814 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
815 				fCommittedSwapSize += B_PAGE_SIZE;
816 			} else {
817 				int priority = aspace == VMAddressSpace::Kernel()
818 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
819 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
820 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
821 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
822 					return B_NO_MEMORY;
823 				}
824 			}
825 
826 			committed_size += B_PAGE_SIZE;
827 		} else
828 			fPrecommittedPages--;
829 	}
830 
831 	// This will cause vm_soft_fault() to handle the fault
832 	return B_BAD_HANDLER;
833 }
834 
835 
836 void
837 VMAnonymousCache::Merge(VMCache* _source)
838 {
839 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
840 	if (source == NULL) {
841 		panic("VMAnonymousCache::MergeStore(): merge with incompatible cache "
842 			"%p requested", _source);
843 		return;
844 	}
845 
846 	// take over the source' committed size
847 	fCommittedSwapSize += source->fCommittedSwapSize;
848 	source->fCommittedSwapSize = 0;
849 	committed_size += source->committed_size;
850 	source->committed_size = 0;
851 
852 	off_t actualSize = virtual_end - virtual_base;
853 	if (committed_size > actualSize)
854 		_Commit(actualSize, VM_PRIORITY_USER);
855 
856 	// Move all not shadowed swap pages from the source to the consumer cache.
857 	// Also remove all source pages that are shadowed by consumer swap pages.
858 	_MergeSwapPages(source);
859 
860 	// Move all not shadowed pages from the source to the consumer cache.
861 	if (source->page_count < page_count)
862 		_MergePagesSmallerSource(source);
863 	else
864 		_MergePagesSmallerConsumer(source);
865 }
866 
867 
868 void
869 VMAnonymousCache::DeleteObject()
870 {
871 	object_cache_delete(gAnonymousCacheObjectCache, this);
872 }
873 
874 
875 void
876 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
877 	swap_addr_t startSlotIndex, uint32 count)
878 {
879 	WriteLocker locker(sSwapHashLock);
880 
881 	uint32 left = count;
882 	for (uint32 i = 0, j = 0; i < count; i += j) {
883 		off_t pageIndex = startPageIndex + i;
884 		swap_addr_t slotIndex = startSlotIndex + i;
885 
886 		swap_hash_key key = { this, pageIndex };
887 
888 		swap_block* swap = sSwapHashTable.Lookup(key);
889 		while (swap == NULL) {
890 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
891 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
892 			if (swap == NULL) {
893 				// Wait a short time until memory is available again.
894 				locker.Unlock();
895 				snooze(10000);
896 				locker.Lock();
897 				swap = sSwapHashTable.Lookup(key);
898 				continue;
899 			}
900 
901 			swap->key.cache = this;
902 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
903 			swap->used = 0;
904 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
905 				swap->swap_slots[i] = SWAP_SLOT_NONE;
906 
907 			sSwapHashTable.InsertUnchecked(swap);
908 		}
909 
910 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
911 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
912 			swap->swap_slots[blockIndex++] = slotIndex + j;
913 			left--;
914 		}
915 
916 		swap->used += j;
917 	}
918 }
919 
920 
921 void
922 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
923 {
924 	WriteLocker locker(sSwapHashLock);
925 
926 	uint32 left = count;
927 	for (uint32 i = 0, j = 0; i < count; i += j) {
928 		off_t pageIndex = startPageIndex + i;
929 		swap_hash_key key = { this, pageIndex };
930 		swap_block* swap = sSwapHashTable.Lookup(key);
931 
932 		ASSERT(swap != NULL);
933 
934 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
935 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
936 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
937 			left--;
938 		}
939 
940 		swap->used -= j;
941 		if (swap->used == 0) {
942 			sSwapHashTable.RemoveUnchecked(swap);
943 			object_cache_free(sSwapBlockCache, swap,
944 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
945 		}
946 	}
947 }
948 
949 
950 swap_addr_t
951 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
952 {
953 	ReadLocker locker(sSwapHashLock);
954 
955 	swap_hash_key key = { this, pageIndex };
956 	swap_block* swap = sSwapHashTable.Lookup(key);
957 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
958 
959 	if (swap != NULL) {
960 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
961 		slotIndex = swap->swap_slots[blockIndex];
962 	}
963 
964 	return slotIndex;
965 }
966 
967 
968 status_t
969 VMAnonymousCache::_Commit(off_t size, int priority)
970 {
971 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
972 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
973 		fCommittedSwapSize);
974 
975 	// Basic strategy: reserve swap space first, only when running out of swap
976 	// space, reserve real memory.
977 
978 	off_t committedMemory = committed_size - fCommittedSwapSize;
979 
980 	// Regardless of whether we're asked to grow or shrink the commitment,
981 	// we always try to reserve as much as possible of the final commitment
982 	// in the swap space.
983 	if (size > fCommittedSwapSize) {
984 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
985 		committed_size = fCommittedSwapSize + committedMemory;
986 		if (size > fCommittedSwapSize) {
987 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
988 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
989 		}
990 	}
991 
992 	if (committed_size == size)
993 		return B_OK;
994 
995 	if (committed_size > size) {
996 		// The commitment shrinks -- unreserve real memory first.
997 		off_t toUnreserve = committed_size - size;
998 		if (committedMemory > 0) {
999 			off_t unreserved = min_c(toUnreserve, committedMemory);
1000 			vm_unreserve_memory(unreserved);
1001 			committedMemory -= unreserved;
1002 			committed_size -= unreserved;
1003 			toUnreserve -= unreserved;
1004 		}
1005 
1006 		// Unreserve swap space.
1007 		if (toUnreserve > 0) {
1008 			swap_space_unreserve(toUnreserve);
1009 			fCommittedSwapSize -= toUnreserve;
1010 			committed_size -= toUnreserve;
1011 		}
1012 
1013 		return B_OK;
1014 	}
1015 
1016 	// The commitment grows -- we have already tried to reserve swap space at
1017 	// the start of the method, so we try to reserve real memory, now.
1018 
1019 	off_t toReserve = size - committed_size;
1020 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1021 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1022 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1023 		return B_NO_MEMORY;
1024 	}
1025 
1026 	committed_size = size;
1027 	return B_OK;
1028 }
1029 
1030 
1031 void
1032 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1033 {
1034 	// The source cache has less pages than the consumer (this cache), so we
1035 	// iterate through the source's pages and move the ones that are not
1036 	// shadowed up to the consumer.
1037 
1038 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1039 			vm_page* page = it.Next();) {
1040 		// Note: Removing the current node while iterating through a
1041 		// IteratableSplayTree is safe.
1042 		vm_page* consumerPage = LookupPage(
1043 			(off_t)page->cache_offset << PAGE_SHIFT);
1044 		if (consumerPage == NULL) {
1045 			// the page is not yet in the consumer cache - move it upwards
1046 			ASSERT_PRINT(!page->busy, "page: %p", page);
1047 			MovePage(page);
1048 		}
1049 	}
1050 }
1051 
1052 
1053 void
1054 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1055 {
1056 	// The consumer (this cache) has less pages than the source, so we move the
1057 	// consumer's pages to the source (freeing shadowed ones) and finally just
1058 	// all pages of the source back to the consumer.
1059 
1060 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1061 		vm_page* page = it.Next();) {
1062 		// If a source page is in the way, remove and free it.
1063 		vm_page* sourcePage = source->LookupPage(
1064 			(off_t)page->cache_offset << PAGE_SHIFT);
1065 		if (sourcePage != NULL) {
1066 			DEBUG_PAGE_ACCESS_START(sourcePage);
1067 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1068 			ASSERT_PRINT(sourcePage->WiredCount() == 0
1069 					&& sourcePage->mappings.IsEmpty(),
1070 				"sourcePage: %p, page: %p", sourcePage, page);
1071 			source->RemovePage(sourcePage);
1072 			vm_page_free(source, sourcePage);
1073 		}
1074 
1075 		// Note: Removing the current node while iterating through a
1076 		// IteratableSplayTree is safe.
1077 		source->MovePage(page);
1078 	}
1079 
1080 	MoveAllPages(source);
1081 }
1082 
1083 
1084 void
1085 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1086 {
1087 	// If neither source nor consumer have swap pages, we don't have to do
1088 	// anything.
1089 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1090 		return;
1091 
1092 	for (off_t offset = source->virtual_base
1093 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1094 		offset < source->virtual_end;
1095 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1096 
1097 		WriteLocker locker(sSwapHashLock);
1098 
1099 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1100 		swap_hash_key key = { source, swapBlockPageIndex };
1101 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1102 
1103 		// remove the source swap block -- we will either take over the swap
1104 		// space (and the block) or free it
1105 		if (sourceSwapBlock != NULL)
1106 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1107 
1108 		key.cache = this;
1109 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1110 
1111 		locker.Unlock();
1112 
1113 		// remove all source pages that are shadowed by consumer swap pages
1114 		if (swapBlock != NULL) {
1115 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1116 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1117 					vm_page* page = source->LookupPage(
1118 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1119 					if (page != NULL) {
1120 						DEBUG_PAGE_ACCESS_START(page);
1121 						ASSERT_PRINT(!page->busy, "page: %p", page);
1122 						source->RemovePage(page);
1123 						vm_page_free(source, page);
1124 					}
1125 				}
1126 			}
1127 		}
1128 
1129 		if (sourceSwapBlock == NULL)
1130 			continue;
1131 
1132 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1133 			off_t pageIndex = swapBlockPageIndex + i;
1134 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1135 
1136 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1137 				continue;
1138 
1139 			if ((swapBlock != NULL
1140 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1141 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1142 				// The consumer already has a page or a swapped out page
1143 				// at this index. So we can free the source swap space.
1144 				swap_slot_dealloc(sourceSlotIndex, 1);
1145 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1146 				sourceSwapBlock->used--;
1147 			}
1148 
1149 			// We've either freed the source swap page or are going to move it
1150 			// to the consumer. At any rate, the source cache doesn't own it
1151 			// anymore.
1152 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1153 		}
1154 
1155 		// All source swap pages that have not been freed yet are taken over by
1156 		// the consumer.
1157 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1158 
1159 		if (sourceSwapBlock->used == 0) {
1160 			// All swap pages have been freed -- we can discard the source swap
1161 			// block.
1162 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1163 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1164 		} else if (swapBlock == NULL) {
1165 			// We need to take over some of the source's swap pages and there's
1166 			// no swap block in the consumer cache. Just take over the source
1167 			// swap block.
1168 			sourceSwapBlock->key.cache = this;
1169 			locker.Lock();
1170 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1171 			locker.Unlock();
1172 		} else {
1173 			// We need to take over some of the source's swap pages and there's
1174 			// already a swap block in the consumer cache. Copy the respective
1175 			// swap addresses and discard the source swap block.
1176 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1177 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1178 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1179 			}
1180 
1181 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1182 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1183 		}
1184 	}
1185 }
1186 
1187 
1188 // #pragma mark -
1189 
1190 
1191 // TODO: This can be removed if we get BFS uuid's
1192 struct VolumeInfo {
1193 	char name[B_FILE_NAME_LENGTH];
1194 	char device[B_FILE_NAME_LENGTH];
1195 	char filesystem[B_OS_NAME_LENGTH];
1196 	off_t capacity;
1197 };
1198 
1199 
1200 class PartitionScorer : public KPartitionVisitor {
1201 public:
1202 	PartitionScorer(VolumeInfo& volumeInfo)
1203 		:
1204 		fBestPartition(NULL),
1205 		fBestScore(-1),
1206 		fVolumeInfo(volumeInfo)
1207 	{
1208 	}
1209 
1210 	virtual bool VisitPre(KPartition* partition)
1211 	{
1212 		if (!partition->ContainsFileSystem())
1213 			return false;
1214 
1215 		KPath path;
1216 		partition->GetPath(&path);
1217 
1218 		int score = 0;
1219 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1220 			score += 4;
1221 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1222 			score += 3;
1223 		if (fVolumeInfo.capacity == partition->Size())
1224 			score += 2;
1225 		if (strcmp(fVolumeInfo.filesystem,
1226 			partition->DiskSystem()->ShortName()) == 0) {
1227 			score += 1;
1228 		}
1229 		if (score >= 4 && score > fBestScore) {
1230 			fBestPartition = partition;
1231 			fBestScore = score;
1232 		}
1233 
1234 		return false;
1235 	}
1236 
1237 	KPartition* fBestPartition;
1238 
1239 private:
1240 	int32		fBestScore;
1241 	VolumeInfo	fVolumeInfo;
1242 };
1243 
1244 
1245 status_t
1246 get_mount_point(KPartition* partition, KPath* mountPoint)
1247 {
1248 	if (!mountPoint || !partition->ContainsFileSystem())
1249 		return B_BAD_VALUE;
1250 
1251 	const char* volumeName = partition->ContentName();
1252 	if (!volumeName || strlen(volumeName) == 0)
1253 		volumeName = partition->Name();
1254 	if (!volumeName || strlen(volumeName) == 0)
1255 		volumeName = "unnamed volume";
1256 
1257 	char basePath[B_PATH_NAME_LENGTH];
1258 	int32 len = snprintf(basePath, sizeof(basePath), "/%s", volumeName);
1259 	for (int32 i = 1; i < len; i++)
1260 		if (basePath[i] == '/')
1261 		basePath[i] = '-';
1262 	char* path = mountPoint->LockBuffer();
1263 	int32 pathLen = mountPoint->BufferSize();
1264 	strncpy(path, basePath, pathLen);
1265 
1266 	struct stat dummy;
1267 	for (int i = 1; ; i++) {
1268 		if (stat(path, &dummy) != 0)
1269 			break;
1270 		snprintf(path, pathLen, "%s%d", basePath, i);
1271 	}
1272 
1273 	mountPoint->UnlockBuffer();
1274 	return B_OK;
1275 }
1276 
1277 
1278 status_t
1279 swap_file_add(const char* path)
1280 {
1281 	// open the file
1282 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1283 	if (fd < 0)
1284 		return errno;
1285 
1286 	// fstat() it and check whether we can use it
1287 	struct stat st;
1288 	if (fstat(fd, &st) < 0) {
1289 		close(fd);
1290 		return errno;
1291 	}
1292 
1293 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1294 		close(fd);
1295 		return B_BAD_VALUE;
1296 	}
1297 
1298 	if (st.st_size < B_PAGE_SIZE) {
1299 		close(fd);
1300 		return B_BAD_VALUE;
1301 	}
1302 
1303 	// get file descriptor, vnode, and cookie
1304 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1305 	put_fd(descriptor);
1306 
1307 	vnode* node = fd_vnode(descriptor);
1308 	if (node == NULL) {
1309 		close(fd);
1310 		return B_BAD_VALUE;
1311 	}
1312 
1313 	// do the allocations and prepare the swap_file structure
1314 	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1315 	if (swap == NULL) {
1316 		close(fd);
1317 		return B_NO_MEMORY;
1318 	}
1319 
1320 	swap->fd = fd;
1321 	swap->vnode = node;
1322 	swap->cookie = descriptor->cookie;
1323 
1324 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1325 	swap->bmp = radix_bitmap_create(pageCount);
1326 	if (swap->bmp == NULL) {
1327 		free(swap);
1328 		close(fd);
1329 		return B_NO_MEMORY;
1330 	}
1331 
1332 	// set slot index and add this file to swap file list
1333 	mutex_lock(&sSwapFileListLock);
1334 	// TODO: Also check whether the swap file is already registered!
1335 	if (sSwapFileList.IsEmpty()) {
1336 		swap->first_slot = 0;
1337 		swap->last_slot = pageCount;
1338 	} else {
1339 		// leave one page gap between two swap files
1340 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1341 		swap->last_slot = swap->first_slot + pageCount;
1342 	}
1343 	sSwapFileList.Add(swap);
1344 	sSwapFileCount++;
1345 	mutex_unlock(&sSwapFileListLock);
1346 
1347 	mutex_lock(&sAvailSwapSpaceLock);
1348 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1349 	mutex_unlock(&sAvailSwapSpaceLock);
1350 
1351 	return B_OK;
1352 }
1353 
1354 
1355 status_t
1356 swap_file_delete(const char* path)
1357 {
1358 	vnode* node = NULL;
1359 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1360 	if (status != B_OK)
1361 		return status;
1362 
1363 	MutexLocker locker(sSwapFileListLock);
1364 
1365 	swap_file* swapFile = NULL;
1366 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1367 			(swapFile = it.Next()) != NULL;) {
1368 		if (swapFile->vnode == node)
1369 			break;
1370 	}
1371 
1372 	vfs_put_vnode(node);
1373 
1374 	if (swapFile == NULL)
1375 		return B_ERROR;
1376 
1377 	// if this file is currently used, we can't delete
1378 	// TODO: mark this swap file deleting, and remove it after releasing
1379 	// all the swap space
1380 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1381 		return B_ERROR;
1382 
1383 	sSwapFileList.Remove(swapFile);
1384 	sSwapFileCount--;
1385 	locker.Unlock();
1386 
1387 	mutex_lock(&sAvailSwapSpaceLock);
1388 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1389 		* PAGE_SIZE;
1390 	mutex_unlock(&sAvailSwapSpaceLock);
1391 
1392 	close(swapFile->fd);
1393 	radix_bitmap_destroy(swapFile->bmp);
1394 	free(swapFile);
1395 
1396 	return B_OK;
1397 }
1398 
1399 
1400 void
1401 swap_init(void)
1402 {
1403 	// create swap block cache
1404 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1405 		sizeof(void*), NULL, NULL, NULL);
1406 	if (sSwapBlockCache == NULL)
1407 		panic("swap_init(): can't create object cache for swap blocks\n");
1408 
1409 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1410 		MIN_SWAP_BLOCK_RESERVE);
1411 	if (error != B_OK) {
1412 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1413 			strerror(error));
1414 	}
1415 
1416 	// init swap hash table
1417 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1418 	rw_lock_init(&sSwapHashLock, "swaphash");
1419 
1420 	error = register_resource_resizer(swap_hash_resizer, NULL,
1421 		SWAP_HASH_RESIZE_INTERVAL);
1422 	if (error != B_OK) {
1423 		panic("swap_init(): Failed to register swap hash resizer: %s",
1424 			strerror(error));
1425 	}
1426 
1427 	// init swap file list
1428 	mutex_init(&sSwapFileListLock, "swaplist");
1429 	sSwapFileAlloc = NULL;
1430 	sSwapFileCount = 0;
1431 
1432 	// init available swap space
1433 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1434 	sAvailSwapSpace = 0;
1435 
1436 	add_debugger_command_etc("swap", &dump_swap_info,
1437 		"Print infos about the swap usage",
1438 		"\n"
1439 		"Print infos about the swap usage.\n", 0);
1440 }
1441 
1442 
1443 void
1444 swap_init_post_modules()
1445 {
1446 	// Never try to create a swap file on a read-only device - when booting
1447 	// from CD, the write overlay is used.
1448 	if (gReadOnlyBootDevice)
1449 		return;
1450 
1451 	bool swapEnabled = true;
1452 	bool swapAutomatic = true;
1453 	off_t swapSize = 0;
1454 
1455 	dev_t swapDeviceID = -1;
1456 	VolumeInfo selectedVolume = {};
1457 
1458 	void* settings = load_driver_settings("virtual_memory");
1459 
1460 	if (settings != NULL) {
1461 		// We pass a lot of information on the swap device, this is mostly to
1462 		// ensure that we are dealing with the same device that was configured.
1463 
1464 		// TODO: Some kind of BFS uuid would be great here :)
1465 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1466 
1467 		if (enabled != NULL) {
1468 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1469 				true, false);
1470 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1471 				true, false);
1472 
1473 			if (swapEnabled && !swapAutomatic) {
1474 				const char* size = get_driver_parameter(settings, "swap_size",
1475 					NULL, NULL);
1476 				const char* volume = get_driver_parameter(settings,
1477 					"swap_volume_name", NULL, NULL);
1478 				const char* device = get_driver_parameter(settings,
1479 					"swap_volume_device", NULL, NULL);
1480 				const char* filesystem = get_driver_parameter(settings,
1481 					"swap_volume_filesystem", NULL, NULL);
1482 				const char* capacity = get_driver_parameter(settings,
1483 					"swap_volume_capacity", NULL, NULL);
1484 
1485 				if (size != NULL && device != NULL && volume != NULL
1486 					&& filesystem != NULL && capacity != NULL) {
1487 					// User specified a size / volume that seems valid
1488 					swapAutomatic = false;
1489 					swapSize = atoll(size);
1490 					strlcpy(selectedVolume.name, volume,
1491 						sizeof(selectedVolume.name));
1492 					strlcpy(selectedVolume.device, device,
1493 						sizeof(selectedVolume.device));
1494 					strlcpy(selectedVolume.filesystem, filesystem,
1495 						sizeof(selectedVolume.filesystem));
1496 					selectedVolume.capacity = atoll(capacity);
1497 				} else {
1498 					// Something isn't right with swap config, go auto
1499 					swapAutomatic = true;
1500 					dprintf("%s: virtual_memory configuration is invalid, "
1501 						"using automatic swap\n", __func__);
1502 				}
1503 			}
1504 		}
1505 		unload_driver_settings(settings);
1506 	}
1507 
1508 	if (swapAutomatic) {
1509 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1510 		if (swapSize <= (1024 * 1024 * 1024)) {
1511 			// Memory under 1GB? double the swap
1512 			swapSize *= 2;
1513 		}
1514 		// Automatic swap defaults to the boot device
1515 		swapDeviceID = gBootDevice;
1516 	}
1517 
1518 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1519 		dprintf("%s: virtual_memory is disabled\n", __func__);
1520 		return;
1521 	}
1522 
1523 	if (!swapAutomatic && swapDeviceID < 0) {
1524 		// If user-specified swap, and no swap device has been chosen yet...
1525 		KDiskDeviceManager::CreateDefault();
1526 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1527 		PartitionScorer visitor(selectedVolume);
1528 
1529 		KDiskDevice* device;
1530 		int32 cookie = 0;
1531 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1532 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1533 				|| device->IsRemovable()) {
1534 				continue;
1535 			}
1536 			device->VisitEachDescendant(&visitor);
1537 		}
1538 
1539 		if (!visitor.fBestPartition) {
1540 			dprintf("%s: Can't find configured swap partition '%s'\n",
1541 				__func__, selectedVolume.name);
1542 		} else {
1543 			if (visitor.fBestPartition->IsMounted())
1544 				swapDeviceID = visitor.fBestPartition->VolumeID();
1545 			else {
1546 				KPath devPath, mountPoint;
1547 				visitor.fBestPartition->GetPath(&devPath);
1548 				get_mount_point(visitor.fBestPartition, &mountPoint);
1549 				const char* mountPath = mountPoint.Path();
1550 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1551 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1552 					NULL, 0, NULL, 0);
1553 				if (swapDeviceID < 0) {
1554 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1555 						__func__, selectedVolume.name);
1556 				}
1557 			}
1558 		}
1559 	}
1560 
1561 	if (swapDeviceID < 0)
1562 		swapDeviceID = gBootDevice;
1563 
1564 	// We now have a swapDeviceID which is used for the swap file
1565 
1566 	KPath path;
1567 	struct fs_info info;
1568 	_kern_read_fs_info(swapDeviceID, &info);
1569 	if (swapDeviceID == gBootDevice)
1570 		path = kDefaultSwapPath;
1571 	else {
1572 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1573 			path.BufferSize());
1574 		path.UnlockBuffer();
1575 		path.Append("swap");
1576 	}
1577 
1578 	const char* swapPath = path.Path();
1579 
1580 	// Swap size limits prevent oversized swap files
1581 	if (swapAutomatic) {
1582 		off_t existingSwapSize = 0;
1583 		struct stat existingSwapStat;
1584 		if (stat(swapPath, &existingSwapStat) == 0)
1585 			existingSwapSize = existingSwapStat.st_size;
1586 
1587 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1588 
1589 		// Adjust automatic swap to a maximum of 25% of the free space
1590 		if (swapSize > (freeSpace / 4))
1591 			swapSize = (freeSpace / 4);
1592 	}
1593 
1594 	// Create swap file
1595 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1596 	if (fd < 0) {
1597 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1598 			swapPath, strerror(errno));
1599 		return;
1600 	}
1601 
1602 	struct stat stat;
1603 	stat.st_size = swapSize;
1604 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1605 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1606 	if (error != B_OK) {
1607 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1608 			__func__, swapPath, swapSize, strerror(error));
1609 	}
1610 
1611 	close(fd);
1612 
1613 	error = swap_file_add(swapPath);
1614 	if (error != B_OK) {
1615 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1616 			strerror(error));
1617 	}
1618 }
1619 
1620 
1621 //! Used by page daemon to free swap space.
1622 bool
1623 swap_free_page_swap_space(vm_page* page)
1624 {
1625 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1626 	if (cache == NULL)
1627 		return false;
1628 
1629 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1630 	if (slotIndex == SWAP_SLOT_NONE)
1631 		return false;
1632 
1633 	swap_slot_dealloc(slotIndex, 1);
1634 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1635 	cache->_SwapBlockFree(page->cache_offset, 1);
1636 
1637 	return true;
1638 }
1639 
1640 
1641 uint32
1642 swap_available_pages()
1643 {
1644 	mutex_lock(&sAvailSwapSpaceLock);
1645 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1646 	mutex_unlock(&sAvailSwapSpaceLock);
1647 
1648 	return avail;
1649 }
1650 
1651 
1652 uint32
1653 swap_total_swap_pages()
1654 {
1655 	mutex_lock(&sSwapFileListLock);
1656 
1657 	uint32 totalSwapSlots = 0;
1658 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1659 		swap_file* swapFile = it.Next();) {
1660 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1661 	}
1662 
1663 	mutex_unlock(&sSwapFileListLock);
1664 
1665 	return totalSwapSlots;
1666 }
1667 
1668 
1669 #endif	// ENABLE_SWAP_SUPPORT
1670 
1671 
1672 void
1673 swap_get_info(system_info* info)
1674 {
1675 #if ENABLE_SWAP_SUPPORT
1676 	info->max_swap_pages = swap_total_swap_pages();
1677 	info->free_swap_pages = swap_available_pages();
1678 #else
1679 	info->max_swap_space = 0;
1680 	info->free_swap_space = 0;
1681 #endif
1682 }
1683 
1684