xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision f8da8f3477d3c18142e59d17d05a545982faa5a8)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 
31 #include <arch_config.h>
32 #include <boot_device.h>
33 #include <disk_device_manager/KDiskDevice.h>
34 #include <disk_device_manager/KDiskDeviceManager.h>
35 #include <disk_device_manager/KDiskSystem.h>
36 #include <disk_device_manager/KPartitionVisitor.h>
37 #include <driver_settings.h>
38 #include <fs/fd.h>
39 #include <fs/KPath.h>
40 #include <fs_info.h>
41 #include <fs_interface.h>
42 #include <heap.h>
43 #include <kernel_daemon.h>
44 #include <slab/Slab.h>
45 #include <syscalls.h>
46 #include <system_info.h>
47 #include <tracing.h>
48 #include <util/AutoLock.h>
49 #include <util/DoublyLinkedList.h>
50 #include <util/OpenHashTable.h>
51 #include <util/RadixBitmap.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_priv.h>
56 #include <vm/VMAddressSpace.h>
57 
58 #include "IORequest.h"
59 
60 
61 #if	ENABLE_SWAP_SUPPORT
62 
63 //#define TRACE_VM_ANONYMOUS_CACHE
64 #ifdef TRACE_VM_ANONYMOUS_CACHE
65 #	define TRACE(x...) dprintf(x)
66 #else
67 #	define TRACE(x...) do { } while (false)
68 #endif
69 
70 
71 // number of free swap blocks the object cache shall minimally have
72 #define MIN_SWAP_BLOCK_RESERVE	4096
73 
74 // interval the has resizer is triggered (in 0.1s)
75 #define SWAP_HASH_RESIZE_INTERVAL	5
76 
77 #define INITIAL_SWAP_HASH_SIZE		1024
78 
79 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
80 
81 #define SWAP_BLOCK_PAGES 32
82 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
83 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
84 
85 
86 static const char* const kDefaultSwapPath = "/var/swap";
87 
88 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
89 	int				fd;
90 	struct vnode*	vnode;
91 	void*			cookie;
92 	swap_addr_t		first_slot;
93 	swap_addr_t		last_slot;
94 	radix_bitmap*	bmp;
95 };
96 
97 struct swap_hash_key {
98 	VMAnonymousCache	*cache;
99 	off_t				page_index;  // page index in the cache
100 };
101 
102 // Each swap block contains swap address information for
103 // SWAP_BLOCK_PAGES continuous pages from the same cache
104 struct swap_block {
105 	swap_block*		hash_link;
106 	swap_hash_key	key;
107 	uint32			used;
108 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
109 };
110 
111 struct SwapHashTableDefinition {
112 	typedef swap_hash_key KeyType;
113 	typedef swap_block ValueType;
114 
115 	SwapHashTableDefinition() {}
116 
117 	size_t HashKey(const swap_hash_key& key) const
118 	{
119 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
120 		VMAnonymousCache* cache = key.cache;
121 		return blockIndex ^ (size_t)(int*)cache;
122 	}
123 
124 	size_t Hash(const swap_block* value) const
125 	{
126 		return HashKey(value->key);
127 	}
128 
129 	bool Compare(const swap_hash_key& key, const swap_block* value) const
130 	{
131 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
132 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
133 			&& key.cache == value->key.cache;
134 	}
135 
136 	swap_block*& GetLink(swap_block* value) const
137 	{
138 		return value->hash_link;
139 	}
140 };
141 
142 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
143 typedef DoublyLinkedList<swap_file> SwapFileList;
144 
145 static SwapHashTable sSwapHashTable;
146 static rw_lock sSwapHashLock;
147 
148 static SwapFileList sSwapFileList;
149 static mutex sSwapFileListLock;
150 static swap_file* sSwapFileAlloc = NULL; // allocate from here
151 static uint32 sSwapFileCount = 0;
152 
153 static off_t sAvailSwapSpace = 0;
154 static mutex sAvailSwapSpaceLock;
155 
156 static object_cache* sSwapBlockCache;
157 
158 
159 #if SWAP_TRACING
160 namespace SwapTracing {
161 
162 class SwapTraceEntry : public AbstractTraceEntry {
163 public:
164 	SwapTraceEntry(VMAnonymousCache* cache)
165 		:
166 		fCache(cache)
167 	{
168 	}
169 
170 protected:
171 	VMAnonymousCache*	fCache;
172 };
173 
174 
175 class ReadPage : public SwapTraceEntry {
176 public:
177 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
178 		swap_addr_t swapSlotIndex)
179 		:
180 		SwapTraceEntry(cache),
181 		fPageIndex(pageIndex),
182 		fSwapSlotIndex(swapSlotIndex)
183 	{
184 		Initialized();
185 	}
186 
187 	virtual void AddDump(TraceOutput& out)
188 	{
189 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
190 			fCache, fPageIndex, fSwapSlotIndex);
191 	}
192 
193 private:
194 	page_num_t		fPageIndex;
195 	swap_addr_t		fSwapSlotIndex;
196 };
197 
198 
199 class WritePage : public SwapTraceEntry {
200 public:
201 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
202 		swap_addr_t swapSlotIndex)
203 		:
204 		SwapTraceEntry(cache),
205 		fPageIndex(pageIndex),
206 		fSwapSlotIndex(swapSlotIndex)
207 	{
208 		Initialized();
209 	}
210 
211 	virtual void AddDump(TraceOutput& out)
212 	{
213 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
214 			fCache, fPageIndex, fSwapSlotIndex);
215 	}
216 
217 private:
218 	page_num_t		fPageIndex;
219 	swap_addr_t		fSwapSlotIndex;
220 };
221 
222 }	// namespace SwapTracing
223 
224 #	define T(x) new(std::nothrow) SwapTracing::x;
225 #else
226 #	define T(x) ;
227 #endif
228 
229 
230 static int
231 dump_swap_info(int argc, char** argv)
232 {
233 	swap_addr_t totalSwapPages = 0;
234 	swap_addr_t freeSwapPages = 0;
235 
236 	kprintf("swap files:\n");
237 
238 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
239 		swap_file* file = it.Next();) {
240 		swap_addr_t total = file->last_slot - file->first_slot;
241 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
242 			"\n", file->vnode, total, file->bmp->free_slots);
243 
244 		totalSwapPages += total;
245 		freeSwapPages += file->bmp->free_slots;
246 	}
247 
248 	kprintf("\n");
249 	kprintf("swap space in pages:\n");
250 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
251 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
252 	kprintf("reserved:  %9" B_PRIdOFF "\n",
253 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
254 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
255 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
256 
257 	return 0;
258 }
259 
260 
261 static swap_addr_t
262 swap_slot_alloc(uint32 count)
263 {
264 	mutex_lock(&sSwapFileListLock);
265 
266 	if (sSwapFileList.IsEmpty()) {
267 		mutex_unlock(&sSwapFileListLock);
268 		panic("swap_slot_alloc(): no swap file in the system\n");
269 		return SWAP_SLOT_NONE;
270 	}
271 
272 	// since radix bitmap could not handle more than 32 pages, we return
273 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
274 	if (count > BITMAP_RADIX) {
275 		mutex_unlock(&sSwapFileListLock);
276 		return SWAP_SLOT_NONE;
277 	}
278 
279 	swap_addr_t j, addr = SWAP_SLOT_NONE;
280 	for (j = 0; j < sSwapFileCount; j++) {
281 		if (sSwapFileAlloc == NULL)
282 			sSwapFileAlloc = sSwapFileList.First();
283 
284 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
285 		if (addr != SWAP_SLOT_NONE) {
286 			addr += sSwapFileAlloc->first_slot;
287 			break;
288 		}
289 
290 		// this swap_file is full, find another
291 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
292 	}
293 
294 	if (j == sSwapFileCount) {
295 		mutex_unlock(&sSwapFileListLock);
296 		panic("swap_slot_alloc: swap space exhausted!\n");
297 		return SWAP_SLOT_NONE;
298 	}
299 
300 	// if this swap file has used more than 90% percent of its space
301 	// switch to another
302 	if (sSwapFileAlloc->bmp->free_slots
303 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
304 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
305 	}
306 
307 	mutex_unlock(&sSwapFileListLock);
308 
309 	return addr;
310 }
311 
312 
313 static swap_file*
314 find_swap_file(swap_addr_t slotIndex)
315 {
316 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
317 		swap_file* swapFile = it.Next();) {
318 		if (slotIndex >= swapFile->first_slot
319 			&& slotIndex < swapFile->last_slot) {
320 			return swapFile;
321 		}
322 	}
323 
324 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
325 		slotIndex);
326 	return NULL;
327 }
328 
329 
330 static void
331 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
332 {
333 	if (slotIndex == SWAP_SLOT_NONE)
334 		return;
335 
336 	mutex_lock(&sSwapFileListLock);
337 	swap_file* swapFile = find_swap_file(slotIndex);
338 	slotIndex -= swapFile->first_slot;
339 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
340 	mutex_unlock(&sSwapFileListLock);
341 }
342 
343 
344 static off_t
345 swap_space_reserve(off_t amount)
346 {
347 	mutex_lock(&sAvailSwapSpaceLock);
348 	if (sAvailSwapSpace >= amount)
349 		sAvailSwapSpace -= amount;
350 	else {
351 		amount = sAvailSwapSpace;
352 		sAvailSwapSpace = 0;
353 	}
354 	mutex_unlock(&sAvailSwapSpaceLock);
355 
356 	return amount;
357 }
358 
359 
360 static void
361 swap_space_unreserve(off_t amount)
362 {
363 	mutex_lock(&sAvailSwapSpaceLock);
364 	sAvailSwapSpace += amount;
365 	mutex_unlock(&sAvailSwapSpaceLock);
366 }
367 
368 
369 static void
370 swap_hash_resizer(void*, int)
371 {
372 	WriteLocker locker(sSwapHashLock);
373 
374 	size_t size;
375 	void* allocation;
376 
377 	do {
378 		size = sSwapHashTable.ResizeNeeded();
379 		if (size == 0)
380 			return;
381 
382 		locker.Unlock();
383 
384 		allocation = malloc(size);
385 		if (allocation == NULL)
386 			return;
387 
388 		locker.Lock();
389 
390 	} while (!sSwapHashTable.Resize(allocation, size));
391 }
392 
393 
394 // #pragma mark -
395 
396 
397 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
398 public:
399 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
400 		:
401 		StackableAsyncIOCallback(callback),
402 		fCache(cache)
403 	{
404 	}
405 
406 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
407 	{
408 		fPageIndex = pageIndex;
409 		fSlotIndex = slotIndex;
410 		fNewSlot = newSlot;
411 	}
412 
413 	virtual void IOFinished(status_t status, bool partialTransfer,
414 		generic_size_t bytesTransferred)
415 	{
416 		if (fNewSlot) {
417 			if (status == B_OK) {
418 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
419 			} else {
420 				AutoLocker<VMCache> locker(fCache);
421 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
422 				locker.Unlock();
423 
424 				swap_slot_dealloc(fSlotIndex, 1);
425 			}
426 		}
427 
428 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
429 
430 		delete this;
431 	}
432 
433 private:
434 	VMAnonymousCache*	fCache;
435 	page_num_t			fPageIndex;
436 	swap_addr_t			fSlotIndex;
437 	bool				fNewSlot;
438 };
439 
440 
441 // #pragma mark -
442 
443 
444 VMAnonymousCache::~VMAnonymousCache()
445 {
446 	// free allocated swap space and swap block
447 	for (off_t offset = virtual_base, toFree = fAllocatedSwapSize;
448 		offset < virtual_end && toFree > 0; offset += B_PAGE_SIZE) {
449 		swap_addr_t slotIndex = _SwapBlockGetAddress(offset >> PAGE_SHIFT);
450 		if (slotIndex == SWAP_SLOT_NONE)
451 			continue;
452 
453 		swap_slot_dealloc(slotIndex, 1);
454 		_SwapBlockFree(offset >> PAGE_SHIFT, 1);
455 		toFree -= B_PAGE_SIZE;
456 	}
457 
458 	swap_space_unreserve(fCommittedSwapSize);
459 	if (committed_size > fCommittedSwapSize)
460 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
461 }
462 
463 
464 status_t
465 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
466 	int32 numGuardPages, uint32 allocationFlags)
467 {
468 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
469 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
470 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
471 		numGuardPages);
472 
473 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
474 	if (error != B_OK)
475 		return error;
476 
477 	fCanOvercommit = canOvercommit;
478 	fHasPrecommitted = false;
479 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
480 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
481 	fCommittedSwapSize = 0;
482 	fAllocatedSwapSize = 0;
483 
484 	return B_OK;
485 }
486 
487 
488 status_t
489 VMAnonymousCache::Resize(off_t newSize, int priority)
490 {
491 	// If the cache size shrinks, drop all swap pages beyond the new size.
492 	if (fAllocatedSwapSize > 0) {
493 		off_t oldPageCount = (virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
494 		swap_block* swapBlock = NULL;
495 
496 		for (off_t pageIndex = (newSize + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
497 			pageIndex < oldPageCount && fAllocatedSwapSize > 0; pageIndex++) {
498 
499 			WriteLocker locker(sSwapHashLock);
500 
501 			// Get the swap slot index for the page.
502 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
503 			if (swapBlock == NULL || blockIndex == 0) {
504 				swap_hash_key key = { this, pageIndex };
505 				swapBlock = sSwapHashTable.Lookup(key);
506 
507 				if (swapBlock == NULL) {
508 					pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES);
509 					continue;
510 				}
511 			}
512 
513 			swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
514 			vm_page* page;
515 			if (slotIndex != SWAP_SLOT_NONE
516 				&& ((page = LookupPage((off_t)pageIndex * B_PAGE_SIZE)) == NULL
517 					|| !page->busy)) {
518 					// TODO: We skip (i.e. leak) swap space of busy pages, since
519 					// there could be I/O going on (paging in/out). Waiting is
520 					// not an option as 1. unlocking the cache means that new
521 					// swap pages could be added in a range we've already
522 					// cleared (since the cache still has the old size) and 2.
523 					// we'd risk a deadlock in case we come from the file cache
524 					// and the FS holds the node's write-lock. We should mark
525 					// the page invalid and let the one responsible clean up.
526 					// There's just no such mechanism yet.
527 				swap_slot_dealloc(slotIndex, 1);
528 				fAllocatedSwapSize -= B_PAGE_SIZE;
529 
530 				swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
531 				if (--swapBlock->used == 0) {
532 					// All swap pages have been freed -- we can discard the swap
533 					// block.
534 					sSwapHashTable.RemoveUnchecked(swapBlock);
535 					object_cache_free(sSwapBlockCache, swapBlock,
536 						CACHE_DONT_WAIT_FOR_MEMORY
537 							| CACHE_DONT_LOCK_KERNEL_SPACE);
538 				}
539 			}
540 		}
541 	}
542 
543 	return VMCache::Resize(newSize, priority);
544 }
545 
546 
547 status_t
548 VMAnonymousCache::Commit(off_t size, int priority)
549 {
550 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
551 
552 	// If we can overcommit, we don't commit here, but in Fault(). We always
553 	// unreserve memory, if we're asked to shrink our commitment, though.
554 	if (fCanOvercommit && size > committed_size) {
555 		if (fHasPrecommitted)
556 			return B_OK;
557 
558 		// pre-commit some pages to make a later failure less probable
559 		fHasPrecommitted = true;
560 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
561 		if (size > precommitted)
562 			size = precommitted;
563 	}
564 
565 	return _Commit(size, priority);
566 }
567 
568 
569 bool
570 VMAnonymousCache::HasPage(off_t offset)
571 {
572 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
573 		return true;
574 
575 	return false;
576 }
577 
578 
579 bool
580 VMAnonymousCache::DebugHasPage(off_t offset)
581 {
582 	off_t pageIndex = offset >> PAGE_SHIFT;
583 	swap_hash_key key = { this, pageIndex };
584 	swap_block* swap = sSwapHashTable.Lookup(key);
585 	if (swap == NULL)
586 		return false;
587 
588 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
589 }
590 
591 
592 status_t
593 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
594 	uint32 flags, generic_size_t* _numBytes)
595 {
596 	off_t pageIndex = offset >> PAGE_SHIFT;
597 
598 	for (uint32 i = 0, j = 0; i < count; i = j) {
599 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
600 		for (j = i + 1; j < count; j++) {
601 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
602 			if (slotIndex != startSlotIndex + j - i)
603 				break;
604 		}
605 
606 		T(ReadPage(this, pageIndex, startSlotIndex));
607 			// TODO: Assumes that only one page is read.
608 
609 		swap_file* swapFile = find_swap_file(startSlotIndex);
610 
611 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
612 			* B_PAGE_SIZE;
613 
614 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
615 			vecs + i, j - i, flags, _numBytes);
616 		if (status != B_OK)
617 			return status;
618 	}
619 
620 	return B_OK;
621 }
622 
623 
624 status_t
625 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
626 	uint32 flags, generic_size_t* _numBytes)
627 {
628 	off_t pageIndex = offset >> PAGE_SHIFT;
629 
630 	AutoLocker<VMCache> locker(this);
631 
632 	page_num_t totalPages = 0;
633 	for (uint32 i = 0; i < count; i++) {
634 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
635 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
636 		if (slotIndex != SWAP_SLOT_NONE) {
637 			swap_slot_dealloc(slotIndex, pageCount);
638 			_SwapBlockFree(pageIndex + totalPages, pageCount);
639 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
640 		}
641 
642 		totalPages += pageCount;
643 	}
644 
645 	off_t totalSize = totalPages * B_PAGE_SIZE;
646 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
647 		return B_ERROR;
648 
649 	fAllocatedSwapSize += totalSize;
650 	locker.Unlock();
651 
652 	page_num_t pagesLeft = totalPages;
653 	totalPages = 0;
654 
655 	for (uint32 i = 0; i < count; i++) {
656 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
657 
658 		generic_addr_t vectorBase = vecs[i].base;
659 		generic_size_t vectorLength = vecs[i].length;
660 		page_num_t n = pageCount;
661 
662 		for (page_num_t j = 0; j < pageCount; j += n) {
663 			swap_addr_t slotIndex;
664 			// try to allocate n slots, if fail, try to allocate n/2
665 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
666 				n >>= 1;
667 
668 			if (slotIndex == SWAP_SLOT_NONE)
669 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
670 
671 			T(WritePage(this, pageIndex, slotIndex));
672 				// TODO: Assumes that only one page is written.
673 
674 			swap_file* swapFile = find_swap_file(slotIndex);
675 
676 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
677 
678 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
679 			generic_io_vec vector[1];
680 			vector->base = vectorBase;
681 			vector->length = length;
682 
683 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
684 				pos, vector, 1, flags, &length);
685 			if (status != B_OK) {
686 				locker.Lock();
687 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
688 				locker.Unlock();
689 
690 				swap_slot_dealloc(slotIndex, n);
691 				return status;
692 			}
693 
694 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
695 			pagesLeft -= n;
696 
697 			if (n != pageCount) {
698 				vectorBase = vectorBase + n * B_PAGE_SIZE;
699 				vectorLength -= n * B_PAGE_SIZE;
700 			}
701 		}
702 
703 		totalPages += pageCount;
704 	}
705 
706 	ASSERT(pagesLeft == 0);
707 	return B_OK;
708 }
709 
710 
711 status_t
712 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
713 	size_t count, generic_size_t numBytes, uint32 flags,
714 	AsyncIOCallback* _callback)
715 {
716 	// TODO: Currently this method is only used for single pages. Either make
717 	// more flexible use of it or change the interface!
718 	// This implementation relies on the current usage!
719 	ASSERT(count == 1);
720 	ASSERT(numBytes <= B_PAGE_SIZE);
721 
722 	page_num_t pageIndex = offset >> PAGE_SHIFT;
723 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
724 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
725 
726 	// If the page doesn't have any swap space yet, allocate it.
727 	if (newSlot) {
728 		AutoLocker<VMCache> locker(this);
729 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
730 			_callback->IOFinished(B_ERROR, true, 0);
731 			return B_ERROR;
732 		}
733 
734 		fAllocatedSwapSize += B_PAGE_SIZE;
735 
736 		slotIndex = swap_slot_alloc(1);
737 	}
738 
739 	// create our callback
740 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
741 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
742 		: new(std::nothrow) WriteCallback(this, _callback);
743 	if (callback == NULL) {
744 		if (newSlot) {
745 			AutoLocker<VMCache> locker(this);
746 			fAllocatedSwapSize -= B_PAGE_SIZE;
747 			locker.Unlock();
748 
749 			swap_slot_dealloc(slotIndex, 1);
750 		}
751 		_callback->IOFinished(B_NO_MEMORY, true, 0);
752 		return B_NO_MEMORY;
753 	}
754 	// TODO: If the page already had swap space assigned, we don't need an own
755 	// callback.
756 
757 	callback->SetTo(pageIndex, slotIndex, newSlot);
758 
759 	T(WritePage(this, pageIndex, slotIndex));
760 
761 	// write the page asynchrounously
762 	swap_file* swapFile = find_swap_file(slotIndex);
763 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
764 
765 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
766 		vecs, 1, numBytes, flags, callback);
767 }
768 
769 
770 bool
771 VMAnonymousCache::CanWritePage(off_t offset)
772 {
773 	// We can write the page, if we have not used all of our committed swap
774 	// space or the page already has a swap slot assigned.
775 	return fAllocatedSwapSize < fCommittedSwapSize
776 		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
777 }
778 
779 
780 int32
781 VMAnonymousCache::MaxPagesPerAsyncWrite() const
782 {
783 	return 1;
784 }
785 
786 
787 status_t
788 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
789 {
790 	if (fGuardedSize > 0) {
791 		uint32 guardOffset;
792 
793 #ifdef STACK_GROWS_DOWNWARDS
794 		guardOffset = 0;
795 #elif defined(STACK_GROWS_UPWARDS)
796 		guardOffset = virtual_size - fGuardedSize;
797 #else
798 #	error Stack direction has not been defined in arch_config.h
799 #endif
800 		// report stack fault, guard page hit!
801 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
802 			TRACE(("stack overflow!\n"));
803 			return B_BAD_ADDRESS;
804 		}
805 	}
806 
807 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
808 		if (fPrecommittedPages == 0) {
809 			// never commit more than needed
810 			if (committed_size / B_PAGE_SIZE > page_count)
811 				return B_BAD_HANDLER;
812 
813 			// try to commit additional swap space/memory
814 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
815 				fCommittedSwapSize += B_PAGE_SIZE;
816 			} else {
817 				int priority = aspace == VMAddressSpace::Kernel()
818 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
819 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
820 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
821 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
822 					return B_NO_MEMORY;
823 				}
824 			}
825 
826 			committed_size += B_PAGE_SIZE;
827 		} else
828 			fPrecommittedPages--;
829 	}
830 
831 	// This will cause vm_soft_fault() to handle the fault
832 	return B_BAD_HANDLER;
833 }
834 
835 
836 void
837 VMAnonymousCache::Merge(VMCache* _source)
838 {
839 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
840 	if (source == NULL) {
841 		panic("VMAnonymousCache::MergeStore(): merge with incompatible cache "
842 			"%p requested", _source);
843 		return;
844 	}
845 
846 	// take over the source' committed size
847 	fCommittedSwapSize += source->fCommittedSwapSize;
848 	source->fCommittedSwapSize = 0;
849 	committed_size += source->committed_size;
850 	source->committed_size = 0;
851 
852 	off_t actualSize = virtual_end - virtual_base;
853 	if (committed_size > actualSize)
854 		_Commit(actualSize, VM_PRIORITY_USER);
855 
856 	// Move all not shadowed swap pages from the source to the consumer cache.
857 	// Also remove all source pages that are shadowed by consumer swap pages.
858 	_MergeSwapPages(source);
859 
860 	// Move all not shadowed pages from the source to the consumer cache.
861 	if (source->page_count < page_count)
862 		_MergePagesSmallerSource(source);
863 	else
864 		_MergePagesSmallerConsumer(source);
865 }
866 
867 
868 void
869 VMAnonymousCache::DeleteObject()
870 {
871 	object_cache_delete(gAnonymousCacheObjectCache, this);
872 }
873 
874 
875 void
876 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
877 	swap_addr_t startSlotIndex, uint32 count)
878 {
879 	WriteLocker locker(sSwapHashLock);
880 
881 	uint32 left = count;
882 	for (uint32 i = 0, j = 0; i < count; i += j) {
883 		off_t pageIndex = startPageIndex + i;
884 		swap_addr_t slotIndex = startSlotIndex + i;
885 
886 		swap_hash_key key = { this, pageIndex };
887 
888 		swap_block* swap = sSwapHashTable.Lookup(key);
889 		while (swap == NULL) {
890 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
891 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
892 			if (swap == NULL) {
893 				// Wait a short time until memory is available again.
894 				locker.Unlock();
895 				snooze(10000);
896 				locker.Lock();
897 				swap = sSwapHashTable.Lookup(key);
898 				continue;
899 			}
900 
901 			swap->key.cache = this;
902 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
903 			swap->used = 0;
904 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
905 				swap->swap_slots[i] = SWAP_SLOT_NONE;
906 
907 			sSwapHashTable.InsertUnchecked(swap);
908 		}
909 
910 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
911 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
912 			swap->swap_slots[blockIndex++] = slotIndex + j;
913 			left--;
914 		}
915 
916 		swap->used += j;
917 	}
918 }
919 
920 
921 void
922 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
923 {
924 	WriteLocker locker(sSwapHashLock);
925 
926 	uint32 left = count;
927 	for (uint32 i = 0, j = 0; i < count; i += j) {
928 		off_t pageIndex = startPageIndex + i;
929 		swap_hash_key key = { this, pageIndex };
930 		swap_block* swap = sSwapHashTable.Lookup(key);
931 
932 		ASSERT(swap != NULL);
933 
934 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
935 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
936 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
937 			left--;
938 		}
939 
940 		swap->used -= j;
941 		if (swap->used == 0) {
942 			sSwapHashTable.RemoveUnchecked(swap);
943 			object_cache_free(sSwapBlockCache, swap,
944 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
945 		}
946 	}
947 }
948 
949 
950 swap_addr_t
951 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
952 {
953 	ReadLocker locker(sSwapHashLock);
954 
955 	swap_hash_key key = { this, pageIndex };
956 	swap_block* swap = sSwapHashTable.Lookup(key);
957 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
958 
959 	if (swap != NULL) {
960 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
961 		slotIndex = swap->swap_slots[blockIndex];
962 	}
963 
964 	return slotIndex;
965 }
966 
967 
968 status_t
969 VMAnonymousCache::_Commit(off_t size, int priority)
970 {
971 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
972 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
973 		fCommittedSwapSize);
974 
975 	// Basic strategy: reserve swap space first, only when running out of swap
976 	// space, reserve real memory.
977 
978 	off_t committedMemory = committed_size - fCommittedSwapSize;
979 
980 	// Regardless of whether we're asked to grow or shrink the commitment,
981 	// we always try to reserve as much as possible of the final commitment
982 	// in the swap space.
983 	if (size > fCommittedSwapSize) {
984 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
985 		committed_size = fCommittedSwapSize + committedMemory;
986 		if (size > fCommittedSwapSize) {
987 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
988 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
989 		}
990 	}
991 
992 	if (committed_size == size)
993 		return B_OK;
994 
995 	if (committed_size > size) {
996 		// The commitment shrinks -- unreserve real memory first.
997 		off_t toUnreserve = committed_size - size;
998 		if (committedMemory > 0) {
999 			off_t unreserved = min_c(toUnreserve, committedMemory);
1000 			vm_unreserve_memory(unreserved);
1001 			committedMemory -= unreserved;
1002 			committed_size -= unreserved;
1003 			toUnreserve -= unreserved;
1004 		}
1005 
1006 		// Unreserve swap space.
1007 		if (toUnreserve > 0) {
1008 			swap_space_unreserve(toUnreserve);
1009 			fCommittedSwapSize -= toUnreserve;
1010 			committed_size -= toUnreserve;
1011 		}
1012 
1013 		return B_OK;
1014 	}
1015 
1016 	// The commitment grows -- we have already tried to reserve swap space at
1017 	// the start of the method, so we try to reserve real memory, now.
1018 
1019 	off_t toReserve = size - committed_size;
1020 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1021 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1022 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1023 		return B_NO_MEMORY;
1024 	}
1025 
1026 	committed_size = size;
1027 	return B_OK;
1028 }
1029 
1030 
1031 void
1032 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1033 {
1034 	// The source cache has less pages than the consumer (this cache), so we
1035 	// iterate through the source's pages and move the ones that are not
1036 	// shadowed up to the consumer.
1037 
1038 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1039 			vm_page* page = it.Next();) {
1040 		// Note: Removing the current node while iterating through a
1041 		// IteratableSplayTree is safe.
1042 		vm_page* consumerPage = LookupPage(
1043 			(off_t)page->cache_offset << PAGE_SHIFT);
1044 		if (consumerPage == NULL) {
1045 			// the page is not yet in the consumer cache - move it upwards
1046 			ASSERT_PRINT(!page->busy, "page: %p", page);
1047 			MovePage(page);
1048 		}
1049 	}
1050 }
1051 
1052 
1053 void
1054 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1055 {
1056 	// The consumer (this cache) has less pages than the source, so we move the
1057 	// consumer's pages to the source (freeing shadowed ones) and finally just
1058 	// all pages of the source back to the consumer.
1059 
1060 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1061 		vm_page* page = it.Next();) {
1062 		// If a source page is in the way, remove and free it.
1063 		vm_page* sourcePage = source->LookupPage(
1064 			(off_t)page->cache_offset << PAGE_SHIFT);
1065 		if (sourcePage != NULL) {
1066 			DEBUG_PAGE_ACCESS_START(sourcePage);
1067 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1068 			source->RemovePage(sourcePage);
1069 			vm_page_free(source, sourcePage);
1070 		}
1071 
1072 		// Note: Removing the current node while iterating through a
1073 		// IteratableSplayTree is safe.
1074 		source->MovePage(page);
1075 	}
1076 
1077 	MoveAllPages(source);
1078 }
1079 
1080 
1081 void
1082 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1083 {
1084 	// If neither source nor consumer have swap pages, we don't have to do
1085 	// anything.
1086 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1087 		return;
1088 
1089 	for (off_t offset = source->virtual_base
1090 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1091 		offset < source->virtual_end;
1092 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1093 
1094 		WriteLocker locker(sSwapHashLock);
1095 
1096 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1097 		swap_hash_key key = { source, swapBlockPageIndex };
1098 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1099 
1100 		// remove the source swap block -- we will either take over the swap
1101 		// space (and the block) or free it
1102 		if (sourceSwapBlock != NULL)
1103 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1104 
1105 		key.cache = this;
1106 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1107 
1108 		locker.Unlock();
1109 
1110 		// remove all source pages that are shadowed by consumer swap pages
1111 		if (swapBlock != NULL) {
1112 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1113 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1114 					vm_page* page = source->LookupPage(
1115 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1116 					if (page != NULL) {
1117 						DEBUG_PAGE_ACCESS_START(page);
1118 						ASSERT_PRINT(!page->busy, "page: %p", page);
1119 						source->RemovePage(page);
1120 						vm_page_free(source, page);
1121 					}
1122 				}
1123 			}
1124 		}
1125 
1126 		if (sourceSwapBlock == NULL)
1127 			continue;
1128 
1129 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1130 			off_t pageIndex = swapBlockPageIndex + i;
1131 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1132 
1133 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1134 				continue;
1135 
1136 			if ((swapBlock != NULL
1137 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1138 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1139 				// The consumer already has a page or a swapped out page
1140 				// at this index. So we can free the source swap space.
1141 				swap_slot_dealloc(sourceSlotIndex, 1);
1142 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1143 				sourceSwapBlock->used--;
1144 			}
1145 
1146 			// We've either freed the source swap page or are going to move it
1147 			// to the consumer. At any rate, the source cache doesn't own it
1148 			// anymore.
1149 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1150 		}
1151 
1152 		// All source swap pages that have not been freed yet are taken over by
1153 		// the consumer.
1154 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1155 
1156 		if (sourceSwapBlock->used == 0) {
1157 			// All swap pages have been freed -- we can discard the source swap
1158 			// block.
1159 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1160 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1161 		} else if (swapBlock == NULL) {
1162 			// We need to take over some of the source's swap pages and there's
1163 			// no swap block in the consumer cache. Just take over the source
1164 			// swap block.
1165 			sourceSwapBlock->key.cache = this;
1166 			locker.Lock();
1167 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1168 			locker.Unlock();
1169 		} else {
1170 			// We need to take over some of the source's swap pages and there's
1171 			// already a swap block in the consumer cache. Copy the respective
1172 			// swap addresses and discard the source swap block.
1173 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1174 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1175 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1176 			}
1177 
1178 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1179 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1180 		}
1181 	}
1182 }
1183 
1184 
1185 // #pragma mark -
1186 
1187 
1188 // TODO: This can be removed if we get BFS uuid's
1189 struct VolumeInfo {
1190 	char name[B_FILE_NAME_LENGTH];
1191 	char device[B_FILE_NAME_LENGTH];
1192 	char filesystem[B_OS_NAME_LENGTH];
1193 	off_t capacity;
1194 };
1195 
1196 
1197 class PartitionScorer : public KPartitionVisitor {
1198 public:
1199 	PartitionScorer(VolumeInfo& volumeInfo)
1200 		:
1201 		fBestPartition(NULL),
1202 		fBestScore(-1),
1203 		fVolumeInfo(volumeInfo)
1204 	{
1205 	}
1206 
1207 	virtual bool VisitPre(KPartition* partition)
1208 	{
1209 		if (!partition->ContainsFileSystem())
1210 			return false;
1211 
1212 		KPath path;
1213 		partition->GetPath(&path);
1214 
1215 		int score = 0;
1216 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1217 			score += 4;
1218 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1219 			score += 3;
1220 		if (fVolumeInfo.capacity == partition->Size())
1221 			score += 2;
1222 		if (strcmp(fVolumeInfo.filesystem,
1223 			partition->DiskSystem()->ShortName()) == 0) {
1224 			score += 1;
1225 		}
1226 		if (score >= 4 && score > fBestScore) {
1227 			fBestPartition = partition;
1228 			fBestScore = score;
1229 		}
1230 
1231 		return false;
1232 	}
1233 
1234 	KPartition* fBestPartition;
1235 
1236 private:
1237 	int32		fBestScore;
1238 	VolumeInfo	fVolumeInfo;
1239 };
1240 
1241 
1242 status_t
1243 get_mount_point(KPartition* partition, KPath* mountPoint)
1244 {
1245 	if (!mountPoint || !partition->ContainsFileSystem())
1246 		return B_BAD_VALUE;
1247 
1248 	const char* volumeName = partition->ContentName();
1249 	if (!volumeName || strlen(volumeName) == 0)
1250 		volumeName = partition->Name();
1251 	if (!volumeName || strlen(volumeName) == 0)
1252 		volumeName = "unnamed volume";
1253 
1254 	char basePath[B_PATH_NAME_LENGTH];
1255 	int32 len = snprintf(basePath, sizeof(basePath), "/%s", volumeName);
1256 	for (int32 i = 1; i < len; i++)
1257 		if (basePath[i] == '/')
1258 		basePath[i] = '-';
1259 	char* path = mountPoint->LockBuffer();
1260 	int32 pathLen = mountPoint->BufferSize();
1261 	strncpy(path, basePath, pathLen);
1262 
1263 	struct stat dummy;
1264 	for (int i = 1; ; i++) {
1265 		if (stat(path, &dummy) != 0)
1266 			break;
1267 		snprintf(path, pathLen, "%s%d", basePath, i);
1268 	}
1269 
1270 	mountPoint->UnlockBuffer();
1271 	return B_OK;
1272 }
1273 
1274 
1275 status_t
1276 swap_file_add(const char* path)
1277 {
1278 	// open the file
1279 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1280 	if (fd < 0)
1281 		return errno;
1282 
1283 	// fstat() it and check whether we can use it
1284 	struct stat st;
1285 	if (fstat(fd, &st) < 0) {
1286 		close(fd);
1287 		return errno;
1288 	}
1289 
1290 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1291 		close(fd);
1292 		return B_BAD_VALUE;
1293 	}
1294 
1295 	if (st.st_size < B_PAGE_SIZE) {
1296 		close(fd);
1297 		return B_BAD_VALUE;
1298 	}
1299 
1300 	// get file descriptor, vnode, and cookie
1301 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1302 	put_fd(descriptor);
1303 
1304 	vnode* node = fd_vnode(descriptor);
1305 	if (node == NULL) {
1306 		close(fd);
1307 		return B_BAD_VALUE;
1308 	}
1309 
1310 	// do the allocations and prepare the swap_file structure
1311 	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1312 	if (swap == NULL) {
1313 		close(fd);
1314 		return B_NO_MEMORY;
1315 	}
1316 
1317 	swap->fd = fd;
1318 	swap->vnode = node;
1319 	swap->cookie = descriptor->cookie;
1320 
1321 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1322 	swap->bmp = radix_bitmap_create(pageCount);
1323 	if (swap->bmp == NULL) {
1324 		free(swap);
1325 		close(fd);
1326 		return B_NO_MEMORY;
1327 	}
1328 
1329 	// set slot index and add this file to swap file list
1330 	mutex_lock(&sSwapFileListLock);
1331 	// TODO: Also check whether the swap file is already registered!
1332 	if (sSwapFileList.IsEmpty()) {
1333 		swap->first_slot = 0;
1334 		swap->last_slot = pageCount;
1335 	} else {
1336 		// leave one page gap between two swap files
1337 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1338 		swap->last_slot = swap->first_slot + pageCount;
1339 	}
1340 	sSwapFileList.Add(swap);
1341 	sSwapFileCount++;
1342 	mutex_unlock(&sSwapFileListLock);
1343 
1344 	mutex_lock(&sAvailSwapSpaceLock);
1345 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1346 	mutex_unlock(&sAvailSwapSpaceLock);
1347 
1348 	return B_OK;
1349 }
1350 
1351 
1352 status_t
1353 swap_file_delete(const char* path)
1354 {
1355 	vnode* node = NULL;
1356 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1357 	if (status != B_OK)
1358 		return status;
1359 
1360 	MutexLocker locker(sSwapFileListLock);
1361 
1362 	swap_file* swapFile = NULL;
1363 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1364 			(swapFile = it.Next()) != NULL;) {
1365 		if (swapFile->vnode == node)
1366 			break;
1367 	}
1368 
1369 	vfs_put_vnode(node);
1370 
1371 	if (swapFile == NULL)
1372 		return B_ERROR;
1373 
1374 	// if this file is currently used, we can't delete
1375 	// TODO: mark this swap file deleting, and remove it after releasing
1376 	// all the swap space
1377 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1378 		return B_ERROR;
1379 
1380 	sSwapFileList.Remove(swapFile);
1381 	sSwapFileCount--;
1382 	locker.Unlock();
1383 
1384 	mutex_lock(&sAvailSwapSpaceLock);
1385 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1386 		* PAGE_SIZE;
1387 	mutex_unlock(&sAvailSwapSpaceLock);
1388 
1389 	close(swapFile->fd);
1390 	radix_bitmap_destroy(swapFile->bmp);
1391 	free(swapFile);
1392 
1393 	return B_OK;
1394 }
1395 
1396 
1397 void
1398 swap_init(void)
1399 {
1400 	// create swap block cache
1401 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1402 		sizeof(void*), NULL, NULL, NULL);
1403 	if (sSwapBlockCache == NULL)
1404 		panic("swap_init(): can't create object cache for swap blocks\n");
1405 
1406 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1407 		MIN_SWAP_BLOCK_RESERVE);
1408 	if (error != B_OK) {
1409 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1410 			strerror(error));
1411 	}
1412 
1413 	// init swap hash table
1414 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1415 	rw_lock_init(&sSwapHashLock, "swaphash");
1416 
1417 	error = register_resource_resizer(swap_hash_resizer, NULL,
1418 		SWAP_HASH_RESIZE_INTERVAL);
1419 	if (error != B_OK) {
1420 		panic("swap_init(): Failed to register swap hash resizer: %s",
1421 			strerror(error));
1422 	}
1423 
1424 	// init swap file list
1425 	mutex_init(&sSwapFileListLock, "swaplist");
1426 	sSwapFileAlloc = NULL;
1427 	sSwapFileCount = 0;
1428 
1429 	// init available swap space
1430 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1431 	sAvailSwapSpace = 0;
1432 
1433 	add_debugger_command_etc("swap", &dump_swap_info,
1434 		"Print infos about the swap usage",
1435 		"\n"
1436 		"Print infos about the swap usage.\n", 0);
1437 }
1438 
1439 
1440 void
1441 swap_init_post_modules()
1442 {
1443 	// Never try to create a swap file on a read-only device - when booting
1444 	// from CD, the write overlay is used.
1445 	if (gReadOnlyBootDevice)
1446 		return;
1447 
1448 	bool swapEnabled = true;
1449 	bool swapAutomatic = true;
1450 	off_t swapSize = 0;
1451 
1452 	dev_t swapDeviceID = -1;
1453 	VolumeInfo selectedVolume = {};
1454 
1455 	void* settings = load_driver_settings("virtual_memory");
1456 
1457 	if (settings != NULL) {
1458 		// We pass a lot of information on the swap device, this is mostly to
1459 		// ensure that we are dealing with the same device that was configured.
1460 
1461 		// TODO: Some kind of BFS uuid would be great here :)
1462 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1463 
1464 		if (enabled != NULL) {
1465 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1466 				true, false);
1467 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1468 				true, false);
1469 
1470 			if (swapEnabled && !swapAutomatic) {
1471 				const char* size = get_driver_parameter(settings, "swap_size",
1472 					NULL, NULL);
1473 				const char* volume = get_driver_parameter(settings,
1474 					"swap_volume_name", NULL, NULL);
1475 				const char* device = get_driver_parameter(settings,
1476 					"swap_volume_device", NULL, NULL);
1477 				const char* filesystem = get_driver_parameter(settings,
1478 					"swap_volume_filesystem", NULL, NULL);
1479 				const char* capacity = get_driver_parameter(settings,
1480 					"swap_volume_capacity", NULL, NULL);
1481 
1482 				if (size != NULL && device != NULL && volume != NULL
1483 					&& filesystem != NULL && capacity != NULL) {
1484 					// User specified a size / volume that seems valid
1485 					swapAutomatic = false;
1486 					swapSize = atoll(size);
1487 					strlcpy(selectedVolume.name, volume,
1488 						sizeof(selectedVolume.name));
1489 					strlcpy(selectedVolume.device, device,
1490 						sizeof(selectedVolume.device));
1491 					strlcpy(selectedVolume.filesystem, filesystem,
1492 						sizeof(selectedVolume.filesystem));
1493 					selectedVolume.capacity = atoll(capacity);
1494 				} else {
1495 					// Something isn't right with swap config, go auto
1496 					swapAutomatic = true;
1497 					dprintf("%s: virtual_memory configuration is invalid, "
1498 						"using automatic swap\n", __func__);
1499 				}
1500 			}
1501 		}
1502 		unload_driver_settings(settings);
1503 	}
1504 
1505 	if (swapAutomatic) {
1506 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1507 		if (swapSize <= (1024 * 1024 * 1024)) {
1508 			// Memory under 1GB? double the swap
1509 			swapSize *= 2;
1510 		}
1511 		// Automatic swap defaults to the boot device
1512 		swapDeviceID = gBootDevice;
1513 	}
1514 
1515 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1516 		dprintf("%s: virtual_memory is disabled\n", __func__);
1517 		return;
1518 	}
1519 
1520 	if (!swapAutomatic && swapDeviceID < 0) {
1521 		// If user-specified swap, and no swap device has been chosen yet...
1522 		KDiskDeviceManager::CreateDefault();
1523 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1524 		PartitionScorer visitor(selectedVolume);
1525 
1526 		KDiskDevice* device;
1527 		int32 cookie = 0;
1528 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1529 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1530 				|| device->IsRemovable()) {
1531 				continue;
1532 			}
1533 			device->VisitEachDescendant(&visitor);
1534 		}
1535 
1536 		if (!visitor.fBestPartition) {
1537 			dprintf("%s: Can't find configured swap partition '%s'\n",
1538 				__func__, selectedVolume.name);
1539 		} else {
1540 			if (visitor.fBestPartition->IsMounted())
1541 				swapDeviceID = visitor.fBestPartition->VolumeID();
1542 			else {
1543 				KPath devPath, mountPoint;
1544 				visitor.fBestPartition->GetPath(&devPath);
1545 				get_mount_point(visitor.fBestPartition, &mountPoint);
1546 				const char* mountPath = mountPoint.Path();
1547 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1548 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1549 					NULL, 0, NULL, 0);
1550 				if (swapDeviceID < 0) {
1551 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1552 						__func__, selectedVolume.name);
1553 				}
1554 			}
1555 		}
1556 	}
1557 
1558 	if (swapDeviceID < 0)
1559 		swapDeviceID = gBootDevice;
1560 
1561 	// We now have a swapDeviceID which is used for the swap file
1562 
1563 	KPath path;
1564 	struct fs_info info;
1565 	_kern_read_fs_info(swapDeviceID, &info);
1566 	if (swapDeviceID == gBootDevice)
1567 		path = kDefaultSwapPath;
1568 	else {
1569 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1570 			path.BufferSize());
1571 		path.UnlockBuffer();
1572 		path.Append("swap");
1573 	}
1574 
1575 	const char* swapPath = path.Path();
1576 
1577 	// Swap size limits prevent oversized swap files
1578 	if (swapAutomatic) {
1579 		off_t existingSwapSize = 0;
1580 		struct stat existingSwapStat;
1581 		if (stat(swapPath, &existingSwapStat) == 0)
1582 			existingSwapSize = existingSwapStat.st_size;
1583 
1584 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1585 
1586 		// Adjust automatic swap to a maximum of 25% of the free space
1587 		if (swapSize > (freeSpace / 4))
1588 			swapSize = (freeSpace / 4);
1589 	}
1590 
1591 	// Create swap file
1592 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1593 	if (fd < 0) {
1594 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1595 			swapPath, strerror(errno));
1596 		return;
1597 	}
1598 
1599 	struct stat stat;
1600 	stat.st_size = swapSize;
1601 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1602 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1603 	if (error != B_OK) {
1604 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1605 			__func__, swapPath, swapSize, strerror(error));
1606 	}
1607 
1608 	close(fd);
1609 
1610 	error = swap_file_add(swapPath);
1611 	if (error != B_OK) {
1612 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1613 			strerror(error));
1614 	}
1615 }
1616 
1617 
1618 //! Used by page daemon to free swap space.
1619 bool
1620 swap_free_page_swap_space(vm_page* page)
1621 {
1622 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1623 	if (cache == NULL)
1624 		return false;
1625 
1626 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1627 	if (slotIndex == SWAP_SLOT_NONE)
1628 		return false;
1629 
1630 	swap_slot_dealloc(slotIndex, 1);
1631 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1632 	cache->_SwapBlockFree(page->cache_offset, 1);
1633 
1634 	return true;
1635 }
1636 
1637 
1638 uint32
1639 swap_available_pages()
1640 {
1641 	mutex_lock(&sAvailSwapSpaceLock);
1642 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1643 	mutex_unlock(&sAvailSwapSpaceLock);
1644 
1645 	return avail;
1646 }
1647 
1648 
1649 uint32
1650 swap_total_swap_pages()
1651 {
1652 	mutex_lock(&sSwapFileListLock);
1653 
1654 	uint32 totalSwapSlots = 0;
1655 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1656 		swap_file* swapFile = it.Next();) {
1657 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1658 	}
1659 
1660 	mutex_unlock(&sSwapFileListLock);
1661 
1662 	return totalSwapSlots;
1663 }
1664 
1665 
1666 #endif	// ENABLE_SWAP_SUPPORT
1667 
1668 
1669 void
1670 swap_get_info(struct system_memory_info* info)
1671 {
1672 #if ENABLE_SWAP_SUPPORT
1673 	info->max_swap_space = (uint64)swap_total_swap_pages() * B_PAGE_SIZE;
1674 	info->free_swap_space = (uint64)swap_available_pages() * B_PAGE_SIZE;
1675 #else
1676 	info->max_swap_space = 0;
1677 	info->free_swap_space = 0;
1678 #endif
1679 }
1680 
1681