xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision 909af08f4328301fbdef1ffb41f566c3b5bec0c7)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 
31 #include <arch_config.h>
32 #include <boot_device.h>
33 #include <disk_device_manager/KDiskDevice.h>
34 #include <disk_device_manager/KDiskDeviceManager.h>
35 #include <disk_device_manager/KDiskSystem.h>
36 #include <disk_device_manager/KPartitionVisitor.h>
37 #include <driver_settings.h>
38 #include <fs/fd.h>
39 #include <fs/KPath.h>
40 #include <fs_info.h>
41 #include <fs_interface.h>
42 #include <heap.h>
43 #include <kernel_daemon.h>
44 #include <slab/Slab.h>
45 #include <syscalls.h>
46 #include <system_info.h>
47 #include <thread.h>
48 #include <tracing.h>
49 #include <util/AutoLock.h>
50 #include <util/Bitmap.h>
51 #include <util/DoublyLinkedList.h>
52 #include <util/OpenHashTable.h>
53 #include <util/RadixBitmap.h>
54 #include <vfs.h>
55 #include <vm/vm.h>
56 #include <vm/vm_page.h>
57 #include <vm/vm_priv.h>
58 #include <vm/VMAddressSpace.h>
59 
60 #include "IORequest.h"
61 #include "VMUtils.h"
62 
63 
64 #if	ENABLE_SWAP_SUPPORT
65 
66 //#define TRACE_VM_ANONYMOUS_CACHE
67 #ifdef TRACE_VM_ANONYMOUS_CACHE
68 #	define TRACE(x...) dprintf(x)
69 #else
70 #	define TRACE(x...) do { } while (false)
71 #endif
72 
73 
74 // number of free swap blocks the object cache shall minimally have
75 #define MIN_SWAP_BLOCK_RESERVE	4096
76 
77 // interval the has resizer is triggered (in 0.1s)
78 #define SWAP_HASH_RESIZE_INTERVAL	5
79 
80 #define INITIAL_SWAP_HASH_SIZE		1024
81 
82 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
83 
84 #define SWAP_BLOCK_PAGES 32
85 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
86 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
87 
88 
89 static const char* const kDefaultSwapPath = "/var/swap";
90 
91 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
92 	int				fd;
93 	struct vnode*	vnode;
94 	void*			cookie;
95 	swap_addr_t		first_slot;
96 	swap_addr_t		last_slot;
97 	radix_bitmap*	bmp;
98 };
99 
100 struct swap_hash_key {
101 	VMAnonymousCache	*cache;
102 	off_t				page_index;  // page index in the cache
103 };
104 
105 // Each swap block contains swap address information for
106 // SWAP_BLOCK_PAGES continuous pages from the same cache
107 struct swap_block {
108 	swap_block*		hash_link;
109 	swap_hash_key	key;
110 	uint32			used;
111 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
112 };
113 
114 struct SwapHashTableDefinition {
115 	typedef swap_hash_key KeyType;
116 	typedef swap_block ValueType;
117 
118 	SwapHashTableDefinition() {}
119 
120 	size_t HashKey(const swap_hash_key& key) const
121 	{
122 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
123 		VMAnonymousCache* cache = key.cache;
124 		return blockIndex ^ (size_t)(int*)cache;
125 	}
126 
127 	size_t Hash(const swap_block* value) const
128 	{
129 		return HashKey(value->key);
130 	}
131 
132 	bool Compare(const swap_hash_key& key, const swap_block* value) const
133 	{
134 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
135 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
136 			&& key.cache == value->key.cache;
137 	}
138 
139 	swap_block*& GetLink(swap_block* value) const
140 	{
141 		return value->hash_link;
142 	}
143 };
144 
145 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
146 typedef DoublyLinkedList<swap_file> SwapFileList;
147 
148 static SwapHashTable sSwapHashTable;
149 static rw_lock sSwapHashLock;
150 
151 static SwapFileList sSwapFileList;
152 static mutex sSwapFileListLock;
153 static swap_file* sSwapFileAlloc = NULL; // allocate from here
154 static uint32 sSwapFileCount = 0;
155 
156 static off_t sAvailSwapSpace = 0;
157 static mutex sAvailSwapSpaceLock;
158 
159 static object_cache* sSwapBlockCache;
160 
161 
162 #if SWAP_TRACING
163 namespace SwapTracing {
164 
165 class SwapTraceEntry : public AbstractTraceEntry {
166 public:
167 	SwapTraceEntry(VMAnonymousCache* cache)
168 		:
169 		fCache(cache)
170 	{
171 	}
172 
173 protected:
174 	VMAnonymousCache*	fCache;
175 };
176 
177 
178 class ReadPage : public SwapTraceEntry {
179 public:
180 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
181 		swap_addr_t swapSlotIndex)
182 		:
183 		SwapTraceEntry(cache),
184 		fPageIndex(pageIndex),
185 		fSwapSlotIndex(swapSlotIndex)
186 	{
187 		Initialized();
188 	}
189 
190 	virtual void AddDump(TraceOutput& out)
191 	{
192 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
193 			fCache, fPageIndex, fSwapSlotIndex);
194 	}
195 
196 private:
197 	page_num_t		fPageIndex;
198 	swap_addr_t		fSwapSlotIndex;
199 };
200 
201 
202 class WritePage : public SwapTraceEntry {
203 public:
204 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
205 		swap_addr_t swapSlotIndex)
206 		:
207 		SwapTraceEntry(cache),
208 		fPageIndex(pageIndex),
209 		fSwapSlotIndex(swapSlotIndex)
210 	{
211 		Initialized();
212 	}
213 
214 	virtual void AddDump(TraceOutput& out)
215 	{
216 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
217 			fCache, fPageIndex, fSwapSlotIndex);
218 	}
219 
220 private:
221 	page_num_t		fPageIndex;
222 	swap_addr_t		fSwapSlotIndex;
223 };
224 
225 }	// namespace SwapTracing
226 
227 #	define T(x) new(std::nothrow) SwapTracing::x;
228 #else
229 #	define T(x) ;
230 #endif
231 
232 
233 static int
234 dump_swap_info(int argc, char** argv)
235 {
236 	swap_addr_t totalSwapPages = 0;
237 	swap_addr_t freeSwapPages = 0;
238 
239 	kprintf("swap files:\n");
240 
241 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
242 		swap_file* file = it.Next();) {
243 		swap_addr_t total = file->last_slot - file->first_slot;
244 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
245 			"\n", file->vnode, total, file->bmp->free_slots);
246 
247 		totalSwapPages += total;
248 		freeSwapPages += file->bmp->free_slots;
249 	}
250 
251 	kprintf("\n");
252 	kprintf("swap space in pages:\n");
253 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
254 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
255 	kprintf("reserved:  %9" B_PRIdOFF "\n",
256 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
257 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
258 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
259 
260 	return 0;
261 }
262 
263 
264 static swap_addr_t
265 swap_slot_alloc(uint32 count)
266 {
267 	mutex_lock(&sSwapFileListLock);
268 
269 	if (sSwapFileList.IsEmpty()) {
270 		mutex_unlock(&sSwapFileListLock);
271 		panic("swap_slot_alloc(): no swap file in the system\n");
272 		return SWAP_SLOT_NONE;
273 	}
274 
275 	// since radix bitmap could not handle more than 32 pages, we return
276 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
277 	if (count > BITMAP_RADIX) {
278 		mutex_unlock(&sSwapFileListLock);
279 		return SWAP_SLOT_NONE;
280 	}
281 
282 	swap_addr_t j, addr = SWAP_SLOT_NONE;
283 	for (j = 0; j < sSwapFileCount; j++) {
284 		if (sSwapFileAlloc == NULL)
285 			sSwapFileAlloc = sSwapFileList.First();
286 
287 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
288 		if (addr != SWAP_SLOT_NONE) {
289 			addr += sSwapFileAlloc->first_slot;
290 			break;
291 		}
292 
293 		// this swap_file is full, find another
294 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
295 	}
296 
297 	if (j == sSwapFileCount) {
298 		mutex_unlock(&sSwapFileListLock);
299 		panic("swap_slot_alloc: swap space exhausted!\n");
300 		return SWAP_SLOT_NONE;
301 	}
302 
303 	// if this swap file has used more than 90% percent of its space
304 	// switch to another
305 	if (sSwapFileAlloc->bmp->free_slots
306 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
307 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
308 	}
309 
310 	mutex_unlock(&sSwapFileListLock);
311 
312 	return addr;
313 }
314 
315 
316 static swap_file*
317 find_swap_file(swap_addr_t slotIndex)
318 {
319 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
320 		swap_file* swapFile = it.Next();) {
321 		if (slotIndex >= swapFile->first_slot
322 			&& slotIndex < swapFile->last_slot) {
323 			return swapFile;
324 		}
325 	}
326 
327 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
328 		slotIndex);
329 	return NULL;
330 }
331 
332 
333 static void
334 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
335 {
336 	if (slotIndex == SWAP_SLOT_NONE)
337 		return;
338 
339 	mutex_lock(&sSwapFileListLock);
340 	swap_file* swapFile = find_swap_file(slotIndex);
341 	slotIndex -= swapFile->first_slot;
342 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
343 	mutex_unlock(&sSwapFileListLock);
344 }
345 
346 
347 static off_t
348 swap_space_reserve(off_t amount)
349 {
350 	mutex_lock(&sAvailSwapSpaceLock);
351 	if (sAvailSwapSpace >= amount)
352 		sAvailSwapSpace -= amount;
353 	else {
354 		amount = sAvailSwapSpace;
355 		sAvailSwapSpace = 0;
356 	}
357 	mutex_unlock(&sAvailSwapSpaceLock);
358 
359 	return amount;
360 }
361 
362 
363 static void
364 swap_space_unreserve(off_t amount)
365 {
366 	mutex_lock(&sAvailSwapSpaceLock);
367 	sAvailSwapSpace += amount;
368 	mutex_unlock(&sAvailSwapSpaceLock);
369 }
370 
371 
372 static void
373 swap_hash_resizer(void*, int)
374 {
375 	WriteLocker locker(sSwapHashLock);
376 
377 	size_t size;
378 	void* allocation;
379 
380 	do {
381 		size = sSwapHashTable.ResizeNeeded();
382 		if (size == 0)
383 			return;
384 
385 		locker.Unlock();
386 
387 		allocation = malloc(size);
388 		if (allocation == NULL)
389 			return;
390 
391 		locker.Lock();
392 
393 	} while (!sSwapHashTable.Resize(allocation, size));
394 }
395 
396 
397 // #pragma mark -
398 
399 
400 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
401 public:
402 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
403 		:
404 		StackableAsyncIOCallback(callback),
405 		fCache(cache)
406 	{
407 	}
408 
409 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
410 	{
411 		fPageIndex = pageIndex;
412 		fSlotIndex = slotIndex;
413 		fNewSlot = newSlot;
414 	}
415 
416 	virtual void IOFinished(status_t status, bool partialTransfer,
417 		generic_size_t bytesTransferred)
418 	{
419 		if (fNewSlot) {
420 			if (status == B_OK) {
421 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
422 			} else {
423 				AutoLocker<VMCache> locker(fCache);
424 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
425 				locker.Unlock();
426 
427 				swap_slot_dealloc(fSlotIndex, 1);
428 			}
429 		}
430 
431 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
432 		delete this;
433 	}
434 
435 private:
436 	VMAnonymousCache*	fCache;
437 	page_num_t			fPageIndex;
438 	swap_addr_t			fSlotIndex;
439 	bool				fNewSlot;
440 };
441 
442 
443 // #pragma mark -
444 
445 
446 VMAnonymousCache::~VMAnonymousCache()
447 {
448 	delete fNoSwapPages;
449 	fNoSwapPages = NULL;
450 
451 	_FreeSwapPageRange(virtual_base, virtual_end, false);
452 	swap_space_unreserve(fCommittedSwapSize);
453 	if (committed_size > fCommittedSwapSize)
454 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
455 }
456 
457 
458 status_t
459 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
460 	int32 numGuardPages, uint32 allocationFlags)
461 {
462 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
463 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
464 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
465 		numGuardPages);
466 
467 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
468 	if (error != B_OK)
469 		return error;
470 
471 	fCanOvercommit = canOvercommit;
472 	fHasPrecommitted = false;
473 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
474 	fNoSwapPages = NULL;
475 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
476 	fCommittedSwapSize = 0;
477 	fAllocatedSwapSize = 0;
478 
479 	return B_OK;
480 }
481 
482 
483 status_t
484 VMAnonymousCache::SetCanSwapPages(off_t base, size_t size, bool canSwap)
485 {
486 	const page_num_t first = base >> PAGE_SHIFT;
487 	const size_t count = PAGE_ALIGN(size + ((first << PAGE_SHIFT) - base)) >> PAGE_SHIFT;
488 
489 	if (count == 0)
490 		return B_OK;
491 	if (canSwap && fNoSwapPages == NULL)
492 		return B_OK;
493 
494 	if (fNoSwapPages == NULL)
495 		fNoSwapPages = new(std::nothrow) Bitmap(0);
496 	if (fNoSwapPages == NULL)
497 		return B_NO_MEMORY;
498 
499 	const page_num_t pageCount = PAGE_ALIGN(virtual_end) >> PAGE_SHIFT;
500 
501 	if (fNoSwapPages->Resize(pageCount) != B_OK)
502 		return B_NO_MEMORY;
503 
504 	for (size_t i = 0; i < count; i++) {
505 		if (canSwap)
506 			fNoSwapPages->Clear(first + i);
507 		else
508 			fNoSwapPages->Set(first + i);
509 	}
510 
511 	if (fNoSwapPages->GetHighestSet() < 0) {
512 		delete fNoSwapPages;
513 		fNoSwapPages = NULL;
514 	}
515 	return B_OK;
516 }
517 
518 
519 void
520 VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
521 	bool skipBusyPages)
522 {
523 	swap_block* swapBlock = NULL;
524 	off_t toIndex = toOffset >> PAGE_SHIFT;
525 	for (off_t pageIndex = fromOffset >> PAGE_SHIFT;
526 		pageIndex < toIndex && fAllocatedSwapSize > 0; pageIndex++) {
527 
528 		WriteLocker locker(sSwapHashLock);
529 
530 		// Get the swap slot index for the page.
531 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
532 		if (swapBlock == NULL || blockIndex == 0) {
533 			swap_hash_key key = { this, pageIndex };
534 			swapBlock = sSwapHashTable.Lookup(key);
535 
536 			if (swapBlock == NULL) {
537 				pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
538 				continue;
539 			}
540 		}
541 
542 		swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
543 		if (slotIndex == SWAP_SLOT_NONE)
544 			continue;
545 
546 		if (skipBusyPages) {
547 			vm_page* page = LookupPage(pageIndex * B_PAGE_SIZE);
548 			if (page != NULL && page->busy) {
549 				// TODO: We skip (i.e. leak) swap space of busy pages, since
550 				// there could be I/O going on (paging in/out). Waiting is
551 				// not an option as 1. unlocking the cache means that new
552 				// swap pages could be added in a range we've already
553 				// cleared (since the cache still has the old size) and 2.
554 				// we'd risk a deadlock in case we come from the file cache
555 				// and the FS holds the node's write-lock. We should mark
556 				// the page invalid and let the one responsible clean up.
557 				// There's just no such mechanism yet.
558 				continue;
559 			}
560 		}
561 
562 		swap_slot_dealloc(slotIndex, 1);
563 		fAllocatedSwapSize -= B_PAGE_SIZE;
564 
565 		swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
566 		if (--swapBlock->used == 0) {
567 			// All swap pages have been freed -- we can discard the swap block.
568 			sSwapHashTable.RemoveUnchecked(swapBlock);
569 			object_cache_free(sSwapBlockCache, swapBlock,
570 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
571 
572 			// There are no swap pages for possibly remaining pages, skip to the
573 			// next block.
574 			pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
575 			swapBlock = NULL;
576 		}
577 	}
578 }
579 
580 
581 status_t
582 VMAnonymousCache::Resize(off_t newSize, int priority)
583 {
584 	if (fNoSwapPages != NULL) {
585 		if (fNoSwapPages->Resize(PAGE_ALIGN(newSize) >> PAGE_SHIFT) != B_OK)
586 			return B_NO_MEMORY;
587 	}
588 
589 	_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
590 		virtual_end + B_PAGE_SIZE - 1);
591 	return VMCache::Resize(newSize, priority);
592 }
593 
594 
595 status_t
596 VMAnonymousCache::Rebase(off_t newBase, int priority)
597 {
598 	if (fNoSwapPages != NULL) {
599 		const ssize_t sizeDifference = (newBase >> PAGE_SHIFT) - (virtual_base >> PAGE_SHIFT);
600 		fNoSwapPages->Shift(sizeDifference);
601 	}
602 
603 	_FreeSwapPageRange(virtual_base, newBase);
604 	return VMCache::Rebase(newBase, priority);
605 }
606 
607 
608 status_t
609 VMAnonymousCache::Discard(off_t offset, off_t size)
610 {
611 	_FreeSwapPageRange(offset, offset + size);
612 	return VMCache::Discard(offset, size);
613 }
614 
615 
616 /*!	Moves the swap pages for the given range from the source cache into this
617 	cache. Both caches must be locked.
618 */
619 status_t
620 VMAnonymousCache::Adopt(VMCache* _source, off_t offset, off_t size,
621 	off_t newOffset)
622 {
623 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
624 	if (source == NULL) {
625 		panic("VMAnonymousCache::Adopt(): adopt from incompatible cache %p "
626 			"requested", _source);
627 		return B_ERROR;
628 	}
629 
630 	off_t pageIndex = newOffset >> PAGE_SHIFT;
631 	off_t sourcePageIndex = offset >> PAGE_SHIFT;
632 	off_t sourceEndPageIndex = (offset + size + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
633 	swap_block* swapBlock = NULL;
634 
635 	WriteLocker locker(sSwapHashLock);
636 
637 	while (sourcePageIndex < sourceEndPageIndex
638 			&& source->fAllocatedSwapSize > 0) {
639 		swap_addr_t left
640 			= SWAP_BLOCK_PAGES - (sourcePageIndex & SWAP_BLOCK_MASK);
641 
642 		swap_hash_key sourceKey = { source, sourcePageIndex };
643 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(sourceKey);
644 		if (sourceSwapBlock == NULL || sourceSwapBlock->used == 0) {
645 			sourcePageIndex += left;
646 			pageIndex += left;
647 			swapBlock = NULL;
648 			continue;
649 		}
650 
651 		for (; left > 0 && sourceSwapBlock->used > 0;
652 				left--, sourcePageIndex++, pageIndex++) {
653 
654 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
655 			if (swapBlock == NULL || blockIndex == 0) {
656 				swap_hash_key key = { this, pageIndex };
657 				swapBlock = sSwapHashTable.Lookup(key);
658 
659 				if (swapBlock == NULL) {
660 					swapBlock = (swap_block*)object_cache_alloc(sSwapBlockCache,
661 						CACHE_DONT_WAIT_FOR_MEMORY
662 							| CACHE_DONT_LOCK_KERNEL_SPACE);
663 					if (swapBlock == NULL)
664 						return B_NO_MEMORY;
665 
666 					swapBlock->key.cache = this;
667 					swapBlock->key.page_index
668 						= pageIndex & ~(off_t)SWAP_BLOCK_MASK;
669 					swapBlock->used = 0;
670 					for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
671 						swapBlock->swap_slots[i] = SWAP_SLOT_NONE;
672 
673 					sSwapHashTable.InsertUnchecked(swapBlock);
674 				}
675 			}
676 
677 			swap_addr_t sourceBlockIndex = sourcePageIndex & SWAP_BLOCK_MASK;
678 			swap_addr_t slotIndex
679 				= sourceSwapBlock->swap_slots[sourceBlockIndex];
680 			if (slotIndex == SWAP_SLOT_NONE)
681 				continue;
682 
683 			ASSERT(swapBlock->swap_slots[blockIndex] == SWAP_SLOT_NONE);
684 
685 			swapBlock->swap_slots[blockIndex] = slotIndex;
686 			swapBlock->used++;
687 			fAllocatedSwapSize += B_PAGE_SIZE;
688 
689 			sourceSwapBlock->swap_slots[sourceBlockIndex] = SWAP_SLOT_NONE;
690 			sourceSwapBlock->used--;
691 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
692 
693 			TRACE("adopted slot %#" B_PRIx32 " from %p at page %" B_PRIdOFF
694 				" to %p at page %" B_PRIdOFF "\n", slotIndex, source,
695 				sourcePageIndex, this, pageIndex);
696 		}
697 
698 		if (left > 0) {
699 			sourcePageIndex += left;
700 			pageIndex += left;
701 			swapBlock = NULL;
702 		}
703 
704 		if (sourceSwapBlock->used == 0) {
705 			// All swap pages have been adopted, we can discard the swap block.
706 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
707 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
708 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
709 		}
710 	}
711 
712 	locker.Unlock();
713 
714 	return VMCache::Adopt(source, offset, size, newOffset);
715 }
716 
717 
718 status_t
719 VMAnonymousCache::Commit(off_t size, int priority)
720 {
721 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
722 
723 	// If we can overcommit, we don't commit here, but in Fault(). We always
724 	// unreserve memory, if we're asked to shrink our commitment, though.
725 	if (fCanOvercommit && size > committed_size) {
726 		if (fHasPrecommitted)
727 			return B_OK;
728 
729 		// pre-commit some pages to make a later failure less probable
730 		fHasPrecommitted = true;
731 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
732 		if (size > precommitted)
733 			size = precommitted;
734 	}
735 
736 	return _Commit(size, priority);
737 }
738 
739 
740 bool
741 VMAnonymousCache::HasPage(off_t offset)
742 {
743 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
744 		return true;
745 
746 	return false;
747 }
748 
749 
750 bool
751 VMAnonymousCache::DebugHasPage(off_t offset)
752 {
753 	off_t pageIndex = offset >> PAGE_SHIFT;
754 	swap_hash_key key = { this, pageIndex };
755 	swap_block* swap = sSwapHashTable.Lookup(key);
756 	if (swap == NULL)
757 		return false;
758 
759 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
760 }
761 
762 
763 status_t
764 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
765 	uint32 flags, generic_size_t* _numBytes)
766 {
767 	off_t pageIndex = offset >> PAGE_SHIFT;
768 
769 	for (uint32 i = 0, j = 0; i < count; i = j) {
770 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
771 		for (j = i + 1; j < count; j++) {
772 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
773 			if (slotIndex != startSlotIndex + j - i)
774 				break;
775 		}
776 
777 		T(ReadPage(this, pageIndex, startSlotIndex));
778 			// TODO: Assumes that only one page is read.
779 
780 		swap_file* swapFile = find_swap_file(startSlotIndex);
781 
782 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
783 			* B_PAGE_SIZE;
784 
785 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
786 			vecs + i, j - i, flags, _numBytes);
787 		if (status != B_OK)
788 			return status;
789 	}
790 
791 	return B_OK;
792 }
793 
794 
795 status_t
796 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
797 	uint32 flags, generic_size_t* _numBytes)
798 {
799 	off_t pageIndex = offset >> PAGE_SHIFT;
800 
801 	AutoLocker<VMCache> locker(this);
802 
803 	page_num_t totalPages = 0;
804 	for (uint32 i = 0; i < count; i++) {
805 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
806 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
807 		if (slotIndex != SWAP_SLOT_NONE) {
808 			swap_slot_dealloc(slotIndex, pageCount);
809 			_SwapBlockFree(pageIndex + totalPages, pageCount);
810 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
811 		}
812 
813 		totalPages += pageCount;
814 	}
815 
816 	off_t totalSize = totalPages * B_PAGE_SIZE;
817 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
818 		return B_ERROR;
819 
820 	fAllocatedSwapSize += totalSize;
821 	locker.Unlock();
822 
823 	page_num_t pagesLeft = totalPages;
824 	totalPages = 0;
825 
826 	for (uint32 i = 0; i < count; i++) {
827 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
828 
829 		generic_addr_t vectorBase = vecs[i].base;
830 		generic_size_t vectorLength = vecs[i].length;
831 		page_num_t n = pageCount;
832 
833 		for (page_num_t j = 0; j < pageCount; j += n) {
834 			swap_addr_t slotIndex;
835 			// try to allocate n slots, if fail, try to allocate n/2
836 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
837 				n >>= 1;
838 
839 			if (slotIndex == SWAP_SLOT_NONE)
840 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
841 
842 			T(WritePage(this, pageIndex, slotIndex));
843 				// TODO: Assumes that only one page is written.
844 
845 			swap_file* swapFile = find_swap_file(slotIndex);
846 
847 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
848 
849 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
850 			generic_io_vec vector[1];
851 			vector->base = vectorBase;
852 			vector->length = length;
853 
854 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
855 				pos, vector, 1, flags, &length);
856 			if (status != B_OK) {
857 				locker.Lock();
858 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
859 				locker.Unlock();
860 
861 				swap_slot_dealloc(slotIndex, n);
862 				return status;
863 			}
864 
865 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
866 			pagesLeft -= n;
867 
868 			if (n != pageCount) {
869 				vectorBase = vectorBase + n * B_PAGE_SIZE;
870 				vectorLength -= n * B_PAGE_SIZE;
871 			}
872 		}
873 
874 		totalPages += pageCount;
875 	}
876 
877 	ASSERT(pagesLeft == 0);
878 	return B_OK;
879 }
880 
881 
882 status_t
883 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
884 	size_t count, generic_size_t numBytes, uint32 flags,
885 	AsyncIOCallback* _callback)
886 {
887 	// TODO: Currently this method is only used for single pages. Either make
888 	// more flexible use of it or change the interface!
889 	// This implementation relies on the current usage!
890 	ASSERT(count == 1);
891 	ASSERT(numBytes <= B_PAGE_SIZE);
892 
893 	page_num_t pageIndex = offset >> PAGE_SHIFT;
894 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
895 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
896 
897 	// If the page doesn't have any swap space yet, allocate it.
898 	if (newSlot) {
899 		AutoLocker<VMCache> locker(this);
900 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
901 			_callback->IOFinished(B_ERROR, true, 0);
902 			return B_ERROR;
903 		}
904 
905 		fAllocatedSwapSize += B_PAGE_SIZE;
906 
907 		slotIndex = swap_slot_alloc(1);
908 	}
909 
910 	// create our callback
911 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
912 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
913 		: new(std::nothrow) WriteCallback(this, _callback);
914 	if (callback == NULL) {
915 		if (newSlot) {
916 			AutoLocker<VMCache> locker(this);
917 			fAllocatedSwapSize -= B_PAGE_SIZE;
918 			locker.Unlock();
919 
920 			swap_slot_dealloc(slotIndex, 1);
921 		}
922 		_callback->IOFinished(B_NO_MEMORY, true, 0);
923 		return B_NO_MEMORY;
924 	}
925 	// TODO: If the page already had swap space assigned, we don't need an own
926 	// callback.
927 
928 	callback->SetTo(pageIndex, slotIndex, newSlot);
929 
930 	T(WritePage(this, pageIndex, slotIndex));
931 
932 	// write the page asynchrounously
933 	swap_file* swapFile = find_swap_file(slotIndex);
934 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
935 
936 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
937 		vecs, 1, numBytes, flags, callback);
938 }
939 
940 
941 bool
942 VMAnonymousCache::CanWritePage(off_t offset)
943 {
944 	const off_t pageIndex = offset >> PAGE_SHIFT;
945 	if (fNoSwapPages != NULL && fNoSwapPages->Get(pageIndex))
946 		return false;
947 
948 	// We can write the page, if we have not used all of our committed swap
949 	// space or the page already has a swap slot assigned.
950 	return fAllocatedSwapSize < fCommittedSwapSize
951 		|| _SwapBlockGetAddress(pageIndex) != SWAP_SLOT_NONE;
952 }
953 
954 
955 int32
956 VMAnonymousCache::MaxPagesPerAsyncWrite() const
957 {
958 	return 1;
959 }
960 
961 
962 status_t
963 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
964 {
965 	if (fGuardedSize > 0) {
966 		uint32 guardOffset;
967 
968 #ifdef STACK_GROWS_DOWNWARDS
969 		guardOffset = 0;
970 #elif defined(STACK_GROWS_UPWARDS)
971 		guardOffset = virtual_size - fGuardedSize;
972 #else
973 #	error Stack direction has not been defined in arch_config.h
974 #endif
975 		// report stack fault, guard page hit!
976 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
977 			TRACE(("stack overflow!\n"));
978 			return B_BAD_ADDRESS;
979 		}
980 	}
981 
982 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
983 		if (fPrecommittedPages == 0) {
984 			// never commit more than needed
985 			if (committed_size / B_PAGE_SIZE > page_count)
986 				return B_BAD_HANDLER;
987 
988 			// try to commit additional swap space/memory
989 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
990 				fCommittedSwapSize += B_PAGE_SIZE;
991 			} else {
992 				int priority = aspace == VMAddressSpace::Kernel()
993 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
994 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
995 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
996 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
997 					return B_NO_MEMORY;
998 				}
999 			}
1000 
1001 			committed_size += B_PAGE_SIZE;
1002 		} else
1003 			fPrecommittedPages--;
1004 	}
1005 
1006 	// This will cause vm_soft_fault() to handle the fault
1007 	return B_BAD_HANDLER;
1008 }
1009 
1010 
1011 void
1012 VMAnonymousCache::Merge(VMCache* _source)
1013 {
1014 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
1015 	if (source == NULL) {
1016 		panic("VMAnonymousCache::Merge(): merge with incompatible cache "
1017 			"%p requested", _source);
1018 		return;
1019 	}
1020 
1021 	// take over the source' committed size
1022 	fCommittedSwapSize += source->fCommittedSwapSize;
1023 	source->fCommittedSwapSize = 0;
1024 	committed_size += source->committed_size;
1025 	source->committed_size = 0;
1026 
1027 	off_t actualSize = virtual_end - virtual_base;
1028 	if (committed_size > actualSize)
1029 		_Commit(actualSize, VM_PRIORITY_USER);
1030 
1031 	// Move all not shadowed swap pages from the source to the consumer cache.
1032 	// Also remove all source pages that are shadowed by consumer swap pages.
1033 	_MergeSwapPages(source);
1034 
1035 	// Move all not shadowed pages from the source to the consumer cache.
1036 	if (source->page_count < page_count)
1037 		_MergePagesSmallerSource(source);
1038 	else
1039 		_MergePagesSmallerConsumer(source);
1040 }
1041 
1042 
1043 void
1044 VMAnonymousCache::DeleteObject()
1045 {
1046 	object_cache_delete(gAnonymousCacheObjectCache, this);
1047 }
1048 
1049 
1050 void
1051 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
1052 	swap_addr_t startSlotIndex, uint32 count)
1053 {
1054 	WriteLocker locker(sSwapHashLock);
1055 
1056 	uint32 left = count;
1057 	for (uint32 i = 0, j = 0; i < count; i += j) {
1058 		off_t pageIndex = startPageIndex + i;
1059 		swap_addr_t slotIndex = startSlotIndex + i;
1060 
1061 		swap_hash_key key = { this, pageIndex };
1062 
1063 		swap_block* swap = sSwapHashTable.Lookup(key);
1064 		while (swap == NULL) {
1065 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
1066 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1067 			if (swap == NULL) {
1068 				// Wait a short time until memory is available again.
1069 				locker.Unlock();
1070 				snooze(10000);
1071 				locker.Lock();
1072 				swap = sSwapHashTable.Lookup(key);
1073 				continue;
1074 			}
1075 
1076 			swap->key.cache = this;
1077 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
1078 			swap->used = 0;
1079 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
1080 				swap->swap_slots[i] = SWAP_SLOT_NONE;
1081 
1082 			sSwapHashTable.InsertUnchecked(swap);
1083 		}
1084 
1085 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1086 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1087 			swap->swap_slots[blockIndex++] = slotIndex + j;
1088 			left--;
1089 		}
1090 
1091 		swap->used += j;
1092 	}
1093 }
1094 
1095 
1096 void
1097 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
1098 {
1099 	WriteLocker locker(sSwapHashLock);
1100 
1101 	uint32 left = count;
1102 	for (uint32 i = 0, j = 0; i < count; i += j) {
1103 		off_t pageIndex = startPageIndex + i;
1104 		swap_hash_key key = { this, pageIndex };
1105 		swap_block* swap = sSwapHashTable.Lookup(key);
1106 
1107 		ASSERT(swap != NULL);
1108 
1109 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1110 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1111 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
1112 			left--;
1113 		}
1114 
1115 		swap->used -= j;
1116 		if (swap->used == 0) {
1117 			sSwapHashTable.RemoveUnchecked(swap);
1118 			object_cache_free(sSwapBlockCache, swap,
1119 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1120 		}
1121 	}
1122 }
1123 
1124 
1125 swap_addr_t
1126 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
1127 {
1128 	ReadLocker locker(sSwapHashLock);
1129 
1130 	swap_hash_key key = { this, pageIndex };
1131 	swap_block* swap = sSwapHashTable.Lookup(key);
1132 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
1133 
1134 	if (swap != NULL) {
1135 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1136 		slotIndex = swap->swap_slots[blockIndex];
1137 	}
1138 
1139 	return slotIndex;
1140 }
1141 
1142 
1143 status_t
1144 VMAnonymousCache::_Commit(off_t size, int priority)
1145 {
1146 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
1147 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
1148 		fCommittedSwapSize);
1149 
1150 	// Basic strategy: reserve swap space first, only when running out of swap
1151 	// space, reserve real memory.
1152 
1153 	off_t committedMemory = committed_size - fCommittedSwapSize;
1154 
1155 	// Regardless of whether we're asked to grow or shrink the commitment,
1156 	// we always try to reserve as much as possible of the final commitment
1157 	// in the swap space.
1158 	if (size > fCommittedSwapSize) {
1159 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
1160 		committed_size = fCommittedSwapSize + committedMemory;
1161 		if (size > fCommittedSwapSize) {
1162 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
1163 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
1164 		}
1165 	}
1166 
1167 	if (committed_size == size)
1168 		return B_OK;
1169 
1170 	if (committed_size > size) {
1171 		// The commitment shrinks -- unreserve real memory first.
1172 		off_t toUnreserve = committed_size - size;
1173 		if (committedMemory > 0) {
1174 			off_t unreserved = min_c(toUnreserve, committedMemory);
1175 			vm_unreserve_memory(unreserved);
1176 			committedMemory -= unreserved;
1177 			committed_size -= unreserved;
1178 			toUnreserve -= unreserved;
1179 		}
1180 
1181 		// Unreserve swap space.
1182 		if (toUnreserve > 0) {
1183 			swap_space_unreserve(toUnreserve);
1184 			fCommittedSwapSize -= toUnreserve;
1185 			committed_size -= toUnreserve;
1186 		}
1187 
1188 		return B_OK;
1189 	}
1190 
1191 	// The commitment grows -- we have already tried to reserve swap space at
1192 	// the start of the method, so we try to reserve real memory, now.
1193 
1194 	off_t toReserve = size - committed_size;
1195 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1196 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1197 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1198 		return B_NO_MEMORY;
1199 	}
1200 
1201 	committed_size = size;
1202 	return B_OK;
1203 }
1204 
1205 
1206 void
1207 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1208 {
1209 	// The source cache has less pages than the consumer (this cache), so we
1210 	// iterate through the source's pages and move the ones that are not
1211 	// shadowed up to the consumer.
1212 
1213 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1214 			vm_page* page = it.Next();) {
1215 		// Note: Removing the current node while iterating through a
1216 		// IteratableSplayTree is safe.
1217 		vm_page* consumerPage = LookupPage(
1218 			(off_t)page->cache_offset << PAGE_SHIFT);
1219 		if (consumerPage == NULL) {
1220 			// the page is not yet in the consumer cache - move it upwards
1221 			ASSERT_PRINT(!page->busy, "page: %p", page);
1222 			MovePage(page);
1223 		}
1224 	}
1225 }
1226 
1227 
1228 void
1229 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1230 {
1231 	// The consumer (this cache) has less pages than the source, so we move the
1232 	// consumer's pages to the source (freeing shadowed ones) and finally just
1233 	// all pages of the source back to the consumer.
1234 
1235 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1236 		vm_page* page = it.Next();) {
1237 		// If a source page is in the way, remove and free it.
1238 		vm_page* sourcePage = source->LookupPage(
1239 			(off_t)page->cache_offset << PAGE_SHIFT);
1240 		if (sourcePage != NULL) {
1241 			DEBUG_PAGE_ACCESS_START(sourcePage);
1242 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1243 			ASSERT_PRINT(sourcePage->WiredCount() == 0
1244 					&& sourcePage->mappings.IsEmpty(),
1245 				"sourcePage: %p, page: %p", sourcePage, page);
1246 			source->RemovePage(sourcePage);
1247 			vm_page_free(source, sourcePage);
1248 		}
1249 
1250 		// Note: Removing the current node while iterating through a
1251 		// IteratableSplayTree is safe.
1252 		source->MovePage(page);
1253 	}
1254 
1255 	MoveAllPages(source);
1256 }
1257 
1258 
1259 void
1260 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1261 {
1262 	// If neither source nor consumer have swap pages, we don't have to do
1263 	// anything.
1264 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1265 		return;
1266 
1267 	for (off_t offset = source->virtual_base
1268 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1269 		offset < source->virtual_end;
1270 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1271 
1272 		WriteLocker locker(sSwapHashLock);
1273 
1274 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1275 		swap_hash_key key = { source, swapBlockPageIndex };
1276 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1277 
1278 		// remove the source swap block -- we will either take over the swap
1279 		// space (and the block) or free it
1280 		if (sourceSwapBlock != NULL)
1281 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1282 
1283 		key.cache = this;
1284 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1285 
1286 		locker.Unlock();
1287 
1288 		// remove all source pages that are shadowed by consumer swap pages
1289 		if (swapBlock != NULL) {
1290 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1291 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1292 					vm_page* page = source->LookupPage(
1293 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1294 					if (page != NULL) {
1295 						DEBUG_PAGE_ACCESS_START(page);
1296 						ASSERT_PRINT(!page->busy, "page: %p", page);
1297 						source->RemovePage(page);
1298 						vm_page_free(source, page);
1299 					}
1300 				}
1301 			}
1302 		}
1303 
1304 		if (sourceSwapBlock == NULL)
1305 			continue;
1306 
1307 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1308 			off_t pageIndex = swapBlockPageIndex + i;
1309 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1310 
1311 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1312 				continue;
1313 
1314 			if ((swapBlock != NULL
1315 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1316 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1317 				// The consumer already has a page or a swapped out page
1318 				// at this index. So we can free the source swap space.
1319 				swap_slot_dealloc(sourceSlotIndex, 1);
1320 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1321 				sourceSwapBlock->used--;
1322 			}
1323 
1324 			// We've either freed the source swap page or are going to move it
1325 			// to the consumer. At any rate, the source cache doesn't own it
1326 			// anymore.
1327 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1328 		}
1329 
1330 		// All source swap pages that have not been freed yet are taken over by
1331 		// the consumer.
1332 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1333 
1334 		if (sourceSwapBlock->used == 0) {
1335 			// All swap pages have been freed -- we can discard the source swap
1336 			// block.
1337 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1338 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1339 		} else if (swapBlock == NULL) {
1340 			// We need to take over some of the source's swap pages and there's
1341 			// no swap block in the consumer cache. Just take over the source
1342 			// swap block.
1343 			sourceSwapBlock->key.cache = this;
1344 			locker.Lock();
1345 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1346 			locker.Unlock();
1347 		} else {
1348 			// We need to take over some of the source's swap pages and there's
1349 			// already a swap block in the consumer cache. Copy the respective
1350 			// swap addresses and discard the source swap block.
1351 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1352 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1353 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1354 			}
1355 
1356 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1357 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1358 		}
1359 	}
1360 }
1361 
1362 
1363 // #pragma mark -
1364 
1365 
1366 // TODO: This can be removed if we get BFS uuid's
1367 struct VolumeInfo {
1368 	char name[B_FILE_NAME_LENGTH];
1369 	char device[B_FILE_NAME_LENGTH];
1370 	char filesystem[B_OS_NAME_LENGTH];
1371 	off_t capacity;
1372 };
1373 
1374 
1375 class PartitionScorer : public KPartitionVisitor {
1376 public:
1377 	PartitionScorer(VolumeInfo& volumeInfo)
1378 		:
1379 		fBestPartition(NULL),
1380 		fBestScore(-1),
1381 		fVolumeInfo(volumeInfo)
1382 	{
1383 	}
1384 
1385 	virtual bool VisitPre(KPartition* partition)
1386 	{
1387 		if (!partition->ContainsFileSystem())
1388 			return false;
1389 
1390 		KPath path;
1391 		partition->GetPath(&path);
1392 
1393 		int score = 0;
1394 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1395 			score += 4;
1396 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1397 			score += 3;
1398 		if (fVolumeInfo.capacity == partition->Size())
1399 			score += 2;
1400 		if (strcmp(fVolumeInfo.filesystem,
1401 			partition->DiskSystem()->ShortName()) == 0) {
1402 			score += 1;
1403 		}
1404 		if (score >= 4 && score > fBestScore) {
1405 			fBestPartition = partition;
1406 			fBestScore = score;
1407 		}
1408 
1409 		return false;
1410 	}
1411 
1412 	KPartition* fBestPartition;
1413 
1414 private:
1415 	int32		fBestScore;
1416 	VolumeInfo&	fVolumeInfo;
1417 };
1418 
1419 
1420 status_t
1421 swap_file_add(const char* path)
1422 {
1423 	// open the file
1424 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1425 	if (fd < 0)
1426 		return errno;
1427 
1428 	// fstat() it and check whether we can use it
1429 	struct stat st;
1430 	if (fstat(fd, &st) < 0) {
1431 		close(fd);
1432 		return errno;
1433 	}
1434 
1435 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1436 		close(fd);
1437 		return B_BAD_VALUE;
1438 	}
1439 
1440 	if (st.st_size < B_PAGE_SIZE) {
1441 		close(fd);
1442 		return B_BAD_VALUE;
1443 	}
1444 
1445 	// get file descriptor, vnode, and cookie
1446 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1447 	put_fd(descriptor);
1448 
1449 	vnode* node = fd_vnode(descriptor);
1450 	if (node == NULL) {
1451 		close(fd);
1452 		return B_BAD_VALUE;
1453 	}
1454 
1455 	// do the allocations and prepare the swap_file structure
1456 	swap_file* swap = new(std::nothrow) swap_file;
1457 	if (swap == NULL) {
1458 		close(fd);
1459 		return B_NO_MEMORY;
1460 	}
1461 
1462 	swap->fd = fd;
1463 	swap->vnode = node;
1464 	swap->cookie = descriptor->cookie;
1465 
1466 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1467 	swap->bmp = radix_bitmap_create(pageCount);
1468 	if (swap->bmp == NULL) {
1469 		delete swap;
1470 		close(fd);
1471 		return B_NO_MEMORY;
1472 	}
1473 
1474 	// set slot index and add this file to swap file list
1475 	mutex_lock(&sSwapFileListLock);
1476 	// TODO: Also check whether the swap file is already registered!
1477 	if (sSwapFileList.IsEmpty()) {
1478 		swap->first_slot = 0;
1479 		swap->last_slot = pageCount;
1480 	} else {
1481 		// leave one page gap between two swap files
1482 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1483 		swap->last_slot = swap->first_slot + pageCount;
1484 	}
1485 	sSwapFileList.Add(swap);
1486 	sSwapFileCount++;
1487 	mutex_unlock(&sSwapFileListLock);
1488 
1489 	mutex_lock(&sAvailSwapSpaceLock);
1490 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1491 	mutex_unlock(&sAvailSwapSpaceLock);
1492 
1493 	return B_OK;
1494 }
1495 
1496 
1497 status_t
1498 swap_file_delete(const char* path)
1499 {
1500 	vnode* node = NULL;
1501 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1502 	if (status != B_OK)
1503 		return status;
1504 
1505 	MutexLocker locker(sSwapFileListLock);
1506 
1507 	swap_file* swapFile = NULL;
1508 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1509 			(swapFile = it.Next()) != NULL;) {
1510 		if (swapFile->vnode == node)
1511 			break;
1512 	}
1513 
1514 	vfs_put_vnode(node);
1515 
1516 	if (swapFile == NULL)
1517 		return B_ERROR;
1518 
1519 	// if this file is currently used, we can't delete
1520 	// TODO: mark this swap file deleting, and remove it after releasing
1521 	// all the swap space
1522 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1523 		return B_ERROR;
1524 
1525 	sSwapFileList.Remove(swapFile);
1526 	sSwapFileCount--;
1527 	locker.Unlock();
1528 
1529 	mutex_lock(&sAvailSwapSpaceLock);
1530 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1531 		* B_PAGE_SIZE;
1532 	mutex_unlock(&sAvailSwapSpaceLock);
1533 
1534 	close(swapFile->fd);
1535 	radix_bitmap_destroy(swapFile->bmp);
1536 	delete swapFile;
1537 
1538 	return B_OK;
1539 }
1540 
1541 
1542 void
1543 swap_init(void)
1544 {
1545 	// create swap block cache
1546 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1547 		sizeof(void*), NULL, NULL, NULL);
1548 	if (sSwapBlockCache == NULL)
1549 		panic("swap_init(): can't create object cache for swap blocks\n");
1550 
1551 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1552 		MIN_SWAP_BLOCK_RESERVE);
1553 	if (error != B_OK) {
1554 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1555 			strerror(error));
1556 	}
1557 
1558 	// init swap hash table
1559 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1560 	rw_lock_init(&sSwapHashLock, "swaphash");
1561 
1562 	error = register_resource_resizer(swap_hash_resizer, NULL,
1563 		SWAP_HASH_RESIZE_INTERVAL);
1564 	if (error != B_OK) {
1565 		panic("swap_init(): Failed to register swap hash resizer: %s",
1566 			strerror(error));
1567 	}
1568 
1569 	// init swap file list
1570 	mutex_init(&sSwapFileListLock, "swaplist");
1571 	sSwapFileAlloc = NULL;
1572 	sSwapFileCount = 0;
1573 
1574 	// init available swap space
1575 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1576 	sAvailSwapSpace = 0;
1577 
1578 	add_debugger_command_etc("swap", &dump_swap_info,
1579 		"Print infos about the swap usage",
1580 		"\n"
1581 		"Print infos about the swap usage.\n", 0);
1582 }
1583 
1584 
1585 void
1586 swap_init_post_modules()
1587 {
1588 	// Never try to create a swap file on a read-only device - when booting
1589 	// from CD, the write overlay is used.
1590 	if (gReadOnlyBootDevice)
1591 		return;
1592 
1593 	bool swapEnabled = true;
1594 	bool swapAutomatic = true;
1595 	off_t swapSize = 0;
1596 
1597 	dev_t swapDeviceID = -1;
1598 	VolumeInfo selectedVolume = {};
1599 
1600 	void* settings = load_driver_settings("virtual_memory");
1601 
1602 	if (settings != NULL) {
1603 		// We pass a lot of information on the swap device, this is mostly to
1604 		// ensure that we are dealing with the same device that was configured.
1605 
1606 		// TODO: Some kind of BFS uuid would be great here :)
1607 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1608 
1609 		if (enabled != NULL) {
1610 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1611 				true, false);
1612 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1613 				true, false);
1614 
1615 			if (swapEnabled && !swapAutomatic) {
1616 				const char* size = get_driver_parameter(settings, "swap_size",
1617 					NULL, NULL);
1618 				const char* volume = get_driver_parameter(settings,
1619 					"swap_volume_name", NULL, NULL);
1620 				const char* device = get_driver_parameter(settings,
1621 					"swap_volume_device", NULL, NULL);
1622 				const char* filesystem = get_driver_parameter(settings,
1623 					"swap_volume_filesystem", NULL, NULL);
1624 				const char* capacity = get_driver_parameter(settings,
1625 					"swap_volume_capacity", NULL, NULL);
1626 
1627 				if (size != NULL && device != NULL && volume != NULL
1628 					&& filesystem != NULL && capacity != NULL) {
1629 					// User specified a size / volume that seems valid
1630 					swapAutomatic = false;
1631 					swapSize = atoll(size);
1632 					strlcpy(selectedVolume.name, volume,
1633 						sizeof(selectedVolume.name));
1634 					strlcpy(selectedVolume.device, device,
1635 						sizeof(selectedVolume.device));
1636 					strlcpy(selectedVolume.filesystem, filesystem,
1637 						sizeof(selectedVolume.filesystem));
1638 					selectedVolume.capacity = atoll(capacity);
1639 				} else {
1640 					// Something isn't right with swap config, go auto
1641 					swapAutomatic = true;
1642 					dprintf("%s: virtual_memory configuration is invalid, "
1643 						"using automatic swap\n", __func__);
1644 				}
1645 			}
1646 		}
1647 		unload_driver_settings(settings);
1648 	}
1649 
1650 	if (swapAutomatic) {
1651 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1652 		if (swapSize <= (1024 * 1024 * 1024)) {
1653 			// Memory under 1GB? double the swap
1654 			swapSize *= 2;
1655 		}
1656 		// Automatic swap defaults to the boot device
1657 		swapDeviceID = gBootDevice;
1658 	}
1659 
1660 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1661 		dprintf("%s: virtual_memory is disabled\n", __func__);
1662 		return;
1663 	}
1664 
1665 	if (!swapAutomatic && swapDeviceID < 0) {
1666 		// If user-specified swap, and no swap device has been chosen yet...
1667 		KDiskDeviceManager::CreateDefault();
1668 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1669 		PartitionScorer visitor(selectedVolume);
1670 
1671 		KDiskDevice* device;
1672 		int32 cookie = 0;
1673 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1674 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1675 				|| device->IsRemovable()) {
1676 				continue;
1677 			}
1678 			device->VisitEachDescendant(&visitor);
1679 		}
1680 
1681 		if (!visitor.fBestPartition) {
1682 			dprintf("%s: Can't find configured swap partition '%s'\n",
1683 				__func__, selectedVolume.name);
1684 		} else {
1685 			if (visitor.fBestPartition->IsMounted())
1686 				swapDeviceID = visitor.fBestPartition->VolumeID();
1687 			else {
1688 				KPath devPath, mountPoint;
1689 				visitor.fBestPartition->GetPath(&devPath);
1690 				get_mount_point(visitor.fBestPartition, &mountPoint);
1691 				const char* mountPath = mountPoint.Path();
1692 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1693 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1694 					NULL, 0, NULL, 0);
1695 				if (swapDeviceID < 0) {
1696 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1697 						__func__, selectedVolume.name);
1698 				}
1699 			}
1700 		}
1701 	}
1702 
1703 	if (swapDeviceID < 0)
1704 		swapDeviceID = gBootDevice;
1705 
1706 	// We now have a swapDeviceID which is used for the swap file
1707 
1708 	KPath path;
1709 	struct fs_info info;
1710 	_kern_read_fs_info(swapDeviceID, &info);
1711 	if (swapDeviceID == gBootDevice)
1712 		path = kDefaultSwapPath;
1713 	else {
1714 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1715 			path.BufferSize());
1716 		path.UnlockBuffer();
1717 		path.Append("swap");
1718 	}
1719 
1720 	const char* swapPath = path.Path();
1721 
1722 	// Swap size limits prevent oversized swap files
1723 	if (swapAutomatic) {
1724 		off_t existingSwapSize = 0;
1725 		struct stat existingSwapStat;
1726 		if (stat(swapPath, &existingSwapStat) == 0)
1727 			existingSwapSize = existingSwapStat.st_size;
1728 
1729 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1730 
1731 		// Adjust automatic swap to a maximum of 25% of the free space
1732 		if (swapSize > (freeSpace / 4))
1733 			swapSize = (freeSpace / 4);
1734 	}
1735 
1736 	// Create swap file
1737 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1738 	if (fd < 0) {
1739 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1740 			swapPath, strerror(errno));
1741 		return;
1742 	}
1743 
1744 	struct stat stat;
1745 	stat.st_size = swapSize;
1746 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1747 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1748 	if (error != B_OK) {
1749 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1750 			__func__, swapPath, swapSize, strerror(error));
1751 	}
1752 
1753 	close(fd);
1754 
1755 	error = swap_file_add(swapPath);
1756 	if (error != B_OK) {
1757 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1758 			strerror(error));
1759 	}
1760 }
1761 
1762 
1763 //! Used by page daemon to free swap space.
1764 bool
1765 swap_free_page_swap_space(vm_page* page)
1766 {
1767 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1768 	if (cache == NULL)
1769 		return false;
1770 
1771 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1772 	if (slotIndex == SWAP_SLOT_NONE)
1773 		return false;
1774 
1775 	swap_slot_dealloc(slotIndex, 1);
1776 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1777 	cache->_SwapBlockFree(page->cache_offset, 1);
1778 
1779 	return true;
1780 }
1781 
1782 
1783 uint32
1784 swap_available_pages()
1785 {
1786 	mutex_lock(&sAvailSwapSpaceLock);
1787 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1788 	mutex_unlock(&sAvailSwapSpaceLock);
1789 
1790 	return avail;
1791 }
1792 
1793 
1794 uint32
1795 swap_total_swap_pages()
1796 {
1797 	mutex_lock(&sSwapFileListLock);
1798 
1799 	uint32 totalSwapSlots = 0;
1800 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1801 		swap_file* swapFile = it.Next();) {
1802 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1803 	}
1804 
1805 	mutex_unlock(&sSwapFileListLock);
1806 
1807 	return totalSwapSlots;
1808 }
1809 
1810 
1811 #endif	// ENABLE_SWAP_SUPPORT
1812 
1813 
1814 void
1815 swap_get_info(system_info* info)
1816 {
1817 #if ENABLE_SWAP_SUPPORT
1818 	MutexLocker locker(sSwapFileListLock);
1819 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1820 		swap_file* swapFile = it.Next();) {
1821 		info->max_swap_pages += swapFile->last_slot - swapFile->first_slot;
1822 		info->free_swap_pages += swapFile->bmp->free_slots;
1823 	}
1824 #else
1825 	info->max_swap_pages = 0;
1826 	info->free_swap_pages = 0;
1827 #endif
1828 }
1829 
1830