xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision 64b46b706b02d969629415c9a44c394a2a5e4993)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 
31 #include <arch_config.h>
32 #include <boot_device.h>
33 #include <disk_device_manager/KDiskDevice.h>
34 #include <disk_device_manager/KDiskDeviceManager.h>
35 #include <disk_device_manager/KDiskSystem.h>
36 #include <disk_device_manager/KPartitionVisitor.h>
37 #include <driver_settings.h>
38 #include <fs/fd.h>
39 #include <fs/KPath.h>
40 #include <fs_info.h>
41 #include <fs_interface.h>
42 #include <heap.h>
43 #include <kernel_daemon.h>
44 #include <slab/Slab.h>
45 #include <syscalls.h>
46 #include <system_info.h>
47 #include <tracing.h>
48 #include <util/AutoLock.h>
49 #include <util/DoublyLinkedList.h>
50 #include <util/OpenHashTable.h>
51 #include <util/RadixBitmap.h>
52 #include <vfs.h>
53 #include <vm/vm.h>
54 #include <vm/vm_page.h>
55 #include <vm/vm_priv.h>
56 #include <vm/VMAddressSpace.h>
57 
58 #include "IORequest.h"
59 #include "VMUtils.h"
60 
61 
62 #if	ENABLE_SWAP_SUPPORT
63 
64 //#define TRACE_VM_ANONYMOUS_CACHE
65 #ifdef TRACE_VM_ANONYMOUS_CACHE
66 #	define TRACE(x...) dprintf(x)
67 #else
68 #	define TRACE(x...) do { } while (false)
69 #endif
70 
71 
72 // number of free swap blocks the object cache shall minimally have
73 #define MIN_SWAP_BLOCK_RESERVE	4096
74 
75 // interval the has resizer is triggered (in 0.1s)
76 #define SWAP_HASH_RESIZE_INTERVAL	5
77 
78 #define INITIAL_SWAP_HASH_SIZE		1024
79 
80 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
81 
82 #define SWAP_BLOCK_PAGES 32
83 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
84 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
85 
86 
87 static const char* const kDefaultSwapPath = "/var/swap";
88 
89 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
90 	int				fd;
91 	struct vnode*	vnode;
92 	void*			cookie;
93 	swap_addr_t		first_slot;
94 	swap_addr_t		last_slot;
95 	radix_bitmap*	bmp;
96 };
97 
98 struct swap_hash_key {
99 	VMAnonymousCache	*cache;
100 	off_t				page_index;  // page index in the cache
101 };
102 
103 // Each swap block contains swap address information for
104 // SWAP_BLOCK_PAGES continuous pages from the same cache
105 struct swap_block {
106 	swap_block*		hash_link;
107 	swap_hash_key	key;
108 	uint32			used;
109 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
110 };
111 
112 struct SwapHashTableDefinition {
113 	typedef swap_hash_key KeyType;
114 	typedef swap_block ValueType;
115 
116 	SwapHashTableDefinition() {}
117 
118 	size_t HashKey(const swap_hash_key& key) const
119 	{
120 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
121 		VMAnonymousCache* cache = key.cache;
122 		return blockIndex ^ (size_t)(int*)cache;
123 	}
124 
125 	size_t Hash(const swap_block* value) const
126 	{
127 		return HashKey(value->key);
128 	}
129 
130 	bool Compare(const swap_hash_key& key, const swap_block* value) const
131 	{
132 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
133 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
134 			&& key.cache == value->key.cache;
135 	}
136 
137 	swap_block*& GetLink(swap_block* value) const
138 	{
139 		return value->hash_link;
140 	}
141 };
142 
143 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
144 typedef DoublyLinkedList<swap_file> SwapFileList;
145 
146 static SwapHashTable sSwapHashTable;
147 static rw_lock sSwapHashLock;
148 
149 static SwapFileList sSwapFileList;
150 static mutex sSwapFileListLock;
151 static swap_file* sSwapFileAlloc = NULL; // allocate from here
152 static uint32 sSwapFileCount = 0;
153 
154 static off_t sAvailSwapSpace = 0;
155 static mutex sAvailSwapSpaceLock;
156 
157 static object_cache* sSwapBlockCache;
158 
159 
160 #if SWAP_TRACING
161 namespace SwapTracing {
162 
163 class SwapTraceEntry : public AbstractTraceEntry {
164 public:
165 	SwapTraceEntry(VMAnonymousCache* cache)
166 		:
167 		fCache(cache)
168 	{
169 	}
170 
171 protected:
172 	VMAnonymousCache*	fCache;
173 };
174 
175 
176 class ReadPage : public SwapTraceEntry {
177 public:
178 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
179 		swap_addr_t swapSlotIndex)
180 		:
181 		SwapTraceEntry(cache),
182 		fPageIndex(pageIndex),
183 		fSwapSlotIndex(swapSlotIndex)
184 	{
185 		Initialized();
186 	}
187 
188 	virtual void AddDump(TraceOutput& out)
189 	{
190 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
191 			fCache, fPageIndex, fSwapSlotIndex);
192 	}
193 
194 private:
195 	page_num_t		fPageIndex;
196 	swap_addr_t		fSwapSlotIndex;
197 };
198 
199 
200 class WritePage : public SwapTraceEntry {
201 public:
202 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
203 		swap_addr_t swapSlotIndex)
204 		:
205 		SwapTraceEntry(cache),
206 		fPageIndex(pageIndex),
207 		fSwapSlotIndex(swapSlotIndex)
208 	{
209 		Initialized();
210 	}
211 
212 	virtual void AddDump(TraceOutput& out)
213 	{
214 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
215 			fCache, fPageIndex, fSwapSlotIndex);
216 	}
217 
218 private:
219 	page_num_t		fPageIndex;
220 	swap_addr_t		fSwapSlotIndex;
221 };
222 
223 }	// namespace SwapTracing
224 
225 #	define T(x) new(std::nothrow) SwapTracing::x;
226 #else
227 #	define T(x) ;
228 #endif
229 
230 
231 static int
232 dump_swap_info(int argc, char** argv)
233 {
234 	swap_addr_t totalSwapPages = 0;
235 	swap_addr_t freeSwapPages = 0;
236 
237 	kprintf("swap files:\n");
238 
239 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
240 		swap_file* file = it.Next();) {
241 		swap_addr_t total = file->last_slot - file->first_slot;
242 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
243 			"\n", file->vnode, total, file->bmp->free_slots);
244 
245 		totalSwapPages += total;
246 		freeSwapPages += file->bmp->free_slots;
247 	}
248 
249 	kprintf("\n");
250 	kprintf("swap space in pages:\n");
251 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
252 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
253 	kprintf("reserved:  %9" B_PRIdOFF "\n",
254 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
255 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
256 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
257 
258 	return 0;
259 }
260 
261 
262 static swap_addr_t
263 swap_slot_alloc(uint32 count)
264 {
265 	mutex_lock(&sSwapFileListLock);
266 
267 	if (sSwapFileList.IsEmpty()) {
268 		mutex_unlock(&sSwapFileListLock);
269 		panic("swap_slot_alloc(): no swap file in the system\n");
270 		return SWAP_SLOT_NONE;
271 	}
272 
273 	// since radix bitmap could not handle more than 32 pages, we return
274 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
275 	if (count > BITMAP_RADIX) {
276 		mutex_unlock(&sSwapFileListLock);
277 		return SWAP_SLOT_NONE;
278 	}
279 
280 	swap_addr_t j, addr = SWAP_SLOT_NONE;
281 	for (j = 0; j < sSwapFileCount; j++) {
282 		if (sSwapFileAlloc == NULL)
283 			sSwapFileAlloc = sSwapFileList.First();
284 
285 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
286 		if (addr != SWAP_SLOT_NONE) {
287 			addr += sSwapFileAlloc->first_slot;
288 			break;
289 		}
290 
291 		// this swap_file is full, find another
292 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
293 	}
294 
295 	if (j == sSwapFileCount) {
296 		mutex_unlock(&sSwapFileListLock);
297 		panic("swap_slot_alloc: swap space exhausted!\n");
298 		return SWAP_SLOT_NONE;
299 	}
300 
301 	// if this swap file has used more than 90% percent of its space
302 	// switch to another
303 	if (sSwapFileAlloc->bmp->free_slots
304 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
305 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
306 	}
307 
308 	mutex_unlock(&sSwapFileListLock);
309 
310 	return addr;
311 }
312 
313 
314 static swap_file*
315 find_swap_file(swap_addr_t slotIndex)
316 {
317 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
318 		swap_file* swapFile = it.Next();) {
319 		if (slotIndex >= swapFile->first_slot
320 			&& slotIndex < swapFile->last_slot) {
321 			return swapFile;
322 		}
323 	}
324 
325 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
326 		slotIndex);
327 	return NULL;
328 }
329 
330 
331 static void
332 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
333 {
334 	if (slotIndex == SWAP_SLOT_NONE)
335 		return;
336 
337 	mutex_lock(&sSwapFileListLock);
338 	swap_file* swapFile = find_swap_file(slotIndex);
339 	slotIndex -= swapFile->first_slot;
340 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
341 	mutex_unlock(&sSwapFileListLock);
342 }
343 
344 
345 static off_t
346 swap_space_reserve(off_t amount)
347 {
348 	mutex_lock(&sAvailSwapSpaceLock);
349 	if (sAvailSwapSpace >= amount)
350 		sAvailSwapSpace -= amount;
351 	else {
352 		amount = sAvailSwapSpace;
353 		sAvailSwapSpace = 0;
354 	}
355 	mutex_unlock(&sAvailSwapSpaceLock);
356 
357 	return amount;
358 }
359 
360 
361 static void
362 swap_space_unreserve(off_t amount)
363 {
364 	mutex_lock(&sAvailSwapSpaceLock);
365 	sAvailSwapSpace += amount;
366 	mutex_unlock(&sAvailSwapSpaceLock);
367 }
368 
369 
370 static void
371 swap_hash_resizer(void*, int)
372 {
373 	WriteLocker locker(sSwapHashLock);
374 
375 	size_t size;
376 	void* allocation;
377 
378 	do {
379 		size = sSwapHashTable.ResizeNeeded();
380 		if (size == 0)
381 			return;
382 
383 		locker.Unlock();
384 
385 		allocation = malloc(size);
386 		if (allocation == NULL)
387 			return;
388 
389 		locker.Lock();
390 
391 	} while (!sSwapHashTable.Resize(allocation, size));
392 }
393 
394 
395 // #pragma mark -
396 
397 
398 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
399 public:
400 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
401 		:
402 		StackableAsyncIOCallback(callback),
403 		fCache(cache)
404 	{
405 	}
406 
407 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
408 	{
409 		fPageIndex = pageIndex;
410 		fSlotIndex = slotIndex;
411 		fNewSlot = newSlot;
412 	}
413 
414 	virtual void IOFinished(status_t status, bool partialTransfer,
415 		generic_size_t bytesTransferred)
416 	{
417 		if (fNewSlot) {
418 			if (status == B_OK) {
419 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
420 			} else {
421 				AutoLocker<VMCache> locker(fCache);
422 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
423 				locker.Unlock();
424 
425 				swap_slot_dealloc(fSlotIndex, 1);
426 			}
427 		}
428 
429 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
430 
431 		delete this;
432 	}
433 
434 private:
435 	VMAnonymousCache*	fCache;
436 	page_num_t			fPageIndex;
437 	swap_addr_t			fSlotIndex;
438 	bool				fNewSlot;
439 };
440 
441 
442 // #pragma mark -
443 
444 
445 VMAnonymousCache::~VMAnonymousCache()
446 {
447 	_FreeSwapPageRange(virtual_base, virtual_end, false);
448 	swap_space_unreserve(fCommittedSwapSize);
449 	if (committed_size > fCommittedSwapSize)
450 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
451 }
452 
453 
454 status_t
455 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
456 	int32 numGuardPages, uint32 allocationFlags)
457 {
458 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
459 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
460 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
461 		numGuardPages);
462 
463 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
464 	if (error != B_OK)
465 		return error;
466 
467 	fCanOvercommit = canOvercommit;
468 	fHasPrecommitted = false;
469 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
470 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
471 	fCommittedSwapSize = 0;
472 	fAllocatedSwapSize = 0;
473 
474 	return B_OK;
475 }
476 
477 
478 void
479 VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
480 	bool skipBusyPages)
481 {
482 	swap_block* swapBlock = NULL;
483 	off_t toIndex = toOffset >> PAGE_SHIFT;
484 	for (off_t pageIndex = fromOffset >> PAGE_SHIFT;
485 		pageIndex < toIndex && fAllocatedSwapSize > 0; pageIndex++) {
486 
487 		WriteLocker locker(sSwapHashLock);
488 
489 		// Get the swap slot index for the page.
490 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
491 		if (swapBlock == NULL || blockIndex == 0) {
492 			swap_hash_key key = { this, pageIndex };
493 			swapBlock = sSwapHashTable.Lookup(key);
494 
495 			if (swapBlock == NULL) {
496 				pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
497 				continue;
498 			}
499 		}
500 
501 		swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
502 		if (slotIndex == SWAP_SLOT_NONE)
503 			continue;
504 
505 		if (skipBusyPages) {
506 			vm_page* page = LookupPage(pageIndex * B_PAGE_SIZE);
507 			if (page != NULL && page->busy) {
508 				// TODO: We skip (i.e. leak) swap space of busy pages, since
509 				// there could be I/O going on (paging in/out). Waiting is
510 				// not an option as 1. unlocking the cache means that new
511 				// swap pages could be added in a range we've already
512 				// cleared (since the cache still has the old size) and 2.
513 				// we'd risk a deadlock in case we come from the file cache
514 				// and the FS holds the node's write-lock. We should mark
515 				// the page invalid and let the one responsible clean up.
516 				// There's just no such mechanism yet.
517 				continue;
518 			}
519 		}
520 
521 		swap_slot_dealloc(slotIndex, 1);
522 		fAllocatedSwapSize -= B_PAGE_SIZE;
523 
524 		swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
525 		if (--swapBlock->used == 0) {
526 			// All swap pages have been freed -- we can discard the swap block.
527 			sSwapHashTable.RemoveUnchecked(swapBlock);
528 			object_cache_free(sSwapBlockCache, swapBlock,
529 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
530 
531 			// There are no swap pages for possibly remaining pages, skip to the
532 			// next block.
533 			pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
534 			swapBlock = NULL;
535 		}
536 	}
537 }
538 
539 
540 status_t
541 VMAnonymousCache::Resize(off_t newSize, int priority)
542 {
543 	_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
544 		virtual_end + B_PAGE_SIZE - 1);
545 	return VMCache::Resize(newSize, priority);
546 }
547 
548 
549 status_t
550 VMAnonymousCache::Rebase(off_t newBase, int priority)
551 {
552 	_FreeSwapPageRange(virtual_base, newBase);
553 	return VMCache::Rebase(newBase, priority);
554 }
555 
556 
557 status_t
558 VMAnonymousCache::Discard(off_t offset, off_t size)
559 {
560 	_FreeSwapPageRange(offset, offset + size);
561 	return VMCache::Discard(offset, size);
562 }
563 
564 
565 /*!	Moves the swap pages for the given range from the source cache into this
566 	cache. Both caches must be locked.
567 */
568 status_t
569 VMAnonymousCache::Adopt(VMCache* _source, off_t offset, off_t size,
570 	off_t newOffset)
571 {
572 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
573 	if (source == NULL) {
574 		panic("VMAnonymousCache::Adopt(): adopt from incompatible cache %p "
575 			"requested", _source);
576 		return B_ERROR;
577 	}
578 
579 	off_t pageIndex = newOffset >> PAGE_SHIFT;
580 	off_t sourcePageIndex = offset >> PAGE_SHIFT;
581 	off_t sourceEndPageIndex = (offset + size + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
582 	swap_block* swapBlock = NULL;
583 
584 	WriteLocker locker(sSwapHashLock);
585 
586 	while (sourcePageIndex < sourceEndPageIndex
587 			&& source->fAllocatedSwapSize > 0) {
588 		swap_addr_t left
589 			= SWAP_BLOCK_PAGES - (sourcePageIndex & SWAP_BLOCK_MASK);
590 
591 		swap_hash_key sourceKey = { source, sourcePageIndex };
592 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(sourceKey);
593 		if (sourceSwapBlock == NULL || sourceSwapBlock->used == 0) {
594 			sourcePageIndex += left;
595 			pageIndex += left;
596 			swapBlock = NULL;
597 			continue;
598 		}
599 
600 		for (; left > 0 && sourceSwapBlock->used > 0;
601 				left--, sourcePageIndex++, pageIndex++) {
602 
603 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
604 			if (swapBlock == NULL || blockIndex == 0) {
605 				swap_hash_key key = { this, pageIndex };
606 				swapBlock = sSwapHashTable.Lookup(key);
607 
608 				if (swapBlock == NULL) {
609 					swapBlock = (swap_block*)object_cache_alloc(sSwapBlockCache,
610 						CACHE_DONT_WAIT_FOR_MEMORY
611 							| CACHE_DONT_LOCK_KERNEL_SPACE);
612 					if (swapBlock == NULL)
613 						return B_NO_MEMORY;
614 
615 					swapBlock->key.cache = this;
616 					swapBlock->key.page_index
617 						= pageIndex & ~(off_t)SWAP_BLOCK_MASK;
618 					swapBlock->used = 0;
619 					for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
620 						swapBlock->swap_slots[i] = SWAP_SLOT_NONE;
621 
622 					sSwapHashTable.InsertUnchecked(swapBlock);
623 				}
624 			}
625 
626 			swap_addr_t sourceBlockIndex = sourcePageIndex & SWAP_BLOCK_MASK;
627 			swap_addr_t slotIndex
628 				= sourceSwapBlock->swap_slots[sourceBlockIndex];
629 			if (slotIndex == SWAP_SLOT_NONE)
630 				continue;
631 
632 			ASSERT(swapBlock->swap_slots[blockIndex] == SWAP_SLOT_NONE);
633 
634 			swapBlock->swap_slots[blockIndex] = slotIndex;
635 			swapBlock->used++;
636 			fAllocatedSwapSize += B_PAGE_SIZE;
637 
638 			sourceSwapBlock->swap_slots[sourceBlockIndex] = SWAP_SLOT_NONE;
639 			sourceSwapBlock->used--;
640 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
641 
642 			TRACE("adopted slot %#" B_PRIx32 " from %p at page %" B_PRIdOFF
643 				" to %p at page %" B_PRIdOFF "\n", slotIndex, source,
644 				sourcePageIndex, this, pageIndex);
645 		}
646 
647 		if (left > 0) {
648 			sourcePageIndex += left;
649 			pageIndex += left;
650 			swapBlock = NULL;
651 		}
652 
653 		if (sourceSwapBlock->used == 0) {
654 			// All swap pages have been adopted, we can discard the swap block.
655 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
656 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
657 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
658 		}
659 	}
660 
661 	locker.Unlock();
662 
663 	return VMCache::Adopt(source, offset, size, newOffset);
664 }
665 
666 
667 status_t
668 VMAnonymousCache::Commit(off_t size, int priority)
669 {
670 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
671 
672 	// If we can overcommit, we don't commit here, but in Fault(). We always
673 	// unreserve memory, if we're asked to shrink our commitment, though.
674 	if (fCanOvercommit && size > committed_size) {
675 		if (fHasPrecommitted)
676 			return B_OK;
677 
678 		// pre-commit some pages to make a later failure less probable
679 		fHasPrecommitted = true;
680 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
681 		if (size > precommitted)
682 			size = precommitted;
683 	}
684 
685 	return _Commit(size, priority);
686 }
687 
688 
689 bool
690 VMAnonymousCache::HasPage(off_t offset)
691 {
692 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
693 		return true;
694 
695 	return false;
696 }
697 
698 
699 bool
700 VMAnonymousCache::DebugHasPage(off_t offset)
701 {
702 	off_t pageIndex = offset >> PAGE_SHIFT;
703 	swap_hash_key key = { this, pageIndex };
704 	swap_block* swap = sSwapHashTable.Lookup(key);
705 	if (swap == NULL)
706 		return false;
707 
708 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
709 }
710 
711 
712 status_t
713 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
714 	uint32 flags, generic_size_t* _numBytes)
715 {
716 	off_t pageIndex = offset >> PAGE_SHIFT;
717 
718 	for (uint32 i = 0, j = 0; i < count; i = j) {
719 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
720 		for (j = i + 1; j < count; j++) {
721 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
722 			if (slotIndex != startSlotIndex + j - i)
723 				break;
724 		}
725 
726 		T(ReadPage(this, pageIndex, startSlotIndex));
727 			// TODO: Assumes that only one page is read.
728 
729 		swap_file* swapFile = find_swap_file(startSlotIndex);
730 
731 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
732 			* B_PAGE_SIZE;
733 
734 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
735 			vecs + i, j - i, flags, _numBytes);
736 		if (status != B_OK)
737 			return status;
738 	}
739 
740 	return B_OK;
741 }
742 
743 
744 status_t
745 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
746 	uint32 flags, generic_size_t* _numBytes)
747 {
748 	off_t pageIndex = offset >> PAGE_SHIFT;
749 
750 	AutoLocker<VMCache> locker(this);
751 
752 	page_num_t totalPages = 0;
753 	for (uint32 i = 0; i < count; i++) {
754 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
755 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
756 		if (slotIndex != SWAP_SLOT_NONE) {
757 			swap_slot_dealloc(slotIndex, pageCount);
758 			_SwapBlockFree(pageIndex + totalPages, pageCount);
759 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
760 		}
761 
762 		totalPages += pageCount;
763 	}
764 
765 	off_t totalSize = totalPages * B_PAGE_SIZE;
766 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
767 		return B_ERROR;
768 
769 	fAllocatedSwapSize += totalSize;
770 	locker.Unlock();
771 
772 	page_num_t pagesLeft = totalPages;
773 	totalPages = 0;
774 
775 	for (uint32 i = 0; i < count; i++) {
776 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
777 
778 		generic_addr_t vectorBase = vecs[i].base;
779 		generic_size_t vectorLength = vecs[i].length;
780 		page_num_t n = pageCount;
781 
782 		for (page_num_t j = 0; j < pageCount; j += n) {
783 			swap_addr_t slotIndex;
784 			// try to allocate n slots, if fail, try to allocate n/2
785 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
786 				n >>= 1;
787 
788 			if (slotIndex == SWAP_SLOT_NONE)
789 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
790 
791 			T(WritePage(this, pageIndex, slotIndex));
792 				// TODO: Assumes that only one page is written.
793 
794 			swap_file* swapFile = find_swap_file(slotIndex);
795 
796 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
797 
798 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
799 			generic_io_vec vector[1];
800 			vector->base = vectorBase;
801 			vector->length = length;
802 
803 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
804 				pos, vector, 1, flags, &length);
805 			if (status != B_OK) {
806 				locker.Lock();
807 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
808 				locker.Unlock();
809 
810 				swap_slot_dealloc(slotIndex, n);
811 				return status;
812 			}
813 
814 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
815 			pagesLeft -= n;
816 
817 			if (n != pageCount) {
818 				vectorBase = vectorBase + n * B_PAGE_SIZE;
819 				vectorLength -= n * B_PAGE_SIZE;
820 			}
821 		}
822 
823 		totalPages += pageCount;
824 	}
825 
826 	ASSERT(pagesLeft == 0);
827 	return B_OK;
828 }
829 
830 
831 status_t
832 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
833 	size_t count, generic_size_t numBytes, uint32 flags,
834 	AsyncIOCallback* _callback)
835 {
836 	// TODO: Currently this method is only used for single pages. Either make
837 	// more flexible use of it or change the interface!
838 	// This implementation relies on the current usage!
839 	ASSERT(count == 1);
840 	ASSERT(numBytes <= B_PAGE_SIZE);
841 
842 	page_num_t pageIndex = offset >> PAGE_SHIFT;
843 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
844 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
845 
846 	// If the page doesn't have any swap space yet, allocate it.
847 	if (newSlot) {
848 		AutoLocker<VMCache> locker(this);
849 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
850 			_callback->IOFinished(B_ERROR, true, 0);
851 			return B_ERROR;
852 		}
853 
854 		fAllocatedSwapSize += B_PAGE_SIZE;
855 
856 		slotIndex = swap_slot_alloc(1);
857 	}
858 
859 	// create our callback
860 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
861 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
862 		: new(std::nothrow) WriteCallback(this, _callback);
863 	if (callback == NULL) {
864 		if (newSlot) {
865 			AutoLocker<VMCache> locker(this);
866 			fAllocatedSwapSize -= B_PAGE_SIZE;
867 			locker.Unlock();
868 
869 			swap_slot_dealloc(slotIndex, 1);
870 		}
871 		_callback->IOFinished(B_NO_MEMORY, true, 0);
872 		return B_NO_MEMORY;
873 	}
874 	// TODO: If the page already had swap space assigned, we don't need an own
875 	// callback.
876 
877 	callback->SetTo(pageIndex, slotIndex, newSlot);
878 
879 	T(WritePage(this, pageIndex, slotIndex));
880 
881 	// write the page asynchrounously
882 	swap_file* swapFile = find_swap_file(slotIndex);
883 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
884 
885 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
886 		vecs, 1, numBytes, flags, callback);
887 }
888 
889 
890 bool
891 VMAnonymousCache::CanWritePage(off_t offset)
892 {
893 	// We can write the page, if we have not used all of our committed swap
894 	// space or the page already has a swap slot assigned.
895 	return fAllocatedSwapSize < fCommittedSwapSize
896 		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
897 }
898 
899 
900 int32
901 VMAnonymousCache::MaxPagesPerAsyncWrite() const
902 {
903 	return 1;
904 }
905 
906 
907 status_t
908 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
909 {
910 	if (fGuardedSize > 0) {
911 		uint32 guardOffset;
912 
913 #ifdef STACK_GROWS_DOWNWARDS
914 		guardOffset = 0;
915 #elif defined(STACK_GROWS_UPWARDS)
916 		guardOffset = virtual_size - fGuardedSize;
917 #else
918 #	error Stack direction has not been defined in arch_config.h
919 #endif
920 		// report stack fault, guard page hit!
921 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
922 			TRACE(("stack overflow!\n"));
923 			return B_BAD_ADDRESS;
924 		}
925 	}
926 
927 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
928 		if (fPrecommittedPages == 0) {
929 			// never commit more than needed
930 			if (committed_size / B_PAGE_SIZE > page_count)
931 				return B_BAD_HANDLER;
932 
933 			// try to commit additional swap space/memory
934 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
935 				fCommittedSwapSize += B_PAGE_SIZE;
936 			} else {
937 				int priority = aspace == VMAddressSpace::Kernel()
938 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
939 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
940 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
941 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
942 					return B_NO_MEMORY;
943 				}
944 			}
945 
946 			committed_size += B_PAGE_SIZE;
947 		} else
948 			fPrecommittedPages--;
949 	}
950 
951 	// This will cause vm_soft_fault() to handle the fault
952 	return B_BAD_HANDLER;
953 }
954 
955 
956 void
957 VMAnonymousCache::Merge(VMCache* _source)
958 {
959 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
960 	if (source == NULL) {
961 		panic("VMAnonymousCache::Merge(): merge with incompatible cache "
962 			"%p requested", _source);
963 		return;
964 	}
965 
966 	// take over the source' committed size
967 	fCommittedSwapSize += source->fCommittedSwapSize;
968 	source->fCommittedSwapSize = 0;
969 	committed_size += source->committed_size;
970 	source->committed_size = 0;
971 
972 	off_t actualSize = virtual_end - virtual_base;
973 	if (committed_size > actualSize)
974 		_Commit(actualSize, VM_PRIORITY_USER);
975 
976 	// Move all not shadowed swap pages from the source to the consumer cache.
977 	// Also remove all source pages that are shadowed by consumer swap pages.
978 	_MergeSwapPages(source);
979 
980 	// Move all not shadowed pages from the source to the consumer cache.
981 	if (source->page_count < page_count)
982 		_MergePagesSmallerSource(source);
983 	else
984 		_MergePagesSmallerConsumer(source);
985 }
986 
987 
988 void
989 VMAnonymousCache::DeleteObject()
990 {
991 	object_cache_delete(gAnonymousCacheObjectCache, this);
992 }
993 
994 
995 void
996 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
997 	swap_addr_t startSlotIndex, uint32 count)
998 {
999 	WriteLocker locker(sSwapHashLock);
1000 
1001 	uint32 left = count;
1002 	for (uint32 i = 0, j = 0; i < count; i += j) {
1003 		off_t pageIndex = startPageIndex + i;
1004 		swap_addr_t slotIndex = startSlotIndex + i;
1005 
1006 		swap_hash_key key = { this, pageIndex };
1007 
1008 		swap_block* swap = sSwapHashTable.Lookup(key);
1009 		while (swap == NULL) {
1010 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
1011 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1012 			if (swap == NULL) {
1013 				// Wait a short time until memory is available again.
1014 				locker.Unlock();
1015 				snooze(10000);
1016 				locker.Lock();
1017 				swap = sSwapHashTable.Lookup(key);
1018 				continue;
1019 			}
1020 
1021 			swap->key.cache = this;
1022 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
1023 			swap->used = 0;
1024 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
1025 				swap->swap_slots[i] = SWAP_SLOT_NONE;
1026 
1027 			sSwapHashTable.InsertUnchecked(swap);
1028 		}
1029 
1030 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1031 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1032 			swap->swap_slots[blockIndex++] = slotIndex + j;
1033 			left--;
1034 		}
1035 
1036 		swap->used += j;
1037 	}
1038 }
1039 
1040 
1041 void
1042 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
1043 {
1044 	WriteLocker locker(sSwapHashLock);
1045 
1046 	uint32 left = count;
1047 	for (uint32 i = 0, j = 0; i < count; i += j) {
1048 		off_t pageIndex = startPageIndex + i;
1049 		swap_hash_key key = { this, pageIndex };
1050 		swap_block* swap = sSwapHashTable.Lookup(key);
1051 
1052 		ASSERT(swap != NULL);
1053 
1054 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1055 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1056 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
1057 			left--;
1058 		}
1059 
1060 		swap->used -= j;
1061 		if (swap->used == 0) {
1062 			sSwapHashTable.RemoveUnchecked(swap);
1063 			object_cache_free(sSwapBlockCache, swap,
1064 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1065 		}
1066 	}
1067 }
1068 
1069 
1070 swap_addr_t
1071 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
1072 {
1073 	ReadLocker locker(sSwapHashLock);
1074 
1075 	swap_hash_key key = { this, pageIndex };
1076 	swap_block* swap = sSwapHashTable.Lookup(key);
1077 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
1078 
1079 	if (swap != NULL) {
1080 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1081 		slotIndex = swap->swap_slots[blockIndex];
1082 	}
1083 
1084 	return slotIndex;
1085 }
1086 
1087 
1088 status_t
1089 VMAnonymousCache::_Commit(off_t size, int priority)
1090 {
1091 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
1092 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
1093 		fCommittedSwapSize);
1094 
1095 	// Basic strategy: reserve swap space first, only when running out of swap
1096 	// space, reserve real memory.
1097 
1098 	off_t committedMemory = committed_size - fCommittedSwapSize;
1099 
1100 	// Regardless of whether we're asked to grow or shrink the commitment,
1101 	// we always try to reserve as much as possible of the final commitment
1102 	// in the swap space.
1103 	if (size > fCommittedSwapSize) {
1104 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
1105 		committed_size = fCommittedSwapSize + committedMemory;
1106 		if (size > fCommittedSwapSize) {
1107 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
1108 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
1109 		}
1110 	}
1111 
1112 	if (committed_size == size)
1113 		return B_OK;
1114 
1115 	if (committed_size > size) {
1116 		// The commitment shrinks -- unreserve real memory first.
1117 		off_t toUnreserve = committed_size - size;
1118 		if (committedMemory > 0) {
1119 			off_t unreserved = min_c(toUnreserve, committedMemory);
1120 			vm_unreserve_memory(unreserved);
1121 			committedMemory -= unreserved;
1122 			committed_size -= unreserved;
1123 			toUnreserve -= unreserved;
1124 		}
1125 
1126 		// Unreserve swap space.
1127 		if (toUnreserve > 0) {
1128 			swap_space_unreserve(toUnreserve);
1129 			fCommittedSwapSize -= toUnreserve;
1130 			committed_size -= toUnreserve;
1131 		}
1132 
1133 		return B_OK;
1134 	}
1135 
1136 	// The commitment grows -- we have already tried to reserve swap space at
1137 	// the start of the method, so we try to reserve real memory, now.
1138 
1139 	off_t toReserve = size - committed_size;
1140 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1141 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1142 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1143 		return B_NO_MEMORY;
1144 	}
1145 
1146 	committed_size = size;
1147 	return B_OK;
1148 }
1149 
1150 
1151 void
1152 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1153 {
1154 	// The source cache has less pages than the consumer (this cache), so we
1155 	// iterate through the source's pages and move the ones that are not
1156 	// shadowed up to the consumer.
1157 
1158 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1159 			vm_page* page = it.Next();) {
1160 		// Note: Removing the current node while iterating through a
1161 		// IteratableSplayTree is safe.
1162 		vm_page* consumerPage = LookupPage(
1163 			(off_t)page->cache_offset << PAGE_SHIFT);
1164 		if (consumerPage == NULL) {
1165 			// the page is not yet in the consumer cache - move it upwards
1166 			ASSERT_PRINT(!page->busy, "page: %p", page);
1167 			MovePage(page);
1168 		}
1169 	}
1170 }
1171 
1172 
1173 void
1174 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1175 {
1176 	// The consumer (this cache) has less pages than the source, so we move the
1177 	// consumer's pages to the source (freeing shadowed ones) and finally just
1178 	// all pages of the source back to the consumer.
1179 
1180 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1181 		vm_page* page = it.Next();) {
1182 		// If a source page is in the way, remove and free it.
1183 		vm_page* sourcePage = source->LookupPage(
1184 			(off_t)page->cache_offset << PAGE_SHIFT);
1185 		if (sourcePage != NULL) {
1186 			DEBUG_PAGE_ACCESS_START(sourcePage);
1187 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1188 			ASSERT_PRINT(sourcePage->WiredCount() == 0
1189 					&& sourcePage->mappings.IsEmpty(),
1190 				"sourcePage: %p, page: %p", sourcePage, page);
1191 			source->RemovePage(sourcePage);
1192 			vm_page_free(source, sourcePage);
1193 		}
1194 
1195 		// Note: Removing the current node while iterating through a
1196 		// IteratableSplayTree is safe.
1197 		source->MovePage(page);
1198 	}
1199 
1200 	MoveAllPages(source);
1201 }
1202 
1203 
1204 void
1205 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1206 {
1207 	// If neither source nor consumer have swap pages, we don't have to do
1208 	// anything.
1209 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1210 		return;
1211 
1212 	for (off_t offset = source->virtual_base
1213 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1214 		offset < source->virtual_end;
1215 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1216 
1217 		WriteLocker locker(sSwapHashLock);
1218 
1219 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1220 		swap_hash_key key = { source, swapBlockPageIndex };
1221 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1222 
1223 		// remove the source swap block -- we will either take over the swap
1224 		// space (and the block) or free it
1225 		if (sourceSwapBlock != NULL)
1226 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1227 
1228 		key.cache = this;
1229 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1230 
1231 		locker.Unlock();
1232 
1233 		// remove all source pages that are shadowed by consumer swap pages
1234 		if (swapBlock != NULL) {
1235 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1236 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1237 					vm_page* page = source->LookupPage(
1238 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1239 					if (page != NULL) {
1240 						DEBUG_PAGE_ACCESS_START(page);
1241 						ASSERT_PRINT(!page->busy, "page: %p", page);
1242 						source->RemovePage(page);
1243 						vm_page_free(source, page);
1244 					}
1245 				}
1246 			}
1247 		}
1248 
1249 		if (sourceSwapBlock == NULL)
1250 			continue;
1251 
1252 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1253 			off_t pageIndex = swapBlockPageIndex + i;
1254 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1255 
1256 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1257 				continue;
1258 
1259 			if ((swapBlock != NULL
1260 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1261 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1262 				// The consumer already has a page or a swapped out page
1263 				// at this index. So we can free the source swap space.
1264 				swap_slot_dealloc(sourceSlotIndex, 1);
1265 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1266 				sourceSwapBlock->used--;
1267 			}
1268 
1269 			// We've either freed the source swap page or are going to move it
1270 			// to the consumer. At any rate, the source cache doesn't own it
1271 			// anymore.
1272 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1273 		}
1274 
1275 		// All source swap pages that have not been freed yet are taken over by
1276 		// the consumer.
1277 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1278 
1279 		if (sourceSwapBlock->used == 0) {
1280 			// All swap pages have been freed -- we can discard the source swap
1281 			// block.
1282 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1283 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1284 		} else if (swapBlock == NULL) {
1285 			// We need to take over some of the source's swap pages and there's
1286 			// no swap block in the consumer cache. Just take over the source
1287 			// swap block.
1288 			sourceSwapBlock->key.cache = this;
1289 			locker.Lock();
1290 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1291 			locker.Unlock();
1292 		} else {
1293 			// We need to take over some of the source's swap pages and there's
1294 			// already a swap block in the consumer cache. Copy the respective
1295 			// swap addresses and discard the source swap block.
1296 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1297 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1298 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1299 			}
1300 
1301 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1302 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1303 		}
1304 	}
1305 }
1306 
1307 
1308 // #pragma mark -
1309 
1310 
1311 // TODO: This can be removed if we get BFS uuid's
1312 struct VolumeInfo {
1313 	char name[B_FILE_NAME_LENGTH];
1314 	char device[B_FILE_NAME_LENGTH];
1315 	char filesystem[B_OS_NAME_LENGTH];
1316 	off_t capacity;
1317 };
1318 
1319 
1320 class PartitionScorer : public KPartitionVisitor {
1321 public:
1322 	PartitionScorer(VolumeInfo& volumeInfo)
1323 		:
1324 		fBestPartition(NULL),
1325 		fBestScore(-1),
1326 		fVolumeInfo(volumeInfo)
1327 	{
1328 	}
1329 
1330 	virtual bool VisitPre(KPartition* partition)
1331 	{
1332 		if (!partition->ContainsFileSystem())
1333 			return false;
1334 
1335 		KPath path;
1336 		partition->GetPath(&path);
1337 
1338 		int score = 0;
1339 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1340 			score += 4;
1341 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1342 			score += 3;
1343 		if (fVolumeInfo.capacity == partition->Size())
1344 			score += 2;
1345 		if (strcmp(fVolumeInfo.filesystem,
1346 			partition->DiskSystem()->ShortName()) == 0) {
1347 			score += 1;
1348 		}
1349 		if (score >= 4 && score > fBestScore) {
1350 			fBestPartition = partition;
1351 			fBestScore = score;
1352 		}
1353 
1354 		return false;
1355 	}
1356 
1357 	KPartition* fBestPartition;
1358 
1359 private:
1360 	int32		fBestScore;
1361 	VolumeInfo&	fVolumeInfo;
1362 };
1363 
1364 
1365 status_t
1366 swap_file_add(const char* path)
1367 {
1368 	// open the file
1369 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1370 	if (fd < 0)
1371 		return errno;
1372 
1373 	// fstat() it and check whether we can use it
1374 	struct stat st;
1375 	if (fstat(fd, &st) < 0) {
1376 		close(fd);
1377 		return errno;
1378 	}
1379 
1380 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1381 		close(fd);
1382 		return B_BAD_VALUE;
1383 	}
1384 
1385 	if (st.st_size < B_PAGE_SIZE) {
1386 		close(fd);
1387 		return B_BAD_VALUE;
1388 	}
1389 
1390 	// get file descriptor, vnode, and cookie
1391 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1392 	put_fd(descriptor);
1393 
1394 	vnode* node = fd_vnode(descriptor);
1395 	if (node == NULL) {
1396 		close(fd);
1397 		return B_BAD_VALUE;
1398 	}
1399 
1400 	// do the allocations and prepare the swap_file structure
1401 	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1402 	if (swap == NULL) {
1403 		close(fd);
1404 		return B_NO_MEMORY;
1405 	}
1406 
1407 	swap->fd = fd;
1408 	swap->vnode = node;
1409 	swap->cookie = descriptor->cookie;
1410 
1411 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1412 	swap->bmp = radix_bitmap_create(pageCount);
1413 	if (swap->bmp == NULL) {
1414 		free(swap);
1415 		close(fd);
1416 		return B_NO_MEMORY;
1417 	}
1418 
1419 	// set slot index and add this file to swap file list
1420 	mutex_lock(&sSwapFileListLock);
1421 	// TODO: Also check whether the swap file is already registered!
1422 	if (sSwapFileList.IsEmpty()) {
1423 		swap->first_slot = 0;
1424 		swap->last_slot = pageCount;
1425 	} else {
1426 		// leave one page gap between two swap files
1427 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1428 		swap->last_slot = swap->first_slot + pageCount;
1429 	}
1430 	sSwapFileList.Add(swap);
1431 	sSwapFileCount++;
1432 	mutex_unlock(&sSwapFileListLock);
1433 
1434 	mutex_lock(&sAvailSwapSpaceLock);
1435 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1436 	mutex_unlock(&sAvailSwapSpaceLock);
1437 
1438 	return B_OK;
1439 }
1440 
1441 
1442 status_t
1443 swap_file_delete(const char* path)
1444 {
1445 	vnode* node = NULL;
1446 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1447 	if (status != B_OK)
1448 		return status;
1449 
1450 	MutexLocker locker(sSwapFileListLock);
1451 
1452 	swap_file* swapFile = NULL;
1453 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1454 			(swapFile = it.Next()) != NULL;) {
1455 		if (swapFile->vnode == node)
1456 			break;
1457 	}
1458 
1459 	vfs_put_vnode(node);
1460 
1461 	if (swapFile == NULL)
1462 		return B_ERROR;
1463 
1464 	// if this file is currently used, we can't delete
1465 	// TODO: mark this swap file deleting, and remove it after releasing
1466 	// all the swap space
1467 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1468 		return B_ERROR;
1469 
1470 	sSwapFileList.Remove(swapFile);
1471 	sSwapFileCount--;
1472 	locker.Unlock();
1473 
1474 	mutex_lock(&sAvailSwapSpaceLock);
1475 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1476 		* B_PAGE_SIZE;
1477 	mutex_unlock(&sAvailSwapSpaceLock);
1478 
1479 	close(swapFile->fd);
1480 	radix_bitmap_destroy(swapFile->bmp);
1481 	free(swapFile);
1482 
1483 	return B_OK;
1484 }
1485 
1486 
1487 void
1488 swap_init(void)
1489 {
1490 	// create swap block cache
1491 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1492 		sizeof(void*), NULL, NULL, NULL);
1493 	if (sSwapBlockCache == NULL)
1494 		panic("swap_init(): can't create object cache for swap blocks\n");
1495 
1496 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1497 		MIN_SWAP_BLOCK_RESERVE);
1498 	if (error != B_OK) {
1499 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1500 			strerror(error));
1501 	}
1502 
1503 	// init swap hash table
1504 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1505 	rw_lock_init(&sSwapHashLock, "swaphash");
1506 
1507 	error = register_resource_resizer(swap_hash_resizer, NULL,
1508 		SWAP_HASH_RESIZE_INTERVAL);
1509 	if (error != B_OK) {
1510 		panic("swap_init(): Failed to register swap hash resizer: %s",
1511 			strerror(error));
1512 	}
1513 
1514 	// init swap file list
1515 	mutex_init(&sSwapFileListLock, "swaplist");
1516 	sSwapFileAlloc = NULL;
1517 	sSwapFileCount = 0;
1518 
1519 	// init available swap space
1520 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1521 	sAvailSwapSpace = 0;
1522 
1523 	add_debugger_command_etc("swap", &dump_swap_info,
1524 		"Print infos about the swap usage",
1525 		"\n"
1526 		"Print infos about the swap usage.\n", 0);
1527 }
1528 
1529 
1530 void
1531 swap_init_post_modules()
1532 {
1533 	// Never try to create a swap file on a read-only device - when booting
1534 	// from CD, the write overlay is used.
1535 	if (gReadOnlyBootDevice)
1536 		return;
1537 
1538 	bool swapEnabled = true;
1539 	bool swapAutomatic = true;
1540 	off_t swapSize = 0;
1541 
1542 	dev_t swapDeviceID = -1;
1543 	VolumeInfo selectedVolume = {};
1544 
1545 	void* settings = load_driver_settings("virtual_memory");
1546 
1547 	if (settings != NULL) {
1548 		// We pass a lot of information on the swap device, this is mostly to
1549 		// ensure that we are dealing with the same device that was configured.
1550 
1551 		// TODO: Some kind of BFS uuid would be great here :)
1552 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1553 
1554 		if (enabled != NULL) {
1555 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1556 				true, false);
1557 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1558 				true, false);
1559 
1560 			if (swapEnabled && !swapAutomatic) {
1561 				const char* size = get_driver_parameter(settings, "swap_size",
1562 					NULL, NULL);
1563 				const char* volume = get_driver_parameter(settings,
1564 					"swap_volume_name", NULL, NULL);
1565 				const char* device = get_driver_parameter(settings,
1566 					"swap_volume_device", NULL, NULL);
1567 				const char* filesystem = get_driver_parameter(settings,
1568 					"swap_volume_filesystem", NULL, NULL);
1569 				const char* capacity = get_driver_parameter(settings,
1570 					"swap_volume_capacity", NULL, NULL);
1571 
1572 				if (size != NULL && device != NULL && volume != NULL
1573 					&& filesystem != NULL && capacity != NULL) {
1574 					// User specified a size / volume that seems valid
1575 					swapAutomatic = false;
1576 					swapSize = atoll(size);
1577 					strlcpy(selectedVolume.name, volume,
1578 						sizeof(selectedVolume.name));
1579 					strlcpy(selectedVolume.device, device,
1580 						sizeof(selectedVolume.device));
1581 					strlcpy(selectedVolume.filesystem, filesystem,
1582 						sizeof(selectedVolume.filesystem));
1583 					selectedVolume.capacity = atoll(capacity);
1584 				} else {
1585 					// Something isn't right with swap config, go auto
1586 					swapAutomatic = true;
1587 					dprintf("%s: virtual_memory configuration is invalid, "
1588 						"using automatic swap\n", __func__);
1589 				}
1590 			}
1591 		}
1592 		unload_driver_settings(settings);
1593 	}
1594 
1595 	if (swapAutomatic) {
1596 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1597 		if (swapSize <= (1024 * 1024 * 1024)) {
1598 			// Memory under 1GB? double the swap
1599 			swapSize *= 2;
1600 		}
1601 		// Automatic swap defaults to the boot device
1602 		swapDeviceID = gBootDevice;
1603 	}
1604 
1605 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1606 		dprintf("%s: virtual_memory is disabled\n", __func__);
1607 		return;
1608 	}
1609 
1610 	if (!swapAutomatic && swapDeviceID < 0) {
1611 		// If user-specified swap, and no swap device has been chosen yet...
1612 		KDiskDeviceManager::CreateDefault();
1613 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1614 		PartitionScorer visitor(selectedVolume);
1615 
1616 		KDiskDevice* device;
1617 		int32 cookie = 0;
1618 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1619 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1620 				|| device->IsRemovable()) {
1621 				continue;
1622 			}
1623 			device->VisitEachDescendant(&visitor);
1624 		}
1625 
1626 		if (!visitor.fBestPartition) {
1627 			dprintf("%s: Can't find configured swap partition '%s'\n",
1628 				__func__, selectedVolume.name);
1629 		} else {
1630 			if (visitor.fBestPartition->IsMounted())
1631 				swapDeviceID = visitor.fBestPartition->VolumeID();
1632 			else {
1633 				KPath devPath, mountPoint;
1634 				visitor.fBestPartition->GetPath(&devPath);
1635 				get_mount_point(visitor.fBestPartition, &mountPoint);
1636 				const char* mountPath = mountPoint.Path();
1637 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1638 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1639 					NULL, 0, NULL, 0);
1640 				if (swapDeviceID < 0) {
1641 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1642 						__func__, selectedVolume.name);
1643 				}
1644 			}
1645 		}
1646 	}
1647 
1648 	if (swapDeviceID < 0)
1649 		swapDeviceID = gBootDevice;
1650 
1651 	// We now have a swapDeviceID which is used for the swap file
1652 
1653 	KPath path;
1654 	struct fs_info info;
1655 	_kern_read_fs_info(swapDeviceID, &info);
1656 	if (swapDeviceID == gBootDevice)
1657 		path = kDefaultSwapPath;
1658 	else {
1659 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1660 			path.BufferSize());
1661 		path.UnlockBuffer();
1662 		path.Append("swap");
1663 	}
1664 
1665 	const char* swapPath = path.Path();
1666 
1667 	// Swap size limits prevent oversized swap files
1668 	if (swapAutomatic) {
1669 		off_t existingSwapSize = 0;
1670 		struct stat existingSwapStat;
1671 		if (stat(swapPath, &existingSwapStat) == 0)
1672 			existingSwapSize = existingSwapStat.st_size;
1673 
1674 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1675 
1676 		// Adjust automatic swap to a maximum of 25% of the free space
1677 		if (swapSize > (freeSpace / 4))
1678 			swapSize = (freeSpace / 4);
1679 	}
1680 
1681 	// Create swap file
1682 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1683 	if (fd < 0) {
1684 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1685 			swapPath, strerror(errno));
1686 		return;
1687 	}
1688 
1689 	struct stat stat;
1690 	stat.st_size = swapSize;
1691 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1692 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1693 	if (error != B_OK) {
1694 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1695 			__func__, swapPath, swapSize, strerror(error));
1696 	}
1697 
1698 	close(fd);
1699 
1700 	error = swap_file_add(swapPath);
1701 	if (error != B_OK) {
1702 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1703 			strerror(error));
1704 	}
1705 }
1706 
1707 
1708 //! Used by page daemon to free swap space.
1709 bool
1710 swap_free_page_swap_space(vm_page* page)
1711 {
1712 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1713 	if (cache == NULL)
1714 		return false;
1715 
1716 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1717 	if (slotIndex == SWAP_SLOT_NONE)
1718 		return false;
1719 
1720 	swap_slot_dealloc(slotIndex, 1);
1721 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1722 	cache->_SwapBlockFree(page->cache_offset, 1);
1723 
1724 	return true;
1725 }
1726 
1727 
1728 uint32
1729 swap_available_pages()
1730 {
1731 	mutex_lock(&sAvailSwapSpaceLock);
1732 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1733 	mutex_unlock(&sAvailSwapSpaceLock);
1734 
1735 	return avail;
1736 }
1737 
1738 
1739 uint32
1740 swap_total_swap_pages()
1741 {
1742 	mutex_lock(&sSwapFileListLock);
1743 
1744 	uint32 totalSwapSlots = 0;
1745 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1746 		swap_file* swapFile = it.Next();) {
1747 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1748 	}
1749 
1750 	mutex_unlock(&sSwapFileListLock);
1751 
1752 	return totalSwapSlots;
1753 }
1754 
1755 
1756 #endif	// ENABLE_SWAP_SUPPORT
1757 
1758 
1759 void
1760 swap_get_info(system_info* info)
1761 {
1762 #if ENABLE_SWAP_SUPPORT
1763 	MutexLocker locker(sSwapFileListLock);
1764 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1765 		swap_file* swapFile = it.Next();) {
1766 		info->max_swap_pages += swapFile->last_slot - swapFile->first_slot;
1767 		info->free_swap_pages += swapFile->bmp->free_slots;
1768 	}
1769 #else
1770 	info->max_swap_pages = 0;
1771 	info->free_swap_pages = 0;
1772 #endif
1773 }
1774 
1775