xref: /haiku/src/system/kernel/vm/VMAnonymousCache.cpp (revision 02354704729d38c3b078c696adc1bbbd33cbcf72)
1 /*
2  * Copyright 2008, Zhao Shuai, upczhsh@163.com.
3  * Copyright 2008-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
4  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
5  * Distributed under the terms of the MIT License.
6  *
7  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
8  * Distributed under the terms of the NewOS License.
9  *
10  * Copyright 2011-2012 Haiku, Inc. All rights reserved.
11  * Distributed under the terms of the MIT License.
12  *
13  * Authors:
14  *		Hamish Morrison, hamish@lavabit.com
15  *		Alexander von Gluck IV, kallisti5@unixzen.com
16  */
17 
18 
19 #include "VMAnonymousCache.h"
20 
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <unistd.h>
26 
27 #include <FindDirectory.h>
28 #include <KernelExport.h>
29 #include <NodeMonitor.h>
30 
31 #include <arch_config.h>
32 #include <boot_device.h>
33 #include <disk_device_manager/KDiskDevice.h>
34 #include <disk_device_manager/KDiskDeviceManager.h>
35 #include <disk_device_manager/KDiskSystem.h>
36 #include <disk_device_manager/KPartitionVisitor.h>
37 #include <driver_settings.h>
38 #include <fs/fd.h>
39 #include <fs/KPath.h>
40 #include <fs_info.h>
41 #include <fs_interface.h>
42 #include <heap.h>
43 #include <kernel_daemon.h>
44 #include <slab/Slab.h>
45 #include <syscalls.h>
46 #include <system_info.h>
47 #include <thread.h>
48 #include <tracing.h>
49 #include <util/AutoLock.h>
50 #include <util/DoublyLinkedList.h>
51 #include <util/OpenHashTable.h>
52 #include <util/RadixBitmap.h>
53 #include <vfs.h>
54 #include <vm/vm.h>
55 #include <vm/vm_page.h>
56 #include <vm/vm_priv.h>
57 #include <vm/VMAddressSpace.h>
58 
59 #include "IORequest.h"
60 #include "VMUtils.h"
61 
62 
63 #if	ENABLE_SWAP_SUPPORT
64 
65 //#define TRACE_VM_ANONYMOUS_CACHE
66 #ifdef TRACE_VM_ANONYMOUS_CACHE
67 #	define TRACE(x...) dprintf(x)
68 #else
69 #	define TRACE(x...) do { } while (false)
70 #endif
71 
72 
73 // number of free swap blocks the object cache shall minimally have
74 #define MIN_SWAP_BLOCK_RESERVE	4096
75 
76 // interval the has resizer is triggered (in 0.1s)
77 #define SWAP_HASH_RESIZE_INTERVAL	5
78 
79 #define INITIAL_SWAP_HASH_SIZE		1024
80 
81 #define SWAP_SLOT_NONE	RADIX_SLOT_NONE
82 
83 #define SWAP_BLOCK_PAGES 32
84 #define SWAP_BLOCK_SHIFT 5		/* 1 << SWAP_BLOCK_SHIFT == SWAP_BLOCK_PAGES */
85 #define SWAP_BLOCK_MASK  (SWAP_BLOCK_PAGES - 1)
86 
87 
88 static const char* const kDefaultSwapPath = "/var/swap";
89 
90 struct swap_file : DoublyLinkedListLinkImpl<swap_file> {
91 	int				fd;
92 	struct vnode*	vnode;
93 	void*			cookie;
94 	swap_addr_t		first_slot;
95 	swap_addr_t		last_slot;
96 	radix_bitmap*	bmp;
97 };
98 
99 struct swap_hash_key {
100 	VMAnonymousCache	*cache;
101 	off_t				page_index;  // page index in the cache
102 };
103 
104 // Each swap block contains swap address information for
105 // SWAP_BLOCK_PAGES continuous pages from the same cache
106 struct swap_block {
107 	swap_block*		hash_link;
108 	swap_hash_key	key;
109 	uint32			used;
110 	swap_addr_t		swap_slots[SWAP_BLOCK_PAGES];
111 };
112 
113 struct SwapHashTableDefinition {
114 	typedef swap_hash_key KeyType;
115 	typedef swap_block ValueType;
116 
117 	SwapHashTableDefinition() {}
118 
119 	size_t HashKey(const swap_hash_key& key) const
120 	{
121 		off_t blockIndex = key.page_index >> SWAP_BLOCK_SHIFT;
122 		VMAnonymousCache* cache = key.cache;
123 		return blockIndex ^ (size_t)(int*)cache;
124 	}
125 
126 	size_t Hash(const swap_block* value) const
127 	{
128 		return HashKey(value->key);
129 	}
130 
131 	bool Compare(const swap_hash_key& key, const swap_block* value) const
132 	{
133 		return (key.page_index & ~(off_t)SWAP_BLOCK_MASK)
134 				== (value->key.page_index & ~(off_t)SWAP_BLOCK_MASK)
135 			&& key.cache == value->key.cache;
136 	}
137 
138 	swap_block*& GetLink(swap_block* value) const
139 	{
140 		return value->hash_link;
141 	}
142 };
143 
144 typedef BOpenHashTable<SwapHashTableDefinition> SwapHashTable;
145 typedef DoublyLinkedList<swap_file> SwapFileList;
146 
147 static SwapHashTable sSwapHashTable;
148 static rw_lock sSwapHashLock;
149 
150 static SwapFileList sSwapFileList;
151 static mutex sSwapFileListLock;
152 static swap_file* sSwapFileAlloc = NULL; // allocate from here
153 static uint32 sSwapFileCount = 0;
154 
155 static off_t sAvailSwapSpace = 0;
156 static mutex sAvailSwapSpaceLock;
157 
158 static object_cache* sSwapBlockCache;
159 
160 
161 #if SWAP_TRACING
162 namespace SwapTracing {
163 
164 class SwapTraceEntry : public AbstractTraceEntry {
165 public:
166 	SwapTraceEntry(VMAnonymousCache* cache)
167 		:
168 		fCache(cache)
169 	{
170 	}
171 
172 protected:
173 	VMAnonymousCache*	fCache;
174 };
175 
176 
177 class ReadPage : public SwapTraceEntry {
178 public:
179 	ReadPage(VMAnonymousCache* cache, page_num_t pageIndex,
180 		swap_addr_t swapSlotIndex)
181 		:
182 		SwapTraceEntry(cache),
183 		fPageIndex(pageIndex),
184 		fSwapSlotIndex(swapSlotIndex)
185 	{
186 		Initialized();
187 	}
188 
189 	virtual void AddDump(TraceOutput& out)
190 	{
191 		out.Print("swap read:  cache %p, page index: %lu <- swap slot: %lu",
192 			fCache, fPageIndex, fSwapSlotIndex);
193 	}
194 
195 private:
196 	page_num_t		fPageIndex;
197 	swap_addr_t		fSwapSlotIndex;
198 };
199 
200 
201 class WritePage : public SwapTraceEntry {
202 public:
203 	WritePage(VMAnonymousCache* cache, page_num_t pageIndex,
204 		swap_addr_t swapSlotIndex)
205 		:
206 		SwapTraceEntry(cache),
207 		fPageIndex(pageIndex),
208 		fSwapSlotIndex(swapSlotIndex)
209 	{
210 		Initialized();
211 	}
212 
213 	virtual void AddDump(TraceOutput& out)
214 	{
215 		out.Print("swap write: cache %p, page index: %lu -> swap slot: %lu",
216 			fCache, fPageIndex, fSwapSlotIndex);
217 	}
218 
219 private:
220 	page_num_t		fPageIndex;
221 	swap_addr_t		fSwapSlotIndex;
222 };
223 
224 }	// namespace SwapTracing
225 
226 #	define T(x) new(std::nothrow) SwapTracing::x;
227 #else
228 #	define T(x) ;
229 #endif
230 
231 
232 static int
233 dump_swap_info(int argc, char** argv)
234 {
235 	swap_addr_t totalSwapPages = 0;
236 	swap_addr_t freeSwapPages = 0;
237 
238 	kprintf("swap files:\n");
239 
240 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
241 		swap_file* file = it.Next();) {
242 		swap_addr_t total = file->last_slot - file->first_slot;
243 		kprintf("  vnode: %p, pages: total: %" B_PRIu32 ", free: %" B_PRIu32
244 			"\n", file->vnode, total, file->bmp->free_slots);
245 
246 		totalSwapPages += total;
247 		freeSwapPages += file->bmp->free_slots;
248 	}
249 
250 	kprintf("\n");
251 	kprintf("swap space in pages:\n");
252 	kprintf("total:     %9" B_PRIu32 "\n", totalSwapPages);
253 	kprintf("available: %9" B_PRIdOFF "\n", sAvailSwapSpace / B_PAGE_SIZE);
254 	kprintf("reserved:  %9" B_PRIdOFF "\n",
255 		totalSwapPages - sAvailSwapSpace / B_PAGE_SIZE);
256 	kprintf("used:      %9" B_PRIu32 "\n", totalSwapPages - freeSwapPages);
257 	kprintf("free:      %9" B_PRIu32 "\n", freeSwapPages);
258 
259 	return 0;
260 }
261 
262 
263 static swap_addr_t
264 swap_slot_alloc(uint32 count)
265 {
266 	mutex_lock(&sSwapFileListLock);
267 
268 	if (sSwapFileList.IsEmpty()) {
269 		mutex_unlock(&sSwapFileListLock);
270 		panic("swap_slot_alloc(): no swap file in the system\n");
271 		return SWAP_SLOT_NONE;
272 	}
273 
274 	// since radix bitmap could not handle more than 32 pages, we return
275 	// SWAP_SLOT_NONE, this forces Write() adjust allocation amount
276 	if (count > BITMAP_RADIX) {
277 		mutex_unlock(&sSwapFileListLock);
278 		return SWAP_SLOT_NONE;
279 	}
280 
281 	swap_addr_t j, addr = SWAP_SLOT_NONE;
282 	for (j = 0; j < sSwapFileCount; j++) {
283 		if (sSwapFileAlloc == NULL)
284 			sSwapFileAlloc = sSwapFileList.First();
285 
286 		addr = radix_bitmap_alloc(sSwapFileAlloc->bmp, count);
287 		if (addr != SWAP_SLOT_NONE) {
288 			addr += sSwapFileAlloc->first_slot;
289 			break;
290 		}
291 
292 		// this swap_file is full, find another
293 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
294 	}
295 
296 	if (j == sSwapFileCount) {
297 		mutex_unlock(&sSwapFileListLock);
298 		panic("swap_slot_alloc: swap space exhausted!\n");
299 		return SWAP_SLOT_NONE;
300 	}
301 
302 	// if this swap file has used more than 90% percent of its space
303 	// switch to another
304 	if (sSwapFileAlloc->bmp->free_slots
305 		< (sSwapFileAlloc->last_slot - sSwapFileAlloc->first_slot) / 10) {
306 		sSwapFileAlloc = sSwapFileList.GetNext(sSwapFileAlloc);
307 	}
308 
309 	mutex_unlock(&sSwapFileListLock);
310 
311 	return addr;
312 }
313 
314 
315 static swap_file*
316 find_swap_file(swap_addr_t slotIndex)
317 {
318 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
319 		swap_file* swapFile = it.Next();) {
320 		if (slotIndex >= swapFile->first_slot
321 			&& slotIndex < swapFile->last_slot) {
322 			return swapFile;
323 		}
324 	}
325 
326 	panic("find_swap_file(): can't find swap file for slot %" B_PRIu32 "\n",
327 		slotIndex);
328 	return NULL;
329 }
330 
331 
332 static void
333 swap_slot_dealloc(swap_addr_t slotIndex, uint32 count)
334 {
335 	if (slotIndex == SWAP_SLOT_NONE)
336 		return;
337 
338 	mutex_lock(&sSwapFileListLock);
339 	swap_file* swapFile = find_swap_file(slotIndex);
340 	slotIndex -= swapFile->first_slot;
341 	radix_bitmap_dealloc(swapFile->bmp, slotIndex, count);
342 	mutex_unlock(&sSwapFileListLock);
343 }
344 
345 
346 static off_t
347 swap_space_reserve(off_t amount)
348 {
349 	mutex_lock(&sAvailSwapSpaceLock);
350 	if (sAvailSwapSpace >= amount)
351 		sAvailSwapSpace -= amount;
352 	else {
353 		amount = sAvailSwapSpace;
354 		sAvailSwapSpace = 0;
355 	}
356 	mutex_unlock(&sAvailSwapSpaceLock);
357 
358 	return amount;
359 }
360 
361 
362 static void
363 swap_space_unreserve(off_t amount)
364 {
365 	mutex_lock(&sAvailSwapSpaceLock);
366 	sAvailSwapSpace += amount;
367 	mutex_unlock(&sAvailSwapSpaceLock);
368 }
369 
370 
371 static void
372 swap_hash_resizer(void*, int)
373 {
374 	WriteLocker locker(sSwapHashLock);
375 
376 	size_t size;
377 	void* allocation;
378 
379 	do {
380 		size = sSwapHashTable.ResizeNeeded();
381 		if (size == 0)
382 			return;
383 
384 		locker.Unlock();
385 
386 		allocation = malloc(size);
387 		if (allocation == NULL)
388 			return;
389 
390 		locker.Lock();
391 
392 	} while (!sSwapHashTable.Resize(allocation, size));
393 }
394 
395 
396 // #pragma mark -
397 
398 
399 class VMAnonymousCache::WriteCallback : public StackableAsyncIOCallback {
400 public:
401 	WriteCallback(VMAnonymousCache* cache, AsyncIOCallback* callback)
402 		:
403 		StackableAsyncIOCallback(callback),
404 		fCache(cache)
405 	{
406 	}
407 
408 	void SetTo(page_num_t pageIndex, swap_addr_t slotIndex, bool newSlot)
409 	{
410 		fPageIndex = pageIndex;
411 		fSlotIndex = slotIndex;
412 		fNewSlot = newSlot;
413 	}
414 
415 	virtual void IOFinished(status_t status, bool partialTransfer,
416 		generic_size_t bytesTransferred)
417 	{
418 		if (fNewSlot) {
419 			if (status == B_OK) {
420 				fCache->_SwapBlockBuild(fPageIndex, fSlotIndex, 1);
421 			} else {
422 				AutoLocker<VMCache> locker(fCache);
423 				fCache->fAllocatedSwapSize -= B_PAGE_SIZE;
424 				locker.Unlock();
425 
426 				swap_slot_dealloc(fSlotIndex, 1);
427 			}
428 		}
429 
430 		fNextCallback->IOFinished(status, partialTransfer, bytesTransferred);
431 
432 		delete this;
433 	}
434 
435 private:
436 	VMAnonymousCache*	fCache;
437 	page_num_t			fPageIndex;
438 	swap_addr_t			fSlotIndex;
439 	bool				fNewSlot;
440 };
441 
442 
443 // #pragma mark -
444 
445 
446 VMAnonymousCache::~VMAnonymousCache()
447 {
448 	_FreeSwapPageRange(virtual_base, virtual_end, false);
449 	swap_space_unreserve(fCommittedSwapSize);
450 	if (committed_size > fCommittedSwapSize)
451 		vm_unreserve_memory(committed_size - fCommittedSwapSize);
452 }
453 
454 
455 status_t
456 VMAnonymousCache::Init(bool canOvercommit, int32 numPrecommittedPages,
457 	int32 numGuardPages, uint32 allocationFlags)
458 {
459 	TRACE("%p->VMAnonymousCache::Init(canOvercommit = %s, "
460 		"numPrecommittedPages = %" B_PRId32 ", numGuardPages = %" B_PRId32
461 		")\n", this, canOvercommit ? "yes" : "no", numPrecommittedPages,
462 		numGuardPages);
463 
464 	status_t error = VMCache::Init(CACHE_TYPE_RAM, allocationFlags);
465 	if (error != B_OK)
466 		return error;
467 
468 	fCanOvercommit = canOvercommit;
469 	fHasPrecommitted = false;
470 	fPrecommittedPages = min_c(numPrecommittedPages, 255);
471 	fGuardedSize = numGuardPages * B_PAGE_SIZE;
472 	fCommittedSwapSize = 0;
473 	fAllocatedSwapSize = 0;
474 
475 	return B_OK;
476 }
477 
478 
479 void
480 VMAnonymousCache::_FreeSwapPageRange(off_t fromOffset, off_t toOffset,
481 	bool skipBusyPages)
482 {
483 	swap_block* swapBlock = NULL;
484 	off_t toIndex = toOffset >> PAGE_SHIFT;
485 	for (off_t pageIndex = fromOffset >> PAGE_SHIFT;
486 		pageIndex < toIndex && fAllocatedSwapSize > 0; pageIndex++) {
487 
488 		WriteLocker locker(sSwapHashLock);
489 
490 		// Get the swap slot index for the page.
491 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
492 		if (swapBlock == NULL || blockIndex == 0) {
493 			swap_hash_key key = { this, pageIndex };
494 			swapBlock = sSwapHashTable.Lookup(key);
495 
496 			if (swapBlock == NULL) {
497 				pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
498 				continue;
499 			}
500 		}
501 
502 		swap_addr_t slotIndex = swapBlock->swap_slots[blockIndex];
503 		if (slotIndex == SWAP_SLOT_NONE)
504 			continue;
505 
506 		if (skipBusyPages) {
507 			vm_page* page = LookupPage(pageIndex * B_PAGE_SIZE);
508 			if (page != NULL && page->busy) {
509 				// TODO: We skip (i.e. leak) swap space of busy pages, since
510 				// there could be I/O going on (paging in/out). Waiting is
511 				// not an option as 1. unlocking the cache means that new
512 				// swap pages could be added in a range we've already
513 				// cleared (since the cache still has the old size) and 2.
514 				// we'd risk a deadlock in case we come from the file cache
515 				// and the FS holds the node's write-lock. We should mark
516 				// the page invalid and let the one responsible clean up.
517 				// There's just no such mechanism yet.
518 				continue;
519 			}
520 		}
521 
522 		swap_slot_dealloc(slotIndex, 1);
523 		fAllocatedSwapSize -= B_PAGE_SIZE;
524 
525 		swapBlock->swap_slots[blockIndex] = SWAP_SLOT_NONE;
526 		if (--swapBlock->used == 0) {
527 			// All swap pages have been freed -- we can discard the swap block.
528 			sSwapHashTable.RemoveUnchecked(swapBlock);
529 			object_cache_free(sSwapBlockCache, swapBlock,
530 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
531 
532 			// There are no swap pages for possibly remaining pages, skip to the
533 			// next block.
534 			pageIndex = ROUNDUP(pageIndex + 1, SWAP_BLOCK_PAGES) - 1;
535 			swapBlock = NULL;
536 		}
537 	}
538 }
539 
540 
541 status_t
542 VMAnonymousCache::Resize(off_t newSize, int priority)
543 {
544 	_FreeSwapPageRange(newSize + B_PAGE_SIZE - 1,
545 		virtual_end + B_PAGE_SIZE - 1);
546 	return VMCache::Resize(newSize, priority);
547 }
548 
549 
550 status_t
551 VMAnonymousCache::Rebase(off_t newBase, int priority)
552 {
553 	_FreeSwapPageRange(virtual_base, newBase);
554 	return VMCache::Rebase(newBase, priority);
555 }
556 
557 
558 status_t
559 VMAnonymousCache::Discard(off_t offset, off_t size)
560 {
561 	_FreeSwapPageRange(offset, offset + size);
562 	return VMCache::Discard(offset, size);
563 }
564 
565 
566 /*!	Moves the swap pages for the given range from the source cache into this
567 	cache. Both caches must be locked.
568 */
569 status_t
570 VMAnonymousCache::Adopt(VMCache* _source, off_t offset, off_t size,
571 	off_t newOffset)
572 {
573 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
574 	if (source == NULL) {
575 		panic("VMAnonymousCache::Adopt(): adopt from incompatible cache %p "
576 			"requested", _source);
577 		return B_ERROR;
578 	}
579 
580 	off_t pageIndex = newOffset >> PAGE_SHIFT;
581 	off_t sourcePageIndex = offset >> PAGE_SHIFT;
582 	off_t sourceEndPageIndex = (offset + size + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
583 	swap_block* swapBlock = NULL;
584 
585 	WriteLocker locker(sSwapHashLock);
586 
587 	while (sourcePageIndex < sourceEndPageIndex
588 			&& source->fAllocatedSwapSize > 0) {
589 		swap_addr_t left
590 			= SWAP_BLOCK_PAGES - (sourcePageIndex & SWAP_BLOCK_MASK);
591 
592 		swap_hash_key sourceKey = { source, sourcePageIndex };
593 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(sourceKey);
594 		if (sourceSwapBlock == NULL || sourceSwapBlock->used == 0) {
595 			sourcePageIndex += left;
596 			pageIndex += left;
597 			swapBlock = NULL;
598 			continue;
599 		}
600 
601 		for (; left > 0 && sourceSwapBlock->used > 0;
602 				left--, sourcePageIndex++, pageIndex++) {
603 
604 			swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
605 			if (swapBlock == NULL || blockIndex == 0) {
606 				swap_hash_key key = { this, pageIndex };
607 				swapBlock = sSwapHashTable.Lookup(key);
608 
609 				if (swapBlock == NULL) {
610 					swapBlock = (swap_block*)object_cache_alloc(sSwapBlockCache,
611 						CACHE_DONT_WAIT_FOR_MEMORY
612 							| CACHE_DONT_LOCK_KERNEL_SPACE);
613 					if (swapBlock == NULL)
614 						return B_NO_MEMORY;
615 
616 					swapBlock->key.cache = this;
617 					swapBlock->key.page_index
618 						= pageIndex & ~(off_t)SWAP_BLOCK_MASK;
619 					swapBlock->used = 0;
620 					for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
621 						swapBlock->swap_slots[i] = SWAP_SLOT_NONE;
622 
623 					sSwapHashTable.InsertUnchecked(swapBlock);
624 				}
625 			}
626 
627 			swap_addr_t sourceBlockIndex = sourcePageIndex & SWAP_BLOCK_MASK;
628 			swap_addr_t slotIndex
629 				= sourceSwapBlock->swap_slots[sourceBlockIndex];
630 			if (slotIndex == SWAP_SLOT_NONE)
631 				continue;
632 
633 			ASSERT(swapBlock->swap_slots[blockIndex] == SWAP_SLOT_NONE);
634 
635 			swapBlock->swap_slots[blockIndex] = slotIndex;
636 			swapBlock->used++;
637 			fAllocatedSwapSize += B_PAGE_SIZE;
638 
639 			sourceSwapBlock->swap_slots[sourceBlockIndex] = SWAP_SLOT_NONE;
640 			sourceSwapBlock->used--;
641 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
642 
643 			TRACE("adopted slot %#" B_PRIx32 " from %p at page %" B_PRIdOFF
644 				" to %p at page %" B_PRIdOFF "\n", slotIndex, source,
645 				sourcePageIndex, this, pageIndex);
646 		}
647 
648 		if (left > 0) {
649 			sourcePageIndex += left;
650 			pageIndex += left;
651 			swapBlock = NULL;
652 		}
653 
654 		if (sourceSwapBlock->used == 0) {
655 			// All swap pages have been adopted, we can discard the swap block.
656 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
657 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
658 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
659 		}
660 	}
661 
662 	locker.Unlock();
663 
664 	return VMCache::Adopt(source, offset, size, newOffset);
665 }
666 
667 
668 status_t
669 VMAnonymousCache::Commit(off_t size, int priority)
670 {
671 	TRACE("%p->VMAnonymousCache::Commit(%" B_PRIdOFF ")\n", this, size);
672 
673 	// If we can overcommit, we don't commit here, but in Fault(). We always
674 	// unreserve memory, if we're asked to shrink our commitment, though.
675 	if (fCanOvercommit && size > committed_size) {
676 		if (fHasPrecommitted)
677 			return B_OK;
678 
679 		// pre-commit some pages to make a later failure less probable
680 		fHasPrecommitted = true;
681 		uint32 precommitted = fPrecommittedPages * B_PAGE_SIZE;
682 		if (size > precommitted)
683 			size = precommitted;
684 	}
685 
686 	return _Commit(size, priority);
687 }
688 
689 
690 bool
691 VMAnonymousCache::HasPage(off_t offset)
692 {
693 	if (_SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE)
694 		return true;
695 
696 	return false;
697 }
698 
699 
700 bool
701 VMAnonymousCache::DebugHasPage(off_t offset)
702 {
703 	off_t pageIndex = offset >> PAGE_SHIFT;
704 	swap_hash_key key = { this, pageIndex };
705 	swap_block* swap = sSwapHashTable.Lookup(key);
706 	if (swap == NULL)
707 		return false;
708 
709 	return swap->swap_slots[pageIndex & SWAP_BLOCK_MASK] != SWAP_SLOT_NONE;
710 }
711 
712 
713 status_t
714 VMAnonymousCache::Read(off_t offset, const generic_io_vec* vecs, size_t count,
715 	uint32 flags, generic_size_t* _numBytes)
716 {
717 	off_t pageIndex = offset >> PAGE_SHIFT;
718 
719 	for (uint32 i = 0, j = 0; i < count; i = j) {
720 		swap_addr_t startSlotIndex = _SwapBlockGetAddress(pageIndex + i);
721 		for (j = i + 1; j < count; j++) {
722 			swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + j);
723 			if (slotIndex != startSlotIndex + j - i)
724 				break;
725 		}
726 
727 		T(ReadPage(this, pageIndex, startSlotIndex));
728 			// TODO: Assumes that only one page is read.
729 
730 		swap_file* swapFile = find_swap_file(startSlotIndex);
731 
732 		off_t pos = (off_t)(startSlotIndex - swapFile->first_slot)
733 			* B_PAGE_SIZE;
734 
735 		status_t status = vfs_read_pages(swapFile->vnode, swapFile->cookie, pos,
736 			vecs + i, j - i, flags, _numBytes);
737 		if (status != B_OK)
738 			return status;
739 	}
740 
741 	return B_OK;
742 }
743 
744 
745 status_t
746 VMAnonymousCache::Write(off_t offset, const generic_io_vec* vecs, size_t count,
747 	uint32 flags, generic_size_t* _numBytes)
748 {
749 	off_t pageIndex = offset >> PAGE_SHIFT;
750 
751 	AutoLocker<VMCache> locker(this);
752 
753 	page_num_t totalPages = 0;
754 	for (uint32 i = 0; i < count; i++) {
755 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
756 		swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex + totalPages);
757 		if (slotIndex != SWAP_SLOT_NONE) {
758 			swap_slot_dealloc(slotIndex, pageCount);
759 			_SwapBlockFree(pageIndex + totalPages, pageCount);
760 			fAllocatedSwapSize -= pageCount * B_PAGE_SIZE;
761 		}
762 
763 		totalPages += pageCount;
764 	}
765 
766 	off_t totalSize = totalPages * B_PAGE_SIZE;
767 	if (fAllocatedSwapSize + totalSize > fCommittedSwapSize)
768 		return B_ERROR;
769 
770 	fAllocatedSwapSize += totalSize;
771 	locker.Unlock();
772 
773 	page_num_t pagesLeft = totalPages;
774 	totalPages = 0;
775 
776 	for (uint32 i = 0; i < count; i++) {
777 		page_num_t pageCount = (vecs[i].length + B_PAGE_SIZE - 1) >> PAGE_SHIFT;
778 
779 		generic_addr_t vectorBase = vecs[i].base;
780 		generic_size_t vectorLength = vecs[i].length;
781 		page_num_t n = pageCount;
782 
783 		for (page_num_t j = 0; j < pageCount; j += n) {
784 			swap_addr_t slotIndex;
785 			// try to allocate n slots, if fail, try to allocate n/2
786 			while ((slotIndex = swap_slot_alloc(n)) == SWAP_SLOT_NONE && n >= 2)
787 				n >>= 1;
788 
789 			if (slotIndex == SWAP_SLOT_NONE)
790 				panic("VMAnonymousCache::Write(): can't allocate swap space\n");
791 
792 			T(WritePage(this, pageIndex, slotIndex));
793 				// TODO: Assumes that only one page is written.
794 
795 			swap_file* swapFile = find_swap_file(slotIndex);
796 
797 			off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
798 
799 			generic_size_t length = (phys_addr_t)n * B_PAGE_SIZE;
800 			generic_io_vec vector[1];
801 			vector->base = vectorBase;
802 			vector->length = length;
803 
804 			status_t status = vfs_write_pages(swapFile->vnode, swapFile->cookie,
805 				pos, vector, 1, flags, &length);
806 			if (status != B_OK) {
807 				locker.Lock();
808 				fAllocatedSwapSize -= (off_t)pagesLeft * B_PAGE_SIZE;
809 				locker.Unlock();
810 
811 				swap_slot_dealloc(slotIndex, n);
812 				return status;
813 			}
814 
815 			_SwapBlockBuild(pageIndex + totalPages, slotIndex, n);
816 			pagesLeft -= n;
817 
818 			if (n != pageCount) {
819 				vectorBase = vectorBase + n * B_PAGE_SIZE;
820 				vectorLength -= n * B_PAGE_SIZE;
821 			}
822 		}
823 
824 		totalPages += pageCount;
825 	}
826 
827 	ASSERT(pagesLeft == 0);
828 	return B_OK;
829 }
830 
831 
832 status_t
833 VMAnonymousCache::WriteAsync(off_t offset, const generic_io_vec* vecs,
834 	size_t count, generic_size_t numBytes, uint32 flags,
835 	AsyncIOCallback* _callback)
836 {
837 	// TODO: Currently this method is only used for single pages. Either make
838 	// more flexible use of it or change the interface!
839 	// This implementation relies on the current usage!
840 	ASSERT(count == 1);
841 	ASSERT(numBytes <= B_PAGE_SIZE);
842 
843 	page_num_t pageIndex = offset >> PAGE_SHIFT;
844 	swap_addr_t slotIndex = _SwapBlockGetAddress(pageIndex);
845 	bool newSlot = slotIndex == SWAP_SLOT_NONE;
846 
847 	// If the page doesn't have any swap space yet, allocate it.
848 	if (newSlot) {
849 		AutoLocker<VMCache> locker(this);
850 		if (fAllocatedSwapSize + B_PAGE_SIZE > fCommittedSwapSize) {
851 			_callback->IOFinished(B_ERROR, true, 0);
852 			return B_ERROR;
853 		}
854 
855 		fAllocatedSwapSize += B_PAGE_SIZE;
856 
857 		slotIndex = swap_slot_alloc(1);
858 	}
859 
860 	// create our callback
861 	WriteCallback* callback = (flags & B_VIP_IO_REQUEST) != 0
862 		? new(malloc_flags(HEAP_PRIORITY_VIP)) WriteCallback(this, _callback)
863 		: new(std::nothrow) WriteCallback(this, _callback);
864 	if (callback == NULL) {
865 		if (newSlot) {
866 			AutoLocker<VMCache> locker(this);
867 			fAllocatedSwapSize -= B_PAGE_SIZE;
868 			locker.Unlock();
869 
870 			swap_slot_dealloc(slotIndex, 1);
871 		}
872 		_callback->IOFinished(B_NO_MEMORY, true, 0);
873 		return B_NO_MEMORY;
874 	}
875 	// TODO: If the page already had swap space assigned, we don't need an own
876 	// callback.
877 
878 	callback->SetTo(pageIndex, slotIndex, newSlot);
879 
880 	T(WritePage(this, pageIndex, slotIndex));
881 
882 	// write the page asynchrounously
883 	swap_file* swapFile = find_swap_file(slotIndex);
884 	off_t pos = (off_t)(slotIndex - swapFile->first_slot) * B_PAGE_SIZE;
885 
886 	return vfs_asynchronous_write_pages(swapFile->vnode, swapFile->cookie, pos,
887 		vecs, 1, numBytes, flags, callback);
888 }
889 
890 
891 bool
892 VMAnonymousCache::CanWritePage(off_t offset)
893 {
894 	// We can write the page, if we have not used all of our committed swap
895 	// space or the page already has a swap slot assigned.
896 	return fAllocatedSwapSize < fCommittedSwapSize
897 		|| _SwapBlockGetAddress(offset >> PAGE_SHIFT) != SWAP_SLOT_NONE;
898 }
899 
900 
901 int32
902 VMAnonymousCache::MaxPagesPerAsyncWrite() const
903 {
904 	return 1;
905 }
906 
907 
908 status_t
909 VMAnonymousCache::Fault(struct VMAddressSpace* aspace, off_t offset)
910 {
911 	if (fGuardedSize > 0) {
912 		uint32 guardOffset;
913 
914 #ifdef STACK_GROWS_DOWNWARDS
915 		guardOffset = 0;
916 #elif defined(STACK_GROWS_UPWARDS)
917 		guardOffset = virtual_size - fGuardedSize;
918 #else
919 #	error Stack direction has not been defined in arch_config.h
920 #endif
921 		// report stack fault, guard page hit!
922 		if (offset >= guardOffset && offset < guardOffset + fGuardedSize) {
923 			TRACE(("stack overflow!\n"));
924 			return B_BAD_ADDRESS;
925 		}
926 	}
927 
928 	if (fCanOvercommit && LookupPage(offset) == NULL && !HasPage(offset)) {
929 		if (fPrecommittedPages == 0) {
930 			// never commit more than needed
931 			if (committed_size / B_PAGE_SIZE > page_count)
932 				return B_BAD_HANDLER;
933 
934 			// try to commit additional swap space/memory
935 			if (swap_space_reserve(B_PAGE_SIZE) == B_PAGE_SIZE) {
936 				fCommittedSwapSize += B_PAGE_SIZE;
937 			} else {
938 				int priority = aspace == VMAddressSpace::Kernel()
939 					? VM_PRIORITY_SYSTEM : VM_PRIORITY_USER;
940 				if (vm_try_reserve_memory(B_PAGE_SIZE, priority, 0) != B_OK) {
941 					dprintf("%p->VMAnonymousCache::Fault(): Failed to reserve "
942 						"%d bytes of RAM.\n", this, (int)B_PAGE_SIZE);
943 					return B_NO_MEMORY;
944 				}
945 			}
946 
947 			committed_size += B_PAGE_SIZE;
948 		} else
949 			fPrecommittedPages--;
950 	}
951 
952 	// This will cause vm_soft_fault() to handle the fault
953 	return B_BAD_HANDLER;
954 }
955 
956 
957 void
958 VMAnonymousCache::Merge(VMCache* _source)
959 {
960 	VMAnonymousCache* source = dynamic_cast<VMAnonymousCache*>(_source);
961 	if (source == NULL) {
962 		panic("VMAnonymousCache::Merge(): merge with incompatible cache "
963 			"%p requested", _source);
964 		return;
965 	}
966 
967 	// take over the source' committed size
968 	fCommittedSwapSize += source->fCommittedSwapSize;
969 	source->fCommittedSwapSize = 0;
970 	committed_size += source->committed_size;
971 	source->committed_size = 0;
972 
973 	off_t actualSize = virtual_end - virtual_base;
974 	if (committed_size > actualSize)
975 		_Commit(actualSize, VM_PRIORITY_USER);
976 
977 	// Move all not shadowed swap pages from the source to the consumer cache.
978 	// Also remove all source pages that are shadowed by consumer swap pages.
979 	_MergeSwapPages(source);
980 
981 	// Move all not shadowed pages from the source to the consumer cache.
982 	if (source->page_count < page_count)
983 		_MergePagesSmallerSource(source);
984 	else
985 		_MergePagesSmallerConsumer(source);
986 }
987 
988 
989 void
990 VMAnonymousCache::DeleteObject()
991 {
992 	object_cache_delete(gAnonymousCacheObjectCache, this);
993 }
994 
995 
996 void
997 VMAnonymousCache::_SwapBlockBuild(off_t startPageIndex,
998 	swap_addr_t startSlotIndex, uint32 count)
999 {
1000 	WriteLocker locker(sSwapHashLock);
1001 
1002 	uint32 left = count;
1003 	for (uint32 i = 0, j = 0; i < count; i += j) {
1004 		off_t pageIndex = startPageIndex + i;
1005 		swap_addr_t slotIndex = startSlotIndex + i;
1006 
1007 		swap_hash_key key = { this, pageIndex };
1008 
1009 		swap_block* swap = sSwapHashTable.Lookup(key);
1010 		while (swap == NULL) {
1011 			swap = (swap_block*)object_cache_alloc(sSwapBlockCache,
1012 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1013 			if (swap == NULL) {
1014 				// Wait a short time until memory is available again.
1015 				locker.Unlock();
1016 				snooze(10000);
1017 				locker.Lock();
1018 				swap = sSwapHashTable.Lookup(key);
1019 				continue;
1020 			}
1021 
1022 			swap->key.cache = this;
1023 			swap->key.page_index = pageIndex & ~(off_t)SWAP_BLOCK_MASK;
1024 			swap->used = 0;
1025 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++)
1026 				swap->swap_slots[i] = SWAP_SLOT_NONE;
1027 
1028 			sSwapHashTable.InsertUnchecked(swap);
1029 		}
1030 
1031 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1032 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1033 			swap->swap_slots[blockIndex++] = slotIndex + j;
1034 			left--;
1035 		}
1036 
1037 		swap->used += j;
1038 	}
1039 }
1040 
1041 
1042 void
1043 VMAnonymousCache::_SwapBlockFree(off_t startPageIndex, uint32 count)
1044 {
1045 	WriteLocker locker(sSwapHashLock);
1046 
1047 	uint32 left = count;
1048 	for (uint32 i = 0, j = 0; i < count; i += j) {
1049 		off_t pageIndex = startPageIndex + i;
1050 		swap_hash_key key = { this, pageIndex };
1051 		swap_block* swap = sSwapHashTable.Lookup(key);
1052 
1053 		ASSERT(swap != NULL);
1054 
1055 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1056 		for (j = 0; blockIndex < SWAP_BLOCK_PAGES && left > 0; j++) {
1057 			swap->swap_slots[blockIndex++] = SWAP_SLOT_NONE;
1058 			left--;
1059 		}
1060 
1061 		swap->used -= j;
1062 		if (swap->used == 0) {
1063 			sSwapHashTable.RemoveUnchecked(swap);
1064 			object_cache_free(sSwapBlockCache, swap,
1065 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1066 		}
1067 	}
1068 }
1069 
1070 
1071 swap_addr_t
1072 VMAnonymousCache::_SwapBlockGetAddress(off_t pageIndex)
1073 {
1074 	ReadLocker locker(sSwapHashLock);
1075 
1076 	swap_hash_key key = { this, pageIndex };
1077 	swap_block* swap = sSwapHashTable.Lookup(key);
1078 	swap_addr_t slotIndex = SWAP_SLOT_NONE;
1079 
1080 	if (swap != NULL) {
1081 		swap_addr_t blockIndex = pageIndex & SWAP_BLOCK_MASK;
1082 		slotIndex = swap->swap_slots[blockIndex];
1083 	}
1084 
1085 	return slotIndex;
1086 }
1087 
1088 
1089 status_t
1090 VMAnonymousCache::_Commit(off_t size, int priority)
1091 {
1092 	TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), already committed: "
1093 		"%" B_PRIdOFF " (%" B_PRIdOFF " swap)\n", this, size, committed_size,
1094 		fCommittedSwapSize);
1095 
1096 	// Basic strategy: reserve swap space first, only when running out of swap
1097 	// space, reserve real memory.
1098 
1099 	off_t committedMemory = committed_size - fCommittedSwapSize;
1100 
1101 	// Regardless of whether we're asked to grow or shrink the commitment,
1102 	// we always try to reserve as much as possible of the final commitment
1103 	// in the swap space.
1104 	if (size > fCommittedSwapSize) {
1105 		fCommittedSwapSize += swap_space_reserve(size - fCommittedSwapSize);
1106 		committed_size = fCommittedSwapSize + committedMemory;
1107 		if (size > fCommittedSwapSize) {
1108 			TRACE("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "), reserved "
1109 				"only %" B_PRIdOFF " swap\n", this, size, fCommittedSwapSize);
1110 		}
1111 	}
1112 
1113 	if (committed_size == size)
1114 		return B_OK;
1115 
1116 	if (committed_size > size) {
1117 		// The commitment shrinks -- unreserve real memory first.
1118 		off_t toUnreserve = committed_size - size;
1119 		if (committedMemory > 0) {
1120 			off_t unreserved = min_c(toUnreserve, committedMemory);
1121 			vm_unreserve_memory(unreserved);
1122 			committedMemory -= unreserved;
1123 			committed_size -= unreserved;
1124 			toUnreserve -= unreserved;
1125 		}
1126 
1127 		// Unreserve swap space.
1128 		if (toUnreserve > 0) {
1129 			swap_space_unreserve(toUnreserve);
1130 			fCommittedSwapSize -= toUnreserve;
1131 			committed_size -= toUnreserve;
1132 		}
1133 
1134 		return B_OK;
1135 	}
1136 
1137 	// The commitment grows -- we have already tried to reserve swap space at
1138 	// the start of the method, so we try to reserve real memory, now.
1139 
1140 	off_t toReserve = size - committed_size;
1141 	if (vm_try_reserve_memory(toReserve, priority, 1000000) != B_OK) {
1142 		dprintf("%p->VMAnonymousCache::_Commit(%" B_PRIdOFF "): Failed to "
1143 			"reserve %" B_PRIdOFF " bytes of RAM\n", this, size, toReserve);
1144 		return B_NO_MEMORY;
1145 	}
1146 
1147 	committed_size = size;
1148 	return B_OK;
1149 }
1150 
1151 
1152 void
1153 VMAnonymousCache::_MergePagesSmallerSource(VMAnonymousCache* source)
1154 {
1155 	// The source cache has less pages than the consumer (this cache), so we
1156 	// iterate through the source's pages and move the ones that are not
1157 	// shadowed up to the consumer.
1158 
1159 	for (VMCachePagesTree::Iterator it = source->pages.GetIterator();
1160 			vm_page* page = it.Next();) {
1161 		// Note: Removing the current node while iterating through a
1162 		// IteratableSplayTree is safe.
1163 		vm_page* consumerPage = LookupPage(
1164 			(off_t)page->cache_offset << PAGE_SHIFT);
1165 		if (consumerPage == NULL) {
1166 			// the page is not yet in the consumer cache - move it upwards
1167 			ASSERT_PRINT(!page->busy, "page: %p", page);
1168 			MovePage(page);
1169 		}
1170 	}
1171 }
1172 
1173 
1174 void
1175 VMAnonymousCache::_MergePagesSmallerConsumer(VMAnonymousCache* source)
1176 {
1177 	// The consumer (this cache) has less pages than the source, so we move the
1178 	// consumer's pages to the source (freeing shadowed ones) and finally just
1179 	// all pages of the source back to the consumer.
1180 
1181 	for (VMCachePagesTree::Iterator it = pages.GetIterator();
1182 		vm_page* page = it.Next();) {
1183 		// If a source page is in the way, remove and free it.
1184 		vm_page* sourcePage = source->LookupPage(
1185 			(off_t)page->cache_offset << PAGE_SHIFT);
1186 		if (sourcePage != NULL) {
1187 			DEBUG_PAGE_ACCESS_START(sourcePage);
1188 			ASSERT_PRINT(!sourcePage->busy, "page: %p", sourcePage);
1189 			ASSERT_PRINT(sourcePage->WiredCount() == 0
1190 					&& sourcePage->mappings.IsEmpty(),
1191 				"sourcePage: %p, page: %p", sourcePage, page);
1192 			source->RemovePage(sourcePage);
1193 			vm_page_free(source, sourcePage);
1194 		}
1195 
1196 		// Note: Removing the current node while iterating through a
1197 		// IteratableSplayTree is safe.
1198 		source->MovePage(page);
1199 	}
1200 
1201 	MoveAllPages(source);
1202 }
1203 
1204 
1205 void
1206 VMAnonymousCache::_MergeSwapPages(VMAnonymousCache* source)
1207 {
1208 	// If neither source nor consumer have swap pages, we don't have to do
1209 	// anything.
1210 	if (source->fAllocatedSwapSize == 0 && fAllocatedSwapSize == 0)
1211 		return;
1212 
1213 	for (off_t offset = source->virtual_base
1214 		& ~(off_t)(B_PAGE_SIZE * SWAP_BLOCK_PAGES - 1);
1215 		offset < source->virtual_end;
1216 		offset += B_PAGE_SIZE * SWAP_BLOCK_PAGES) {
1217 
1218 		WriteLocker locker(sSwapHashLock);
1219 
1220 		off_t swapBlockPageIndex = offset >> PAGE_SHIFT;
1221 		swap_hash_key key = { source, swapBlockPageIndex };
1222 		swap_block* sourceSwapBlock = sSwapHashTable.Lookup(key);
1223 
1224 		// remove the source swap block -- we will either take over the swap
1225 		// space (and the block) or free it
1226 		if (sourceSwapBlock != NULL)
1227 			sSwapHashTable.RemoveUnchecked(sourceSwapBlock);
1228 
1229 		key.cache = this;
1230 		swap_block* swapBlock = sSwapHashTable.Lookup(key);
1231 
1232 		locker.Unlock();
1233 
1234 		// remove all source pages that are shadowed by consumer swap pages
1235 		if (swapBlock != NULL) {
1236 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1237 				if (swapBlock->swap_slots[i] != SWAP_SLOT_NONE) {
1238 					vm_page* page = source->LookupPage(
1239 						(off_t)(swapBlockPageIndex + i) << PAGE_SHIFT);
1240 					if (page != NULL) {
1241 						DEBUG_PAGE_ACCESS_START(page);
1242 						ASSERT_PRINT(!page->busy, "page: %p", page);
1243 						source->RemovePage(page);
1244 						vm_page_free(source, page);
1245 					}
1246 				}
1247 			}
1248 		}
1249 
1250 		if (sourceSwapBlock == NULL)
1251 			continue;
1252 
1253 		for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1254 			off_t pageIndex = swapBlockPageIndex + i;
1255 			swap_addr_t sourceSlotIndex = sourceSwapBlock->swap_slots[i];
1256 
1257 			if (sourceSlotIndex == SWAP_SLOT_NONE)
1258 				continue;
1259 
1260 			if ((swapBlock != NULL
1261 					&& swapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1262 				|| LookupPage((off_t)pageIndex << PAGE_SHIFT) != NULL) {
1263 				// The consumer already has a page or a swapped out page
1264 				// at this index. So we can free the source swap space.
1265 				swap_slot_dealloc(sourceSlotIndex, 1);
1266 				sourceSwapBlock->swap_slots[i] = SWAP_SLOT_NONE;
1267 				sourceSwapBlock->used--;
1268 			}
1269 
1270 			// We've either freed the source swap page or are going to move it
1271 			// to the consumer. At any rate, the source cache doesn't own it
1272 			// anymore.
1273 			source->fAllocatedSwapSize -= B_PAGE_SIZE;
1274 		}
1275 
1276 		// All source swap pages that have not been freed yet are taken over by
1277 		// the consumer.
1278 		fAllocatedSwapSize += B_PAGE_SIZE * (off_t)sourceSwapBlock->used;
1279 
1280 		if (sourceSwapBlock->used == 0) {
1281 			// All swap pages have been freed -- we can discard the source swap
1282 			// block.
1283 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1284 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1285 		} else if (swapBlock == NULL) {
1286 			// We need to take over some of the source's swap pages and there's
1287 			// no swap block in the consumer cache. Just take over the source
1288 			// swap block.
1289 			sourceSwapBlock->key.cache = this;
1290 			locker.Lock();
1291 			sSwapHashTable.InsertUnchecked(sourceSwapBlock);
1292 			locker.Unlock();
1293 		} else {
1294 			// We need to take over some of the source's swap pages and there's
1295 			// already a swap block in the consumer cache. Copy the respective
1296 			// swap addresses and discard the source swap block.
1297 			for (uint32 i = 0; i < SWAP_BLOCK_PAGES; i++) {
1298 				if (sourceSwapBlock->swap_slots[i] != SWAP_SLOT_NONE)
1299 					swapBlock->swap_slots[i] = sourceSwapBlock->swap_slots[i];
1300 			}
1301 
1302 			object_cache_free(sSwapBlockCache, sourceSwapBlock,
1303 				CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1304 		}
1305 	}
1306 }
1307 
1308 
1309 // #pragma mark -
1310 
1311 
1312 // TODO: This can be removed if we get BFS uuid's
1313 struct VolumeInfo {
1314 	char name[B_FILE_NAME_LENGTH];
1315 	char device[B_FILE_NAME_LENGTH];
1316 	char filesystem[B_OS_NAME_LENGTH];
1317 	off_t capacity;
1318 };
1319 
1320 
1321 class PartitionScorer : public KPartitionVisitor {
1322 public:
1323 	PartitionScorer(VolumeInfo& volumeInfo)
1324 		:
1325 		fBestPartition(NULL),
1326 		fBestScore(-1),
1327 		fVolumeInfo(volumeInfo)
1328 	{
1329 	}
1330 
1331 	virtual bool VisitPre(KPartition* partition)
1332 	{
1333 		if (!partition->ContainsFileSystem())
1334 			return false;
1335 
1336 		KPath path;
1337 		partition->GetPath(&path);
1338 
1339 		int score = 0;
1340 		if (strcmp(fVolumeInfo.name, partition->ContentName()) == 0)
1341 			score += 4;
1342 		if (strcmp(fVolumeInfo.device, path.Path()) == 0)
1343 			score += 3;
1344 		if (fVolumeInfo.capacity == partition->Size())
1345 			score += 2;
1346 		if (strcmp(fVolumeInfo.filesystem,
1347 			partition->DiskSystem()->ShortName()) == 0) {
1348 			score += 1;
1349 		}
1350 		if (score >= 4 && score > fBestScore) {
1351 			fBestPartition = partition;
1352 			fBestScore = score;
1353 		}
1354 
1355 		return false;
1356 	}
1357 
1358 	KPartition* fBestPartition;
1359 
1360 private:
1361 	int32		fBestScore;
1362 	VolumeInfo&	fVolumeInfo;
1363 };
1364 
1365 
1366 status_t
1367 swap_file_add(const char* path)
1368 {
1369 	// open the file
1370 	int fd = open(path, O_RDWR | O_NOCACHE, S_IRUSR | S_IWUSR);
1371 	if (fd < 0)
1372 		return errno;
1373 
1374 	// fstat() it and check whether we can use it
1375 	struct stat st;
1376 	if (fstat(fd, &st) < 0) {
1377 		close(fd);
1378 		return errno;
1379 	}
1380 
1381 	if (!(S_ISREG(st.st_mode) || S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) {
1382 		close(fd);
1383 		return B_BAD_VALUE;
1384 	}
1385 
1386 	if (st.st_size < B_PAGE_SIZE) {
1387 		close(fd);
1388 		return B_BAD_VALUE;
1389 	}
1390 
1391 	// get file descriptor, vnode, and cookie
1392 	file_descriptor* descriptor = get_fd(get_current_io_context(true), fd);
1393 	put_fd(descriptor);
1394 
1395 	vnode* node = fd_vnode(descriptor);
1396 	if (node == NULL) {
1397 		close(fd);
1398 		return B_BAD_VALUE;
1399 	}
1400 
1401 	// do the allocations and prepare the swap_file structure
1402 	swap_file* swap = (swap_file*)malloc(sizeof(swap_file));
1403 	if (swap == NULL) {
1404 		close(fd);
1405 		return B_NO_MEMORY;
1406 	}
1407 
1408 	swap->fd = fd;
1409 	swap->vnode = node;
1410 	swap->cookie = descriptor->cookie;
1411 
1412 	uint32 pageCount = st.st_size >> PAGE_SHIFT;
1413 	swap->bmp = radix_bitmap_create(pageCount);
1414 	if (swap->bmp == NULL) {
1415 		free(swap);
1416 		close(fd);
1417 		return B_NO_MEMORY;
1418 	}
1419 
1420 	// set slot index and add this file to swap file list
1421 	mutex_lock(&sSwapFileListLock);
1422 	// TODO: Also check whether the swap file is already registered!
1423 	if (sSwapFileList.IsEmpty()) {
1424 		swap->first_slot = 0;
1425 		swap->last_slot = pageCount;
1426 	} else {
1427 		// leave one page gap between two swap files
1428 		swap->first_slot = sSwapFileList.Last()->last_slot + 1;
1429 		swap->last_slot = swap->first_slot + pageCount;
1430 	}
1431 	sSwapFileList.Add(swap);
1432 	sSwapFileCount++;
1433 	mutex_unlock(&sSwapFileListLock);
1434 
1435 	mutex_lock(&sAvailSwapSpaceLock);
1436 	sAvailSwapSpace += (off_t)pageCount * B_PAGE_SIZE;
1437 	mutex_unlock(&sAvailSwapSpaceLock);
1438 
1439 	return B_OK;
1440 }
1441 
1442 
1443 status_t
1444 swap_file_delete(const char* path)
1445 {
1446 	vnode* node = NULL;
1447 	status_t status = vfs_get_vnode_from_path(path, true, &node);
1448 	if (status != B_OK)
1449 		return status;
1450 
1451 	MutexLocker locker(sSwapFileListLock);
1452 
1453 	swap_file* swapFile = NULL;
1454 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1455 			(swapFile = it.Next()) != NULL;) {
1456 		if (swapFile->vnode == node)
1457 			break;
1458 	}
1459 
1460 	vfs_put_vnode(node);
1461 
1462 	if (swapFile == NULL)
1463 		return B_ERROR;
1464 
1465 	// if this file is currently used, we can't delete
1466 	// TODO: mark this swap file deleting, and remove it after releasing
1467 	// all the swap space
1468 	if (swapFile->bmp->free_slots < swapFile->last_slot - swapFile->first_slot)
1469 		return B_ERROR;
1470 
1471 	sSwapFileList.Remove(swapFile);
1472 	sSwapFileCount--;
1473 	locker.Unlock();
1474 
1475 	mutex_lock(&sAvailSwapSpaceLock);
1476 	sAvailSwapSpace -= (off_t)(swapFile->last_slot - swapFile->first_slot)
1477 		* B_PAGE_SIZE;
1478 	mutex_unlock(&sAvailSwapSpaceLock);
1479 
1480 	close(swapFile->fd);
1481 	radix_bitmap_destroy(swapFile->bmp);
1482 	free(swapFile);
1483 
1484 	return B_OK;
1485 }
1486 
1487 
1488 void
1489 swap_init(void)
1490 {
1491 	// create swap block cache
1492 	sSwapBlockCache = create_object_cache("swapblock", sizeof(swap_block),
1493 		sizeof(void*), NULL, NULL, NULL);
1494 	if (sSwapBlockCache == NULL)
1495 		panic("swap_init(): can't create object cache for swap blocks\n");
1496 
1497 	status_t error = object_cache_set_minimum_reserve(sSwapBlockCache,
1498 		MIN_SWAP_BLOCK_RESERVE);
1499 	if (error != B_OK) {
1500 		panic("swap_init(): object_cache_set_minimum_reserve() failed: %s",
1501 			strerror(error));
1502 	}
1503 
1504 	// init swap hash table
1505 	sSwapHashTable.Init(INITIAL_SWAP_HASH_SIZE);
1506 	rw_lock_init(&sSwapHashLock, "swaphash");
1507 
1508 	error = register_resource_resizer(swap_hash_resizer, NULL,
1509 		SWAP_HASH_RESIZE_INTERVAL);
1510 	if (error != B_OK) {
1511 		panic("swap_init(): Failed to register swap hash resizer: %s",
1512 			strerror(error));
1513 	}
1514 
1515 	// init swap file list
1516 	mutex_init(&sSwapFileListLock, "swaplist");
1517 	sSwapFileAlloc = NULL;
1518 	sSwapFileCount = 0;
1519 
1520 	// init available swap space
1521 	mutex_init(&sAvailSwapSpaceLock, "avail swap space");
1522 	sAvailSwapSpace = 0;
1523 
1524 	add_debugger_command_etc("swap", &dump_swap_info,
1525 		"Print infos about the swap usage",
1526 		"\n"
1527 		"Print infos about the swap usage.\n", 0);
1528 }
1529 
1530 
1531 void
1532 swap_init_post_modules()
1533 {
1534 	// Never try to create a swap file on a read-only device - when booting
1535 	// from CD, the write overlay is used.
1536 	if (gReadOnlyBootDevice)
1537 		return;
1538 
1539 	bool swapEnabled = true;
1540 	bool swapAutomatic = true;
1541 	off_t swapSize = 0;
1542 
1543 	dev_t swapDeviceID = -1;
1544 	VolumeInfo selectedVolume = {};
1545 
1546 	void* settings = load_driver_settings("virtual_memory");
1547 
1548 	if (settings != NULL) {
1549 		// We pass a lot of information on the swap device, this is mostly to
1550 		// ensure that we are dealing with the same device that was configured.
1551 
1552 		// TODO: Some kind of BFS uuid would be great here :)
1553 		const char* enabled = get_driver_parameter(settings, "vm", NULL, NULL);
1554 
1555 		if (enabled != NULL) {
1556 			swapEnabled = get_driver_boolean_parameter(settings, "vm",
1557 				true, false);
1558 			swapAutomatic = get_driver_boolean_parameter(settings, "swap_auto",
1559 				true, false);
1560 
1561 			if (swapEnabled && !swapAutomatic) {
1562 				const char* size = get_driver_parameter(settings, "swap_size",
1563 					NULL, NULL);
1564 				const char* volume = get_driver_parameter(settings,
1565 					"swap_volume_name", NULL, NULL);
1566 				const char* device = get_driver_parameter(settings,
1567 					"swap_volume_device", NULL, NULL);
1568 				const char* filesystem = get_driver_parameter(settings,
1569 					"swap_volume_filesystem", NULL, NULL);
1570 				const char* capacity = get_driver_parameter(settings,
1571 					"swap_volume_capacity", NULL, NULL);
1572 
1573 				if (size != NULL && device != NULL && volume != NULL
1574 					&& filesystem != NULL && capacity != NULL) {
1575 					// User specified a size / volume that seems valid
1576 					swapAutomatic = false;
1577 					swapSize = atoll(size);
1578 					strlcpy(selectedVolume.name, volume,
1579 						sizeof(selectedVolume.name));
1580 					strlcpy(selectedVolume.device, device,
1581 						sizeof(selectedVolume.device));
1582 					strlcpy(selectedVolume.filesystem, filesystem,
1583 						sizeof(selectedVolume.filesystem));
1584 					selectedVolume.capacity = atoll(capacity);
1585 				} else {
1586 					// Something isn't right with swap config, go auto
1587 					swapAutomatic = true;
1588 					dprintf("%s: virtual_memory configuration is invalid, "
1589 						"using automatic swap\n", __func__);
1590 				}
1591 			}
1592 		}
1593 		unload_driver_settings(settings);
1594 	}
1595 
1596 	if (swapAutomatic) {
1597 		swapSize = (off_t)vm_page_num_pages() * B_PAGE_SIZE;
1598 		if (swapSize <= (1024 * 1024 * 1024)) {
1599 			// Memory under 1GB? double the swap
1600 			swapSize *= 2;
1601 		}
1602 		// Automatic swap defaults to the boot device
1603 		swapDeviceID = gBootDevice;
1604 	}
1605 
1606 	if (!swapEnabled || swapSize < B_PAGE_SIZE) {
1607 		dprintf("%s: virtual_memory is disabled\n", __func__);
1608 		return;
1609 	}
1610 
1611 	if (!swapAutomatic && swapDeviceID < 0) {
1612 		// If user-specified swap, and no swap device has been chosen yet...
1613 		KDiskDeviceManager::CreateDefault();
1614 		KDiskDeviceManager* manager = KDiskDeviceManager::Default();
1615 		PartitionScorer visitor(selectedVolume);
1616 
1617 		KDiskDevice* device;
1618 		int32 cookie = 0;
1619 		while ((device = manager->NextDevice(&cookie)) != NULL) {
1620 			if (device->IsReadOnlyMedia() || device->IsWriteOnce()
1621 				|| device->IsRemovable()) {
1622 				continue;
1623 			}
1624 			device->VisitEachDescendant(&visitor);
1625 		}
1626 
1627 		if (!visitor.fBestPartition) {
1628 			dprintf("%s: Can't find configured swap partition '%s'\n",
1629 				__func__, selectedVolume.name);
1630 		} else {
1631 			if (visitor.fBestPartition->IsMounted())
1632 				swapDeviceID = visitor.fBestPartition->VolumeID();
1633 			else {
1634 				KPath devPath, mountPoint;
1635 				visitor.fBestPartition->GetPath(&devPath);
1636 				get_mount_point(visitor.fBestPartition, &mountPoint);
1637 				const char* mountPath = mountPoint.Path();
1638 				mkdir(mountPath, S_IRWXU | S_IRWXG | S_IRWXO);
1639 				swapDeviceID = _kern_mount(mountPath, devPath.Path(),
1640 					NULL, 0, NULL, 0);
1641 				if (swapDeviceID < 0) {
1642 					dprintf("%s: Can't mount configured swap partition '%s'\n",
1643 						__func__, selectedVolume.name);
1644 				}
1645 			}
1646 		}
1647 	}
1648 
1649 	if (swapDeviceID < 0)
1650 		swapDeviceID = gBootDevice;
1651 
1652 	// We now have a swapDeviceID which is used for the swap file
1653 
1654 	KPath path;
1655 	struct fs_info info;
1656 	_kern_read_fs_info(swapDeviceID, &info);
1657 	if (swapDeviceID == gBootDevice)
1658 		path = kDefaultSwapPath;
1659 	else {
1660 		vfs_entry_ref_to_path(info.dev, info.root, ".", true, path.LockBuffer(),
1661 			path.BufferSize());
1662 		path.UnlockBuffer();
1663 		path.Append("swap");
1664 	}
1665 
1666 	const char* swapPath = path.Path();
1667 
1668 	// Swap size limits prevent oversized swap files
1669 	if (swapAutomatic) {
1670 		off_t existingSwapSize = 0;
1671 		struct stat existingSwapStat;
1672 		if (stat(swapPath, &existingSwapStat) == 0)
1673 			existingSwapSize = existingSwapStat.st_size;
1674 
1675 		off_t freeSpace = info.free_blocks * info.block_size + existingSwapSize;
1676 
1677 		// Adjust automatic swap to a maximum of 25% of the free space
1678 		if (swapSize > (freeSpace / 4))
1679 			swapSize = (freeSpace / 4);
1680 	}
1681 
1682 	// Create swap file
1683 	int fd = open(swapPath, O_RDWR | O_CREAT | O_NOCACHE, S_IRUSR | S_IWUSR);
1684 	if (fd < 0) {
1685 		dprintf("%s: Can't open/create %s: %s\n", __func__,
1686 			swapPath, strerror(errno));
1687 		return;
1688 	}
1689 
1690 	struct stat stat;
1691 	stat.st_size = swapSize;
1692 	status_t error = _kern_write_stat(fd, NULL, false, &stat,
1693 		sizeof(struct stat), B_STAT_SIZE | B_STAT_SIZE_INSECURE);
1694 	if (error != B_OK) {
1695 		dprintf("%s: Failed to resize %s to %" B_PRIdOFF " bytes: %s\n",
1696 			__func__, swapPath, swapSize, strerror(error));
1697 	}
1698 
1699 	close(fd);
1700 
1701 	error = swap_file_add(swapPath);
1702 	if (error != B_OK) {
1703 		dprintf("%s: Failed to add swap file %s: %s\n", __func__, swapPath,
1704 			strerror(error));
1705 	}
1706 }
1707 
1708 
1709 //! Used by page daemon to free swap space.
1710 bool
1711 swap_free_page_swap_space(vm_page* page)
1712 {
1713 	VMAnonymousCache* cache = dynamic_cast<VMAnonymousCache*>(page->Cache());
1714 	if (cache == NULL)
1715 		return false;
1716 
1717 	swap_addr_t slotIndex = cache->_SwapBlockGetAddress(page->cache_offset);
1718 	if (slotIndex == SWAP_SLOT_NONE)
1719 		return false;
1720 
1721 	swap_slot_dealloc(slotIndex, 1);
1722 	cache->fAllocatedSwapSize -= B_PAGE_SIZE;
1723 	cache->_SwapBlockFree(page->cache_offset, 1);
1724 
1725 	return true;
1726 }
1727 
1728 
1729 uint32
1730 swap_available_pages()
1731 {
1732 	mutex_lock(&sAvailSwapSpaceLock);
1733 	uint32 avail = sAvailSwapSpace >> PAGE_SHIFT;
1734 	mutex_unlock(&sAvailSwapSpaceLock);
1735 
1736 	return avail;
1737 }
1738 
1739 
1740 uint32
1741 swap_total_swap_pages()
1742 {
1743 	mutex_lock(&sSwapFileListLock);
1744 
1745 	uint32 totalSwapSlots = 0;
1746 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1747 		swap_file* swapFile = it.Next();) {
1748 		totalSwapSlots += swapFile->last_slot - swapFile->first_slot;
1749 	}
1750 
1751 	mutex_unlock(&sSwapFileListLock);
1752 
1753 	return totalSwapSlots;
1754 }
1755 
1756 
1757 #endif	// ENABLE_SWAP_SUPPORT
1758 
1759 
1760 void
1761 swap_get_info(system_info* info)
1762 {
1763 #if ENABLE_SWAP_SUPPORT
1764 	MutexLocker locker(sSwapFileListLock);
1765 	for (SwapFileList::Iterator it = sSwapFileList.GetIterator();
1766 		swap_file* swapFile = it.Next();) {
1767 		info->max_swap_pages += swapFile->last_slot - swapFile->first_slot;
1768 		info->free_swap_pages += swapFile->bmp->free_slots;
1769 	}
1770 #else
1771 	info->max_swap_pages = 0;
1772 	info->free_swap_pages = 0;
1773 #endif
1774 }
1775 
1776