xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision 62f5ba006a08b0df30631375878effaf67ae5dbc)
1 /*
2  * Copyright 2010, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 #include <signal.h>
12 #include <string.h>
13 #include <stdlib.h>
14 
15 #include <algorithm>
16 
17 #include <KernelExport.h>
18 #include <OS.h>
19 
20 #include <AutoDeleter.h>
21 
22 #include <arch/cpu.h>
23 #include <arch/vm_translation_map.h>
24 #include <block_cache.h>
25 #include <boot/kernel_args.h>
26 #include <condition_variable.h>
27 #include <heap.h>
28 #include <kernel.h>
29 #include <low_resource_manager.h>
30 #include <thread.h>
31 #include <tracing.h>
32 #include <util/AutoLock.h>
33 #include <vfs.h>
34 #include <vm/vm.h>
35 #include <vm/vm_priv.h>
36 #include <vm/vm_page.h>
37 #include <vm/VMAddressSpace.h>
38 #include <vm/VMArea.h>
39 #include <vm/VMCache.h>
40 
41 #include "IORequest.h"
42 #include "PageCacheLocker.h"
43 #include "VMAnonymousCache.h"
44 #include "VMPageQueue.h"
45 
46 
47 //#define TRACE_VM_PAGE
48 #ifdef TRACE_VM_PAGE
49 #	define TRACE(x) dprintf x
50 #else
51 #	define TRACE(x) ;
52 #endif
53 
54 //#define TRACE_VM_DAEMONS
55 #ifdef TRACE_VM_DAEMONS
56 #define TRACE_DAEMON(x...) dprintf(x)
57 #else
58 #define TRACE_DAEMON(x...) do {} while (false)
59 #endif
60 
61 //#define TRACK_PAGE_USAGE_STATS	1
62 
63 #define PAGE_ASSERT(page, condition)	\
64 	ASSERT_PRINT((condition), "page: %p", (page))
65 
66 #define SCRUB_SIZE 16
67 	// this many pages will be cleared at once in the page scrubber thread
68 
69 #define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70 	// maximum I/O priority of the page writer
71 #define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72 	// the maximum I/O priority shall be reached when this many pages need to
73 	// be written
74 
75 
76 // The page reserve an allocation of the certain priority must not touch.
77 static const size_t kPageReserveForPriority[] = {
78 	VM_PAGE_RESERVE_USER,		// user
79 	VM_PAGE_RESERVE_SYSTEM,		// system
80 	0							// VIP
81 };
82 
83 // Minimum number of free pages the page daemon will try to achieve.
84 static uint32 sFreePagesTarget;
85 static uint32 sFreeOrCachedPagesTarget;
86 static uint32 sInactivePagesTarget;
87 
88 // Wait interval between page daemon runs.
89 static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90 static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91 
92 // Number of idle runs after which we want to have processed the full active
93 // queue.
94 static const uint32 kIdleRunsForFullQueue = 20;
95 
96 // Maximum limit for the vm_page::usage_count.
97 static const int32 kPageUsageMax = 64;
98 // vm_page::usage_count buff an accessed page receives in a scan.
99 static const int32 kPageUsageAdvance = 3;
100 // vm_page::usage_count debuff an unaccessed page receives in a scan.
101 static const int32 kPageUsageDecline = 1;
102 
103 int32 gMappedPagesCount;
104 
105 static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106 
107 static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108 static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109 static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110 static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111 static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112 static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113 
114 static vm_page *sPages;
115 static addr_t sPhysicalPageOffset;
116 static size_t sNumPages;
117 static vint32 sUnreservedFreePages;
118 static vint32 sUnsatisfiedPageReservations;
119 static vint32 sModifiedTemporaryPages;
120 
121 static ConditionVariable sFreePageCondition;
122 static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
123 
124 static rw_lock sFreePageQueuesLock
125 	= RW_LOCK_INITIALIZER("free/clear page queues");
126 
127 #ifdef TRACK_PAGE_USAGE_STATS
128 static page_num_t sPageUsageArrays[512];
129 static page_num_t* sPageUsage = sPageUsageArrays;
130 static page_num_t sPageUsagePageCount;
131 static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
132 static page_num_t sNextPageUsagePageCount;
133 #endif
134 
135 
136 struct page_stats {
137 	int32	totalFreePages;
138 	int32	unsatisfiedReservations;
139 	int32	cachedPages;
140 };
141 
142 
143 struct PageReservationWaiter
144 		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
145 	struct thread*	thread;
146 	uint32			dontTouch;		// reserve not to touch
147 	uint32			missing;		// pages missing for the reservation
148 	int32			threadPriority;
149 
150 	bool operator<(const PageReservationWaiter& other) const
151 	{
152 		// Implies an order by descending VM priority (ascending dontTouch)
153 		// and (secondarily) descending thread priority.
154 		if (dontTouch != other.dontTouch)
155 			return dontTouch < other.dontTouch;
156 		return threadPriority > other.threadPriority;
157 	}
158 };
159 
160 typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
161 static PageReservationWaiterList sPageReservationWaiters;
162 
163 
164 struct DaemonCondition {
165 	void Init(const char* name)
166 	{
167 		mutex_init(&fLock, "daemon condition");
168 		fCondition.Init(this, name);
169 		fActivated = false;
170 	}
171 
172 	bool Lock()
173 	{
174 		return mutex_lock(&fLock) == B_OK;
175 	}
176 
177 	void Unlock()
178 	{
179 		mutex_unlock(&fLock);
180 	}
181 
182 	bool Wait(bigtime_t timeout, bool clearActivated)
183 	{
184 		MutexLocker locker(fLock);
185 		if (clearActivated)
186 			fActivated = false;
187 		else if (fActivated)
188 			return true;
189 
190 		ConditionVariableEntry entry;
191 		fCondition.Add(&entry);
192 
193 		locker.Unlock();
194 
195 		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
196 	}
197 
198 	void WakeUp()
199 	{
200 		if (fActivated)
201 			return;
202 
203 		MutexLocker locker(fLock);
204 		fActivated = true;
205 		fCondition.NotifyOne();
206 	}
207 
208 	void ClearActivated()
209 	{
210 		MutexLocker locker(fLock);
211 		fActivated = false;
212 	}
213 
214 private:
215 	mutex				fLock;
216 	ConditionVariable	fCondition;
217 	bool				fActivated;
218 };
219 
220 
221 static DaemonCondition sPageWriterCondition;
222 static DaemonCondition sPageDaemonCondition;
223 
224 
225 #if PAGE_ALLOCATION_TRACING
226 
227 namespace PageAllocationTracing {
228 
229 class ReservePages : public AbstractTraceEntry {
230 	public:
231 		ReservePages(uint32 count)
232 			:
233 			fCount(count)
234 		{
235 			Initialized();
236 		}
237 
238 		virtual void AddDump(TraceOutput& out)
239 		{
240 			out.Print("page reserve:   %lu", fCount);
241 		}
242 
243 	private:
244 		uint32		fCount;
245 };
246 
247 
248 class UnreservePages : public AbstractTraceEntry {
249 	public:
250 		UnreservePages(uint32 count)
251 			:
252 			fCount(count)
253 		{
254 			Initialized();
255 		}
256 
257 		virtual void AddDump(TraceOutput& out)
258 		{
259 			out.Print("page unreserve: %lu", fCount);
260 		}
261 
262 	private:
263 		uint32		fCount;
264 };
265 
266 
267 class AllocatePage : public AbstractTraceEntry {
268 	public:
269 		AllocatePage()
270 		{
271 			Initialized();
272 		}
273 
274 		virtual void AddDump(TraceOutput& out)
275 		{
276 			out.Print("page alloc");
277 		}
278 };
279 
280 
281 class AllocatePageRun : public AbstractTraceEntry {
282 	public:
283 		AllocatePageRun(uint32 length)
284 			:
285 			fLength(length)
286 		{
287 			Initialized();
288 		}
289 
290 		virtual void AddDump(TraceOutput& out)
291 		{
292 			out.Print("page alloc run: length: %ld", fLength);
293 		}
294 
295 	private:
296 		uint32		fLength;
297 };
298 
299 
300 class FreePage : public AbstractTraceEntry {
301 	public:
302 		FreePage()
303 		{
304 			Initialized();
305 		}
306 
307 		virtual void AddDump(TraceOutput& out)
308 		{
309 			out.Print("page free");
310 		}
311 };
312 
313 
314 class ScrubbingPages : public AbstractTraceEntry {
315 	public:
316 		ScrubbingPages(uint32 count)
317 			:
318 			fCount(count)
319 		{
320 			Initialized();
321 		}
322 
323 		virtual void AddDump(TraceOutput& out)
324 		{
325 			out.Print("page scrubbing: %lu", fCount);
326 		}
327 
328 	private:
329 		uint32		fCount;
330 };
331 
332 
333 class ScrubbedPages : public AbstractTraceEntry {
334 	public:
335 		ScrubbedPages(uint32 count)
336 			:
337 			fCount(count)
338 		{
339 			Initialized();
340 		}
341 
342 		virtual void AddDump(TraceOutput& out)
343 		{
344 			out.Print("page scrubbed:  %lu", fCount);
345 		}
346 
347 	private:
348 		uint32		fCount;
349 };
350 
351 
352 class StolenPage : public AbstractTraceEntry {
353 	public:
354 		StolenPage()
355 		{
356 			Initialized();
357 		}
358 
359 		virtual void AddDump(TraceOutput& out)
360 		{
361 			out.Print("page stolen");
362 		}
363 };
364 
365 }	// namespace PageAllocationTracing
366 
367 #	define TA(x)	new(std::nothrow) PageAllocationTracing::x
368 
369 #else
370 #	define TA(x)
371 #endif	// PAGE_ALLOCATION_TRACING
372 
373 
374 #if PAGE_DAEMON_TRACING
375 
376 namespace PageDaemonTracing {
377 
378 class ActivatePage : public AbstractTraceEntry {
379 	public:
380 		ActivatePage(vm_page* page)
381 			:
382 			fCache(page->cache),
383 			fPage(page)
384 		{
385 			Initialized();
386 		}
387 
388 		virtual void AddDump(TraceOutput& out)
389 		{
390 			out.Print("page activated:   %p, cache: %p", fPage, fCache);
391 		}
392 
393 	private:
394 		VMCache*	fCache;
395 		vm_page*	fPage;
396 };
397 
398 
399 class DeactivatePage : public AbstractTraceEntry {
400 	public:
401 		DeactivatePage(vm_page* page)
402 			:
403 			fCache(page->cache),
404 			fPage(page)
405 		{
406 			Initialized();
407 		}
408 
409 		virtual void AddDump(TraceOutput& out)
410 		{
411 			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
412 		}
413 
414 	private:
415 		VMCache*	fCache;
416 		vm_page*	fPage;
417 };
418 
419 
420 class FreedPageSwap : public AbstractTraceEntry {
421 	public:
422 		FreedPageSwap(vm_page* page)
423 			:
424 			fCache(page->cache),
425 			fPage(page)
426 		{
427 			Initialized();
428 		}
429 
430 		virtual void AddDump(TraceOutput& out)
431 		{
432 			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
433 		}
434 
435 	private:
436 		VMCache*	fCache;
437 		vm_page*	fPage;
438 };
439 
440 }	// namespace PageDaemonTracing
441 
442 #	define TD(x)	new(std::nothrow) PageDaemonTracing::x
443 
444 #else
445 #	define TD(x)
446 #endif	// PAGE_DAEMON_TRACING
447 
448 
449 #if PAGE_WRITER_TRACING
450 
451 namespace PageWriterTracing {
452 
453 class WritePage : public AbstractTraceEntry {
454 	public:
455 		WritePage(vm_page* page)
456 			:
457 			fCache(page->Cache()),
458 			fPage(page)
459 		{
460 			Initialized();
461 		}
462 
463 		virtual void AddDump(TraceOutput& out)
464 		{
465 			out.Print("page write: %p, cache: %p", fPage, fCache);
466 		}
467 
468 	private:
469 		VMCache*	fCache;
470 		vm_page*	fPage;
471 };
472 
473 }	// namespace PageWriterTracing
474 
475 #	define TPW(x)	new(std::nothrow) PageWriterTracing::x
476 
477 #else
478 #	define TPW(x)
479 #endif	// PAGE_WRITER_TRACING
480 
481 
482 #if PAGE_STATE_TRACING
483 
484 namespace PageStateTracing {
485 
486 class SetPageState : public AbstractTraceEntry {
487 	public:
488 		SetPageState(vm_page* page, uint8 newState)
489 			:
490 			fPage(page),
491 			fOldState(page->State()),
492 			fNewState(newState),
493 			fBusy(page->busy),
494 			fWired(page->wired_count > 0),
495 			fMapped(!page->mappings.IsEmpty()),
496 			fAccessed(page->accessed),
497 			fModified(page->modified)
498 		{
499 #if PAGE_STATE_TRACING_STACK_TRACE
500 			fStackTrace = capture_tracing_stack_trace(
501 				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
502 				// Don't capture userland stack trace to avoid potential
503 				// deadlocks.
504 #endif
505 			Initialized();
506 		}
507 
508 #if PAGE_STATE_TRACING_STACK_TRACE
509 		virtual void DumpStackTrace(TraceOutput& out)
510 		{
511 			out.PrintStackTrace(fStackTrace);
512 		}
513 #endif
514 
515 		virtual void AddDump(TraceOutput& out)
516 		{
517 			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
518 				fBusy ? 'b' : '-',
519 				fWired ? 'w' : '-',
520 				fMapped ? 'm' : '-',
521 				fAccessed ? 'a' : '-',
522 				fModified ? 'm' : '-',
523 				page_state_to_string(fOldState),
524 				page_state_to_string(fNewState));
525 		}
526 
527 	private:
528 		vm_page*	fPage;
529 #if PAGE_STATE_TRACING_STACK_TRACE
530 		tracing_stack_trace* fStackTrace;
531 #endif
532 		uint8		fOldState;
533 		uint8		fNewState;
534 		bool		fBusy : 1;
535 		bool		fWired : 1;
536 		bool		fMapped : 1;
537 		bool		fAccessed : 1;
538 		bool		fModified : 1;
539 };
540 
541 }	// namespace PageStateTracing
542 
543 #	define TPS(x)	new(std::nothrow) PageStateTracing::x
544 
545 #else
546 #	define TPS(x)
547 #endif	// PAGE_STATE_TRACING
548 
549 
550 static int
551 find_page(int argc, char **argv)
552 {
553 	struct vm_page *page;
554 	addr_t address;
555 	int32 index = 1;
556 	int i;
557 
558 	struct {
559 		const char*	name;
560 		VMPageQueue*	queue;
561 	} pageQueueInfos[] = {
562 		{ "free",		&sFreePageQueue },
563 		{ "clear",		&sClearPageQueue },
564 		{ "modified",	&sModifiedPageQueue },
565 		{ "active",		&sActivePageQueue },
566 		{ "inactive",	&sInactivePageQueue },
567 		{ "cached",		&sCachedPageQueue },
568 		{ NULL, NULL }
569 	};
570 
571 	if (argc < 2
572 		|| strlen(argv[index]) <= 2
573 		|| argv[index][0] != '0'
574 		|| argv[index][1] != 'x') {
575 		kprintf("usage: find_page <address>\n");
576 		return 0;
577 	}
578 
579 	address = strtoul(argv[index], NULL, 0);
580 	page = (vm_page*)address;
581 
582 	for (i = 0; pageQueueInfos[i].name; i++) {
583 		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
584 		while (vm_page* p = it.Next()) {
585 			if (p == page) {
586 				kprintf("found page %p in queue %p (%s)\n", page,
587 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
588 				return 0;
589 			}
590 		}
591 	}
592 
593 	kprintf("page %p isn't in any queue\n", page);
594 
595 	return 0;
596 }
597 
598 
599 const char *
600 page_state_to_string(int state)
601 {
602 	switch(state) {
603 		case PAGE_STATE_ACTIVE:
604 			return "active";
605 		case PAGE_STATE_INACTIVE:
606 			return "inactive";
607 		case PAGE_STATE_MODIFIED:
608 			return "modified";
609 		case PAGE_STATE_CACHED:
610 			return "cached";
611 		case PAGE_STATE_FREE:
612 			return "free";
613 		case PAGE_STATE_CLEAR:
614 			return "clear";
615 		case PAGE_STATE_WIRED:
616 			return "wired";
617 		case PAGE_STATE_UNUSED:
618 			return "unused";
619 		default:
620 			return "unknown";
621 	}
622 }
623 
624 
625 static int
626 dump_page(int argc, char **argv)
627 {
628 	bool addressIsPointer = true;
629 	bool physical = false;
630 	bool searchMappings = false;
631 	int32 index = 1;
632 
633 	while (index < argc) {
634 		if (argv[index][0] != '-')
635 			break;
636 
637 		if (!strcmp(argv[index], "-p")) {
638 			addressIsPointer = false;
639 			physical = true;
640 		} else if (!strcmp(argv[index], "-v")) {
641 			addressIsPointer = false;
642 		} else if (!strcmp(argv[index], "-m")) {
643 			searchMappings = true;
644 		} else {
645 			print_debugger_command_usage(argv[0]);
646 			return 0;
647 		}
648 
649 		index++;
650 	}
651 
652 	if (index + 1 != argc) {
653 		print_debugger_command_usage(argv[0]);
654 		return 0;
655 	}
656 
657 	uint64 value;
658 	if (!evaluate_debug_expression(argv[index], &value, false))
659 		return 0;
660 
661 	addr_t pageAddress = (addr_t)value;
662 	struct vm_page* page;
663 
664 	if (addressIsPointer) {
665 		page = (struct vm_page *)pageAddress;
666 	} else {
667 		if (!physical) {
668 			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
669 
670 			if (debug_get_debugged_thread()->team->address_space != NULL)
671 				addressSpace = debug_get_debugged_thread()->team->address_space;
672 
673 			uint32 flags = 0;
674 			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
675 					&pageAddress, &flags) != B_OK
676 				|| (flags & PAGE_PRESENT) == 0) {
677 				kprintf("Virtual address not mapped to a physical page in this "
678 					"address space.\n");
679 				return 0;
680 			}
681 		}
682 
683 		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
684 	}
685 
686 	kprintf("PAGE: %p\n", page);
687 	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
688 		page->queue_link.previous);
689 	kprintf("physical_number: %#lx\n", page->physical_page_number);
690 	kprintf("cache:           %p\n", page->Cache());
691 	kprintf("cache_offset:    %ld\n", page->cache_offset);
692 	kprintf("cache_next:      %p\n", page->cache_next);
693 	kprintf("state:           %s\n", page_state_to_string(page->State()));
694 	kprintf("wired_count:     %d\n", page->wired_count);
695 	kprintf("usage_count:     %d\n", page->usage_count);
696 	kprintf("busy:            %d\n", page->busy);
697 	kprintf("busy_writing:    %d\n", page->busy_writing);
698 	kprintf("accessed:        %d\n", page->accessed);
699 	kprintf("modified:        %d\n", page->modified);
700 	#if DEBUG_PAGE_QUEUE
701 		kprintf("queue:           %p\n", page->queue);
702 	#endif
703 	#if DEBUG_PAGE_ACCESS
704 		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
705 	#endif
706 	kprintf("area mappings:\n");
707 
708 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
709 	vm_page_mapping *mapping;
710 	while ((mapping = iterator.Next()) != NULL) {
711 		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
712 		mapping = mapping->page_link.next;
713 	}
714 
715 	if (searchMappings) {
716 		kprintf("all mappings:\n");
717 		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
718 		while (addressSpace != NULL) {
719 			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
720 			for (addr_t address = addressSpace->Base(); pageCount != 0;
721 					address += B_PAGE_SIZE, pageCount--) {
722 				addr_t physicalAddress;
723 				uint32 flags = 0;
724 				if (addressSpace->TranslationMap()->QueryInterrupt(address,
725 						&physicalAddress, &flags) == B_OK
726 					&& (flags & PAGE_PRESENT) != 0
727 					&& physicalAddress / B_PAGE_SIZE
728 						== page->physical_page_number) {
729 					VMArea* area = addressSpace->LookupArea(address);
730 					kprintf("  aspace %ld, area %ld: %#" B_PRIxADDR
731 						" (%c%c%s%s)\n", addressSpace->ID(),
732 						area != NULL ? area->id : -1, address,
733 						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
734 						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
735 						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
736 						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
737 				}
738 			}
739 			addressSpace = VMAddressSpace::DebugNext(addressSpace);
740 		}
741 	}
742 
743 	set_debug_variable("_cache", (addr_t)page->Cache());
744 	#if DEBUG_PAGE_ACCESS
745 		set_debug_variable("_accessor", page->accessing_thread);
746 	#endif
747 
748 	return 0;
749 }
750 
751 
752 static int
753 dump_page_queue(int argc, char **argv)
754 {
755 	struct VMPageQueue *queue;
756 
757 	if (argc < 2) {
758 		kprintf("usage: page_queue <address/name> [list]\n");
759 		return 0;
760 	}
761 
762 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
763 		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
764 	if (!strcmp(argv[1], "free"))
765 		queue = &sFreePageQueue;
766 	else if (!strcmp(argv[1], "clear"))
767 		queue = &sClearPageQueue;
768 	else if (!strcmp(argv[1], "modified"))
769 		queue = &sModifiedPageQueue;
770 	else if (!strcmp(argv[1], "active"))
771 		queue = &sActivePageQueue;
772 	else if (!strcmp(argv[1], "inactive"))
773 		queue = &sInactivePageQueue;
774 	else if (!strcmp(argv[1], "cached"))
775 		queue = &sCachedPageQueue;
776 	else {
777 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
778 		return 0;
779 	}
780 
781 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %ld\n",
782 		queue, queue->Head(), queue->Tail(), queue->Count());
783 
784 	if (argc == 3) {
785 		struct vm_page *page = queue->Head();
786 		const char *type = "none";
787 		int i;
788 
789 		if (page->Cache() != NULL) {
790 			switch (page->Cache()->type) {
791 				case CACHE_TYPE_RAM:
792 					type = "RAM";
793 					break;
794 				case CACHE_TYPE_DEVICE:
795 					type = "device";
796 					break;
797 				case CACHE_TYPE_VNODE:
798 					type = "vnode";
799 					break;
800 				case CACHE_TYPE_NULL:
801 					type = "null";
802 					break;
803 				default:
804 					type = "???";
805 					break;
806 			}
807 		}
808 
809 		kprintf("page        cache       type       state  wired  usage\n");
810 		for (i = 0; page; i++, page = queue->Next(page)) {
811 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
812 				type, page_state_to_string(page->State()),
813 				page->wired_count, page->usage_count);
814 		}
815 	}
816 	return 0;
817 }
818 
819 
820 static int
821 dump_page_stats(int argc, char **argv)
822 {
823 	page_num_t swappableModified = 0;
824 	page_num_t swappableModifiedInactive = 0;
825 	size_t counter[8];
826 	size_t busyCounter[8];
827 	addr_t i;
828 
829 	memset(counter, 0, sizeof(counter));
830 	memset(busyCounter, 0, sizeof(busyCounter));
831 
832 	for (i = 0; i < sNumPages; i++) {
833 		if (sPages[i].State() > 7)
834 			panic("page %li at %p has invalid state!\n", i, &sPages[i]);
835 
836 		counter[sPages[i].State()]++;
837 		if (sPages[i].busy)
838 			busyCounter[sPages[i].State()]++;
839 
840 		if (sPages[i].State() == PAGE_STATE_MODIFIED
841 			&& sPages[i].Cache() != NULL
842 			&& sPages[i].Cache()->temporary && sPages[i].wired_count == 0) {
843 			swappableModified++;
844 			if (sPages[i].usage_count == 0)
845 				swappableModifiedInactive++;
846 		}
847 	}
848 
849 	kprintf("page stats:\n");
850 	kprintf("total: %lu\n", sNumPages);
851 
852 	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
853 		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
854 	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
855 		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
856 	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
857 		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
858 	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
859 		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
860 	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
861 		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
862 	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
863 		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
864 	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
865 	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
866 
867 	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
868 	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
869 		sUnsatisfiedPageReservations);
870 	kprintf("mapped pages: %lu\n", gMappedPagesCount);
871 
872 	kprintf("waiting threads:\n");
873 	for (PageReservationWaiterList::Iterator it
874 			= sPageReservationWaiters.GetIterator();
875 		PageReservationWaiter* waiter = it.Next();) {
876 		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
877 			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
878 			waiter->missing, waiter->dontTouch);
879 	}
880 
881 	kprintf("\nfree queue: %p, count = %ld\n", &sFreePageQueue,
882 		sFreePageQueue.Count());
883 	kprintf("clear queue: %p, count = %ld\n", &sClearPageQueue,
884 		sClearPageQueue.Count());
885 	kprintf("modified queue: %p, count = %ld (%ld temporary, %lu swappable, "
886 		"inactive: %lu)\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
887 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
888 	kprintf("active queue: %p, count = %ld\n", &sActivePageQueue,
889 		sActivePageQueue.Count());
890 	kprintf("inactive queue: %p, count = %ld\n", &sInactivePageQueue,
891 		sInactivePageQueue.Count());
892 	kprintf("cached queue: %p, count = %ld\n", &sCachedPageQueue,
893 		sCachedPageQueue.Count());
894 	return 0;
895 }
896 
897 
898 #ifdef TRACK_PAGE_USAGE_STATS
899 
900 static void
901 track_page_usage(vm_page* page)
902 {
903 	if (page->wired_count == 0) {
904 		sNextPageUsage[(int32)page->usage_count + 128]++;
905 		sNextPageUsagePageCount++;
906 	}
907 }
908 
909 
910 static void
911 update_page_usage_stats()
912 {
913 	std::swap(sPageUsage, sNextPageUsage);
914 	sPageUsagePageCount = sNextPageUsagePageCount;
915 
916 	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
917 	sNextPageUsagePageCount = 0;
918 
919 	// compute average
920 	if (sPageUsagePageCount > 0) {
921 		int64 sum = 0;
922 		for (int32 i = 0; i < 256; i++)
923 			sum += (int64)sPageUsage[i] * (i - 128);
924 
925 		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
926 			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
927 	}
928 }
929 
930 
931 static int
932 dump_page_usage_stats(int argc, char** argv)
933 {
934 	kprintf("distribution of page usage counts (%lu pages):",
935 		sPageUsagePageCount);
936 
937 	int64 sum = 0;
938 	for (int32 i = 0; i < 256; i++) {
939 		if (i % 8 == 0)
940 			kprintf("\n%4ld:", i - 128);
941 
942 		int64 count = sPageUsage[i];
943 		sum += count * (i - 128);
944 
945 		kprintf("  %9llu", count);
946 	}
947 
948 	kprintf("\n\n");
949 
950 	kprintf("average usage count: %f\n",
951 		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
952 
953 	return 0;
954 }
955 
956 #endif	// TRACK_PAGE_USAGE_STATS
957 
958 
959 // #pragma mark - vm_page
960 
961 
962 inline void
963 vm_page::InitState(uint8 newState)
964 {
965 	state = newState;
966 }
967 
968 
969 inline void
970 vm_page::SetState(uint8 newState)
971 {
972 	TPS(SetPageState(this, newState));
973 
974 	state = newState;
975 }
976 
977 
978 // #pragma mark -
979 
980 
981 static void
982 get_page_stats(page_stats& _pageStats)
983 {
984 	_pageStats.totalFreePages = sUnreservedFreePages;
985 	_pageStats.cachedPages = sCachedPageQueue.Count();
986 	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
987 	// TODO: We don't get an actual snapshot here!
988 }
989 
990 
991 static bool
992 do_active_paging(const page_stats& pageStats)
993 {
994 	return pageStats.totalFreePages + pageStats.cachedPages
995 		< pageStats.unsatisfiedReservations
996 			+ (int32)sFreeOrCachedPagesTarget;
997 }
998 
999 
1000 /*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1001 	\a count. Doesn't touch the last \a dontTouch pages of
1002 	\c sUnreservedFreePages, though.
1003 	\return The number of actually reserved pages.
1004 */
1005 static uint32
1006 reserve_some_pages(uint32 count, uint32 dontTouch)
1007 {
1008 	while (true) {
1009 		int32 freePages = sUnreservedFreePages;
1010 		if (freePages <= (int32)dontTouch)
1011 			return 0;
1012 
1013 		int32 toReserve = std::min(count, freePages - dontTouch);
1014 		if (atomic_test_and_set(&sUnreservedFreePages,
1015 					freePages - toReserve, freePages)
1016 				== freePages) {
1017 			return toReserve;
1018 		}
1019 
1020 		// the count changed in the meantime -- retry
1021 	}
1022 }
1023 
1024 
1025 static void
1026 wake_up_page_reservation_waiters()
1027 {
1028 	MutexLocker pageDeficitLocker(sPageDeficitLock);
1029 
1030 	// TODO: If this is a low priority thread, we might want to disable
1031 	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1032 	// high priority threads wait be kept waiting while a medium priority thread
1033 	// prevents us from running.
1034 
1035 	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1036 		int32 reserved = reserve_some_pages(waiter->missing,
1037 			waiter->dontTouch);
1038 		if (reserved == 0)
1039 			return;
1040 
1041 		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1042 		waiter->missing -= reserved;
1043 
1044 		if (waiter->missing > 0)
1045 			return;
1046 
1047 		sPageReservationWaiters.Remove(waiter);
1048 
1049 		InterruptsSpinLocker threadLocker(gThreadSpinlock);
1050 		thread_unblock_locked(waiter->thread, B_OK);
1051 	}
1052 }
1053 
1054 
1055 static inline void
1056 unreserve_pages(uint32 count)
1057 {
1058 	atomic_add(&sUnreservedFreePages, count);
1059 	if (sUnsatisfiedPageReservations != 0)
1060 		wake_up_page_reservation_waiters();
1061 }
1062 
1063 
1064 static void
1065 free_page(vm_page* page, bool clear)
1066 {
1067 	DEBUG_PAGE_ACCESS_CHECK(page);
1068 
1069 	PAGE_ASSERT(page, !page->IsMapped());
1070 
1071 	VMPageQueue* fromQueue;
1072 
1073 	switch (page->State()) {
1074 		case PAGE_STATE_ACTIVE:
1075 			fromQueue = &sActivePageQueue;
1076 			break;
1077 		case PAGE_STATE_INACTIVE:
1078 			fromQueue = &sInactivePageQueue;
1079 			break;
1080 		case PAGE_STATE_MODIFIED:
1081 			fromQueue = &sModifiedPageQueue;
1082 			break;
1083 		case PAGE_STATE_CACHED:
1084 			fromQueue = &sCachedPageQueue;
1085 			break;
1086 		case PAGE_STATE_FREE:
1087 		case PAGE_STATE_CLEAR:
1088 			panic("free_page(): page %p already free", page);
1089 			return;
1090 		case PAGE_STATE_WIRED:
1091 		case PAGE_STATE_UNUSED:
1092 			fromQueue = NULL;
1093 			break;
1094 		default:
1095 			panic("free_page(): page %p in invalid state %d",
1096 				page, page->State());
1097 			return;
1098 	}
1099 
1100 	if (page->CacheRef() != NULL)
1101 		panic("to be freed page %p has cache", page);
1102 	if (page->IsMapped())
1103 		panic("to be freed page %p has mappings", page);
1104 
1105 	if (fromQueue != NULL)
1106 		fromQueue->RemoveUnlocked(page);
1107 
1108 	TA(FreePage());
1109 
1110 	ReadLocker locker(sFreePageQueuesLock);
1111 
1112 	DEBUG_PAGE_ACCESS_END(page);
1113 
1114 	if (clear) {
1115 		page->SetState(PAGE_STATE_CLEAR);
1116 		sClearPageQueue.PrependUnlocked(page);
1117 	} else {
1118 		page->SetState(PAGE_STATE_FREE);
1119 		sFreePageQueue.PrependUnlocked(page);
1120 	}
1121 
1122 	locker.Unlock();
1123 
1124 	unreserve_pages(1);
1125 }
1126 
1127 
1128 /*!	The caller must make sure that no-one else tries to change the page's state
1129 	while the function is called. If the page has a cache, this can be done by
1130 	locking the cache.
1131 */
1132 static void
1133 set_page_state(vm_page *page, int pageState)
1134 {
1135 	DEBUG_PAGE_ACCESS_CHECK(page);
1136 
1137 	if (pageState == page->State())
1138 		return;
1139 
1140 	VMPageQueue* fromQueue;
1141 
1142 	switch (page->State()) {
1143 		case PAGE_STATE_ACTIVE:
1144 			fromQueue = &sActivePageQueue;
1145 			break;
1146 		case PAGE_STATE_INACTIVE:
1147 			fromQueue = &sInactivePageQueue;
1148 			break;
1149 		case PAGE_STATE_MODIFIED:
1150 			fromQueue = &sModifiedPageQueue;
1151 			break;
1152 		case PAGE_STATE_CACHED:
1153 			fromQueue = &sCachedPageQueue;
1154 			break;
1155 		case PAGE_STATE_FREE:
1156 		case PAGE_STATE_CLEAR:
1157 			panic("set_page_state(): page %p is free/clear", page);
1158 			return;
1159 		case PAGE_STATE_WIRED:
1160 		case PAGE_STATE_UNUSED:
1161 			fromQueue = NULL;
1162 			break;
1163 		default:
1164 			panic("set_page_state(): page %p in invalid state %d",
1165 				page, page->State());
1166 			return;
1167 	}
1168 
1169 	VMPageQueue* toQueue;
1170 
1171 	switch (pageState) {
1172 		case PAGE_STATE_ACTIVE:
1173 			toQueue = &sActivePageQueue;
1174 			break;
1175 		case PAGE_STATE_INACTIVE:
1176 			toQueue = &sInactivePageQueue;
1177 			break;
1178 		case PAGE_STATE_MODIFIED:
1179 			toQueue = &sModifiedPageQueue;
1180 			break;
1181 		case PAGE_STATE_CACHED:
1182 			PAGE_ASSERT(page, !page->IsMapped());
1183 			PAGE_ASSERT(page, !page->modified);
1184 			toQueue = &sCachedPageQueue;
1185 			break;
1186 		case PAGE_STATE_FREE:
1187 		case PAGE_STATE_CLEAR:
1188 			panic("set_page_state(): target state is free/clear");
1189 			return;
1190 		case PAGE_STATE_WIRED:
1191 		case PAGE_STATE_UNUSED:
1192 			toQueue = NULL;
1193 			break;
1194 		default:
1195 			panic("set_page_state(): invalid target state %d", pageState);
1196 			return;
1197 	}
1198 
1199 	VMCache* cache = page->Cache();
1200 	if (cache != NULL && cache->temporary) {
1201 		if (pageState == PAGE_STATE_MODIFIED)
1202 			atomic_add(&sModifiedTemporaryPages, 1);
1203 		else if (page->State() == PAGE_STATE_MODIFIED)
1204 			atomic_add(&sModifiedTemporaryPages, -1);
1205 	}
1206 
1207 	// move the page
1208 	if (toQueue == fromQueue) {
1209 		// Note: Theoretically we are required to lock when changing the page
1210 		// state, even if we don't change the queue. We actually don't have to
1211 		// do this, though, since only for the active queue there are different
1212 		// page states and active pages have a cache that must be locked at
1213 		// this point. So we rely on the fact that everyone must lock the cache
1214 		// before trying to change/interpret the page state.
1215 		PAGE_ASSERT(page, cache != NULL);
1216 		cache->AssertLocked();
1217 		page->SetState(pageState);
1218 	} else {
1219 		if (fromQueue != NULL)
1220 			fromQueue->RemoveUnlocked(page);
1221 
1222 		page->SetState(pageState);
1223 
1224 		if (toQueue != NULL)
1225 			toQueue->AppendUnlocked(page);
1226 	}
1227 }
1228 
1229 
1230 /*! Moves a previously modified page into a now appropriate queue.
1231 	The page queues must not be locked.
1232 */
1233 static void
1234 move_page_to_appropriate_queue(vm_page *page)
1235 {
1236 	DEBUG_PAGE_ACCESS_CHECK(page);
1237 
1238 	// Note, this logic must be in sync with what the page daemon does.
1239 	int32 state;
1240 	if (page->IsMapped())
1241 		state = PAGE_STATE_ACTIVE;
1242 	else if (page->modified)
1243 		state = PAGE_STATE_MODIFIED;
1244 	else
1245 		state = PAGE_STATE_CACHED;
1246 
1247 // TODO: If free + cached pages are low, we might directly want to free the
1248 // page.
1249 	set_page_state(page, state);
1250 }
1251 
1252 
1253 static void
1254 clear_page(struct vm_page *page)
1255 {
1256 	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1257 		B_PAGE_SIZE);
1258 }
1259 
1260 
1261 static status_t
1262 mark_page_range_in_use(addr_t startPage, addr_t length, bool wired)
1263 {
1264 	TRACE(("mark_page_range_in_use: start 0x%lx, len 0x%lx\n",
1265 		startPage, length));
1266 
1267 	if (sPhysicalPageOffset > startPage) {
1268 		TRACE(("mark_page_range_in_use: start page %ld is before free list\n",
1269 			startPage));
1270 		return B_BAD_VALUE;
1271 	}
1272 	startPage -= sPhysicalPageOffset;
1273 	if (startPage + length > sNumPages) {
1274 		TRACE(("mark_page_range_in_use: range would extend past free list\n"));
1275 		return B_BAD_VALUE;
1276 	}
1277 
1278 	WriteLocker locker(sFreePageQueuesLock);
1279 
1280 	for (addr_t i = 0; i < length; i++) {
1281 		vm_page *page = &sPages[startPage + i];
1282 		switch (page->State()) {
1283 			case PAGE_STATE_FREE:
1284 			case PAGE_STATE_CLEAR:
1285 			{
1286 // TODO: This violates the page reservation policy, since we remove pages from
1287 // the free/clear queues without having reserved them before. This should happen
1288 // in the early boot process only, though.
1289 				DEBUG_PAGE_ACCESS_START(page);
1290 				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1291 					? sFreePageQueue : sClearPageQueue;
1292 				queue.Remove(page);
1293 				page->SetState(wired ? PAGE_STATE_UNUSED : PAGE_STATE_UNUSED);
1294 				page->busy = false;
1295 				atomic_add(&sUnreservedFreePages, -1);
1296 				DEBUG_PAGE_ACCESS_END(page);
1297 				break;
1298 			}
1299 			case PAGE_STATE_WIRED:
1300 			case PAGE_STATE_UNUSED:
1301 				break;
1302 			case PAGE_STATE_ACTIVE:
1303 			case PAGE_STATE_INACTIVE:
1304 			case PAGE_STATE_MODIFIED:
1305 			case PAGE_STATE_CACHED:
1306 			default:
1307 				// uh
1308 				dprintf("mark_page_range_in_use: page 0x%lx in non-free state %d!\n",
1309 					startPage + i, page->State());
1310 				break;
1311 		}
1312 	}
1313 
1314 	return B_OK;
1315 }
1316 
1317 
1318 /*!
1319 	This is a background thread that wakes up every now and then (every 100ms)
1320 	and moves some pages from the free queue over to the clear queue.
1321 	Given enough time, it will clear out all pages from the free queue - we
1322 	could probably slow it down after having reached a certain threshold.
1323 */
1324 static int32
1325 page_scrubber(void *unused)
1326 {
1327 	(void)(unused);
1328 
1329 	TRACE(("page_scrubber starting...\n"));
1330 
1331 	for (;;) {
1332 		snooze(100000); // 100ms
1333 
1334 		if (sFreePageQueue.Count() == 0
1335 				|| sUnreservedFreePages < (int32)sFreePagesTarget) {
1336 			continue;
1337 		}
1338 
1339 		// Since we temporarily remove pages from the free pages reserve,
1340 		// we must make sure we don't cause a violation of the page
1341 		// reservation warranty. The following is usually stricter than
1342 		// necessary, because we don't have information on how many of the
1343 		// reserved pages have already been allocated.
1344 		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1345 			kPageReserveForPriority[VM_PRIORITY_USER]);
1346 		if (reserved == 0)
1347 			continue;
1348 
1349 		// get some pages from the free queue
1350 		ReadLocker locker(sFreePageQueuesLock);
1351 
1352 		vm_page *page[SCRUB_SIZE];
1353 		int32 scrubCount = 0;
1354 		for (int32 i = 0; i < reserved; i++) {
1355 			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1356 			if (page[i] == NULL)
1357 				break;
1358 
1359 			DEBUG_PAGE_ACCESS_START(page[i]);
1360 
1361 			page[i]->SetState(PAGE_STATE_ACTIVE);
1362 			page[i]->busy = true;
1363 			scrubCount++;
1364 		}
1365 
1366 		locker.Unlock();
1367 
1368 		if (scrubCount == 0) {
1369 			unreserve_pages(reserved);
1370 			continue;
1371 		}
1372 
1373 		TA(ScrubbingPages(scrubCount));
1374 
1375 		// clear them
1376 		for (int32 i = 0; i < scrubCount; i++)
1377 			clear_page(page[i]);
1378 
1379 		locker.Lock();
1380 
1381 		// and put them into the clear queue
1382 		for (int32 i = 0; i < scrubCount; i++) {
1383 			page[i]->SetState(PAGE_STATE_CLEAR);
1384 			page[i]->busy = false;
1385 			DEBUG_PAGE_ACCESS_END(page[i]);
1386 			sClearPageQueue.PrependUnlocked(page[i]);
1387 		}
1388 
1389 		locker.Unlock();
1390 
1391 		unreserve_pages(reserved);
1392 
1393 		TA(ScrubbedPages(scrubCount));
1394 	}
1395 
1396 	return 0;
1397 }
1398 
1399 
1400 static void
1401 init_page_marker(vm_page &marker)
1402 {
1403 	marker.SetCacheRef(NULL);
1404 	marker.InitState(PAGE_STATE_UNUSED);
1405 	marker.busy = true;
1406 #if DEBUG_PAGE_QUEUE
1407 	marker.queue = NULL;
1408 #endif
1409 #if DEBUG_PAGE_ACCESS
1410 	marker.accessing_thread = thread_get_current_thread_id();
1411 #endif
1412 }
1413 
1414 
1415 static void
1416 remove_page_marker(struct vm_page &marker)
1417 {
1418 	DEBUG_PAGE_ACCESS_CHECK(&marker);
1419 
1420 	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1421 		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1422 
1423 	marker.SetState(PAGE_STATE_UNUSED);
1424 }
1425 
1426 
1427 static vm_page *
1428 next_modified_page(struct vm_page &marker)
1429 {
1430 	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1431 	vm_page *page;
1432 
1433 	DEBUG_PAGE_ACCESS_CHECK(&marker);
1434 
1435 	if (marker.State() == PAGE_STATE_MODIFIED) {
1436 		page = sModifiedPageQueue.Next(&marker);
1437 		sModifiedPageQueue.Remove(&marker);
1438 		marker.SetState(PAGE_STATE_UNUSED);
1439 	} else
1440 		page = sModifiedPageQueue.Head();
1441 
1442 	for (; page != NULL; page = sModifiedPageQueue.Next(page)) {
1443 		if (!page->busy) {
1444 			// insert marker
1445 			marker.SetState(PAGE_STATE_MODIFIED);
1446 			sModifiedPageQueue.InsertAfter(page, &marker);
1447 			return page;
1448 		}
1449 	}
1450 
1451 	return NULL;
1452 }
1453 
1454 
1455 // #pragma mark -
1456 
1457 
1458 class PageWriteTransfer;
1459 class PageWriteWrapper;
1460 
1461 
1462 class PageWriterRun {
1463 public:
1464 	status_t Init(uint32 maxPages);
1465 
1466 	void PrepareNextRun();
1467 	void AddPage(vm_page* page);
1468 	void Go();
1469 
1470 	void PageWritten(PageWriteTransfer* transfer, status_t status,
1471 		bool partialTransfer, size_t bytesTransferred);
1472 
1473 private:
1474 	uint32				fMaxPages;
1475 	uint32				fWrapperCount;
1476 	uint32				fTransferCount;
1477 	vint32				fPendingTransfers;
1478 	PageWriteWrapper*	fWrappers;
1479 	PageWriteTransfer*	fTransfers;
1480 	ConditionVariable	fAllFinishedCondition;
1481 };
1482 
1483 
1484 class PageWriteTransfer : public AsyncIOCallback {
1485 public:
1486 	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1487 	bool AddPage(vm_page* page);
1488 
1489 	status_t Schedule(uint32 flags);
1490 
1491 	void SetStatus(status_t status, size_t transferred);
1492 
1493 	status_t Status() const	{ return fStatus; }
1494 	struct VMCache* Cache() const { return fCache; }
1495 	uint32 PageCount() const { return fPageCount; }
1496 
1497 	virtual void IOFinished(status_t status, bool partialTransfer,
1498 		size_t bytesTransferred);
1499 private:
1500 	PageWriterRun*		fRun;
1501 	struct VMCache*		fCache;
1502 	off_t				fOffset;
1503 	uint32				fPageCount;
1504 	int32				fMaxPages;
1505 	status_t			fStatus;
1506 	uint32				fVecCount;
1507 	iovec				fVecs[32]; // TODO: make dynamic/configurable
1508 };
1509 
1510 
1511 class PageWriteWrapper {
1512 public:
1513 	PageWriteWrapper();
1514 	~PageWriteWrapper();
1515 	void SetTo(vm_page* page);
1516 	void Done(status_t result);
1517 
1518 private:
1519 	vm_page*			fPage;
1520 	struct VMCache*		fCache;
1521 	bool				fIsActive;
1522 };
1523 
1524 
1525 PageWriteWrapper::PageWriteWrapper()
1526 	:
1527 	fIsActive(false)
1528 {
1529 }
1530 
1531 
1532 PageWriteWrapper::~PageWriteWrapper()
1533 {
1534 	if (fIsActive)
1535 		panic("page write wrapper going out of scope but isn't completed");
1536 }
1537 
1538 
1539 /*!	The page's cache must be locked.
1540 */
1541 void
1542 PageWriteWrapper::SetTo(vm_page* page)
1543 {
1544 	DEBUG_PAGE_ACCESS_CHECK(page);
1545 
1546 	if (page->busy)
1547 		panic("setting page write wrapper to busy page");
1548 
1549 	if (fIsActive)
1550 		panic("re-setting page write wrapper that isn't completed");
1551 
1552 	fPage = page;
1553 	fCache = page->Cache();
1554 	fIsActive = true;
1555 
1556 	fPage->busy = true;
1557 	fPage->busy_writing = true;
1558 
1559 	// We have a modified page -- however, while we're writing it back,
1560 	// the page might still be mapped. In order not to lose any changes to the
1561 	// page, we mark it clean before actually writing it back; if
1562 	// writing the page fails for some reason, we'll just keep it in the
1563 	// modified page list, but that should happen only rarely.
1564 
1565 	// If the page is changed after we cleared the dirty flag, but before we
1566 	// had the chance to write it back, then we'll write it again later -- that
1567 	// will probably not happen that often, though.
1568 
1569 	vm_clear_map_flags(fPage, PAGE_MODIFIED);
1570 }
1571 
1572 
1573 /*!	The page's cache must be locked.
1574 	The page queues must not be locked.
1575 */
1576 void
1577 PageWriteWrapper::Done(status_t result)
1578 {
1579 	if (!fIsActive)
1580 		panic("completing page write wrapper that is not active");
1581 
1582 	DEBUG_PAGE_ACCESS_START(fPage);
1583 
1584 	fPage->busy = false;
1585 		// Set unbusy and notify later by hand, since we might free the page.
1586 
1587 	if (result == B_OK) {
1588 		// put it into the active/inactive queue
1589 		move_page_to_appropriate_queue(fPage);
1590 		fPage->busy_writing = false;
1591 		DEBUG_PAGE_ACCESS_END(fPage);
1592 	} else {
1593 		// Writing the page failed. One reason would be that the cache has been
1594 		// shrunk and the page does no longer belong to the file. Otherwise the
1595 		// actual I/O failed, in which case we'll simply keep the page modified.
1596 
1597 		if (!fPage->busy_writing) {
1598 			// The busy_writing flag was cleared. That means the cache has been
1599 			// shrunk while we were trying to write the page and we have to free
1600 			// it now.
1601 			vm_remove_all_page_mappings(fPage);
1602 // TODO: Unmapping should already happen when resizing the cache!
1603 			fCache->RemovePage(fPage);
1604 			free_page(fPage, false);
1605 		} else {
1606 			// Writing the page failed -- mark the page modified and move it to
1607 			// an appropriate queue other than the modified queue, so we don't
1608 			// keep trying to write it over and over again. We keep
1609 			// non-temporary pages in the modified queue, though, so they don't
1610 			// get lost in the inactive queue.
1611 			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
1612 				strerror(result));
1613 
1614 			fPage->modified = true;
1615 			if (!fCache->temporary)
1616 				set_page_state(fPage, PAGE_STATE_MODIFIED);
1617 			else if (fPage->IsMapped())
1618 				set_page_state(fPage, PAGE_STATE_ACTIVE);
1619 			else
1620 				set_page_state(fPage, PAGE_STATE_INACTIVE);
1621 
1622 			fPage->busy_writing = false;
1623 			DEBUG_PAGE_ACCESS_END(fPage);
1624 		}
1625 	}
1626 
1627 	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
1628 	fIsActive = false;
1629 }
1630 
1631 
1632 /*!	The page's cache must be locked.
1633 */
1634 void
1635 PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
1636 {
1637 	fRun = run;
1638 	fCache = page->Cache();
1639 	fOffset = page->cache_offset;
1640 	fPageCount = 1;
1641 	fMaxPages = maxPages;
1642 	fStatus = B_OK;
1643 
1644 	fVecs[0].iov_base = (void*)(page->physical_page_number << PAGE_SHIFT);
1645 	fVecs[0].iov_len = B_PAGE_SIZE;
1646 	fVecCount = 1;
1647 }
1648 
1649 
1650 /*!	The page's cache must be locked.
1651 */
1652 bool
1653 PageWriteTransfer::AddPage(vm_page* page)
1654 {
1655 	if (page->Cache() != fCache
1656 		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
1657 		return false;
1658 
1659 	addr_t nextBase
1660 		= (addr_t)fVecs[fVecCount - 1].iov_base + fVecs[fVecCount - 1].iov_len;
1661 
1662 	if (page->physical_page_number << PAGE_SHIFT == nextBase
1663 		&& page->cache_offset == fOffset + fPageCount) {
1664 		// append to last iovec
1665 		fVecs[fVecCount - 1].iov_len += B_PAGE_SIZE;
1666 		fPageCount++;
1667 		return true;
1668 	}
1669 
1670 	nextBase = (addr_t)fVecs[0].iov_base - B_PAGE_SIZE;
1671 	if (page->physical_page_number << PAGE_SHIFT == nextBase
1672 		&& page->cache_offset == fOffset - 1) {
1673 		// prepend to first iovec and adjust offset
1674 		fVecs[0].iov_base = (void*)nextBase;
1675 		fVecs[0].iov_len += B_PAGE_SIZE;
1676 		fOffset = page->cache_offset;
1677 		fPageCount++;
1678 		return true;
1679 	}
1680 
1681 	if ((page->cache_offset == fOffset + fPageCount
1682 			|| page->cache_offset == fOffset - 1)
1683 		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
1684 		// not physically contiguous or not in the right order
1685 		uint32 vectorIndex;
1686 		if (page->cache_offset < fOffset) {
1687 			// we are pre-pending another vector, move the other vecs
1688 			for (uint32 i = fVecCount; i > 0; i--)
1689 				fVecs[i] = fVecs[i - 1];
1690 
1691 			fOffset = page->cache_offset;
1692 			vectorIndex = 0;
1693 		} else
1694 			vectorIndex = fVecCount;
1695 
1696 		fVecs[vectorIndex].iov_base
1697 			= (void*)(page->physical_page_number << PAGE_SHIFT);
1698 		fVecs[vectorIndex].iov_len = B_PAGE_SIZE;
1699 
1700 		fVecCount++;
1701 		fPageCount++;
1702 		return true;
1703 	}
1704 
1705 	return false;
1706 }
1707 
1708 
1709 status_t
1710 PageWriteTransfer::Schedule(uint32 flags)
1711 {
1712 	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
1713 	size_t writeLength = fPageCount << PAGE_SHIFT;
1714 
1715 	if (fRun != NULL) {
1716 		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
1717 			flags | B_PHYSICAL_IO_REQUEST, this);
1718 	}
1719 
1720 	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
1721 		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
1722 
1723 	SetStatus(status, writeLength);
1724 	return fStatus;
1725 }
1726 
1727 
1728 void
1729 PageWriteTransfer::SetStatus(status_t status, size_t transferred)
1730 {
1731 	// only succeed if all pages up to the last one have been written fully
1732 	// and the last page has at least been written partially
1733 	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
1734 		status = B_ERROR;
1735 
1736 	fStatus = status;
1737 }
1738 
1739 
1740 void
1741 PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
1742 	size_t bytesTransferred)
1743 {
1744 	SetStatus(status, bytesTransferred);
1745 	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
1746 }
1747 
1748 
1749 status_t
1750 PageWriterRun::Init(uint32 maxPages)
1751 {
1752 	fMaxPages = maxPages;
1753 	fWrapperCount = 0;
1754 	fTransferCount = 0;
1755 	fPendingTransfers = 0;
1756 
1757 	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
1758 	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
1759 	if (fWrappers == NULL || fTransfers == NULL)
1760 		return B_NO_MEMORY;
1761 
1762 	return B_OK;
1763 }
1764 
1765 
1766 void
1767 PageWriterRun::PrepareNextRun()
1768 {
1769 	fWrapperCount = 0;
1770 	fTransferCount = 0;
1771 	fPendingTransfers = 0;
1772 }
1773 
1774 
1775 /*!	The page's cache must be locked.
1776 */
1777 void
1778 PageWriterRun::AddPage(vm_page* page)
1779 {
1780 	fWrappers[fWrapperCount++].SetTo(page);
1781 
1782 	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
1783 		fTransfers[fTransferCount++].SetTo(this, page,
1784 			page->Cache()->MaxPagesPerAsyncWrite());
1785 	}
1786 }
1787 
1788 
1789 void
1790 PageWriterRun::Go()
1791 {
1792 	fPendingTransfers = fTransferCount;
1793 
1794 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
1795 	ConditionVariableEntry waitEntry;
1796 	fAllFinishedCondition.Add(&waitEntry);
1797 
1798 	// schedule writes
1799 	for (uint32 i = 0; i < fTransferCount; i++)
1800 		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
1801 
1802 	// wait until all pages have been written
1803 	waitEntry.Wait();
1804 
1805 	// mark pages depending on whether they could be written or not
1806 
1807 	uint32 wrapperIndex = 0;
1808 	for (uint32 i = 0; i < fTransferCount; i++) {
1809 		PageWriteTransfer& transfer = fTransfers[i];
1810 		transfer.Cache()->Lock();
1811 
1812 		for (uint32 j = 0; j < transfer.PageCount(); j++)
1813 			fWrappers[wrapperIndex++].Done(transfer.Status());
1814 
1815 		transfer.Cache()->Unlock();
1816 	}
1817 
1818 	ASSERT(wrapperIndex == fWrapperCount);
1819 
1820 	for (uint32 i = 0; i < fTransferCount; i++) {
1821 		PageWriteTransfer& transfer = fTransfers[i];
1822 		struct VMCache* cache = transfer.Cache();
1823 
1824 		// We've acquired a references for each page
1825 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
1826 			// We release the cache references after all pages were made
1827 			// unbusy again - otherwise releasing a vnode could deadlock.
1828 			cache->ReleaseStoreRef();
1829 			cache->ReleaseRef();
1830 		}
1831 	}
1832 }
1833 
1834 
1835 void
1836 PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
1837 	bool partialTransfer, size_t bytesTransferred)
1838 {
1839 	if (atomic_add(&fPendingTransfers, -1) == 1)
1840 		fAllFinishedCondition.NotifyAll();
1841 }
1842 
1843 
1844 /*!	The page writer continuously takes some pages from the modified
1845 	queue, writes them back, and moves them back to the active queue.
1846 	It runs in its own thread, and is only there to keep the number
1847 	of modified pages low, so that more pages can be reused with
1848 	fewer costs.
1849 */
1850 status_t
1851 page_writer(void* /*unused*/)
1852 {
1853 	const uint32 kNumPages = 256;
1854 	uint32 writtenPages = 0;
1855 	bigtime_t lastWrittenTime = 0;
1856 	bigtime_t pageCollectionTime = 0;
1857 	bigtime_t pageWritingTime = 0;
1858 
1859 	PageWriterRun run;
1860 	if (run.Init(kNumPages) != B_OK) {
1861 		panic("page writer: Failed to init PageWriterRun!");
1862 		return B_ERROR;
1863 	}
1864 
1865 	vm_page marker;
1866 	init_page_marker(marker);
1867 
1868 	while (true) {
1869 // TODO: Maybe wait shorter when memory is low!
1870 		if (sModifiedPageQueue.Count() < kNumPages) {
1871 			sPageWriterCondition.Wait(3000000, true);
1872 				// all 3 seconds when no one triggers us
1873 		}
1874 
1875 		int32 modifiedPages = sModifiedPageQueue.Count();
1876 		if (modifiedPages == 0)
1877 			continue;
1878 
1879 #if ENABLE_SWAP_SUPPORT
1880 		page_stats pageStats;
1881 		get_page_stats(pageStats);
1882 		bool activePaging = do_active_paging(pageStats);
1883 #endif
1884 
1885 		// depending on how urgent it becomes to get pages to disk, we adjust
1886 		// our I/O priority
1887 		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
1888 		int32 ioPriority = B_IDLE_PRIORITY;
1889 		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
1890 			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
1891 			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
1892 		} else {
1893 			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
1894 				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
1895 		}
1896 
1897 		thread_set_io_priority(ioPriority);
1898 
1899 		uint32 numPages = 0;
1900 		run.PrepareNextRun();
1901 
1902 		// TODO: make this laptop friendly, too (ie. only start doing
1903 		// something if someone else did something or there is really
1904 		// enough to do).
1905 
1906 		// collect pages to be written
1907 		pageCollectionTime -= system_time();
1908 
1909 		while (numPages < kNumPages) {
1910 			vm_page *page = next_modified_page(marker);
1911 			if (page == NULL)
1912 				break;
1913 
1914 			PageCacheLocker cacheLocker(page, false);
1915 			if (!cacheLocker.IsLocked())
1916 				continue;
1917 
1918 			VMCache *cache = page->Cache();
1919 
1920 			// If the page is busy or its state has changed while we were
1921 			// locking the cache, just ignore it.
1922 			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
1923 				continue;
1924 
1925 			DEBUG_PAGE_ACCESS_START(page);
1926 
1927 			// Don't write back wired (locked) pages.
1928 			if (page->wired_count > 0) {
1929 				set_page_state(page, PAGE_STATE_ACTIVE);
1930 				DEBUG_PAGE_ACCESS_END(page);
1931 				continue;
1932 			}
1933 
1934 			// Write back temporary pages only when we're actively paging.
1935 			if (cache->temporary
1936 #if ENABLE_SWAP_SUPPORT
1937 				&& (!activePaging
1938 					|| !cache->CanWritePage(
1939 							(off_t)page->cache_offset << PAGE_SHIFT))
1940 #endif
1941 				) {
1942 				// We can't/don't want to do anything with this page, so move it
1943 				// to one of the other queues.
1944 				if (page->mappings.IsEmpty())
1945 					set_page_state(page, PAGE_STATE_INACTIVE);
1946 				else
1947 					set_page_state(page, PAGE_STATE_ACTIVE);
1948 
1949 				DEBUG_PAGE_ACCESS_END(page);
1950 				continue;
1951 			}
1952 
1953 			// We need our own reference to the store, as it might currently be
1954 			// destroyed.
1955 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
1956 				DEBUG_PAGE_ACCESS_END(page);
1957 				cacheLocker.Unlock();
1958 				thread_yield(true);
1959 				continue;
1960 			}
1961 
1962 			run.AddPage(page);
1963 
1964 			DEBUG_PAGE_ACCESS_END(page);
1965 
1966 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
1967 			TPW(WritePage(page));
1968 
1969 			cache->AcquireRefLocked();
1970 			numPages++;
1971 		}
1972 
1973 		pageCollectionTime += system_time();
1974 
1975 		if (numPages == 0)
1976 			continue;
1977 
1978 		// write pages to disk and do all the cleanup
1979 		pageWritingTime -= system_time();
1980 		run.Go();
1981 		pageWritingTime += system_time();
1982 
1983 		// debug output only...
1984 		writtenPages += numPages;
1985 		if (writtenPages >= 1024) {
1986 			bigtime_t now = system_time();
1987 			TRACE(("page writer: wrote 1024 pages (total: %llu ms, "
1988 				"collect: %llu ms, write: %llu ms)\n",
1989 				(now - lastWrittenTime) / 1000,
1990 				pageCollectionTime / 1000, pageWritingTime / 1000));
1991 			writtenPages -= 1024;
1992 			lastWrittenTime = now;
1993 			pageCollectionTime = 0;
1994 			pageWritingTime = 0;
1995 		}
1996 	}
1997 
1998 	remove_page_marker(marker);
1999 	return B_OK;
2000 }
2001 
2002 
2003 // #pragma mark -
2004 
2005 
2006 // TODO: This should be done in the page daemon!
2007 #if 0
2008 #if ENABLE_SWAP_SUPPORT
2009 static bool
2010 free_page_swap_space(int32 index)
2011 {
2012 	vm_page *page = vm_page_at_index(index);
2013 	PageCacheLocker locker(page);
2014 	if (!locker.IsLocked())
2015 		return false;
2016 
2017 	DEBUG_PAGE_ACCESS_START(page);
2018 
2019 	VMCache* cache = page->Cache();
2020 	if (cache->temporary && page->wired_count == 0
2021 			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2022 			&& page->usage_count > 0) {
2023 		// TODO: how to judge a page is highly active?
2024 		if (swap_free_page_swap_space(page)) {
2025 			// We need to mark the page modified, since otherwise it could be
2026 			// stolen and we'd lose its data.
2027 			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2028 			TD(FreedPageSwap(page));
2029 			DEBUG_PAGE_ACCESS_END(page);
2030 			return true;
2031 		}
2032 	}
2033 	DEBUG_PAGE_ACCESS_END(page);
2034 	return false;
2035 }
2036 #endif
2037 #endif	// 0
2038 
2039 
2040 static vm_page *
2041 find_cached_page_candidate(struct vm_page &marker)
2042 {
2043 	DEBUG_PAGE_ACCESS_CHECK(&marker);
2044 
2045 	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2046 	vm_page *page;
2047 
2048 	if (marker.State() == PAGE_STATE_UNUSED) {
2049 		// Get the first free pages of the (in)active queue
2050 		page = sCachedPageQueue.Head();
2051 	} else {
2052 		// Get the next page of the current queue
2053 		if (marker.State() != PAGE_STATE_CACHED) {
2054 			panic("invalid marker %p state", &marker);
2055 			return NULL;
2056 		}
2057 
2058 		page = sCachedPageQueue.Next(&marker);
2059 		sCachedPageQueue.Remove(&marker);
2060 		marker.SetState(PAGE_STATE_UNUSED);
2061 	}
2062 
2063 	while (page != NULL) {
2064 		if (!page->busy) {
2065 			// we found a candidate, insert marker
2066 			marker.SetState(PAGE_STATE_CACHED);
2067 			sCachedPageQueue.InsertAfter(page, &marker);
2068 			return page;
2069 		}
2070 
2071 		page = sCachedPageQueue.Next(page);
2072 	}
2073 
2074 	return NULL;
2075 }
2076 
2077 
2078 static bool
2079 free_cached_page(vm_page *page, bool dontWait)
2080 {
2081 	// try to lock the page's cache
2082 	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2083 		return false;
2084 	VMCache* cache = page->Cache();
2085 
2086 	AutoLocker<VMCache> cacheLocker(cache, true);
2087 	MethodDeleter<VMCache> _2(cache, &VMCache::ReleaseRefLocked);
2088 
2089 	// check again if that page is still a candidate
2090 	if (page->busy || page->State() != PAGE_STATE_CACHED)
2091 		return false;
2092 
2093 	DEBUG_PAGE_ACCESS_START(page);
2094 
2095 	PAGE_ASSERT(page, !page->IsMapped());
2096 	PAGE_ASSERT(page, !page->modified);
2097 
2098 	// we can now steal this page
2099 
2100 	cache->RemovePage(page);
2101 
2102 	sCachedPageQueue.RemoveUnlocked(page);
2103 	return true;
2104 }
2105 
2106 
2107 static uint32
2108 free_cached_pages(uint32 pagesToFree, bool dontWait)
2109 {
2110 	vm_page marker;
2111 	init_page_marker(marker);
2112 
2113 	uint32 pagesFreed = 0;
2114 
2115 	while (pagesFreed < pagesToFree) {
2116 		vm_page *page = find_cached_page_candidate(marker);
2117 		if (page == NULL)
2118 			break;
2119 
2120 		if (free_cached_page(page, dontWait)) {
2121 			ReadLocker locker(sFreePageQueuesLock);
2122 			page->SetState(PAGE_STATE_FREE);
2123 			DEBUG_PAGE_ACCESS_END(page);
2124 			sFreePageQueue.PrependUnlocked(page);
2125 			locker.Unlock();
2126 
2127 			TA(StolenPage());
2128 
2129 			pagesFreed++;
2130 		}
2131 	}
2132 
2133 	remove_page_marker(marker);
2134 
2135 	return pagesFreed;
2136 }
2137 
2138 
2139 static void
2140 idle_scan_active_pages(page_stats& pageStats)
2141 {
2142 	VMPageQueue& queue = sActivePageQueue;
2143 
2144 	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2145 	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2146 
2147 	while (maxToScan > 0) {
2148 		maxToScan--;
2149 
2150 		// Get the next page. Note that we don't bother to lock here. We go with
2151 		// the assumption that on all architectures reading/writing pointers is
2152 		// atomic. Beyond that it doesn't really matter. We have to unlock the
2153 		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2154 		vm_page* page = queue.Head();
2155 		if (page == NULL)
2156 			break;
2157 
2158 		// lock the page's cache
2159 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2160 		if (cache == NULL)
2161 			continue;
2162 
2163 		if (cache == NULL || page->State() != PAGE_STATE_ACTIVE) {
2164 			// page is no longer in the cache or in this queue
2165 			cache->ReleaseRefAndUnlock();
2166 			continue;
2167 		}
2168 
2169 		DEBUG_PAGE_ACCESS_START(page);
2170 
2171 		if (page->busy) {
2172 			// page is busy -- requeue at the end
2173 			vm_page_requeue(page, true);
2174 			cache->ReleaseRefAndUnlock();
2175 			DEBUG_PAGE_ACCESS_END(page);
2176 			continue;
2177 		}
2178 
2179 		// Get the page active/modified flags and update the page's usage count.
2180 		// We completely unmap inactive temporary pages. This saves us to
2181 		// iterate through the inactive list as well, since we'll be notified
2182 		// via page fault whenever such an inactive page is used again.
2183 		// We don't remove the mappings of non-temporary pages, since we
2184 		// wouldn't notice when those would become unused and could thus be
2185 		// moved to the cached list.
2186 		int32 usageCount;
2187 		if (page->wired_count > 0 || page->usage_count > 0 || !cache->temporary)
2188 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2189 		else
2190 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2191 
2192 		if (usageCount > 0) {
2193 			usageCount += page->usage_count + kPageUsageAdvance;
2194 			if (usageCount > kPageUsageMax)
2195 				usageCount = kPageUsageMax;
2196 // TODO: This would probably also be the place to reclaim swap space.
2197 		} else {
2198 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2199 			if (usageCount < 0) {
2200 				usageCount = 0;
2201 				set_page_state(page, PAGE_STATE_INACTIVE);
2202 			}
2203 		}
2204 
2205 		page->usage_count = usageCount;
2206 
2207 		DEBUG_PAGE_ACCESS_END(page);
2208 
2209 		cache->ReleaseRefAndUnlock();
2210 	}
2211 }
2212 
2213 
2214 static void
2215 full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2216 {
2217 	int32 pagesToFree = pageStats.unsatisfiedReservations
2218 		+ sFreeOrCachedPagesTarget
2219 		- (pageStats.totalFreePages + pageStats.cachedPages);
2220 	if (pagesToFree <= 0)
2221 		return;
2222 
2223 	bigtime_t time = system_time();
2224 	uint32 pagesScanned = 0;
2225 	uint32 pagesToCached = 0;
2226 	uint32 pagesToModified = 0;
2227 	uint32 pagesToActive = 0;
2228 
2229 	// Determine how many pages at maximum to send to the modified queue. Since
2230 	// it is relatively expensive to page out pages, we do that on a grander
2231 	// scale only when things get desperate.
2232 	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2233 
2234 	vm_page marker;
2235 	init_page_marker(marker);
2236 
2237 	VMPageQueue& queue = sInactivePageQueue;
2238 	InterruptsSpinLocker queueLocker(queue.GetLock());
2239 	uint32 maxToScan = queue.Count();
2240 
2241 	vm_page* nextPage = queue.Head();
2242 
2243 	while (pagesToFree > 0 && maxToScan > 0) {
2244 		maxToScan--;
2245 
2246 		// get the next page
2247 		vm_page* page = nextPage;
2248 		if (page == NULL)
2249 			break;
2250 		nextPage = queue.Next(page);
2251 
2252 		if (page->busy)
2253 			continue;
2254 
2255 		// mark the position
2256 		queue.InsertAfter(page, &marker);
2257 		queueLocker.Unlock();
2258 
2259 		// lock the page's cache
2260 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2261 		if (cache == NULL || page->busy
2262 				|| page->State() != PAGE_STATE_INACTIVE) {
2263 			if (cache != NULL)
2264 				cache->ReleaseRefAndUnlock();
2265 			queueLocker.Lock();
2266 			nextPage = queue.Next(&marker);
2267 			queue.Remove(&marker);
2268 			continue;
2269 		}
2270 
2271 		pagesScanned++;
2272 
2273 		DEBUG_PAGE_ACCESS_START(page);
2274 
2275 		// Get the accessed count, clear the accessed/modified flags and
2276 		// unmap the page, if it hasn't been accessed.
2277 		int32 usageCount;
2278 		if (page->wired_count > 0)
2279 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2280 		else
2281 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2282 
2283 		// update usage count
2284 		if (usageCount > 0) {
2285 			usageCount += page->usage_count + kPageUsageAdvance;
2286 			if (usageCount > kPageUsageMax)
2287 				usageCount = kPageUsageMax;
2288 		} else {
2289 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2290 			if (usageCount < 0)
2291 				usageCount = 0;
2292 		}
2293 
2294 		page->usage_count = usageCount;
2295 
2296 		// Move to fitting queue or requeue:
2297 		// * Active mapped pages go to the active queue.
2298 		// * Inactive mapped (i.e. wired) pages are requeued.
2299 		// * The remaining pages are cachable. Thus, if unmodified they go to
2300 		//   the cached queue, otherwise to the modified queue (up to a limit).
2301 		//   Note that until in the idle scanning we don't exempt pages of
2302 		//   temporary caches. Apparently we really need memory, so we better
2303 		//   page out memory as well.
2304 		bool isMapped = page->IsMapped();
2305 		if (usageCount > 0) {
2306 			if (isMapped) {
2307 				set_page_state(page, PAGE_STATE_ACTIVE);
2308 				pagesToActive++;
2309 			} else
2310 				vm_page_requeue(page, true);
2311 		} else if (isMapped) {
2312 			vm_page_requeue(page, true);
2313 		} else if (!page->modified) {
2314 			set_page_state(page, PAGE_STATE_CACHED);
2315 			pagesToFree--;
2316 			pagesToCached++;
2317 		} else if (maxToFlush > 0) {
2318 			set_page_state(page, PAGE_STATE_MODIFIED);
2319 			maxToFlush--;
2320 			pagesToModified++;
2321 		} else
2322 			vm_page_requeue(page, true);
2323 
2324 		DEBUG_PAGE_ACCESS_END(page);
2325 
2326 		cache->ReleaseRefAndUnlock();
2327 
2328 		// remove the marker
2329 		queueLocker.Lock();
2330 		nextPage = queue.Next(&marker);
2331 		queue.Remove(&marker);
2332 	}
2333 
2334 	queueLocker.Unlock();
2335 
2336 	time = system_time() - time;
2337 	TRACE_DAEMON("  -> inactive scan (%7lld us): scanned: %7lu, "
2338 		"moved: %lu -> cached, %lu -> modified, %lu -> active\n", time,
2339 		pagesScanned, pagesToCached, pagesToModified, pagesToActive);
2340 
2341 	// wake up the page writer, if we tossed it some pages
2342 	if (pagesToModified > 0)
2343 		sPageWriterCondition.WakeUp();
2344 }
2345 
2346 
2347 static void
2348 full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2349 {
2350 	vm_page marker;
2351 	init_page_marker(marker);
2352 
2353 	VMPageQueue& queue = sActivePageQueue;
2354 	InterruptsSpinLocker queueLocker(queue.GetLock());
2355 	uint32 maxToScan = queue.Count();
2356 
2357 	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2358 		+ sFreeOrCachedPagesTarget
2359 		- (pageStats.totalFreePages + pageStats.cachedPages)
2360 		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2361 	if (pagesToDeactivate <= 0)
2362 		return;
2363 
2364 	bigtime_t time = system_time();
2365 	uint32 pagesAccessed = 0;
2366 	uint32 pagesToInactive = 0;
2367 	uint32 pagesScanned = 0;
2368 
2369 	vm_page* nextPage = queue.Head();
2370 
2371 	while (pagesToDeactivate > 0 && maxToScan > 0) {
2372 		maxToScan--;
2373 
2374 		// get the next page
2375 		vm_page* page = nextPage;
2376 		if (page == NULL)
2377 			break;
2378 		nextPage = queue.Next(page);
2379 
2380 		if (page->busy)
2381 			continue;
2382 
2383 		// mark the position
2384 		queue.InsertAfter(page, &marker);
2385 		queueLocker.Unlock();
2386 
2387 		// lock the page's cache
2388 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2389 		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2390 			if (cache != NULL)
2391 				cache->ReleaseRefAndUnlock();
2392 			queueLocker.Lock();
2393 			nextPage = queue.Next(&marker);
2394 			queue.Remove(&marker);
2395 			continue;
2396 		}
2397 
2398 		pagesScanned++;
2399 
2400 		DEBUG_PAGE_ACCESS_START(page);
2401 
2402 		// Get the page active/modified flags and update the page's usage count.
2403 		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2404 
2405 		if (usageCount > 0) {
2406 			usageCount += page->usage_count + kPageUsageAdvance;
2407 			if (usageCount > kPageUsageMax)
2408 				usageCount = kPageUsageMax;
2409 			pagesAccessed++;
2410 // TODO: This would probably also be the place to reclaim swap space.
2411 		} else {
2412 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2413 			if (usageCount <= 0) {
2414 				usageCount = 0;
2415 				set_page_state(page, PAGE_STATE_INACTIVE);
2416 				pagesToInactive++;
2417 			}
2418 		}
2419 
2420 		page->usage_count = usageCount;
2421 
2422 		DEBUG_PAGE_ACCESS_END(page);
2423 
2424 		cache->ReleaseRefAndUnlock();
2425 
2426 		// remove the marker
2427 		queueLocker.Lock();
2428 		nextPage = queue.Next(&marker);
2429 		queue.Remove(&marker);
2430 	}
2431 
2432 	time = system_time() - time;
2433 	TRACE_DAEMON("  ->   active scan (%7lld us): scanned: %7lu, "
2434 		"moved: %lu -> inactive, encountered %lu accessed ones\n", time,
2435 		pagesScanned, pagesToInactive, pagesAccessed);
2436 }
2437 
2438 
2439 static void
2440 page_daemon_idle_scan(page_stats& pageStats)
2441 {
2442 	TRACE_DAEMON("page daemon: idle run\n");
2443 
2444 	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2445 		// We want more actually free pages, so free some from the cached
2446 		// ones.
2447 		uint32 freed = free_cached_pages(
2448 			sFreePagesTarget - pageStats.totalFreePages, false);
2449 		if (freed > 0)
2450 			unreserve_pages(freed);
2451 		get_page_stats(pageStats);
2452 	}
2453 
2454 	// Walk the active list and move pages to the inactive queue.
2455 	get_page_stats(pageStats);
2456 	idle_scan_active_pages(pageStats);
2457 }
2458 
2459 
2460 static void
2461 page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2462 {
2463 	TRACE_DAEMON("page daemon: full run: free: %lu, cached: %lu, "
2464 		"to free: %lu\n", pageStats.totalFreePages, pageStats.cachedPages,
2465 		pageStats.unsatisfiedReservations + sFreeOrCachedPagesTarget
2466 			- (pageStats.totalFreePages + pageStats.cachedPages));
2467 
2468 	// Walk the inactive list and transfer pages to the cached and modified
2469 	// queues.
2470 	full_scan_inactive_pages(pageStats, despairLevel);
2471 
2472 	// Free cached pages. Also wake up reservation waiters.
2473 	get_page_stats(pageStats);
2474 	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2475 		- (pageStats.totalFreePages);
2476 	if (pagesToFree > 0) {
2477 		uint32 freed = free_cached_pages(pagesToFree, true);
2478 		if (freed > 0)
2479 			unreserve_pages(freed);
2480 	}
2481 
2482 	// Walk the active list and move pages to the inactive queue.
2483 	get_page_stats(pageStats);
2484 	full_scan_active_pages(pageStats, despairLevel);
2485 }
2486 
2487 
2488 static status_t
2489 page_daemon(void* /*unused*/)
2490 {
2491 	int32 despairLevel = 0;
2492 
2493 	while (true) {
2494 		sPageDaemonCondition.ClearActivated();
2495 
2496 		// evaluate the free pages situation
2497 		page_stats pageStats;
2498 		get_page_stats(pageStats);
2499 
2500 		if (!do_active_paging(pageStats)) {
2501 			// Things look good -- just maintain statistics and keep the pool
2502 			// of actually free pages full enough.
2503 			despairLevel = 0;
2504 			page_daemon_idle_scan(pageStats);
2505 			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
2506 		} else {
2507 			// Not enough free pages. We need to do some real work.
2508 			despairLevel = std::max(despairLevel + 1, (int32)3);
2509 			page_daemon_full_scan(pageStats, despairLevel);
2510 
2511 			// Don't wait after the first full scan, but rather immediately
2512 			// check whether we were successful in freeing enough pages and
2513 			// re-run with increased despair level. The first scan is
2514 			// conservative with respect to moving inactive modified pages to
2515 			// the modified list to avoid thrashing. The second scan, however,
2516 			// will not hold back.
2517 			if (despairLevel > 1)
2518 				snooze(kBusyScanWaitInterval);
2519 		}
2520 	}
2521 
2522 	return B_OK;
2523 }
2524 
2525 
2526 /*!	Returns how many pages could *not* be reserved.
2527 */
2528 static uint32
2529 reserve_pages(uint32 count, int priority, bool dontWait)
2530 {
2531 	int32 dontTouch = kPageReserveForPriority[priority];
2532 
2533 	while (true) {
2534 		count -= reserve_some_pages(count, dontTouch);
2535 		if (count == 0)
2536 			return 0;
2537 
2538 		if (sUnsatisfiedPageReservations == 0) {
2539 			count -= free_cached_pages(count, dontWait);
2540 			if (count == 0)
2541 				return count;
2542 		}
2543 
2544 		if (dontWait)
2545 			return count;
2546 
2547 		// we need to wait for pages to become available
2548 
2549 		MutexLocker pageDeficitLocker(sPageDeficitLock);
2550 
2551 		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
2552 		sUnsatisfiedPageReservations += count;
2553 
2554 		if (sUnreservedFreePages > dontTouch) {
2555 			// the situation changed
2556 			sUnsatisfiedPageReservations -= count;
2557 			continue;
2558 		}
2559 
2560 		PageReservationWaiter waiter;
2561 		waiter.dontTouch = dontTouch;
2562 		waiter.missing = count;
2563 		waiter.thread = thread_get_current_thread();
2564 		waiter.threadPriority = waiter.thread->priority;
2565 
2566 		// insert ordered (i.e. after all waiters with higher or equal priority)
2567 		PageReservationWaiter* otherWaiter = NULL;
2568 		for (PageReservationWaiterList::Iterator it
2569 				= sPageReservationWaiters.GetIterator();
2570 			(otherWaiter = it.Next()) != NULL;) {
2571 			if (waiter < *otherWaiter)
2572 				break;
2573 		}
2574 
2575 		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
2576 
2577 		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
2578 			"waiting for pages");
2579 
2580 		if (notifyDaemon)
2581 			sPageDaemonCondition.WakeUp();
2582 
2583 		pageDeficitLocker.Unlock();
2584 
2585 		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
2586 		thread_block();
2587 
2588 		pageDeficitLocker.Lock();
2589 
2590 		return 0;
2591 	}
2592 }
2593 
2594 
2595 //	#pragma mark - private kernel API
2596 
2597 
2598 /*!	Writes a range of modified pages of a cache to disk.
2599 	You need to hold the VMCache lock when calling this function.
2600 	Note that the cache lock is released in this function.
2601 	\param cache The cache.
2602 	\param firstPage Offset (in page size units) of the first page in the range.
2603 	\param endPage End offset (in page size units) of the page range. The page
2604 		at this offset is not included.
2605 */
2606 status_t
2607 vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
2608 	uint32 endPage)
2609 {
2610 	static const int32 kMaxPages = 256;
2611 	int32 maxPages = cache->MaxPagesPerWrite();
2612 	if (maxPages < 0 || maxPages > kMaxPages)
2613 		maxPages = kMaxPages;
2614 
2615 	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
2616 		| HEAP_DONT_LOCK_KERNEL_SPACE;
2617 
2618 	PageWriteWrapper stackWrappers[2];
2619 	PageWriteWrapper* wrapperPool
2620 		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
2621 	if (wrapperPool == NULL) {
2622 		// don't fail, just limit our capabilities
2623 		wrapperPool = stackWrappers;
2624 		maxPages = 1;
2625 	}
2626 
2627 	int32 nextWrapper = 0;
2628 
2629 	PageWriteWrapper* wrappers[maxPages];
2630 	int32 usedWrappers = 0;
2631 
2632 	PageWriteTransfer transfer;
2633 	bool transferEmpty = true;
2634 
2635 	VMCachePagesTree::Iterator it
2636 		= cache->pages.GetIterator(firstPage, true, true);
2637 
2638 	while (true) {
2639 		vm_page* page = it.Next();
2640 		if (page == NULL || page->cache_offset >= endPage) {
2641 			if (transferEmpty)
2642 				break;
2643 
2644 			page = NULL;
2645 		}
2646 
2647 		if (page != NULL) {
2648 			if (page->busy
2649 				|| (page->State() != PAGE_STATE_MODIFIED
2650 					&& !vm_test_map_modification(page))) {
2651 				page = NULL;
2652 			}
2653 		}
2654 
2655 		PageWriteWrapper* wrapper = NULL;
2656 		if (page != NULL) {
2657 			wrapper = &wrapperPool[nextWrapper++];
2658 			if (nextWrapper > maxPages)
2659 				nextWrapper = 0;
2660 
2661 			DEBUG_PAGE_ACCESS_START(page);
2662 
2663 			wrapper->SetTo(page);
2664 
2665 			if (transferEmpty || transfer.AddPage(page)) {
2666 				if (transferEmpty) {
2667 					transfer.SetTo(NULL, page, maxPages);
2668 					transferEmpty = false;
2669 				}
2670 
2671 				DEBUG_PAGE_ACCESS_END(page);
2672 
2673 				wrappers[usedWrappers++] = wrapper;
2674 				continue;
2675 			}
2676 
2677 			DEBUG_PAGE_ACCESS_END(page);
2678 		}
2679 
2680 		if (transferEmpty)
2681 			continue;
2682 
2683 		cache->Unlock();
2684 		status_t status = transfer.Schedule(0);
2685 		cache->Lock();
2686 
2687 		for (int32 i = 0; i < usedWrappers; i++)
2688 			wrappers[i]->Done(status);
2689 
2690 		usedWrappers = 0;
2691 
2692 		if (page != NULL) {
2693 			transfer.SetTo(NULL, page, maxPages);
2694 			wrappers[usedWrappers++] = wrapper;
2695 		} else
2696 			transferEmpty = true;
2697 	}
2698 
2699 	if (wrapperPool != stackWrappers)
2700 		delete[] wrapperPool;
2701 
2702 	return B_OK;
2703 }
2704 
2705 
2706 /*!	You need to hold the VMCache lock when calling this function.
2707 	Note that the cache lock is released in this function.
2708 */
2709 status_t
2710 vm_page_write_modified_pages(VMCache *cache)
2711 {
2712 	return vm_page_write_modified_page_range(cache, 0,
2713 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
2714 }
2715 
2716 
2717 /*!	Schedules the page writer to write back the specified \a page.
2718 	Note, however, that it might not do this immediately, and it can well
2719 	take several seconds until the page is actually written out.
2720 */
2721 void
2722 vm_page_schedule_write_page(vm_page *page)
2723 {
2724 	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
2725 
2726 	vm_page_requeue(page, false);
2727 
2728 	sPageWriterCondition.WakeUp();
2729 }
2730 
2731 
2732 /*!	Cache must be locked.
2733 */
2734 void
2735 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
2736 	uint32 endPage)
2737 {
2738 	uint32 modified = 0;
2739 	for (VMCachePagesTree::Iterator it
2740 				= cache->pages.GetIterator(firstPage, true, true);
2741 			vm_page *page = it.Next();) {
2742 		if (page->cache_offset >= endPage)
2743 			break;
2744 
2745 		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
2746 			DEBUG_PAGE_ACCESS_START(page);
2747 			vm_page_requeue(page, false);
2748 			modified++;
2749 			DEBUG_PAGE_ACCESS_END(page);
2750 		}
2751 	}
2752 
2753 	if (modified > 0)
2754 		sPageWriterCondition.WakeUp();
2755 }
2756 
2757 
2758 void
2759 vm_page_init_num_pages(kernel_args *args)
2760 {
2761 	// calculate the size of memory by looking at the physical_memory_range array
2762 	addr_t physicalPagesEnd = 0;
2763 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
2764 
2765 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
2766 		physicalPagesEnd = (args->physical_memory_range[i].start
2767 			+ args->physical_memory_range[i].size) / B_PAGE_SIZE;
2768 	}
2769 
2770 	TRACE(("first phys page = 0x%lx, end 0x%lx\n", sPhysicalPageOffset,
2771 		physicalPagesEnd));
2772 
2773 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
2774 
2775 #ifdef LIMIT_AVAILABLE_MEMORY
2776 	if (sNumPages > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE))
2777 		sNumPages = LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
2778 #endif
2779 }
2780 
2781 
2782 status_t
2783 vm_page_init(kernel_args *args)
2784 {
2785 	TRACE(("vm_page_init: entry\n"));
2786 
2787 	// init page queues
2788 	sModifiedPageQueue.Init("modified pages queue");
2789 	sInactivePageQueue.Init("inactive pages queue");
2790 	sActivePageQueue.Init("active pages queue");
2791 	sCachedPageQueue.Init("cached pages queue");
2792 	sFreePageQueue.Init("free pages queue");
2793 	sClearPageQueue.Init("clear pages queue");
2794 
2795 	new (&sPageReservationWaiters) PageReservationWaiterList;
2796 
2797 	// map in the new free page table
2798 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
2799 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, false);
2800 
2801 	TRACE(("vm_init: putting free_page_table @ %p, # ents %ld (size 0x%x)\n",
2802 		sPages, sNumPages, (unsigned int)(sNumPages * sizeof(vm_page))));
2803 
2804 	// initialize the free page table
2805 	for (uint32 i = 0; i < sNumPages; i++) {
2806 		sPages[i].physical_page_number = sPhysicalPageOffset + i;
2807 		sPages[i].InitState(PAGE_STATE_FREE);
2808 		new(&sPages[i].mappings) vm_page_mappings();
2809 		sPages[i].wired_count = 0;
2810 		sPages[i].usage_count = 0;
2811 		sPages[i].busy_writing = false;
2812 		sPages[i].SetCacheRef(NULL);
2813 		#if DEBUG_PAGE_QUEUE
2814 			sPages[i].queue = NULL;
2815 		#endif
2816 		#if DEBUG_PAGE_ACCESS
2817 			sPages[i].accessing_thread = -1;
2818 		#endif
2819 		sFreePageQueue.Append(&sPages[i]);
2820 	}
2821 
2822 	sUnreservedFreePages = sNumPages;
2823 
2824 	TRACE(("initialized table\n"));
2825 
2826 	// mark the ranges between usable physical memory unused
2827 	addr_t previousEnd = 0;
2828 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
2829 		addr_t base = args->physical_memory_range[i].start;
2830 		addr_t size = args->physical_memory_range[i].size;
2831 		if (base > previousEnd) {
2832 			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
2833 				(base - previousEnd) / B_PAGE_SIZE, false);
2834 		}
2835 		previousEnd = base + size;
2836 	}
2837 
2838 	// mark the allocated physical page ranges wired
2839 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
2840 		mark_page_range_in_use(
2841 			args->physical_allocated_range[i].start / B_PAGE_SIZE,
2842 			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
2843 	}
2844 
2845 	// The target of actually free pages. This must be at least the system
2846 	// reserve, but should be a few more pages, so we don't have to extract
2847 	// a cached page with each allocation.
2848 	sFreePagesTarget = VM_PAGE_RESERVE_USER
2849 		+ std::max((uint32)32, sNumPages / 1024);
2850 
2851 	// The target of free + cached and inactive pages. On low-memory machines
2852 	// keep things tight. free + cached is the pool of immediately allocatable
2853 	// pages. We want a few inactive pages, so when we're actually paging, we
2854 	// have a reasonably large set of pages to work with.
2855 	if (sUnreservedFreePages < 16 * 1024) {
2856 		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
2857 		sInactivePagesTarget = sFreePagesTarget / 3;
2858 	} else {
2859 		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
2860 		sInactivePagesTarget = sFreePagesTarget / 2;
2861 	}
2862 
2863 	TRACE(("vm_page_init: exit\n"));
2864 
2865 	return B_OK;
2866 }
2867 
2868 
2869 status_t
2870 vm_page_init_post_area(kernel_args *args)
2871 {
2872 	void *dummy;
2873 
2874 	dummy = sPages;
2875 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
2876 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
2877 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
2878 
2879 	add_debugger_command("page_stats", &dump_page_stats,
2880 		"Dump statistics about page usage");
2881 	add_debugger_command_etc("page", &dump_page,
2882 		"Dump page info",
2883 		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
2884 		"Prints information for the physical page. If neither \"-p\" nor\n"
2885 		"\"-v\" are given, the provided address is interpreted as address of\n"
2886 		"the vm_page data structure for the page in question. If \"-p\" is\n"
2887 		"given, the address is the physical address of the page. If \"-v\" is\n"
2888 		"given, the address is interpreted as virtual address in the current\n"
2889 		"thread's address space and for the page it is mapped to (if any)\n"
2890 		"information are printed. If \"-m\" is specified, the command will\n"
2891 		"search all known address spaces for mappings to that page and print\n"
2892 		"them.\n", 0);
2893 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
2894 	add_debugger_command("find_page", &find_page,
2895 		"Find out which queue a page is actually in");
2896 
2897 #ifdef TRACK_PAGE_USAGE_STATS
2898 	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
2899 		"Dumps statistics about page usage counts",
2900 		"\n"
2901 		"Dumps statistics about page usage counts.\n",
2902 		B_KDEBUG_DONT_PARSE_ARGUMENTS);
2903 #endif
2904 
2905 	return B_OK;
2906 }
2907 
2908 
2909 status_t
2910 vm_page_init_post_thread(kernel_args *args)
2911 {
2912 	new (&sFreePageCondition) ConditionVariable;
2913 	sFreePageCondition.Publish(&sFreePageQueue, "free page");
2914 
2915 	// create a kernel thread to clear out pages
2916 
2917 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
2918 		B_LOWEST_ACTIVE_PRIORITY, NULL);
2919 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
2920 
2921 	// start page writer
2922 
2923 	sPageWriterCondition.Init("page writer");
2924 
2925 	thread = spawn_kernel_thread(&page_writer, "page writer",
2926 		B_NORMAL_PRIORITY + 1, NULL);
2927 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
2928 
2929 	// start page daemon
2930 
2931 	sPageDaemonCondition.Init("page daemon");
2932 
2933 	thread = spawn_kernel_thread(&page_daemon, "page daemon",
2934 		B_NORMAL_PRIORITY, NULL);
2935 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
2936 
2937 	return B_OK;
2938 }
2939 
2940 
2941 status_t
2942 vm_mark_page_inuse(addr_t page)
2943 {
2944 	return vm_mark_page_range_inuse(page, 1);
2945 }
2946 
2947 
2948 status_t
2949 vm_mark_page_range_inuse(addr_t startPage, addr_t length)
2950 {
2951 	return mark_page_range_in_use(startPage, length, false);
2952 }
2953 
2954 
2955 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
2956 */
2957 void
2958 vm_page_unreserve_pages(vm_page_reservation* reservation)
2959 {
2960 	uint32 count = reservation->count;
2961 	reservation->count = 0;
2962 
2963 	if (count == 0)
2964 		return;
2965 
2966 	TA(UnreservePages(count));
2967 
2968 	unreserve_pages(count);
2969 }
2970 
2971 
2972 /*!	With this call, you can reserve a number of free pages in the system.
2973 	They will only be handed out to someone who has actually reserved them.
2974 	This call returns as soon as the number of requested pages has been
2975 	reached.
2976 	The caller must not hold any cache lock or the function might deadlock.
2977 */
2978 void
2979 vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
2980 	int priority)
2981 {
2982 	reservation->count = count;
2983 
2984 	if (count == 0)
2985 		return;
2986 
2987 	TA(ReservePages(count));
2988 
2989 	reserve_pages(count, priority, false);
2990 }
2991 
2992 
2993 bool
2994 vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
2995 	int priority)
2996 {
2997 	if (count == 0) {
2998 		reservation->count = count;
2999 		return true;
3000 	}
3001 
3002 	uint32 remaining = reserve_pages(count, priority, true);
3003 	if (remaining == 0) {
3004 		TA(ReservePages(count));
3005 		reservation->count = count;
3006 		return true;
3007 	}
3008 
3009 	unreserve_pages(count - remaining);
3010 
3011 	return false;
3012 }
3013 
3014 
3015 vm_page *
3016 vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3017 {
3018 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3019 	ASSERT(pageState != PAGE_STATE_FREE);
3020 	ASSERT(pageState != PAGE_STATE_CLEAR);
3021 
3022 	ASSERT(reservation->count > 0);
3023 	reservation->count--;
3024 
3025 	VMPageQueue* queue;
3026 	VMPageQueue* otherQueue;
3027 
3028 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3029 		queue = &sClearPageQueue;
3030 		otherQueue = &sFreePageQueue;
3031 	} else {
3032 		queue = &sFreePageQueue;
3033 		otherQueue = &sClearPageQueue;
3034 	}
3035 
3036 	TA(AllocatePage());
3037 
3038 	ReadLocker locker(sFreePageQueuesLock);
3039 
3040 	vm_page* page = queue->RemoveHeadUnlocked();
3041 	if (page == NULL) {
3042 		// if the primary queue was empty, grab the page from the
3043 		// secondary queue
3044 		page = otherQueue->RemoveHeadUnlocked();
3045 
3046 		if (page == NULL) {
3047 			// Unlikely, but possible: the page we have reserved has moved
3048 			// between the queues after we checked the first queue. Grab the
3049 			// write locker to make sure this doesn't happen again.
3050 			locker.Unlock();
3051 			WriteLocker writeLocker(sFreePageQueuesLock);
3052 
3053 			page = queue->RemoveHead();
3054 			if (page == NULL)
3055 				otherQueue->RemoveHead();
3056 
3057 			if (page == NULL) {
3058 				panic("Had reserved page, but there is none!");
3059 				return NULL;
3060 			}
3061 
3062 			// downgrade to read lock
3063 			locker.Lock();
3064 		}
3065 	}
3066 
3067 	if (page->CacheRef() != NULL)
3068 		panic("supposed to be free page %p has cache\n", page);
3069 
3070 	DEBUG_PAGE_ACCESS_START(page);
3071 
3072 	int oldPageState = page->State();
3073 	page->SetState(pageState);
3074 	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3075 	page->usage_count = 0;
3076 	page->accessed = false;
3077 	page->modified = false;
3078 
3079 	locker.Unlock();
3080 
3081 	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3082 		sPageQueues[pageState].AppendUnlocked(page);
3083 
3084 	// clear the page, if we had to take it from the free queue and a clear
3085 	// page was requested
3086 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3087 		clear_page(page);
3088 
3089 	return page;
3090 }
3091 
3092 
3093 static vm_page*
3094 allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3095 	WriteLocker& freeClearQueueLocker)
3096 {
3097 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3098 	ASSERT(pageState != PAGE_STATE_FREE);
3099 	ASSERT(pageState != PAGE_STATE_CLEAR);
3100 
3101 	TA(AllocatePageRun(length));
3102 
3103 	// pull the pages out of the appropriate queues
3104 	VMPageQueue::PageList freePages;
3105 	VMPageQueue::PageList clearPages;
3106 	for (page_num_t i = 0; i < length; i++) {
3107 		vm_page& page = sPages[start + i];
3108 		DEBUG_PAGE_ACCESS_START(&page);
3109 		if (page.State() == PAGE_STATE_CLEAR) {
3110 			sClearPageQueue.Remove(&page);
3111 			clearPages.Add(&page);
3112 		} else {
3113 			sFreePageQueue.Remove(&page);
3114 			freePages.Add(&page);
3115 		}
3116 
3117 		page.SetState(flags & VM_PAGE_ALLOC_STATE);
3118 		page.busy = flags & VM_PAGE_ALLOC_BUSY;
3119 		page.usage_count = 0;
3120 		page.accessed = false;
3121 		page.modified = false;
3122 	}
3123 
3124 	freeClearQueueLocker.Unlock();
3125 
3126 	// clear pages, if requested
3127 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3128 		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3129 				vm_page* page = it.Next();) {
3130  			clear_page(page);
3131 		}
3132 	}
3133 
3134 	// add pages to target queue
3135 	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3136 		freePages.MoveFrom(&clearPages);
3137 		sPageQueues[pageState].AppendUnlocked(freePages, length);
3138 	}
3139 
3140 	// Note: We don't unreserve the pages since we pulled them out of the
3141 	// free/clear queues without adjusting sUnreservedFreePages.
3142 
3143 	return &sPages[start];
3144 }
3145 
3146 
3147 vm_page *
3148 vm_page_allocate_page_run(uint32 flags, addr_t base, addr_t length,
3149 	int priority)
3150 {
3151 	uint32 start = base >> PAGE_SHIFT;
3152 
3153 	vm_page_reservation reservation;
3154 	if (!vm_page_try_reserve_pages(&reservation, length, priority))
3155 		return NULL;
3156 		// TODO: add more tries, ie. free some inactive, ...
3157 		// no free space
3158 
3159 	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3160 
3161 	for (;;) {
3162 		bool foundRun = true;
3163 		if (start + length > sNumPages) {
3164 			freeClearQueueLocker.Unlock();
3165 			vm_page_unreserve_pages(&reservation);
3166 			return NULL;
3167 		}
3168 
3169 		uint32 i;
3170 		for (i = 0; i < length; i++) {
3171 			if (sPages[start + i].State() != PAGE_STATE_FREE
3172 				&& sPages[start + i].State() != PAGE_STATE_CLEAR) {
3173 				foundRun = false;
3174 				i++;
3175 				break;
3176 			}
3177 		}
3178 
3179 		if (foundRun)
3180 			return allocate_page_run(start, length, flags,
3181 				freeClearQueueLocker);
3182 
3183 		start += i;
3184 	}
3185 }
3186 
3187 
3188 vm_page *
3189 vm_page_allocate_page_run_no_base(uint32 flags, addr_t count, int priority)
3190 {
3191 	VMPageQueue* queue;
3192 	VMPageQueue* otherQueue;
3193 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3194 		queue = &sClearPageQueue;
3195 		otherQueue = &sFreePageQueue;
3196 	} else {
3197 		queue = &sFreePageQueue;
3198 		otherQueue = &sClearPageQueue;
3199 	}
3200 
3201 	vm_page_reservation reservation;
3202 	if (!vm_page_try_reserve_pages(&reservation, count, priority))
3203 		return NULL;
3204 		// TODO: add more tries, ie. free some inactive, ...
3205 		// no free space
3206 
3207 	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3208 
3209 	for (;;) {
3210 		VMPageQueue::Iterator it = queue->GetIterator();
3211 		while (vm_page *page = it.Next()) {
3212 			vm_page *current = page;
3213 			if (current >= &sPages[sNumPages - count])
3214 				continue;
3215 
3216 			bool foundRun = true;
3217 			for (uint32 i = 0; i < count; i++, current++) {
3218 				if (current->State() != PAGE_STATE_FREE
3219 					&& current->State() != PAGE_STATE_CLEAR) {
3220 					foundRun = false;
3221 					break;
3222 				}
3223 			}
3224 
3225 			if (foundRun) {
3226 				return allocate_page_run(page - sPages, count, flags,
3227 					freeClearQueueLocker);
3228 			}
3229 		}
3230 
3231 		if (queue == otherQueue) {
3232 			freeClearQueueLocker.Unlock();
3233 			vm_page_unreserve_pages(&reservation);
3234 		}
3235 
3236 		queue = otherQueue;
3237 	}
3238 }
3239 
3240 
3241 vm_page *
3242 vm_page_at_index(int32 index)
3243 {
3244 	return &sPages[index];
3245 }
3246 
3247 
3248 vm_page *
3249 vm_lookup_page(addr_t pageNumber)
3250 {
3251 	if (pageNumber < sPhysicalPageOffset)
3252 		return NULL;
3253 
3254 	pageNumber -= sPhysicalPageOffset;
3255 	if (pageNumber >= sNumPages)
3256 		return NULL;
3257 
3258 	return &sPages[pageNumber];
3259 }
3260 
3261 
3262 bool
3263 vm_page_is_dummy(struct vm_page *page)
3264 {
3265 	return page < sPages || page >= sPages + sNumPages;
3266 }
3267 
3268 
3269 /*!	Free the page that belonged to a certain cache.
3270 	You can use vm_page_set_state() manually if you prefer, but only
3271 	if the page does not equal PAGE_STATE_MODIFIED.
3272 */
3273 void
3274 vm_page_free(VMCache *cache, vm_page *page)
3275 {
3276 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3277 		&& page->State() != PAGE_STATE_CLEAR);
3278 
3279 	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
3280 		atomic_add(&sModifiedTemporaryPages, -1);
3281 
3282 	free_page(page, false);
3283 }
3284 
3285 
3286 void
3287 vm_page_set_state(vm_page *page, int pageState)
3288 {
3289 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3290 		&& page->State() != PAGE_STATE_CLEAR);
3291 
3292 	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR)
3293 		free_page(page, pageState == PAGE_STATE_CLEAR);
3294 	else
3295 		set_page_state(page, pageState);
3296 }
3297 
3298 
3299 /*!	Moves a page to either the tail of the head of its current queue,
3300 	depending on \a tail.
3301 	The page must have a cache and the cache must be locked!
3302 */
3303 void
3304 vm_page_requeue(struct vm_page *page, bool tail)
3305 {
3306 	PAGE_ASSERT(page, page->Cache() != NULL);
3307 	DEBUG_PAGE_ACCESS_CHECK(page);
3308 
3309 	VMPageQueue *queue = NULL;
3310 
3311 	switch (page->State()) {
3312 		case PAGE_STATE_ACTIVE:
3313 			queue = &sActivePageQueue;
3314 			break;
3315 		case PAGE_STATE_INACTIVE:
3316 			queue = &sInactivePageQueue;
3317 			break;
3318 		case PAGE_STATE_MODIFIED:
3319 			queue = &sModifiedPageQueue;
3320 			break;
3321 		case PAGE_STATE_CACHED:
3322 			queue = &sCachedPageQueue;
3323 			break;
3324 		case PAGE_STATE_FREE:
3325 		case PAGE_STATE_CLEAR:
3326 			panic("vm_page_requeue() called for free/clear page %p", page);
3327 			return;
3328 		case PAGE_STATE_WIRED:
3329 		case PAGE_STATE_UNUSED:
3330 			return;
3331 		default:
3332 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
3333 				page, page->State());
3334 			break;
3335 	}
3336 
3337 	queue->RequeueUnlocked(page, tail);
3338 }
3339 
3340 
3341 size_t
3342 vm_page_num_pages(void)
3343 {
3344 	return sNumPages;
3345 }
3346 
3347 
3348 /*! There is a subtle distinction between the page counts returned by
3349 	this function and vm_page_num_free_pages():
3350 	The latter returns the number of pages that are completely uncommitted,
3351 	whereas this one returns the number of pages that are available for
3352 	use by being reclaimed as well (IOW it factors in things like cache pages
3353 	as available).
3354 */
3355 size_t
3356 vm_page_num_available_pages(void)
3357 {
3358 	return vm_available_memory() / B_PAGE_SIZE;
3359 }
3360 
3361 
3362 size_t
3363 vm_page_num_free_pages(void)
3364 {
3365 	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
3366 	return count > 0 ? count : 0;
3367 }
3368 
3369 
3370 size_t
3371 vm_page_num_unused_pages(void)
3372 {
3373 	int32 count = sUnreservedFreePages;
3374 	return count > 0 ? count : 0;
3375 }
3376 
3377 
3378 void
3379 vm_page_get_stats(system_info *info)
3380 {
3381 	// Get free pages count -- not really exact, since we don't know how many
3382 	// of the reserved pages have already been allocated, but good citizens
3383 	// unreserve chunk-wise as they are allocating the pages, if they have
3384 	// reserved a larger quantity.
3385 	int32 free = sUnreservedFreePages;
3386 	if (free < 0)
3387 		free = 0;
3388 
3389 	// The pages used for the block cache buffers. Those should not be counted
3390 	// as used but as cached pages.
3391 	// TODO: We should subtract the blocks that are in use ATM, since those
3392 	// can't really be freed in a low memory situation.
3393 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
3394 
3395 	info->max_pages = sNumPages;
3396 	info->used_pages = gMappedPagesCount - blockCachePages;
3397 	info->cached_pages = sNumPages >= (uint32)free + info->used_pages
3398 		? sNumPages - free - info->used_pages : 0;
3399 	info->page_faults = vm_num_page_faults();
3400 
3401 	// TODO: We don't consider pages used for page directories/tables yet.
3402 }
3403