xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision fc75f2df0c666dcc61be83c4facdd3132340c2fb)
1 /*
2  * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2010, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 #include <string.h>
12 #include <stdlib.h>
13 
14 #include <algorithm>
15 
16 #include <KernelExport.h>
17 #include <OS.h>
18 
19 #include <AutoDeleter.h>
20 
21 #include <arch/cpu.h>
22 #include <arch/vm_translation_map.h>
23 #include <block_cache.h>
24 #include <boot/kernel_args.h>
25 #include <condition_variable.h>
26 #include <heap.h>
27 #include <kernel.h>
28 #include <low_resource_manager.h>
29 #include <thread.h>
30 #include <tracing.h>
31 #include <util/AutoLock.h>
32 #include <vfs.h>
33 #include <vm/vm.h>
34 #include <vm/vm_priv.h>
35 #include <vm/vm_page.h>
36 #include <vm/VMAddressSpace.h>
37 #include <vm/VMArea.h>
38 #include <vm/VMCache.h>
39 
40 #include "IORequest.h"
41 #include "PageCacheLocker.h"
42 #include "VMAnonymousCache.h"
43 #include "VMPageQueue.h"
44 
45 
46 //#define TRACE_VM_PAGE
47 #ifdef TRACE_VM_PAGE
48 #	define TRACE(x) dprintf x
49 #else
50 #	define TRACE(x) ;
51 #endif
52 
53 //#define TRACE_VM_DAEMONS
54 #ifdef TRACE_VM_DAEMONS
55 #define TRACE_DAEMON(x...) dprintf(x)
56 #else
57 #define TRACE_DAEMON(x...) do {} while (false)
58 #endif
59 
60 //#define TRACK_PAGE_USAGE_STATS	1
61 
62 #define PAGE_ASSERT(page, condition)	\
63 	ASSERT_PRINT((condition), "page: %p", (page))
64 
65 #define SCRUB_SIZE 16
66 	// this many pages will be cleared at once in the page scrubber thread
67 
68 #define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
69 	// maximum I/O priority of the page writer
70 #define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
71 	// the maximum I/O priority shall be reached when this many pages need to
72 	// be written
73 
74 
75 // The page reserve an allocation of the certain priority must not touch.
76 static const size_t kPageReserveForPriority[] = {
77 	VM_PAGE_RESERVE_USER,		// user
78 	VM_PAGE_RESERVE_SYSTEM,		// system
79 	0							// VIP
80 };
81 
82 // Minimum number of free pages the page daemon will try to achieve.
83 static uint32 sFreePagesTarget;
84 static uint32 sFreeOrCachedPagesTarget;
85 static uint32 sInactivePagesTarget;
86 
87 // Wait interval between page daemon runs.
88 static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
89 static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
90 
91 // Number of idle runs after which we want to have processed the full active
92 // queue.
93 static const uint32 kIdleRunsForFullQueue = 20;
94 
95 // Maximum limit for the vm_page::usage_count.
96 static const int32 kPageUsageMax = 64;
97 // vm_page::usage_count buff an accessed page receives in a scan.
98 static const int32 kPageUsageAdvance = 3;
99 // vm_page::usage_count debuff an unaccessed page receives in a scan.
100 static const int32 kPageUsageDecline = 1;
101 
102 int32 gMappedPagesCount;
103 
104 static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
105 
106 static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
107 static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
108 static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
109 static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
110 static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
111 static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
112 
113 static vm_page *sPages;
114 static page_num_t sPhysicalPageOffset;
115 static page_num_t sNumPages;
116 static page_num_t sNonExistingPages;
117 	// pages in the sPages array that aren't backed by physical memory
118 static uint64 sIgnoredPages;
119 	// pages of physical memory ignored by the boot loader (and thus not
120 	// available here)
121 static vint32 sUnreservedFreePages;
122 static vint32 sUnsatisfiedPageReservations;
123 static vint32 sModifiedTemporaryPages;
124 
125 static ConditionVariable sFreePageCondition;
126 static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
127 
128 // This lock must be used whenever the free or clear page queues are changed.
129 // If you need to work on both queues at the same time, you need to hold a write
130 // lock, otherwise, a read lock suffices (each queue still has a spinlock to
131 // guard against concurrent changes).
132 static rw_lock sFreePageQueuesLock
133 	= RW_LOCK_INITIALIZER("free/clear page queues");
134 
135 #ifdef TRACK_PAGE_USAGE_STATS
136 static page_num_t sPageUsageArrays[512];
137 static page_num_t* sPageUsage = sPageUsageArrays;
138 static page_num_t sPageUsagePageCount;
139 static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
140 static page_num_t sNextPageUsagePageCount;
141 #endif
142 
143 
144 struct page_stats {
145 	int32	totalFreePages;
146 	int32	unsatisfiedReservations;
147 	int32	cachedPages;
148 };
149 
150 
151 struct PageReservationWaiter
152 		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
153 	Thread*	thread;
154 	uint32	dontTouch;		// reserve not to touch
155 	uint32	missing;		// pages missing for the reservation
156 	int32	threadPriority;
157 
158 	bool operator<(const PageReservationWaiter& other) const
159 	{
160 		// Implies an order by descending VM priority (ascending dontTouch)
161 		// and (secondarily) descending thread priority.
162 		if (dontTouch != other.dontTouch)
163 			return dontTouch < other.dontTouch;
164 		return threadPriority > other.threadPriority;
165 	}
166 };
167 
168 typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
169 static PageReservationWaiterList sPageReservationWaiters;
170 
171 
172 struct DaemonCondition {
173 	void Init(const char* name)
174 	{
175 		mutex_init(&fLock, "daemon condition");
176 		fCondition.Init(this, name);
177 		fActivated = false;
178 	}
179 
180 	bool Lock()
181 	{
182 		return mutex_lock(&fLock) == B_OK;
183 	}
184 
185 	void Unlock()
186 	{
187 		mutex_unlock(&fLock);
188 	}
189 
190 	bool Wait(bigtime_t timeout, bool clearActivated)
191 	{
192 		MutexLocker locker(fLock);
193 		if (clearActivated)
194 			fActivated = false;
195 		else if (fActivated)
196 			return true;
197 
198 		ConditionVariableEntry entry;
199 		fCondition.Add(&entry);
200 
201 		locker.Unlock();
202 
203 		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
204 	}
205 
206 	void WakeUp()
207 	{
208 		if (fActivated)
209 			return;
210 
211 		MutexLocker locker(fLock);
212 		fActivated = true;
213 		fCondition.NotifyOne();
214 	}
215 
216 	void ClearActivated()
217 	{
218 		MutexLocker locker(fLock);
219 		fActivated = false;
220 	}
221 
222 private:
223 	mutex				fLock;
224 	ConditionVariable	fCondition;
225 	bool				fActivated;
226 };
227 
228 
229 static DaemonCondition sPageWriterCondition;
230 static DaemonCondition sPageDaemonCondition;
231 
232 
233 #if PAGE_ALLOCATION_TRACING
234 
235 namespace PageAllocationTracing {
236 
237 class ReservePages : public AbstractTraceEntry {
238 	public:
239 		ReservePages(uint32 count)
240 			:
241 			fCount(count)
242 		{
243 			Initialized();
244 		}
245 
246 		virtual void AddDump(TraceOutput& out)
247 		{
248 			out.Print("page reserve:   %lu", fCount);
249 		}
250 
251 	private:
252 		uint32		fCount;
253 };
254 
255 
256 class UnreservePages : public AbstractTraceEntry {
257 	public:
258 		UnreservePages(uint32 count)
259 			:
260 			fCount(count)
261 		{
262 			Initialized();
263 		}
264 
265 		virtual void AddDump(TraceOutput& out)
266 		{
267 			out.Print("page unreserve: %lu", fCount);
268 		}
269 
270 	private:
271 		uint32		fCount;
272 };
273 
274 
275 class AllocatePage : public AbstractTraceEntry {
276 	public:
277 		AllocatePage()
278 		{
279 			Initialized();
280 		}
281 
282 		virtual void AddDump(TraceOutput& out)
283 		{
284 			out.Print("page alloc");
285 		}
286 };
287 
288 
289 class AllocatePageRun : public AbstractTraceEntry {
290 	public:
291 		AllocatePageRun(uint32 length)
292 			:
293 			fLength(length)
294 		{
295 			Initialized();
296 		}
297 
298 		virtual void AddDump(TraceOutput& out)
299 		{
300 			out.Print("page alloc run: length: %ld", fLength);
301 		}
302 
303 	private:
304 		uint32		fLength;
305 };
306 
307 
308 class FreePage : public AbstractTraceEntry {
309 	public:
310 		FreePage()
311 		{
312 			Initialized();
313 		}
314 
315 		virtual void AddDump(TraceOutput& out)
316 		{
317 			out.Print("page free");
318 		}
319 };
320 
321 
322 class ScrubbingPages : public AbstractTraceEntry {
323 	public:
324 		ScrubbingPages(uint32 count)
325 			:
326 			fCount(count)
327 		{
328 			Initialized();
329 		}
330 
331 		virtual void AddDump(TraceOutput& out)
332 		{
333 			out.Print("page scrubbing: %lu", fCount);
334 		}
335 
336 	private:
337 		uint32		fCount;
338 };
339 
340 
341 class ScrubbedPages : public AbstractTraceEntry {
342 	public:
343 		ScrubbedPages(uint32 count)
344 			:
345 			fCount(count)
346 		{
347 			Initialized();
348 		}
349 
350 		virtual void AddDump(TraceOutput& out)
351 		{
352 			out.Print("page scrubbed:  %lu", fCount);
353 		}
354 
355 	private:
356 		uint32		fCount;
357 };
358 
359 
360 class StolenPage : public AbstractTraceEntry {
361 	public:
362 		StolenPage()
363 		{
364 			Initialized();
365 		}
366 
367 		virtual void AddDump(TraceOutput& out)
368 		{
369 			out.Print("page stolen");
370 		}
371 };
372 
373 }	// namespace PageAllocationTracing
374 
375 #	define TA(x)	new(std::nothrow) PageAllocationTracing::x
376 
377 #else
378 #	define TA(x)
379 #endif	// PAGE_ALLOCATION_TRACING
380 
381 
382 #if PAGE_DAEMON_TRACING
383 
384 namespace PageDaemonTracing {
385 
386 class ActivatePage : public AbstractTraceEntry {
387 	public:
388 		ActivatePage(vm_page* page)
389 			:
390 			fCache(page->cache),
391 			fPage(page)
392 		{
393 			Initialized();
394 		}
395 
396 		virtual void AddDump(TraceOutput& out)
397 		{
398 			out.Print("page activated:   %p, cache: %p", fPage, fCache);
399 		}
400 
401 	private:
402 		VMCache*	fCache;
403 		vm_page*	fPage;
404 };
405 
406 
407 class DeactivatePage : public AbstractTraceEntry {
408 	public:
409 		DeactivatePage(vm_page* page)
410 			:
411 			fCache(page->cache),
412 			fPage(page)
413 		{
414 			Initialized();
415 		}
416 
417 		virtual void AddDump(TraceOutput& out)
418 		{
419 			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
420 		}
421 
422 	private:
423 		VMCache*	fCache;
424 		vm_page*	fPage;
425 };
426 
427 
428 class FreedPageSwap : public AbstractTraceEntry {
429 	public:
430 		FreedPageSwap(vm_page* page)
431 			:
432 			fCache(page->cache),
433 			fPage(page)
434 		{
435 			Initialized();
436 		}
437 
438 		virtual void AddDump(TraceOutput& out)
439 		{
440 			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
441 		}
442 
443 	private:
444 		VMCache*	fCache;
445 		vm_page*	fPage;
446 };
447 
448 }	// namespace PageDaemonTracing
449 
450 #	define TD(x)	new(std::nothrow) PageDaemonTracing::x
451 
452 #else
453 #	define TD(x)
454 #endif	// PAGE_DAEMON_TRACING
455 
456 
457 #if PAGE_WRITER_TRACING
458 
459 namespace PageWriterTracing {
460 
461 class WritePage : public AbstractTraceEntry {
462 	public:
463 		WritePage(vm_page* page)
464 			:
465 			fCache(page->Cache()),
466 			fPage(page)
467 		{
468 			Initialized();
469 		}
470 
471 		virtual void AddDump(TraceOutput& out)
472 		{
473 			out.Print("page write: %p, cache: %p", fPage, fCache);
474 		}
475 
476 	private:
477 		VMCache*	fCache;
478 		vm_page*	fPage;
479 };
480 
481 }	// namespace PageWriterTracing
482 
483 #	define TPW(x)	new(std::nothrow) PageWriterTracing::x
484 
485 #else
486 #	define TPW(x)
487 #endif	// PAGE_WRITER_TRACING
488 
489 
490 #if PAGE_STATE_TRACING
491 
492 namespace PageStateTracing {
493 
494 class SetPageState : public AbstractTraceEntry {
495 	public:
496 		SetPageState(vm_page* page, uint8 newState)
497 			:
498 			fPage(page),
499 			fOldState(page->State()),
500 			fNewState(newState),
501 			fBusy(page->busy),
502 			fWired(page->WiredCount() > 0),
503 			fMapped(!page->mappings.IsEmpty()),
504 			fAccessed(page->accessed),
505 			fModified(page->modified)
506 		{
507 #if PAGE_STATE_TRACING_STACK_TRACE
508 			fStackTrace = capture_tracing_stack_trace(
509 				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
510 				// Don't capture userland stack trace to avoid potential
511 				// deadlocks.
512 #endif
513 			Initialized();
514 		}
515 
516 #if PAGE_STATE_TRACING_STACK_TRACE
517 		virtual void DumpStackTrace(TraceOutput& out)
518 		{
519 			out.PrintStackTrace(fStackTrace);
520 		}
521 #endif
522 
523 		virtual void AddDump(TraceOutput& out)
524 		{
525 			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
526 				fBusy ? 'b' : '-',
527 				fWired ? 'w' : '-',
528 				fMapped ? 'm' : '-',
529 				fAccessed ? 'a' : '-',
530 				fModified ? 'm' : '-',
531 				page_state_to_string(fOldState),
532 				page_state_to_string(fNewState));
533 		}
534 
535 	private:
536 		vm_page*	fPage;
537 #if PAGE_STATE_TRACING_STACK_TRACE
538 		tracing_stack_trace* fStackTrace;
539 #endif
540 		uint8		fOldState;
541 		uint8		fNewState;
542 		bool		fBusy : 1;
543 		bool		fWired : 1;
544 		bool		fMapped : 1;
545 		bool		fAccessed : 1;
546 		bool		fModified : 1;
547 };
548 
549 }	// namespace PageStateTracing
550 
551 #	define TPS(x)	new(std::nothrow) PageStateTracing::x
552 
553 #else
554 #	define TPS(x)
555 #endif	// PAGE_STATE_TRACING
556 
557 
558 static int
559 find_page(int argc, char **argv)
560 {
561 	struct vm_page *page;
562 	addr_t address;
563 	int32 index = 1;
564 	int i;
565 
566 	struct {
567 		const char*	name;
568 		VMPageQueue*	queue;
569 	} pageQueueInfos[] = {
570 		{ "free",		&sFreePageQueue },
571 		{ "clear",		&sClearPageQueue },
572 		{ "modified",	&sModifiedPageQueue },
573 		{ "active",		&sActivePageQueue },
574 		{ "inactive",	&sInactivePageQueue },
575 		{ "cached",		&sCachedPageQueue },
576 		{ NULL, NULL }
577 	};
578 
579 	if (argc < 2
580 		|| strlen(argv[index]) <= 2
581 		|| argv[index][0] != '0'
582 		|| argv[index][1] != 'x') {
583 		kprintf("usage: find_page <address>\n");
584 		return 0;
585 	}
586 
587 	address = strtoul(argv[index], NULL, 0);
588 	page = (vm_page*)address;
589 
590 	for (i = 0; pageQueueInfos[i].name; i++) {
591 		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
592 		while (vm_page* p = it.Next()) {
593 			if (p == page) {
594 				kprintf("found page %p in queue %p (%s)\n", page,
595 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
596 				return 0;
597 			}
598 		}
599 	}
600 
601 	kprintf("page %p isn't in any queue\n", page);
602 
603 	return 0;
604 }
605 
606 
607 const char *
608 page_state_to_string(int state)
609 {
610 	switch(state) {
611 		case PAGE_STATE_ACTIVE:
612 			return "active";
613 		case PAGE_STATE_INACTIVE:
614 			return "inactive";
615 		case PAGE_STATE_MODIFIED:
616 			return "modified";
617 		case PAGE_STATE_CACHED:
618 			return "cached";
619 		case PAGE_STATE_FREE:
620 			return "free";
621 		case PAGE_STATE_CLEAR:
622 			return "clear";
623 		case PAGE_STATE_WIRED:
624 			return "wired";
625 		case PAGE_STATE_UNUSED:
626 			return "unused";
627 		default:
628 			return "unknown";
629 	}
630 }
631 
632 
633 static int
634 dump_page(int argc, char **argv)
635 {
636 	bool addressIsPointer = true;
637 	bool physical = false;
638 	bool searchMappings = false;
639 	int32 index = 1;
640 
641 	while (index < argc) {
642 		if (argv[index][0] != '-')
643 			break;
644 
645 		if (!strcmp(argv[index], "-p")) {
646 			addressIsPointer = false;
647 			physical = true;
648 		} else if (!strcmp(argv[index], "-v")) {
649 			addressIsPointer = false;
650 		} else if (!strcmp(argv[index], "-m")) {
651 			searchMappings = true;
652 		} else {
653 			print_debugger_command_usage(argv[0]);
654 			return 0;
655 		}
656 
657 		index++;
658 	}
659 
660 	if (index + 1 != argc) {
661 		print_debugger_command_usage(argv[0]);
662 		return 0;
663 	}
664 
665 	uint64 value;
666 	if (!evaluate_debug_expression(argv[index], &value, false))
667 		return 0;
668 
669 	uint64 pageAddress = value;
670 	struct vm_page* page;
671 
672 	if (addressIsPointer) {
673 		page = (struct vm_page *)(addr_t)pageAddress;
674 	} else {
675 		if (!physical) {
676 			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
677 
678 			if (debug_get_debugged_thread()->team->address_space != NULL)
679 				addressSpace = debug_get_debugged_thread()->team->address_space;
680 
681 			uint32 flags = 0;
682 			phys_addr_t physicalAddress;
683 			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
684 					&physicalAddress, &flags) != B_OK
685 				|| (flags & PAGE_PRESENT) == 0) {
686 				kprintf("Virtual address not mapped to a physical page in this "
687 					"address space.\n");
688 				return 0;
689 			}
690 			pageAddress = physicalAddress;
691 		}
692 
693 		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
694 	}
695 
696 	kprintf("PAGE: %p\n", page);
697 	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
698 		page->queue_link.previous);
699 	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n",
700 		page->physical_page_number);
701 	kprintf("cache:           %p\n", page->Cache());
702 	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
703 	kprintf("cache_next:      %p\n", page->cache_next);
704 	kprintf("state:           %s\n", page_state_to_string(page->State()));
705 	kprintf("wired_count:     %d\n", page->WiredCount());
706 	kprintf("usage_count:     %d\n", page->usage_count);
707 	kprintf("busy:            %d\n", page->busy);
708 	kprintf("busy_writing:    %d\n", page->busy_writing);
709 	kprintf("accessed:        %d\n", page->accessed);
710 	kprintf("modified:        %d\n", page->modified);
711 	#if DEBUG_PAGE_QUEUE
712 		kprintf("queue:           %p\n", page->queue);
713 	#endif
714 	#if DEBUG_PAGE_ACCESS
715 		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
716 	#endif
717 	kprintf("area mappings:\n");
718 
719 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
720 	vm_page_mapping *mapping;
721 	while ((mapping = iterator.Next()) != NULL) {
722 		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
723 		mapping = mapping->page_link.next;
724 	}
725 
726 	if (searchMappings) {
727 		kprintf("all mappings:\n");
728 		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
729 		while (addressSpace != NULL) {
730 			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
731 			for (addr_t address = addressSpace->Base(); pageCount != 0;
732 					address += B_PAGE_SIZE, pageCount--) {
733 				phys_addr_t physicalAddress;
734 				uint32 flags = 0;
735 				if (addressSpace->TranslationMap()->QueryInterrupt(address,
736 						&physicalAddress, &flags) == B_OK
737 					&& (flags & PAGE_PRESENT) != 0
738 					&& physicalAddress / B_PAGE_SIZE
739 						== page->physical_page_number) {
740 					VMArea* area = addressSpace->LookupArea(address);
741 					kprintf("  aspace %ld, area %ld: %#" B_PRIxADDR
742 						" (%c%c%s%s)\n", addressSpace->ID(),
743 						area != NULL ? area->id : -1, address,
744 						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
745 						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
746 						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
747 						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
748 				}
749 			}
750 			addressSpace = VMAddressSpace::DebugNext(addressSpace);
751 		}
752 	}
753 
754 	set_debug_variable("_cache", (addr_t)page->Cache());
755 	#if DEBUG_PAGE_ACCESS
756 		set_debug_variable("_accessor", page->accessing_thread);
757 	#endif
758 
759 	return 0;
760 }
761 
762 
763 static int
764 dump_page_queue(int argc, char **argv)
765 {
766 	struct VMPageQueue *queue;
767 
768 	if (argc < 2) {
769 		kprintf("usage: page_queue <address/name> [list]\n");
770 		return 0;
771 	}
772 
773 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
774 		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
775 	if (!strcmp(argv[1], "free"))
776 		queue = &sFreePageQueue;
777 	else if (!strcmp(argv[1], "clear"))
778 		queue = &sClearPageQueue;
779 	else if (!strcmp(argv[1], "modified"))
780 		queue = &sModifiedPageQueue;
781 	else if (!strcmp(argv[1], "active"))
782 		queue = &sActivePageQueue;
783 	else if (!strcmp(argv[1], "inactive"))
784 		queue = &sInactivePageQueue;
785 	else if (!strcmp(argv[1], "cached"))
786 		queue = &sCachedPageQueue;
787 	else {
788 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
789 		return 0;
790 	}
791 
792 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
793 		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
794 		queue->Count());
795 
796 	if (argc == 3) {
797 		struct vm_page *page = queue->Head();
798 		const char *type = "none";
799 		int i;
800 
801 		if (page->Cache() != NULL) {
802 			switch (page->Cache()->type) {
803 				case CACHE_TYPE_RAM:
804 					type = "RAM";
805 					break;
806 				case CACHE_TYPE_DEVICE:
807 					type = "device";
808 					break;
809 				case CACHE_TYPE_VNODE:
810 					type = "vnode";
811 					break;
812 				case CACHE_TYPE_NULL:
813 					type = "null";
814 					break;
815 				default:
816 					type = "???";
817 					break;
818 			}
819 		}
820 
821 		kprintf("page        cache       type       state  wired  usage\n");
822 		for (i = 0; page; i++, page = queue->Next(page)) {
823 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
824 				type, page_state_to_string(page->State()),
825 				page->WiredCount(), page->usage_count);
826 		}
827 	}
828 	return 0;
829 }
830 
831 
832 static int
833 dump_page_stats(int argc, char **argv)
834 {
835 	page_num_t swappableModified = 0;
836 	page_num_t swappableModifiedInactive = 0;
837 
838 	size_t counter[8];
839 	size_t busyCounter[8];
840 	memset(counter, 0, sizeof(counter));
841 	memset(busyCounter, 0, sizeof(busyCounter));
842 
843 	struct page_run {
844 		page_num_t	start;
845 		page_num_t	end;
846 
847 		page_num_t Length() const	{ return end - start; }
848 	};
849 
850 	page_run currentFreeRun = { 0, 0 };
851 	page_run currentCachedRun = { 0, 0 };
852 	page_run longestFreeRun = { 0, 0 };
853 	page_run longestCachedRun = { 0, 0 };
854 
855 	for (page_num_t i = 0; i < sNumPages; i++) {
856 		if (sPages[i].State() > 7) {
857 			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
858 				&sPages[i]);
859 		}
860 
861 		uint32 pageState = sPages[i].State();
862 
863 		counter[pageState]++;
864 		if (sPages[i].busy)
865 			busyCounter[pageState]++;
866 
867 		if (pageState == PAGE_STATE_MODIFIED
868 			&& sPages[i].Cache() != NULL
869 			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
870 			swappableModified++;
871 			if (sPages[i].usage_count == 0)
872 				swappableModifiedInactive++;
873 		}
874 
875 		// track free and cached pages runs
876 		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
877 			currentFreeRun.end = i + 1;
878 			currentCachedRun.end = i + 1;
879 		} else {
880 			if (currentFreeRun.Length() > longestFreeRun.Length())
881 				longestFreeRun = currentFreeRun;
882 			currentFreeRun.start = currentFreeRun.end = i + 1;
883 
884 			if (pageState == PAGE_STATE_CACHED) {
885 				currentCachedRun.end = i + 1;
886 			} else {
887 				if (currentCachedRun.Length() > longestCachedRun.Length())
888 					longestCachedRun = currentCachedRun;
889 				currentCachedRun.start = currentCachedRun.end = i + 1;
890 			}
891 		}
892 	}
893 
894 	kprintf("page stats:\n");
895 	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
896 
897 	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
898 		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
899 	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
900 		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
901 	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
902 		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
903 	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
904 		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
905 	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
906 		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
907 	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
908 		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
909 	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
910 	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
911 
912 	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
913 	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
914 		sUnsatisfiedPageReservations);
915 	kprintf("mapped pages: %lu\n", gMappedPagesCount);
916 	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
917 		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
918 		sPages[longestFreeRun.start].physical_page_number);
919 	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
920 		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
921 		sPages[longestCachedRun.start].physical_page_number);
922 
923 	kprintf("waiting threads:\n");
924 	for (PageReservationWaiterList::Iterator it
925 			= sPageReservationWaiters.GetIterator();
926 		PageReservationWaiter* waiter = it.Next();) {
927 		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
928 			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
929 			waiter->missing, waiter->dontTouch);
930 	}
931 
932 	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
933 		sFreePageQueue.Count());
934 	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
935 		sClearPageQueue.Count());
936 	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
937 		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
938 		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
939 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
940 	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
941 		&sActivePageQueue, sActivePageQueue.Count());
942 	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
943 		&sInactivePageQueue, sInactivePageQueue.Count());
944 	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
945 		&sCachedPageQueue, sCachedPageQueue.Count());
946 	return 0;
947 }
948 
949 
950 #ifdef TRACK_PAGE_USAGE_STATS
951 
952 static void
953 track_page_usage(vm_page* page)
954 {
955 	if (page->WiredCount() == 0) {
956 		sNextPageUsage[(int32)page->usage_count + 128]++;
957 		sNextPageUsagePageCount++;
958 	}
959 }
960 
961 
962 static void
963 update_page_usage_stats()
964 {
965 	std::swap(sPageUsage, sNextPageUsage);
966 	sPageUsagePageCount = sNextPageUsagePageCount;
967 
968 	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
969 	sNextPageUsagePageCount = 0;
970 
971 	// compute average
972 	if (sPageUsagePageCount > 0) {
973 		int64 sum = 0;
974 		for (int32 i = 0; i < 256; i++)
975 			sum += (int64)sPageUsage[i] * (i - 128);
976 
977 		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
978 			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
979 	}
980 }
981 
982 
983 static int
984 dump_page_usage_stats(int argc, char** argv)
985 {
986 	kprintf("distribution of page usage counts (%lu pages):",
987 		sPageUsagePageCount);
988 
989 	int64 sum = 0;
990 	for (int32 i = 0; i < 256; i++) {
991 		if (i % 8 == 0)
992 			kprintf("\n%4ld:", i - 128);
993 
994 		int64 count = sPageUsage[i];
995 		sum += count * (i - 128);
996 
997 		kprintf("  %9llu", count);
998 	}
999 
1000 	kprintf("\n\n");
1001 
1002 	kprintf("average usage count: %f\n",
1003 		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1004 
1005 	return 0;
1006 }
1007 
1008 #endif	// TRACK_PAGE_USAGE_STATS
1009 
1010 
1011 // #pragma mark - vm_page
1012 
1013 
1014 inline void
1015 vm_page::InitState(uint8 newState)
1016 {
1017 	state = newState;
1018 }
1019 
1020 
1021 inline void
1022 vm_page::SetState(uint8 newState)
1023 {
1024 	TPS(SetPageState(this, newState));
1025 
1026 	state = newState;
1027 }
1028 
1029 
1030 // #pragma mark -
1031 
1032 
1033 static void
1034 get_page_stats(page_stats& _pageStats)
1035 {
1036 	_pageStats.totalFreePages = sUnreservedFreePages;
1037 	_pageStats.cachedPages = sCachedPageQueue.Count();
1038 	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1039 	// TODO: We don't get an actual snapshot here!
1040 }
1041 
1042 
1043 static bool
1044 do_active_paging(const page_stats& pageStats)
1045 {
1046 	return pageStats.totalFreePages + pageStats.cachedPages
1047 		< pageStats.unsatisfiedReservations
1048 			+ (int32)sFreeOrCachedPagesTarget;
1049 }
1050 
1051 
1052 /*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1053 	\a count. Doesn't touch the last \a dontTouch pages of
1054 	\c sUnreservedFreePages, though.
1055 	\return The number of actually reserved pages.
1056 */
1057 static uint32
1058 reserve_some_pages(uint32 count, uint32 dontTouch)
1059 {
1060 	while (true) {
1061 		int32 freePages = sUnreservedFreePages;
1062 		if (freePages <= (int32)dontTouch)
1063 			return 0;
1064 
1065 		int32 toReserve = std::min(count, freePages - dontTouch);
1066 		if (atomic_test_and_set(&sUnreservedFreePages,
1067 					freePages - toReserve, freePages)
1068 				== freePages) {
1069 			return toReserve;
1070 		}
1071 
1072 		// the count changed in the meantime -- retry
1073 	}
1074 }
1075 
1076 
1077 static void
1078 wake_up_page_reservation_waiters()
1079 {
1080 	MutexLocker pageDeficitLocker(sPageDeficitLock);
1081 
1082 	// TODO: If this is a low priority thread, we might want to disable
1083 	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1084 	// high priority threads wait be kept waiting while a medium priority thread
1085 	// prevents us from running.
1086 
1087 	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1088 		int32 reserved = reserve_some_pages(waiter->missing,
1089 			waiter->dontTouch);
1090 		if (reserved == 0)
1091 			return;
1092 
1093 		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1094 		waiter->missing -= reserved;
1095 
1096 		if (waiter->missing > 0)
1097 			return;
1098 
1099 		sPageReservationWaiters.Remove(waiter);
1100 
1101 		InterruptsSpinLocker schedulerLocker(gSchedulerLock);
1102 		thread_unblock_locked(waiter->thread, B_OK);
1103 	}
1104 }
1105 
1106 
1107 static inline void
1108 unreserve_pages(uint32 count)
1109 {
1110 	atomic_add(&sUnreservedFreePages, count);
1111 	if (sUnsatisfiedPageReservations != 0)
1112 		wake_up_page_reservation_waiters();
1113 }
1114 
1115 
1116 static void
1117 free_page(vm_page* page, bool clear)
1118 {
1119 	DEBUG_PAGE_ACCESS_CHECK(page);
1120 
1121 	PAGE_ASSERT(page, !page->IsMapped());
1122 
1123 	VMPageQueue* fromQueue;
1124 
1125 	switch (page->State()) {
1126 		case PAGE_STATE_ACTIVE:
1127 			fromQueue = &sActivePageQueue;
1128 			break;
1129 		case PAGE_STATE_INACTIVE:
1130 			fromQueue = &sInactivePageQueue;
1131 			break;
1132 		case PAGE_STATE_MODIFIED:
1133 			fromQueue = &sModifiedPageQueue;
1134 			break;
1135 		case PAGE_STATE_CACHED:
1136 			fromQueue = &sCachedPageQueue;
1137 			break;
1138 		case PAGE_STATE_FREE:
1139 		case PAGE_STATE_CLEAR:
1140 			panic("free_page(): page %p already free", page);
1141 			return;
1142 		case PAGE_STATE_WIRED:
1143 		case PAGE_STATE_UNUSED:
1144 			fromQueue = NULL;
1145 			break;
1146 		default:
1147 			panic("free_page(): page %p in invalid state %d",
1148 				page, page->State());
1149 			return;
1150 	}
1151 
1152 	if (page->CacheRef() != NULL)
1153 		panic("to be freed page %p has cache", page);
1154 	if (page->IsMapped())
1155 		panic("to be freed page %p has mappings", page);
1156 
1157 	if (fromQueue != NULL)
1158 		fromQueue->RemoveUnlocked(page);
1159 
1160 	TA(FreePage());
1161 
1162 	ReadLocker locker(sFreePageQueuesLock);
1163 
1164 	DEBUG_PAGE_ACCESS_END(page);
1165 
1166 	if (clear) {
1167 		page->SetState(PAGE_STATE_CLEAR);
1168 		sClearPageQueue.PrependUnlocked(page);
1169 	} else {
1170 		page->SetState(PAGE_STATE_FREE);
1171 		sFreePageQueue.PrependUnlocked(page);
1172 	}
1173 
1174 	locker.Unlock();
1175 
1176 	unreserve_pages(1);
1177 }
1178 
1179 
1180 /*!	The caller must make sure that no-one else tries to change the page's state
1181 	while the function is called. If the page has a cache, this can be done by
1182 	locking the cache.
1183 */
1184 static void
1185 set_page_state(vm_page *page, int pageState)
1186 {
1187 	DEBUG_PAGE_ACCESS_CHECK(page);
1188 
1189 	if (pageState == page->State())
1190 		return;
1191 
1192 	VMPageQueue* fromQueue;
1193 
1194 	switch (page->State()) {
1195 		case PAGE_STATE_ACTIVE:
1196 			fromQueue = &sActivePageQueue;
1197 			break;
1198 		case PAGE_STATE_INACTIVE:
1199 			fromQueue = &sInactivePageQueue;
1200 			break;
1201 		case PAGE_STATE_MODIFIED:
1202 			fromQueue = &sModifiedPageQueue;
1203 			break;
1204 		case PAGE_STATE_CACHED:
1205 			fromQueue = &sCachedPageQueue;
1206 			break;
1207 		case PAGE_STATE_FREE:
1208 		case PAGE_STATE_CLEAR:
1209 			panic("set_page_state(): page %p is free/clear", page);
1210 			return;
1211 		case PAGE_STATE_WIRED:
1212 		case PAGE_STATE_UNUSED:
1213 			fromQueue = NULL;
1214 			break;
1215 		default:
1216 			panic("set_page_state(): page %p in invalid state %d",
1217 				page, page->State());
1218 			return;
1219 	}
1220 
1221 	VMPageQueue* toQueue;
1222 
1223 	switch (pageState) {
1224 		case PAGE_STATE_ACTIVE:
1225 			toQueue = &sActivePageQueue;
1226 			break;
1227 		case PAGE_STATE_INACTIVE:
1228 			toQueue = &sInactivePageQueue;
1229 			break;
1230 		case PAGE_STATE_MODIFIED:
1231 			toQueue = &sModifiedPageQueue;
1232 			break;
1233 		case PAGE_STATE_CACHED:
1234 			PAGE_ASSERT(page, !page->IsMapped());
1235 			PAGE_ASSERT(page, !page->modified);
1236 			toQueue = &sCachedPageQueue;
1237 			break;
1238 		case PAGE_STATE_FREE:
1239 		case PAGE_STATE_CLEAR:
1240 			panic("set_page_state(): target state is free/clear");
1241 			return;
1242 		case PAGE_STATE_WIRED:
1243 		case PAGE_STATE_UNUSED:
1244 			toQueue = NULL;
1245 			break;
1246 		default:
1247 			panic("set_page_state(): invalid target state %d", pageState);
1248 			return;
1249 	}
1250 
1251 	VMCache* cache = page->Cache();
1252 	if (cache != NULL && cache->temporary) {
1253 		if (pageState == PAGE_STATE_MODIFIED)
1254 			atomic_add(&sModifiedTemporaryPages, 1);
1255 		else if (page->State() == PAGE_STATE_MODIFIED)
1256 			atomic_add(&sModifiedTemporaryPages, -1);
1257 	}
1258 
1259 	// move the page
1260 	if (toQueue == fromQueue) {
1261 		// Note: Theoretically we are required to lock when changing the page
1262 		// state, even if we don't change the queue. We actually don't have to
1263 		// do this, though, since only for the active queue there are different
1264 		// page states and active pages have a cache that must be locked at
1265 		// this point. So we rely on the fact that everyone must lock the cache
1266 		// before trying to change/interpret the page state.
1267 		PAGE_ASSERT(page, cache != NULL);
1268 		cache->AssertLocked();
1269 		page->SetState(pageState);
1270 	} else {
1271 		if (fromQueue != NULL)
1272 			fromQueue->RemoveUnlocked(page);
1273 
1274 		page->SetState(pageState);
1275 
1276 		if (toQueue != NULL)
1277 			toQueue->AppendUnlocked(page);
1278 	}
1279 }
1280 
1281 
1282 /*! Moves a previously modified page into a now appropriate queue.
1283 	The page queues must not be locked.
1284 */
1285 static void
1286 move_page_to_appropriate_queue(vm_page *page)
1287 {
1288 	DEBUG_PAGE_ACCESS_CHECK(page);
1289 
1290 	// Note, this logic must be in sync with what the page daemon does.
1291 	int32 state;
1292 	if (page->IsMapped())
1293 		state = PAGE_STATE_ACTIVE;
1294 	else if (page->modified)
1295 		state = PAGE_STATE_MODIFIED;
1296 	else
1297 		state = PAGE_STATE_CACHED;
1298 
1299 // TODO: If free + cached pages are low, we might directly want to free the
1300 // page.
1301 	set_page_state(page, state);
1302 }
1303 
1304 
1305 static void
1306 clear_page(struct vm_page *page)
1307 {
1308 	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1309 		B_PAGE_SIZE);
1310 }
1311 
1312 
1313 static status_t
1314 mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1315 {
1316 	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1317 		B_PRIxPHYSADDR "\n", startPage, length));
1318 
1319 	if (sPhysicalPageOffset > startPage) {
1320 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1321 			"): start page is before free list\n", startPage, length);
1322 		if (sPhysicalPageOffset - startPage >= length)
1323 			return B_OK;
1324 		length -= sPhysicalPageOffset - startPage;
1325 		startPage = sPhysicalPageOffset;
1326 	}
1327 
1328 	startPage -= sPhysicalPageOffset;
1329 
1330 	if (startPage + length > sNumPages) {
1331 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1332 			"): range would extend past free list\n", startPage, length);
1333 		if (startPage >= sNumPages)
1334 			return B_OK;
1335 		length = sNumPages - startPage;
1336 	}
1337 
1338 	WriteLocker locker(sFreePageQueuesLock);
1339 
1340 	for (page_num_t i = 0; i < length; i++) {
1341 		vm_page *page = &sPages[startPage + i];
1342 		switch (page->State()) {
1343 			case PAGE_STATE_FREE:
1344 			case PAGE_STATE_CLEAR:
1345 			{
1346 // TODO: This violates the page reservation policy, since we remove pages from
1347 // the free/clear queues without having reserved them before. This should happen
1348 // in the early boot process only, though.
1349 				DEBUG_PAGE_ACCESS_START(page);
1350 				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1351 					? sFreePageQueue : sClearPageQueue;
1352 				queue.Remove(page);
1353 				page->SetState(wired ? PAGE_STATE_UNUSED : PAGE_STATE_UNUSED);
1354 				page->busy = false;
1355 				atomic_add(&sUnreservedFreePages, -1);
1356 				DEBUG_PAGE_ACCESS_END(page);
1357 				break;
1358 			}
1359 			case PAGE_STATE_WIRED:
1360 			case PAGE_STATE_UNUSED:
1361 				break;
1362 			case PAGE_STATE_ACTIVE:
1363 			case PAGE_STATE_INACTIVE:
1364 			case PAGE_STATE_MODIFIED:
1365 			case PAGE_STATE_CACHED:
1366 			default:
1367 				// uh
1368 				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1369 					" in non-free state %d!\n", startPage + i, page->State());
1370 				break;
1371 		}
1372 	}
1373 
1374 	return B_OK;
1375 }
1376 
1377 
1378 /*!
1379 	This is a background thread that wakes up every now and then (every 100ms)
1380 	and moves some pages from the free queue over to the clear queue.
1381 	Given enough time, it will clear out all pages from the free queue - we
1382 	could probably slow it down after having reached a certain threshold.
1383 */
1384 static int32
1385 page_scrubber(void *unused)
1386 {
1387 	(void)(unused);
1388 
1389 	TRACE(("page_scrubber starting...\n"));
1390 
1391 	for (;;) {
1392 		snooze(100000); // 100ms
1393 
1394 		if (sFreePageQueue.Count() == 0
1395 				|| sUnreservedFreePages < (int32)sFreePagesTarget) {
1396 			continue;
1397 		}
1398 
1399 		// Since we temporarily remove pages from the free pages reserve,
1400 		// we must make sure we don't cause a violation of the page
1401 		// reservation warranty. The following is usually stricter than
1402 		// necessary, because we don't have information on how many of the
1403 		// reserved pages have already been allocated.
1404 		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1405 			kPageReserveForPriority[VM_PRIORITY_USER]);
1406 		if (reserved == 0)
1407 			continue;
1408 
1409 		// get some pages from the free queue
1410 		ReadLocker locker(sFreePageQueuesLock);
1411 
1412 		vm_page *page[SCRUB_SIZE];
1413 		int32 scrubCount = 0;
1414 		for (int32 i = 0; i < reserved; i++) {
1415 			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1416 			if (page[i] == NULL)
1417 				break;
1418 
1419 			DEBUG_PAGE_ACCESS_START(page[i]);
1420 
1421 			page[i]->SetState(PAGE_STATE_ACTIVE);
1422 			page[i]->busy = true;
1423 			scrubCount++;
1424 		}
1425 
1426 		locker.Unlock();
1427 
1428 		if (scrubCount == 0) {
1429 			unreserve_pages(reserved);
1430 			continue;
1431 		}
1432 
1433 		TA(ScrubbingPages(scrubCount));
1434 
1435 		// clear them
1436 		for (int32 i = 0; i < scrubCount; i++)
1437 			clear_page(page[i]);
1438 
1439 		locker.Lock();
1440 
1441 		// and put them into the clear queue
1442 		for (int32 i = 0; i < scrubCount; i++) {
1443 			page[i]->SetState(PAGE_STATE_CLEAR);
1444 			page[i]->busy = false;
1445 			DEBUG_PAGE_ACCESS_END(page[i]);
1446 			sClearPageQueue.PrependUnlocked(page[i]);
1447 		}
1448 
1449 		locker.Unlock();
1450 
1451 		unreserve_pages(reserved);
1452 
1453 		TA(ScrubbedPages(scrubCount));
1454 	}
1455 
1456 	return 0;
1457 }
1458 
1459 
1460 static void
1461 init_page_marker(vm_page &marker)
1462 {
1463 	marker.SetCacheRef(NULL);
1464 	marker.InitState(PAGE_STATE_UNUSED);
1465 	marker.busy = true;
1466 #if DEBUG_PAGE_QUEUE
1467 	marker.queue = NULL;
1468 #endif
1469 #if DEBUG_PAGE_ACCESS
1470 	marker.accessing_thread = thread_get_current_thread_id();
1471 #endif
1472 }
1473 
1474 
1475 static void
1476 remove_page_marker(struct vm_page &marker)
1477 {
1478 	DEBUG_PAGE_ACCESS_CHECK(&marker);
1479 
1480 	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1481 		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1482 
1483 	marker.SetState(PAGE_STATE_UNUSED);
1484 }
1485 
1486 
1487 static vm_page*
1488 next_modified_page(page_num_t& maxPagesToSee)
1489 {
1490 	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1491 
1492 	while (maxPagesToSee > 0) {
1493 		vm_page* page = sModifiedPageQueue.Head();
1494 		if (page == NULL)
1495 			return NULL;
1496 
1497 		sModifiedPageQueue.Requeue(page, true);
1498 
1499 		maxPagesToSee--;
1500 
1501 		if (!page->busy)
1502 			return page;
1503 	}
1504 
1505 	return NULL;
1506 }
1507 
1508 
1509 // #pragma mark -
1510 
1511 
1512 class PageWriteTransfer;
1513 class PageWriteWrapper;
1514 
1515 
1516 class PageWriterRun {
1517 public:
1518 	status_t Init(uint32 maxPages);
1519 
1520 	void PrepareNextRun();
1521 	void AddPage(vm_page* page);
1522 	uint32 Go();
1523 
1524 	void PageWritten(PageWriteTransfer* transfer, status_t status,
1525 		bool partialTransfer, size_t bytesTransferred);
1526 
1527 private:
1528 	uint32				fMaxPages;
1529 	uint32				fWrapperCount;
1530 	uint32				fTransferCount;
1531 	vint32				fPendingTransfers;
1532 	PageWriteWrapper*	fWrappers;
1533 	PageWriteTransfer*	fTransfers;
1534 	ConditionVariable	fAllFinishedCondition;
1535 };
1536 
1537 
1538 class PageWriteTransfer : public AsyncIOCallback {
1539 public:
1540 	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1541 	bool AddPage(vm_page* page);
1542 
1543 	status_t Schedule(uint32 flags);
1544 
1545 	void SetStatus(status_t status, size_t transferred);
1546 
1547 	status_t Status() const	{ return fStatus; }
1548 	struct VMCache* Cache() const { return fCache; }
1549 	uint32 PageCount() const { return fPageCount; }
1550 
1551 	virtual void IOFinished(status_t status, bool partialTransfer,
1552 		generic_size_t bytesTransferred);
1553 private:
1554 	PageWriterRun*		fRun;
1555 	struct VMCache*		fCache;
1556 	off_t				fOffset;
1557 	uint32				fPageCount;
1558 	int32				fMaxPages;
1559 	status_t			fStatus;
1560 	uint32				fVecCount;
1561 	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
1562 };
1563 
1564 
1565 class PageWriteWrapper {
1566 public:
1567 	PageWriteWrapper();
1568 	~PageWriteWrapper();
1569 	void SetTo(vm_page* page);
1570 	bool Done(status_t result);
1571 
1572 private:
1573 	vm_page*			fPage;
1574 	struct VMCache*		fCache;
1575 	bool				fIsActive;
1576 };
1577 
1578 
1579 PageWriteWrapper::PageWriteWrapper()
1580 	:
1581 	fIsActive(false)
1582 {
1583 }
1584 
1585 
1586 PageWriteWrapper::~PageWriteWrapper()
1587 {
1588 	if (fIsActive)
1589 		panic("page write wrapper going out of scope but isn't completed");
1590 }
1591 
1592 
1593 /*!	The page's cache must be locked.
1594 */
1595 void
1596 PageWriteWrapper::SetTo(vm_page* page)
1597 {
1598 	DEBUG_PAGE_ACCESS_CHECK(page);
1599 
1600 	if (page->busy)
1601 		panic("setting page write wrapper to busy page");
1602 
1603 	if (fIsActive)
1604 		panic("re-setting page write wrapper that isn't completed");
1605 
1606 	fPage = page;
1607 	fCache = page->Cache();
1608 	fIsActive = true;
1609 
1610 	fPage->busy = true;
1611 	fPage->busy_writing = true;
1612 
1613 	// We have a modified page -- however, while we're writing it back,
1614 	// the page might still be mapped. In order not to lose any changes to the
1615 	// page, we mark it clean before actually writing it back; if
1616 	// writing the page fails for some reason, we'll just keep it in the
1617 	// modified page list, but that should happen only rarely.
1618 
1619 	// If the page is changed after we cleared the dirty flag, but before we
1620 	// had the chance to write it back, then we'll write it again later -- that
1621 	// will probably not happen that often, though.
1622 
1623 	vm_clear_map_flags(fPage, PAGE_MODIFIED);
1624 }
1625 
1626 
1627 /*!	The page's cache must be locked.
1628 	The page queues must not be locked.
1629 	\return \c true if the page was written successfully respectively could be
1630 		handled somehow, \c false otherwise.
1631 */
1632 bool
1633 PageWriteWrapper::Done(status_t result)
1634 {
1635 	if (!fIsActive)
1636 		panic("completing page write wrapper that is not active");
1637 
1638 	DEBUG_PAGE_ACCESS_START(fPage);
1639 
1640 	fPage->busy = false;
1641 		// Set unbusy and notify later by hand, since we might free the page.
1642 
1643 	bool success = true;
1644 
1645 	if (result == B_OK) {
1646 		// put it into the active/inactive queue
1647 		move_page_to_appropriate_queue(fPage);
1648 		fPage->busy_writing = false;
1649 		DEBUG_PAGE_ACCESS_END(fPage);
1650 	} else {
1651 		// Writing the page failed. One reason would be that the cache has been
1652 		// shrunk and the page does no longer belong to the file. Otherwise the
1653 		// actual I/O failed, in which case we'll simply keep the page modified.
1654 
1655 		if (!fPage->busy_writing) {
1656 			// The busy_writing flag was cleared. That means the cache has been
1657 			// shrunk while we were trying to write the page and we have to free
1658 			// it now.
1659 			vm_remove_all_page_mappings(fPage);
1660 // TODO: Unmapping should already happen when resizing the cache!
1661 			fCache->RemovePage(fPage);
1662 			free_page(fPage, false);
1663 		} else {
1664 			// Writing the page failed -- mark the page modified and move it to
1665 			// an appropriate queue other than the modified queue, so we don't
1666 			// keep trying to write it over and over again. We keep
1667 			// non-temporary pages in the modified queue, though, so they don't
1668 			// get lost in the inactive queue.
1669 			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
1670 				strerror(result));
1671 
1672 			fPage->modified = true;
1673 			if (!fCache->temporary)
1674 				set_page_state(fPage, PAGE_STATE_MODIFIED);
1675 			else if (fPage->IsMapped())
1676 				set_page_state(fPage, PAGE_STATE_ACTIVE);
1677 			else
1678 				set_page_state(fPage, PAGE_STATE_INACTIVE);
1679 
1680 			fPage->busy_writing = false;
1681 			DEBUG_PAGE_ACCESS_END(fPage);
1682 
1683 			success = false;
1684 		}
1685 	}
1686 
1687 	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
1688 	fIsActive = false;
1689 
1690 	return success;
1691 }
1692 
1693 
1694 /*!	The page's cache must be locked.
1695 */
1696 void
1697 PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
1698 {
1699 	fRun = run;
1700 	fCache = page->Cache();
1701 	fOffset = page->cache_offset;
1702 	fPageCount = 1;
1703 	fMaxPages = maxPages;
1704 	fStatus = B_OK;
1705 
1706 	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
1707 	fVecs[0].length = B_PAGE_SIZE;
1708 	fVecCount = 1;
1709 }
1710 
1711 
1712 /*!	The page's cache must be locked.
1713 */
1714 bool
1715 PageWriteTransfer::AddPage(vm_page* page)
1716 {
1717 	if (page->Cache() != fCache
1718 		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
1719 		return false;
1720 
1721 	phys_addr_t nextBase = fVecs[fVecCount - 1].base
1722 		+ fVecs[fVecCount - 1].length;
1723 
1724 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
1725 		&& (off_t)page->cache_offset == fOffset + fPageCount) {
1726 		// append to last iovec
1727 		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
1728 		fPageCount++;
1729 		return true;
1730 	}
1731 
1732 	nextBase = fVecs[0].base - B_PAGE_SIZE;
1733 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
1734 		&& (off_t)page->cache_offset == fOffset - 1) {
1735 		// prepend to first iovec and adjust offset
1736 		fVecs[0].base = nextBase;
1737 		fVecs[0].length += B_PAGE_SIZE;
1738 		fOffset = page->cache_offset;
1739 		fPageCount++;
1740 		return true;
1741 	}
1742 
1743 	if (((off_t)page->cache_offset == fOffset + fPageCount
1744 			|| (off_t)page->cache_offset == fOffset - 1)
1745 		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
1746 		// not physically contiguous or not in the right order
1747 		uint32 vectorIndex;
1748 		if ((off_t)page->cache_offset < fOffset) {
1749 			// we are pre-pending another vector, move the other vecs
1750 			for (uint32 i = fVecCount; i > 0; i--)
1751 				fVecs[i] = fVecs[i - 1];
1752 
1753 			fOffset = page->cache_offset;
1754 			vectorIndex = 0;
1755 		} else
1756 			vectorIndex = fVecCount;
1757 
1758 		fVecs[vectorIndex].base
1759 			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
1760 		fVecs[vectorIndex].length = B_PAGE_SIZE;
1761 
1762 		fVecCount++;
1763 		fPageCount++;
1764 		return true;
1765 	}
1766 
1767 	return false;
1768 }
1769 
1770 
1771 status_t
1772 PageWriteTransfer::Schedule(uint32 flags)
1773 {
1774 	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
1775 	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
1776 
1777 	if (fRun != NULL) {
1778 		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
1779 			flags | B_PHYSICAL_IO_REQUEST, this);
1780 	}
1781 
1782 	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
1783 		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
1784 
1785 	SetStatus(status, writeLength);
1786 	return fStatus;
1787 }
1788 
1789 
1790 void
1791 PageWriteTransfer::SetStatus(status_t status, size_t transferred)
1792 {
1793 	// only succeed if all pages up to the last one have been written fully
1794 	// and the last page has at least been written partially
1795 	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
1796 		status = B_ERROR;
1797 
1798 	fStatus = status;
1799 }
1800 
1801 
1802 void
1803 PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
1804 	generic_size_t bytesTransferred)
1805 {
1806 	SetStatus(status, bytesTransferred);
1807 	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
1808 }
1809 
1810 
1811 status_t
1812 PageWriterRun::Init(uint32 maxPages)
1813 {
1814 	fMaxPages = maxPages;
1815 	fWrapperCount = 0;
1816 	fTransferCount = 0;
1817 	fPendingTransfers = 0;
1818 
1819 	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
1820 	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
1821 	if (fWrappers == NULL || fTransfers == NULL)
1822 		return B_NO_MEMORY;
1823 
1824 	return B_OK;
1825 }
1826 
1827 
1828 void
1829 PageWriterRun::PrepareNextRun()
1830 {
1831 	fWrapperCount = 0;
1832 	fTransferCount = 0;
1833 	fPendingTransfers = 0;
1834 }
1835 
1836 
1837 /*!	The page's cache must be locked.
1838 */
1839 void
1840 PageWriterRun::AddPage(vm_page* page)
1841 {
1842 	fWrappers[fWrapperCount++].SetTo(page);
1843 
1844 	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
1845 		fTransfers[fTransferCount++].SetTo(this, page,
1846 			page->Cache()->MaxPagesPerAsyncWrite());
1847 	}
1848 }
1849 
1850 
1851 /*!	Writes all pages previously added.
1852 	\return The number of pages that could not be written or otherwise handled.
1853 */
1854 uint32
1855 PageWriterRun::Go()
1856 {
1857 	fPendingTransfers = fTransferCount;
1858 
1859 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
1860 	ConditionVariableEntry waitEntry;
1861 	fAllFinishedCondition.Add(&waitEntry);
1862 
1863 	// schedule writes
1864 	for (uint32 i = 0; i < fTransferCount; i++)
1865 		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
1866 
1867 	// wait until all pages have been written
1868 	waitEntry.Wait();
1869 
1870 	// mark pages depending on whether they could be written or not
1871 
1872 	uint32 failedPages = 0;
1873 	uint32 wrapperIndex = 0;
1874 	for (uint32 i = 0; i < fTransferCount; i++) {
1875 		PageWriteTransfer& transfer = fTransfers[i];
1876 		transfer.Cache()->Lock();
1877 
1878 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
1879 			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
1880 				failedPages++;
1881 		}
1882 
1883 		transfer.Cache()->Unlock();
1884 	}
1885 
1886 	ASSERT(wrapperIndex == fWrapperCount);
1887 
1888 	for (uint32 i = 0; i < fTransferCount; i++) {
1889 		PageWriteTransfer& transfer = fTransfers[i];
1890 		struct VMCache* cache = transfer.Cache();
1891 
1892 		// We've acquired a references for each page
1893 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
1894 			// We release the cache references after all pages were made
1895 			// unbusy again - otherwise releasing a vnode could deadlock.
1896 			cache->ReleaseStoreRef();
1897 			cache->ReleaseRef();
1898 		}
1899 	}
1900 
1901 	return failedPages;
1902 }
1903 
1904 
1905 void
1906 PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
1907 	bool partialTransfer, size_t bytesTransferred)
1908 {
1909 	if (atomic_add(&fPendingTransfers, -1) == 1)
1910 		fAllFinishedCondition.NotifyAll();
1911 }
1912 
1913 
1914 /*!	The page writer continuously takes some pages from the modified
1915 	queue, writes them back, and moves them back to the active queue.
1916 	It runs in its own thread, and is only there to keep the number
1917 	of modified pages low, so that more pages can be reused with
1918 	fewer costs.
1919 */
1920 status_t
1921 page_writer(void* /*unused*/)
1922 {
1923 	const uint32 kNumPages = 256;
1924 	uint32 writtenPages = 0;
1925 	bigtime_t lastWrittenTime = 0;
1926 	bigtime_t pageCollectionTime = 0;
1927 	bigtime_t pageWritingTime = 0;
1928 
1929 	PageWriterRun run;
1930 	if (run.Init(kNumPages) != B_OK) {
1931 		panic("page writer: Failed to init PageWriterRun!");
1932 		return B_ERROR;
1933 	}
1934 
1935 	page_num_t pagesSinceLastSuccessfulWrite = 0;
1936 
1937 	while (true) {
1938 // TODO: Maybe wait shorter when memory is low!
1939 		if (sModifiedPageQueue.Count() < kNumPages) {
1940 			sPageWriterCondition.Wait(3000000, true);
1941 				// all 3 seconds when no one triggers us
1942 		}
1943 
1944 		page_num_t modifiedPages = sModifiedPageQueue.Count();
1945 		if (modifiedPages == 0)
1946 			continue;
1947 
1948 		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
1949 			// We ran through the whole queue without being able to write a
1950 			// single page. Take a break.
1951 			snooze(500000);
1952 			pagesSinceLastSuccessfulWrite = 0;
1953 		}
1954 
1955 #if ENABLE_SWAP_SUPPORT
1956 		page_stats pageStats;
1957 		get_page_stats(pageStats);
1958 		bool activePaging = do_active_paging(pageStats);
1959 #endif
1960 
1961 		// depending on how urgent it becomes to get pages to disk, we adjust
1962 		// our I/O priority
1963 		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
1964 		int32 ioPriority = B_IDLE_PRIORITY;
1965 		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
1966 			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
1967 			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
1968 		} else {
1969 			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
1970 				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
1971 		}
1972 
1973 		thread_set_io_priority(ioPriority);
1974 
1975 		uint32 numPages = 0;
1976 		run.PrepareNextRun();
1977 
1978 		// TODO: make this laptop friendly, too (ie. only start doing
1979 		// something if someone else did something or there is really
1980 		// enough to do).
1981 
1982 		// collect pages to be written
1983 		pageCollectionTime -= system_time();
1984 
1985 		page_num_t maxPagesToSee = modifiedPages;
1986 
1987 		while (numPages < kNumPages && maxPagesToSee > 0) {
1988 			vm_page *page = next_modified_page(maxPagesToSee);
1989 			if (page == NULL)
1990 				break;
1991 
1992 			PageCacheLocker cacheLocker(page, false);
1993 			if (!cacheLocker.IsLocked())
1994 				continue;
1995 
1996 			VMCache *cache = page->Cache();
1997 
1998 			// If the page is busy or its state has changed while we were
1999 			// locking the cache, just ignore it.
2000 			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2001 				continue;
2002 
2003 			DEBUG_PAGE_ACCESS_START(page);
2004 
2005 			// Don't write back wired (locked) pages.
2006 			if (page->WiredCount() > 0) {
2007 				set_page_state(page, PAGE_STATE_ACTIVE);
2008 				DEBUG_PAGE_ACCESS_END(page);
2009 				continue;
2010 			}
2011 
2012 			// Write back temporary pages only when we're actively paging.
2013 			if (cache->temporary
2014 #if ENABLE_SWAP_SUPPORT
2015 				&& (!activePaging
2016 					|| !cache->CanWritePage(
2017 							(off_t)page->cache_offset << PAGE_SHIFT))
2018 #endif
2019 				) {
2020 				// We can't/don't want to do anything with this page, so move it
2021 				// to one of the other queues.
2022 				if (page->mappings.IsEmpty())
2023 					set_page_state(page, PAGE_STATE_INACTIVE);
2024 				else
2025 					set_page_state(page, PAGE_STATE_ACTIVE);
2026 
2027 				DEBUG_PAGE_ACCESS_END(page);
2028 				continue;
2029 			}
2030 
2031 			// We need our own reference to the store, as it might currently be
2032 			// destroyed.
2033 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2034 				DEBUG_PAGE_ACCESS_END(page);
2035 				cacheLocker.Unlock();
2036 				thread_yield(true);
2037 				continue;
2038 			}
2039 
2040 			run.AddPage(page);
2041 				// TODO: We're possibly adding pages of different caches and
2042 				// thus maybe of different underlying file systems here. This
2043 				// is a potential problem for loop file systems/devices, since
2044 				// we could mark a page busy that would need to be accessed
2045 				// when writing back another page, thus causing a deadlock.
2046 
2047 			DEBUG_PAGE_ACCESS_END(page);
2048 
2049 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2050 			TPW(WritePage(page));
2051 
2052 			cache->AcquireRefLocked();
2053 			numPages++;
2054 		}
2055 
2056 		pageCollectionTime += system_time();
2057 
2058 		if (numPages == 0)
2059 			continue;
2060 
2061 		// write pages to disk and do all the cleanup
2062 		pageWritingTime -= system_time();
2063 		uint32 failedPages = run.Go();
2064 		pageWritingTime += system_time();
2065 
2066 		// debug output only...
2067 		writtenPages += numPages;
2068 		if (writtenPages >= 1024) {
2069 			bigtime_t now = system_time();
2070 			TRACE(("page writer: wrote 1024 pages (total: %llu ms, "
2071 				"collect: %llu ms, write: %llu ms)\n",
2072 				(now - lastWrittenTime) / 1000,
2073 				pageCollectionTime / 1000, pageWritingTime / 1000));
2074 			writtenPages -= 1024;
2075 			lastWrittenTime = now;
2076 			pageCollectionTime = 0;
2077 			pageWritingTime = 0;
2078 		}
2079 
2080 		if (failedPages == numPages)
2081 			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2082 		else
2083 			pagesSinceLastSuccessfulWrite = 0;
2084 	}
2085 
2086 	return B_OK;
2087 }
2088 
2089 
2090 // #pragma mark -
2091 
2092 
2093 // TODO: This should be done in the page daemon!
2094 #if 0
2095 #if ENABLE_SWAP_SUPPORT
2096 static bool
2097 free_page_swap_space(int32 index)
2098 {
2099 	vm_page *page = vm_page_at_index(index);
2100 	PageCacheLocker locker(page);
2101 	if (!locker.IsLocked())
2102 		return false;
2103 
2104 	DEBUG_PAGE_ACCESS_START(page);
2105 
2106 	VMCache* cache = page->Cache();
2107 	if (cache->temporary && page->WiredCount() == 0
2108 			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2109 			&& page->usage_count > 0) {
2110 		// TODO: how to judge a page is highly active?
2111 		if (swap_free_page_swap_space(page)) {
2112 			// We need to mark the page modified, since otherwise it could be
2113 			// stolen and we'd lose its data.
2114 			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2115 			TD(FreedPageSwap(page));
2116 			DEBUG_PAGE_ACCESS_END(page);
2117 			return true;
2118 		}
2119 	}
2120 	DEBUG_PAGE_ACCESS_END(page);
2121 	return false;
2122 }
2123 #endif
2124 #endif	// 0
2125 
2126 
2127 static vm_page *
2128 find_cached_page_candidate(struct vm_page &marker)
2129 {
2130 	DEBUG_PAGE_ACCESS_CHECK(&marker);
2131 
2132 	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2133 	vm_page *page;
2134 
2135 	if (marker.State() == PAGE_STATE_UNUSED) {
2136 		// Get the first free pages of the (in)active queue
2137 		page = sCachedPageQueue.Head();
2138 	} else {
2139 		// Get the next page of the current queue
2140 		if (marker.State() != PAGE_STATE_CACHED) {
2141 			panic("invalid marker %p state", &marker);
2142 			return NULL;
2143 		}
2144 
2145 		page = sCachedPageQueue.Next(&marker);
2146 		sCachedPageQueue.Remove(&marker);
2147 		marker.SetState(PAGE_STATE_UNUSED);
2148 	}
2149 
2150 	while (page != NULL) {
2151 		if (!page->busy) {
2152 			// we found a candidate, insert marker
2153 			marker.SetState(PAGE_STATE_CACHED);
2154 			sCachedPageQueue.InsertAfter(page, &marker);
2155 			return page;
2156 		}
2157 
2158 		page = sCachedPageQueue.Next(page);
2159 	}
2160 
2161 	return NULL;
2162 }
2163 
2164 
2165 static bool
2166 free_cached_page(vm_page *page, bool dontWait)
2167 {
2168 	// try to lock the page's cache
2169 	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2170 		return false;
2171 	VMCache* cache = page->Cache();
2172 
2173 	AutoLocker<VMCache> cacheLocker(cache, true);
2174 	MethodDeleter<VMCache> _2(cache, &VMCache::ReleaseRefLocked);
2175 
2176 	// check again if that page is still a candidate
2177 	if (page->busy || page->State() != PAGE_STATE_CACHED)
2178 		return false;
2179 
2180 	DEBUG_PAGE_ACCESS_START(page);
2181 
2182 	PAGE_ASSERT(page, !page->IsMapped());
2183 	PAGE_ASSERT(page, !page->modified);
2184 
2185 	// we can now steal this page
2186 
2187 	cache->RemovePage(page);
2188 		// Now the page doesn't have cache anymore, so no one else (e.g.
2189 		// vm_page_allocate_page_run() can pick it up), since they would be
2190 		// required to lock the cache first, which would fail.
2191 
2192 	sCachedPageQueue.RemoveUnlocked(page);
2193 	return true;
2194 }
2195 
2196 
2197 static uint32
2198 free_cached_pages(uint32 pagesToFree, bool dontWait)
2199 {
2200 	vm_page marker;
2201 	init_page_marker(marker);
2202 
2203 	uint32 pagesFreed = 0;
2204 
2205 	while (pagesFreed < pagesToFree) {
2206 		vm_page *page = find_cached_page_candidate(marker);
2207 		if (page == NULL)
2208 			break;
2209 
2210 		if (free_cached_page(page, dontWait)) {
2211 			ReadLocker locker(sFreePageQueuesLock);
2212 			page->SetState(PAGE_STATE_FREE);
2213 			DEBUG_PAGE_ACCESS_END(page);
2214 			sFreePageQueue.PrependUnlocked(page);
2215 			locker.Unlock();
2216 
2217 			TA(StolenPage());
2218 
2219 			pagesFreed++;
2220 		}
2221 	}
2222 
2223 	remove_page_marker(marker);
2224 
2225 	return pagesFreed;
2226 }
2227 
2228 
2229 static void
2230 idle_scan_active_pages(page_stats& pageStats)
2231 {
2232 	VMPageQueue& queue = sActivePageQueue;
2233 
2234 	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2235 	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2236 
2237 	while (maxToScan > 0) {
2238 		maxToScan--;
2239 
2240 		// Get the next page. Note that we don't bother to lock here. We go with
2241 		// the assumption that on all architectures reading/writing pointers is
2242 		// atomic. Beyond that it doesn't really matter. We have to unlock the
2243 		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2244 		vm_page* page = queue.Head();
2245 		if (page == NULL)
2246 			break;
2247 
2248 		// lock the page's cache
2249 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2250 		if (cache == NULL)
2251 			continue;
2252 
2253 		if (page->State() != PAGE_STATE_ACTIVE) {
2254 			// page is no longer in the cache or in this queue
2255 			cache->ReleaseRefAndUnlock();
2256 			continue;
2257 		}
2258 
2259 		if (page->busy) {
2260 			// page is busy -- requeue at the end
2261 			vm_page_requeue(page, true);
2262 			cache->ReleaseRefAndUnlock();
2263 			continue;
2264 		}
2265 
2266 		DEBUG_PAGE_ACCESS_START(page);
2267 
2268 		// Get the page active/modified flags and update the page's usage count.
2269 		// We completely unmap inactive temporary pages. This saves us to
2270 		// iterate through the inactive list as well, since we'll be notified
2271 		// via page fault whenever such an inactive page is used again.
2272 		// We don't remove the mappings of non-temporary pages, since we
2273 		// wouldn't notice when those would become unused and could thus be
2274 		// moved to the cached list.
2275 		int32 usageCount;
2276 		if (page->WiredCount() > 0 || page->usage_count > 0
2277 			|| !cache->temporary) {
2278 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2279 		} else
2280 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2281 
2282 		if (usageCount > 0) {
2283 			usageCount += page->usage_count + kPageUsageAdvance;
2284 			if (usageCount > kPageUsageMax)
2285 				usageCount = kPageUsageMax;
2286 // TODO: This would probably also be the place to reclaim swap space.
2287 		} else {
2288 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2289 			if (usageCount < 0) {
2290 				usageCount = 0;
2291 				set_page_state(page, PAGE_STATE_INACTIVE);
2292 			}
2293 		}
2294 
2295 		page->usage_count = usageCount;
2296 
2297 		DEBUG_PAGE_ACCESS_END(page);
2298 
2299 		cache->ReleaseRefAndUnlock();
2300 	}
2301 }
2302 
2303 
2304 static void
2305 full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2306 {
2307 	int32 pagesToFree = pageStats.unsatisfiedReservations
2308 		+ sFreeOrCachedPagesTarget
2309 		- (pageStats.totalFreePages + pageStats.cachedPages);
2310 	if (pagesToFree <= 0)
2311 		return;
2312 
2313 	bigtime_t time = system_time();
2314 	uint32 pagesScanned = 0;
2315 	uint32 pagesToCached = 0;
2316 	uint32 pagesToModified = 0;
2317 	uint32 pagesToActive = 0;
2318 
2319 	// Determine how many pages at maximum to send to the modified queue. Since
2320 	// it is relatively expensive to page out pages, we do that on a grander
2321 	// scale only when things get desperate.
2322 	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2323 
2324 	vm_page marker;
2325 	init_page_marker(marker);
2326 
2327 	VMPageQueue& queue = sInactivePageQueue;
2328 	InterruptsSpinLocker queueLocker(queue.GetLock());
2329 	uint32 maxToScan = queue.Count();
2330 
2331 	vm_page* nextPage = queue.Head();
2332 
2333 	while (pagesToFree > 0 && maxToScan > 0) {
2334 		maxToScan--;
2335 
2336 		// get the next page
2337 		vm_page* page = nextPage;
2338 		if (page == NULL)
2339 			break;
2340 		nextPage = queue.Next(page);
2341 
2342 		if (page->busy)
2343 			continue;
2344 
2345 		// mark the position
2346 		queue.InsertAfter(page, &marker);
2347 		queueLocker.Unlock();
2348 
2349 		// lock the page's cache
2350 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2351 		if (cache == NULL || page->busy
2352 				|| page->State() != PAGE_STATE_INACTIVE) {
2353 			if (cache != NULL)
2354 				cache->ReleaseRefAndUnlock();
2355 			queueLocker.Lock();
2356 			nextPage = queue.Next(&marker);
2357 			queue.Remove(&marker);
2358 			continue;
2359 		}
2360 
2361 		pagesScanned++;
2362 
2363 		DEBUG_PAGE_ACCESS_START(page);
2364 
2365 		// Get the accessed count, clear the accessed/modified flags and
2366 		// unmap the page, if it hasn't been accessed.
2367 		int32 usageCount;
2368 		if (page->WiredCount() > 0)
2369 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2370 		else
2371 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2372 
2373 		// update usage count
2374 		if (usageCount > 0) {
2375 			usageCount += page->usage_count + kPageUsageAdvance;
2376 			if (usageCount > kPageUsageMax)
2377 				usageCount = kPageUsageMax;
2378 		} else {
2379 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2380 			if (usageCount < 0)
2381 				usageCount = 0;
2382 		}
2383 
2384 		page->usage_count = usageCount;
2385 
2386 		// Move to fitting queue or requeue:
2387 		// * Active mapped pages go to the active queue.
2388 		// * Inactive mapped (i.e. wired) pages are requeued.
2389 		// * The remaining pages are cachable. Thus, if unmodified they go to
2390 		//   the cached queue, otherwise to the modified queue (up to a limit).
2391 		//   Note that until in the idle scanning we don't exempt pages of
2392 		//   temporary caches. Apparently we really need memory, so we better
2393 		//   page out memory as well.
2394 		bool isMapped = page->IsMapped();
2395 		if (usageCount > 0) {
2396 			if (isMapped) {
2397 				set_page_state(page, PAGE_STATE_ACTIVE);
2398 				pagesToActive++;
2399 			} else
2400 				vm_page_requeue(page, true);
2401 		} else if (isMapped) {
2402 			vm_page_requeue(page, true);
2403 		} else if (!page->modified) {
2404 			set_page_state(page, PAGE_STATE_CACHED);
2405 			pagesToFree--;
2406 			pagesToCached++;
2407 		} else if (maxToFlush > 0) {
2408 			set_page_state(page, PAGE_STATE_MODIFIED);
2409 			maxToFlush--;
2410 			pagesToModified++;
2411 		} else
2412 			vm_page_requeue(page, true);
2413 
2414 		DEBUG_PAGE_ACCESS_END(page);
2415 
2416 		cache->ReleaseRefAndUnlock();
2417 
2418 		// remove the marker
2419 		queueLocker.Lock();
2420 		nextPage = queue.Next(&marker);
2421 		queue.Remove(&marker);
2422 	}
2423 
2424 	queueLocker.Unlock();
2425 
2426 	time = system_time() - time;
2427 	TRACE_DAEMON("  -> inactive scan (%7lld us): scanned: %7lu, "
2428 		"moved: %lu -> cached, %lu -> modified, %lu -> active\n", time,
2429 		pagesScanned, pagesToCached, pagesToModified, pagesToActive);
2430 
2431 	// wake up the page writer, if we tossed it some pages
2432 	if (pagesToModified > 0)
2433 		sPageWriterCondition.WakeUp();
2434 }
2435 
2436 
2437 static void
2438 full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2439 {
2440 	vm_page marker;
2441 	init_page_marker(marker);
2442 
2443 	VMPageQueue& queue = sActivePageQueue;
2444 	InterruptsSpinLocker queueLocker(queue.GetLock());
2445 	uint32 maxToScan = queue.Count();
2446 
2447 	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2448 		+ sFreeOrCachedPagesTarget
2449 		- (pageStats.totalFreePages + pageStats.cachedPages)
2450 		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2451 	if (pagesToDeactivate <= 0)
2452 		return;
2453 
2454 	bigtime_t time = system_time();
2455 	uint32 pagesAccessed = 0;
2456 	uint32 pagesToInactive = 0;
2457 	uint32 pagesScanned = 0;
2458 
2459 	vm_page* nextPage = queue.Head();
2460 
2461 	while (pagesToDeactivate > 0 && maxToScan > 0) {
2462 		maxToScan--;
2463 
2464 		// get the next page
2465 		vm_page* page = nextPage;
2466 		if (page == NULL)
2467 			break;
2468 		nextPage = queue.Next(page);
2469 
2470 		if (page->busy)
2471 			continue;
2472 
2473 		// mark the position
2474 		queue.InsertAfter(page, &marker);
2475 		queueLocker.Unlock();
2476 
2477 		// lock the page's cache
2478 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2479 		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2480 			if (cache != NULL)
2481 				cache->ReleaseRefAndUnlock();
2482 			queueLocker.Lock();
2483 			nextPage = queue.Next(&marker);
2484 			queue.Remove(&marker);
2485 			continue;
2486 		}
2487 
2488 		pagesScanned++;
2489 
2490 		DEBUG_PAGE_ACCESS_START(page);
2491 
2492 		// Get the page active/modified flags and update the page's usage count.
2493 		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2494 
2495 		if (usageCount > 0) {
2496 			usageCount += page->usage_count + kPageUsageAdvance;
2497 			if (usageCount > kPageUsageMax)
2498 				usageCount = kPageUsageMax;
2499 			pagesAccessed++;
2500 // TODO: This would probably also be the place to reclaim swap space.
2501 		} else {
2502 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2503 			if (usageCount <= 0) {
2504 				usageCount = 0;
2505 				set_page_state(page, PAGE_STATE_INACTIVE);
2506 				pagesToInactive++;
2507 			}
2508 		}
2509 
2510 		page->usage_count = usageCount;
2511 
2512 		DEBUG_PAGE_ACCESS_END(page);
2513 
2514 		cache->ReleaseRefAndUnlock();
2515 
2516 		// remove the marker
2517 		queueLocker.Lock();
2518 		nextPage = queue.Next(&marker);
2519 		queue.Remove(&marker);
2520 	}
2521 
2522 	time = system_time() - time;
2523 	TRACE_DAEMON("  ->   active scan (%7lld us): scanned: %7lu, "
2524 		"moved: %lu -> inactive, encountered %lu accessed ones\n", time,
2525 		pagesScanned, pagesToInactive, pagesAccessed);
2526 }
2527 
2528 
2529 static void
2530 page_daemon_idle_scan(page_stats& pageStats)
2531 {
2532 	TRACE_DAEMON("page daemon: idle run\n");
2533 
2534 	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2535 		// We want more actually free pages, so free some from the cached
2536 		// ones.
2537 		uint32 freed = free_cached_pages(
2538 			sFreePagesTarget - pageStats.totalFreePages, false);
2539 		if (freed > 0)
2540 			unreserve_pages(freed);
2541 		get_page_stats(pageStats);
2542 	}
2543 
2544 	// Walk the active list and move pages to the inactive queue.
2545 	get_page_stats(pageStats);
2546 	idle_scan_active_pages(pageStats);
2547 }
2548 
2549 
2550 static void
2551 page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2552 {
2553 	TRACE_DAEMON("page daemon: full run: free: %lu, cached: %lu, "
2554 		"to free: %lu\n", pageStats.totalFreePages, pageStats.cachedPages,
2555 		pageStats.unsatisfiedReservations + sFreeOrCachedPagesTarget
2556 			- (pageStats.totalFreePages + pageStats.cachedPages));
2557 
2558 	// Walk the inactive list and transfer pages to the cached and modified
2559 	// queues.
2560 	full_scan_inactive_pages(pageStats, despairLevel);
2561 
2562 	// Free cached pages. Also wake up reservation waiters.
2563 	get_page_stats(pageStats);
2564 	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2565 		- (pageStats.totalFreePages);
2566 	if (pagesToFree > 0) {
2567 		uint32 freed = free_cached_pages(pagesToFree, true);
2568 		if (freed > 0)
2569 			unreserve_pages(freed);
2570 	}
2571 
2572 	// Walk the active list and move pages to the inactive queue.
2573 	get_page_stats(pageStats);
2574 	full_scan_active_pages(pageStats, despairLevel);
2575 }
2576 
2577 
2578 static status_t
2579 page_daemon(void* /*unused*/)
2580 {
2581 	int32 despairLevel = 0;
2582 
2583 	while (true) {
2584 		sPageDaemonCondition.ClearActivated();
2585 
2586 		// evaluate the free pages situation
2587 		page_stats pageStats;
2588 		get_page_stats(pageStats);
2589 
2590 		if (!do_active_paging(pageStats)) {
2591 			// Things look good -- just maintain statistics and keep the pool
2592 			// of actually free pages full enough.
2593 			despairLevel = 0;
2594 			page_daemon_idle_scan(pageStats);
2595 			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
2596 		} else {
2597 			// Not enough free pages. We need to do some real work.
2598 			despairLevel = std::max(despairLevel + 1, (int32)3);
2599 			page_daemon_full_scan(pageStats, despairLevel);
2600 
2601 			// Don't wait after the first full scan, but rather immediately
2602 			// check whether we were successful in freeing enough pages and
2603 			// re-run with increased despair level. The first scan is
2604 			// conservative with respect to moving inactive modified pages to
2605 			// the modified list to avoid thrashing. The second scan, however,
2606 			// will not hold back.
2607 			if (despairLevel > 1)
2608 				snooze(kBusyScanWaitInterval);
2609 		}
2610 	}
2611 
2612 	return B_OK;
2613 }
2614 
2615 
2616 /*!	Returns how many pages could *not* be reserved.
2617 */
2618 static uint32
2619 reserve_pages(uint32 count, int priority, bool dontWait)
2620 {
2621 	int32 dontTouch = kPageReserveForPriority[priority];
2622 
2623 	while (true) {
2624 		count -= reserve_some_pages(count, dontTouch);
2625 		if (count == 0)
2626 			return 0;
2627 
2628 		if (sUnsatisfiedPageReservations == 0) {
2629 			count -= free_cached_pages(count, dontWait);
2630 			if (count == 0)
2631 				return count;
2632 		}
2633 
2634 		if (dontWait)
2635 			return count;
2636 
2637 		// we need to wait for pages to become available
2638 
2639 		MutexLocker pageDeficitLocker(sPageDeficitLock);
2640 
2641 		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
2642 		sUnsatisfiedPageReservations += count;
2643 
2644 		if (sUnreservedFreePages > dontTouch) {
2645 			// the situation changed
2646 			sUnsatisfiedPageReservations -= count;
2647 			continue;
2648 		}
2649 
2650 		PageReservationWaiter waiter;
2651 		waiter.dontTouch = dontTouch;
2652 		waiter.missing = count;
2653 		waiter.thread = thread_get_current_thread();
2654 		waiter.threadPriority = waiter.thread->priority;
2655 
2656 		// insert ordered (i.e. after all waiters with higher or equal priority)
2657 		PageReservationWaiter* otherWaiter = NULL;
2658 		for (PageReservationWaiterList::Iterator it
2659 				= sPageReservationWaiters.GetIterator();
2660 			(otherWaiter = it.Next()) != NULL;) {
2661 			if (waiter < *otherWaiter)
2662 				break;
2663 		}
2664 
2665 		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
2666 
2667 		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
2668 			"waiting for pages");
2669 
2670 		if (notifyDaemon)
2671 			sPageDaemonCondition.WakeUp();
2672 
2673 		pageDeficitLocker.Unlock();
2674 
2675 		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
2676 		thread_block();
2677 
2678 		pageDeficitLocker.Lock();
2679 
2680 		return 0;
2681 	}
2682 }
2683 
2684 
2685 //	#pragma mark - private kernel API
2686 
2687 
2688 /*!	Writes a range of modified pages of a cache to disk.
2689 	You need to hold the VMCache lock when calling this function.
2690 	Note that the cache lock is released in this function.
2691 	\param cache The cache.
2692 	\param firstPage Offset (in page size units) of the first page in the range.
2693 	\param endPage End offset (in page size units) of the page range. The page
2694 		at this offset is not included.
2695 */
2696 status_t
2697 vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
2698 	uint32 endPage)
2699 {
2700 	static const int32 kMaxPages = 256;
2701 	int32 maxPages = cache->MaxPagesPerWrite();
2702 	if (maxPages < 0 || maxPages > kMaxPages)
2703 		maxPages = kMaxPages;
2704 
2705 	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
2706 		| HEAP_DONT_LOCK_KERNEL_SPACE;
2707 
2708 	PageWriteWrapper stackWrappers[2];
2709 	PageWriteWrapper* wrapperPool
2710 		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
2711 	if (wrapperPool == NULL) {
2712 		// don't fail, just limit our capabilities
2713 		wrapperPool = stackWrappers;
2714 		maxPages = 1;
2715 	}
2716 
2717 	int32 nextWrapper = 0;
2718 
2719 	PageWriteWrapper* wrappers[maxPages];
2720 	int32 usedWrappers = 0;
2721 
2722 	PageWriteTransfer transfer;
2723 	bool transferEmpty = true;
2724 
2725 	VMCachePagesTree::Iterator it
2726 		= cache->pages.GetIterator(firstPage, true, true);
2727 
2728 	while (true) {
2729 		vm_page* page = it.Next();
2730 		if (page == NULL || page->cache_offset >= endPage) {
2731 			if (transferEmpty)
2732 				break;
2733 
2734 			page = NULL;
2735 		}
2736 
2737 		if (page != NULL) {
2738 			if (page->busy
2739 				|| (page->State() != PAGE_STATE_MODIFIED
2740 					&& !vm_test_map_modification(page))) {
2741 				page = NULL;
2742 			}
2743 		}
2744 
2745 		PageWriteWrapper* wrapper = NULL;
2746 		if (page != NULL) {
2747 			wrapper = &wrapperPool[nextWrapper++];
2748 			if (nextWrapper > maxPages)
2749 				nextWrapper = 0;
2750 
2751 			DEBUG_PAGE_ACCESS_START(page);
2752 
2753 			wrapper->SetTo(page);
2754 
2755 			if (transferEmpty || transfer.AddPage(page)) {
2756 				if (transferEmpty) {
2757 					transfer.SetTo(NULL, page, maxPages);
2758 					transferEmpty = false;
2759 				}
2760 
2761 				DEBUG_PAGE_ACCESS_END(page);
2762 
2763 				wrappers[usedWrappers++] = wrapper;
2764 				continue;
2765 			}
2766 
2767 			DEBUG_PAGE_ACCESS_END(page);
2768 		}
2769 
2770 		if (transferEmpty)
2771 			continue;
2772 
2773 		cache->Unlock();
2774 		status_t status = transfer.Schedule(0);
2775 		cache->Lock();
2776 
2777 		for (int32 i = 0; i < usedWrappers; i++)
2778 			wrappers[i]->Done(status);
2779 
2780 		usedWrappers = 0;
2781 
2782 		if (page != NULL) {
2783 			transfer.SetTo(NULL, page, maxPages);
2784 			wrappers[usedWrappers++] = wrapper;
2785 		} else
2786 			transferEmpty = true;
2787 	}
2788 
2789 	if (wrapperPool != stackWrappers)
2790 		delete[] wrapperPool;
2791 
2792 	return B_OK;
2793 }
2794 
2795 
2796 /*!	You need to hold the VMCache lock when calling this function.
2797 	Note that the cache lock is released in this function.
2798 */
2799 status_t
2800 vm_page_write_modified_pages(VMCache *cache)
2801 {
2802 	return vm_page_write_modified_page_range(cache, 0,
2803 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
2804 }
2805 
2806 
2807 /*!	Schedules the page writer to write back the specified \a page.
2808 	Note, however, that it might not do this immediately, and it can well
2809 	take several seconds until the page is actually written out.
2810 */
2811 void
2812 vm_page_schedule_write_page(vm_page *page)
2813 {
2814 	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
2815 
2816 	vm_page_requeue(page, false);
2817 
2818 	sPageWriterCondition.WakeUp();
2819 }
2820 
2821 
2822 /*!	Cache must be locked.
2823 */
2824 void
2825 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
2826 	uint32 endPage)
2827 {
2828 	uint32 modified = 0;
2829 	for (VMCachePagesTree::Iterator it
2830 				= cache->pages.GetIterator(firstPage, true, true);
2831 			vm_page *page = it.Next();) {
2832 		if (page->cache_offset >= endPage)
2833 			break;
2834 
2835 		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
2836 			DEBUG_PAGE_ACCESS_START(page);
2837 			vm_page_requeue(page, false);
2838 			modified++;
2839 			DEBUG_PAGE_ACCESS_END(page);
2840 		}
2841 	}
2842 
2843 	if (modified > 0)
2844 		sPageWriterCondition.WakeUp();
2845 }
2846 
2847 
2848 void
2849 vm_page_init_num_pages(kernel_args *args)
2850 {
2851 	// calculate the size of memory by looking at the physical_memory_range array
2852 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
2853 	page_num_t physicalPagesEnd = sPhysicalPageOffset
2854 		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
2855 
2856 	sNonExistingPages = 0;
2857 	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
2858 
2859 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
2860 		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
2861 		if (start > physicalPagesEnd)
2862 			sNonExistingPages += start - physicalPagesEnd;
2863 		physicalPagesEnd = start
2864 			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
2865 	}
2866 
2867 	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
2868 		sPhysicalPageOffset, physicalPagesEnd));
2869 
2870 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
2871 
2872 #ifdef LIMIT_AVAILABLE_MEMORY
2873 	if (sNumPages > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE))
2874 		sNumPages = LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
2875 #endif
2876 }
2877 
2878 
2879 status_t
2880 vm_page_init(kernel_args *args)
2881 {
2882 	TRACE(("vm_page_init: entry\n"));
2883 
2884 	// init page queues
2885 	sModifiedPageQueue.Init("modified pages queue");
2886 	sInactivePageQueue.Init("inactive pages queue");
2887 	sActivePageQueue.Init("active pages queue");
2888 	sCachedPageQueue.Init("cached pages queue");
2889 	sFreePageQueue.Init("free pages queue");
2890 	sClearPageQueue.Init("clear pages queue");
2891 
2892 	new (&sPageReservationWaiters) PageReservationWaiterList;
2893 
2894 	// map in the new free page table
2895 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
2896 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
2897 
2898 	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
2899 		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
2900 		(phys_addr_t)(sNumPages * sizeof(vm_page))));
2901 
2902 	// initialize the free page table
2903 	for (uint32 i = 0; i < sNumPages; i++) {
2904 		sPages[i].Init(sPhysicalPageOffset + i);
2905 		sFreePageQueue.Append(&sPages[i]);
2906 	}
2907 
2908 	sUnreservedFreePages = sNumPages;
2909 
2910 	TRACE(("initialized table\n"));
2911 
2912 	// mark the ranges between usable physical memory unused
2913 	phys_addr_t previousEnd = 0;
2914 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
2915 		phys_addr_t base = args->physical_memory_range[i].start;
2916 		phys_size_t size = args->physical_memory_range[i].size;
2917 		if (base > previousEnd) {
2918 			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
2919 				(base - previousEnd) / B_PAGE_SIZE, false);
2920 		}
2921 		previousEnd = base + size;
2922 	}
2923 
2924 	// mark the allocated physical page ranges wired
2925 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
2926 		mark_page_range_in_use(
2927 			args->physical_allocated_range[i].start / B_PAGE_SIZE,
2928 			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
2929 	}
2930 
2931 	// The target of actually free pages. This must be at least the system
2932 	// reserve, but should be a few more pages, so we don't have to extract
2933 	// a cached page with each allocation.
2934 	sFreePagesTarget = VM_PAGE_RESERVE_USER
2935 		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
2936 
2937 	// The target of free + cached and inactive pages. On low-memory machines
2938 	// keep things tight. free + cached is the pool of immediately allocatable
2939 	// pages. We want a few inactive pages, so when we're actually paging, we
2940 	// have a reasonably large set of pages to work with.
2941 	if (sUnreservedFreePages < 16 * 1024) {
2942 		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
2943 		sInactivePagesTarget = sFreePagesTarget / 3;
2944 	} else {
2945 		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
2946 		sInactivePagesTarget = sFreePagesTarget / 2;
2947 	}
2948 
2949 	TRACE(("vm_page_init: exit\n"));
2950 
2951 	return B_OK;
2952 }
2953 
2954 
2955 status_t
2956 vm_page_init_post_area(kernel_args *args)
2957 {
2958 	void *dummy;
2959 
2960 	dummy = sPages;
2961 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
2962 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
2963 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
2964 
2965 	add_debugger_command("page_stats", &dump_page_stats,
2966 		"Dump statistics about page usage");
2967 	add_debugger_command_etc("page", &dump_page,
2968 		"Dump page info",
2969 		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
2970 		"Prints information for the physical page. If neither \"-p\" nor\n"
2971 		"\"-v\" are given, the provided address is interpreted as address of\n"
2972 		"the vm_page data structure for the page in question. If \"-p\" is\n"
2973 		"given, the address is the physical address of the page. If \"-v\" is\n"
2974 		"given, the address is interpreted as virtual address in the current\n"
2975 		"thread's address space and for the page it is mapped to (if any)\n"
2976 		"information are printed. If \"-m\" is specified, the command will\n"
2977 		"search all known address spaces for mappings to that page and print\n"
2978 		"them.\n", 0);
2979 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
2980 	add_debugger_command("find_page", &find_page,
2981 		"Find out which queue a page is actually in");
2982 
2983 #ifdef TRACK_PAGE_USAGE_STATS
2984 	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
2985 		"Dumps statistics about page usage counts",
2986 		"\n"
2987 		"Dumps statistics about page usage counts.\n",
2988 		B_KDEBUG_DONT_PARSE_ARGUMENTS);
2989 #endif
2990 
2991 	return B_OK;
2992 }
2993 
2994 
2995 status_t
2996 vm_page_init_post_thread(kernel_args *args)
2997 {
2998 	new (&sFreePageCondition) ConditionVariable;
2999 	sFreePageCondition.Publish(&sFreePageQueue, "free page");
3000 
3001 	// create a kernel thread to clear out pages
3002 
3003 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3004 		B_LOWEST_ACTIVE_PRIORITY, NULL);
3005 	resume_thread(thread);
3006 
3007 	// start page writer
3008 
3009 	sPageWriterCondition.Init("page writer");
3010 
3011 	thread = spawn_kernel_thread(&page_writer, "page writer",
3012 		B_NORMAL_PRIORITY + 1, NULL);
3013 	resume_thread(thread);
3014 
3015 	// start page daemon
3016 
3017 	sPageDaemonCondition.Init("page daemon");
3018 
3019 	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3020 		B_NORMAL_PRIORITY, NULL);
3021 	resume_thread(thread);
3022 
3023 	return B_OK;
3024 }
3025 
3026 
3027 status_t
3028 vm_mark_page_inuse(page_num_t page)
3029 {
3030 	return vm_mark_page_range_inuse(page, 1);
3031 }
3032 
3033 
3034 status_t
3035 vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3036 {
3037 	return mark_page_range_in_use(startPage, length, false);
3038 }
3039 
3040 
3041 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3042 */
3043 void
3044 vm_page_unreserve_pages(vm_page_reservation* reservation)
3045 {
3046 	uint32 count = reservation->count;
3047 	reservation->count = 0;
3048 
3049 	if (count == 0)
3050 		return;
3051 
3052 	TA(UnreservePages(count));
3053 
3054 	unreserve_pages(count);
3055 }
3056 
3057 
3058 /*!	With this call, you can reserve a number of free pages in the system.
3059 	They will only be handed out to someone who has actually reserved them.
3060 	This call returns as soon as the number of requested pages has been
3061 	reached.
3062 	The caller must not hold any cache lock or the function might deadlock.
3063 */
3064 void
3065 vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3066 	int priority)
3067 {
3068 	reservation->count = count;
3069 
3070 	if (count == 0)
3071 		return;
3072 
3073 	TA(ReservePages(count));
3074 
3075 	reserve_pages(count, priority, false);
3076 }
3077 
3078 
3079 bool
3080 vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3081 	int priority)
3082 {
3083 	if (count == 0) {
3084 		reservation->count = count;
3085 		return true;
3086 	}
3087 
3088 	uint32 remaining = reserve_pages(count, priority, true);
3089 	if (remaining == 0) {
3090 		TA(ReservePages(count));
3091 		reservation->count = count;
3092 		return true;
3093 	}
3094 
3095 	unreserve_pages(count - remaining);
3096 
3097 	return false;
3098 }
3099 
3100 
3101 vm_page *
3102 vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3103 {
3104 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3105 	ASSERT(pageState != PAGE_STATE_FREE);
3106 	ASSERT(pageState != PAGE_STATE_CLEAR);
3107 
3108 	ASSERT(reservation->count > 0);
3109 	reservation->count--;
3110 
3111 	VMPageQueue* queue;
3112 	VMPageQueue* otherQueue;
3113 
3114 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3115 		queue = &sClearPageQueue;
3116 		otherQueue = &sFreePageQueue;
3117 	} else {
3118 		queue = &sFreePageQueue;
3119 		otherQueue = &sClearPageQueue;
3120 	}
3121 
3122 	TA(AllocatePage());
3123 
3124 	ReadLocker locker(sFreePageQueuesLock);
3125 
3126 	vm_page* page = queue->RemoveHeadUnlocked();
3127 	if (page == NULL) {
3128 		// if the primary queue was empty, grab the page from the
3129 		// secondary queue
3130 		page = otherQueue->RemoveHeadUnlocked();
3131 
3132 		if (page == NULL) {
3133 			// Unlikely, but possible: the page we have reserved has moved
3134 			// between the queues after we checked the first queue. Grab the
3135 			// write locker to make sure this doesn't happen again.
3136 			locker.Unlock();
3137 			WriteLocker writeLocker(sFreePageQueuesLock);
3138 
3139 			page = queue->RemoveHead();
3140 			if (page == NULL)
3141 				otherQueue->RemoveHead();
3142 
3143 			if (page == NULL) {
3144 				panic("Had reserved page, but there is none!");
3145 				return NULL;
3146 			}
3147 
3148 			// downgrade to read lock
3149 			locker.Lock();
3150 		}
3151 	}
3152 
3153 	if (page->CacheRef() != NULL)
3154 		panic("supposed to be free page %p has cache\n", page);
3155 
3156 	DEBUG_PAGE_ACCESS_START(page);
3157 
3158 	int oldPageState = page->State();
3159 	page->SetState(pageState);
3160 	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3161 	page->usage_count = 0;
3162 	page->accessed = false;
3163 	page->modified = false;
3164 
3165 	locker.Unlock();
3166 
3167 	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3168 		sPageQueues[pageState].AppendUnlocked(page);
3169 
3170 	// clear the page, if we had to take it from the free queue and a clear
3171 	// page was requested
3172 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3173 		clear_page(page);
3174 
3175 	return page;
3176 }
3177 
3178 
3179 static void
3180 allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3181 	VMPageQueue::PageList& clearPages)
3182 {
3183 	while (vm_page* page = freePages.RemoveHead()) {
3184 		page->busy = false;
3185 		page->SetState(PAGE_STATE_FREE);
3186 		DEBUG_PAGE_ACCESS_END(page);
3187 		sFreePageQueue.PrependUnlocked(page);
3188 	}
3189 
3190 	while (vm_page* page = clearPages.RemoveHead()) {
3191 		page->busy = false;
3192 		page->SetState(PAGE_STATE_CLEAR);
3193 		DEBUG_PAGE_ACCESS_END(page);
3194 		sClearPageQueue.PrependUnlocked(page);
3195 	}
3196 }
3197 
3198 
3199 /*!	Tries to allocate the a contiguous run of \a length pages starting at
3200 	index \a start.
3201 
3202 	The caller must have write-locked the free/clear page queues. The function
3203 	will unlock regardless of whether it succeeds or fails.
3204 
3205 	If the function fails, it cleans up after itself, i.e. it will free all
3206 	pages it managed to allocate.
3207 
3208 	\param start The start index (into \c sPages) of the run.
3209 	\param length The number of pages to allocate.
3210 	\param flags Page allocation flags. Encodes the state the function shall
3211 		set the allocated pages to, whether the pages shall be marked busy
3212 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3213 		(VM_PAGE_ALLOC_CLEAR).
3214 	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3215 		queues in locked state. Will be unlocked by the function.
3216 	\return The index of the first page that could not be allocated. \a length
3217 		is returned when the function was successful.
3218 */
3219 static page_num_t
3220 allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3221 	WriteLocker& freeClearQueueLocker)
3222 {
3223 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3224 	ASSERT(pageState != PAGE_STATE_FREE);
3225 	ASSERT(pageState != PAGE_STATE_CLEAR);
3226 	ASSERT(start + length <= sNumPages);
3227 
3228 	TA(AllocatePageRun(length));
3229 
3230 	// Pull the free/clear pages out of their respective queues. Cached pages
3231 	// are allocated later.
3232 	page_num_t cachedPages = 0;
3233 	VMPageQueue::PageList freePages;
3234 	VMPageQueue::PageList clearPages;
3235 	page_num_t i = 0;
3236 	for (; i < length; i++) {
3237 		bool pageAllocated = true;
3238 		bool noPage = false;
3239 		vm_page& page = sPages[start + i];
3240 		switch (page.State()) {
3241 			case PAGE_STATE_CLEAR:
3242 				DEBUG_PAGE_ACCESS_START(&page);
3243 				sClearPageQueue.Remove(&page);
3244 				clearPages.Add(&page);
3245 				break;
3246 			case PAGE_STATE_FREE:
3247 				DEBUG_PAGE_ACCESS_START(&page);
3248 				sFreePageQueue.Remove(&page);
3249 				freePages.Add(&page);
3250 				break;
3251 			case PAGE_STATE_CACHED:
3252 				// We allocate cached pages later.
3253 				cachedPages++;
3254 				pageAllocated = false;
3255 				break;
3256 
3257 			default:
3258 				// Probably a page was cached when our caller checked. Now it's
3259 				// gone and we have to abort.
3260 				noPage = true;
3261 				break;
3262 		}
3263 
3264 		if (noPage)
3265 			break;
3266 
3267 		if (pageAllocated) {
3268 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3269 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3270 			page.usage_count = 0;
3271 			page.accessed = false;
3272 			page.modified = false;
3273 		}
3274 	}
3275 
3276 	if (i < length) {
3277 		// failed to allocate a page -- free all that we've got
3278 		allocate_page_run_cleanup(freePages, clearPages);
3279 		return i;
3280 	}
3281 
3282 	freeClearQueueLocker.Unlock();
3283 
3284 	if (cachedPages > 0) {
3285 		// allocate the pages that weren't free but cached
3286 		page_num_t freedCachedPages = 0;
3287 		page_num_t nextIndex = start;
3288 		vm_page* freePage = freePages.Head();
3289 		vm_page* clearPage = clearPages.Head();
3290 		while (cachedPages > 0) {
3291 			// skip, if we've already got the page
3292 			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3293 				freePage = freePages.GetNext(freePage);
3294 				nextIndex++;
3295 				continue;
3296 			}
3297 			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3298 				clearPage = clearPages.GetNext(clearPage);
3299 				nextIndex++;
3300 				continue;
3301 			}
3302 
3303 			// free the page, if it is still cached
3304 			vm_page& page = sPages[nextIndex];
3305 			if (!free_cached_page(&page, false)) {
3306 				// TODO: if the page turns out to have been freed already,
3307 				// there would be no need to fail
3308 				break;
3309 			}
3310 
3311 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3312 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3313 			page.usage_count = 0;
3314 			page.accessed = false;
3315 			page.modified = false;
3316 
3317 			freePages.InsertBefore(freePage, &page);
3318 			freedCachedPages++;
3319 			cachedPages--;
3320 			nextIndex++;
3321 		}
3322 
3323 		// If we have freed cached pages, we need to balance things.
3324 		if (freedCachedPages > 0)
3325 			unreserve_pages(freedCachedPages);
3326 
3327 		if (nextIndex - start < length) {
3328 			// failed to allocate all cached pages -- free all that we've got
3329 			freeClearQueueLocker.Lock();
3330 			allocate_page_run_cleanup(freePages, clearPages);
3331 			freeClearQueueLocker.Unlock();
3332 
3333 			return nextIndex - start;
3334 		}
3335 	}
3336 
3337 	// clear pages, if requested
3338 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3339 		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3340 				vm_page* page = it.Next();) {
3341  			clear_page(page);
3342 		}
3343 	}
3344 
3345 	// add pages to target queue
3346 	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3347 		freePages.MoveFrom(&clearPages);
3348 		sPageQueues[pageState].AppendUnlocked(freePages, length);
3349 	}
3350 
3351 	// Note: We don't unreserve the pages since we pulled them out of the
3352 	// free/clear queues without adjusting sUnreservedFreePages.
3353 
3354 	return length;
3355 }
3356 
3357 
3358 /*! Allocate a physically contiguous range of pages.
3359 
3360 	\param flags Page allocation flags. Encodes the state the function shall
3361 		set the allocated pages to, whether the pages shall be marked busy
3362 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3363 		(VM_PAGE_ALLOC_CLEAR).
3364 	\param length The number of contiguous pages to allocate.
3365 	\param restrictions Restrictions to the physical addresses of the page run
3366 		to allocate, including \c low_address, the first acceptable physical
3367 		address where the page run may start, \c high_address, the last
3368 		acceptable physical address where the page run may end (i.e. it must
3369 		hold \code runStartAddress + length <= high_address \endcode),
3370 		\c alignment, the alignment of the page run start address, and
3371 		\c boundary, multiples of which the page run must not cross.
3372 		Values set to \c 0 are ignored.
3373 	\param priority The page reservation priority (as passed to
3374 		vm_page_reserve_pages()).
3375 	\return The first page of the allocated page run on success; \c NULL
3376 		when the allocation failed.
3377 */
3378 vm_page*
3379 vm_page_allocate_page_run(uint32 flags, page_num_t length,
3380 	const physical_address_restrictions* restrictions, int priority)
3381 {
3382 	// compute start and end page index
3383 	page_num_t requestedStart
3384 		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3385 			- sPhysicalPageOffset;
3386 	page_num_t start = requestedStart;
3387 	page_num_t end;
3388 	if (restrictions->high_address > 0) {
3389 		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3390 				sPhysicalPageOffset)
3391 			- sPhysicalPageOffset;
3392 		end = std::min(end, sNumPages);
3393 	} else
3394 		end = sNumPages;
3395 
3396 	// compute alignment mask
3397 	page_num_t alignmentMask
3398 		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3399 	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3400 		// alignment must be a power of 2
3401 
3402 	// compute the boundary shift
3403 	uint32 boundaryShift = 0;
3404 	if (restrictions->boundary != 0) {
3405 		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3406 		// boundary must be a power of two and not less than alignment and
3407 		// length
3408 		ASSERT(((boundary - 1) & boundary) == 0);
3409 		ASSERT(boundary >= alignmentMask + 1);
3410 		ASSERT(boundary >= length);
3411 
3412 		while ((boundary >>= 1) > 0)
3413 			boundaryShift++;
3414 	}
3415 
3416 	vm_page_reservation reservation;
3417 	vm_page_reserve_pages(&reservation, length, priority);
3418 
3419 	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3420 
3421 	// First we try to get a run with free pages only. If that fails, we also
3422 	// consider cached pages. If there are only few free pages and many cached
3423 	// ones, the odds are that we won't find enough contiguous ones, so we skip
3424 	// the first iteration in this case.
3425 	int32 freePages = sUnreservedFreePages;
3426 	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3427 
3428 	for (;;) {
3429 		if (alignmentMask != 0 || boundaryShift != 0) {
3430 			page_num_t offsetStart = start + sPhysicalPageOffset;
3431 
3432 			// enforce alignment
3433 			if ((offsetStart & alignmentMask) != 0) {
3434 				offsetStart = ((offsetStart + alignmentMask) & ~alignmentMask)
3435 					- sPhysicalPageOffset;
3436 			}
3437 
3438 			// enforce boundary
3439 			if (offsetStart << boundaryShift
3440 					!= (offsetStart + length - 1) << boundaryShift) {
3441 				offsetStart = (offsetStart + length - 1) << boundaryShift
3442 					>> boundaryShift;
3443 			}
3444 
3445 			start = offsetStart - sPhysicalPageOffset;
3446 		}
3447 
3448 		if (start + length > end) {
3449 			if (useCached == 0) {
3450 				// The first iteration with free pages only was unsuccessful.
3451 				// Try again also considering cached pages.
3452 				useCached = 1;
3453 				start = requestedStart;
3454 				continue;
3455 			}
3456 
3457 			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3458 				"length %" B_PRIuPHYSADDR " in second iteration!", length);
3459 
3460 			freeClearQueueLocker.Unlock();
3461 			vm_page_unreserve_pages(&reservation);
3462 			return NULL;
3463 		}
3464 
3465 		bool foundRun = true;
3466 		page_num_t i;
3467 		for (i = 0; i < length; i++) {
3468 			uint32 pageState = sPages[start + i].State();
3469 			if (pageState != PAGE_STATE_FREE
3470 				&& pageState != PAGE_STATE_CLEAR
3471 				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3472 				foundRun = false;
3473 				break;
3474 			}
3475 		}
3476 
3477 		if (foundRun) {
3478 			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
3479 			if (i == length)
3480 				return &sPages[start];
3481 
3482 			// apparently a cached page couldn't be allocated -- skip it and
3483 			// continue
3484 			freeClearQueueLocker.Lock();
3485 		}
3486 
3487 		start += i + 1;
3488 	}
3489 }
3490 
3491 
3492 vm_page *
3493 vm_page_at_index(int32 index)
3494 {
3495 	return &sPages[index];
3496 }
3497 
3498 
3499 vm_page *
3500 vm_lookup_page(page_num_t pageNumber)
3501 {
3502 	if (pageNumber < sPhysicalPageOffset)
3503 		return NULL;
3504 
3505 	pageNumber -= sPhysicalPageOffset;
3506 	if (pageNumber >= sNumPages)
3507 		return NULL;
3508 
3509 	return &sPages[pageNumber];
3510 }
3511 
3512 
3513 bool
3514 vm_page_is_dummy(struct vm_page *page)
3515 {
3516 	return page < sPages || page >= sPages + sNumPages;
3517 }
3518 
3519 
3520 /*!	Free the page that belonged to a certain cache.
3521 	You can use vm_page_set_state() manually if you prefer, but only
3522 	if the page does not equal PAGE_STATE_MODIFIED.
3523 */
3524 void
3525 vm_page_free(VMCache *cache, vm_page *page)
3526 {
3527 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3528 		&& page->State() != PAGE_STATE_CLEAR);
3529 
3530 	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
3531 		atomic_add(&sModifiedTemporaryPages, -1);
3532 
3533 	free_page(page, false);
3534 }
3535 
3536 
3537 void
3538 vm_page_set_state(vm_page *page, int pageState)
3539 {
3540 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
3541 		&& page->State() != PAGE_STATE_CLEAR);
3542 
3543 	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR)
3544 		free_page(page, pageState == PAGE_STATE_CLEAR);
3545 	else
3546 		set_page_state(page, pageState);
3547 }
3548 
3549 
3550 /*!	Moves a page to either the tail of the head of its current queue,
3551 	depending on \a tail.
3552 	The page must have a cache and the cache must be locked!
3553 */
3554 void
3555 vm_page_requeue(struct vm_page *page, bool tail)
3556 {
3557 	PAGE_ASSERT(page, page->Cache() != NULL);
3558 	page->Cache()->AssertLocked();
3559 	// DEBUG_PAGE_ACCESS_CHECK(page);
3560 		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
3561 		// when it requeues busy pages. The reason is that vm_soft_fault()
3562 		// (respectively fault_get_page()) and the file cache keep newly
3563 		// allocated pages accessed while they are reading them from disk. It
3564 		// would probably be better to change that code and reenable this
3565 		// check.
3566 
3567 	VMPageQueue *queue = NULL;
3568 
3569 	switch (page->State()) {
3570 		case PAGE_STATE_ACTIVE:
3571 			queue = &sActivePageQueue;
3572 			break;
3573 		case PAGE_STATE_INACTIVE:
3574 			queue = &sInactivePageQueue;
3575 			break;
3576 		case PAGE_STATE_MODIFIED:
3577 			queue = &sModifiedPageQueue;
3578 			break;
3579 		case PAGE_STATE_CACHED:
3580 			queue = &sCachedPageQueue;
3581 			break;
3582 		case PAGE_STATE_FREE:
3583 		case PAGE_STATE_CLEAR:
3584 			panic("vm_page_requeue() called for free/clear page %p", page);
3585 			return;
3586 		case PAGE_STATE_WIRED:
3587 		case PAGE_STATE_UNUSED:
3588 			return;
3589 		default:
3590 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
3591 				page, page->State());
3592 			break;
3593 	}
3594 
3595 	queue->RequeueUnlocked(page, tail);
3596 }
3597 
3598 
3599 page_num_t
3600 vm_page_num_pages(void)
3601 {
3602 	return sNumPages - sNonExistingPages;
3603 }
3604 
3605 
3606 /*! There is a subtle distinction between the page counts returned by
3607 	this function and vm_page_num_free_pages():
3608 	The latter returns the number of pages that are completely uncommitted,
3609 	whereas this one returns the number of pages that are available for
3610 	use by being reclaimed as well (IOW it factors in things like cache pages
3611 	as available).
3612 */
3613 page_num_t
3614 vm_page_num_available_pages(void)
3615 {
3616 	return vm_available_memory() / B_PAGE_SIZE;
3617 }
3618 
3619 
3620 page_num_t
3621 vm_page_num_free_pages(void)
3622 {
3623 	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
3624 	return count > 0 ? count : 0;
3625 }
3626 
3627 
3628 page_num_t
3629 vm_page_num_unused_pages(void)
3630 {
3631 	int32 count = sUnreservedFreePages;
3632 	return count > 0 ? count : 0;
3633 }
3634 
3635 
3636 void
3637 vm_page_get_stats(system_info *info)
3638 {
3639 	// Get free pages count -- not really exact, since we don't know how many
3640 	// of the reserved pages have already been allocated, but good citizens
3641 	// unreserve chunk-wise as they are allocating the pages, if they have
3642 	// reserved a larger quantity.
3643 	int32 free = sUnreservedFreePages;
3644 	if (free < 0)
3645 		free = 0;
3646 
3647 	// The pages used for the block cache buffers. Those should not be counted
3648 	// as used but as cached pages.
3649 	// TODO: We should subtract the blocks that are in use ATM, since those
3650 	// can't really be freed in a low memory situation.
3651 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
3652 
3653 	info->max_pages = sNumPages - sNonExistingPages;
3654 	info->used_pages = gMappedPagesCount - blockCachePages;
3655 	info->cached_pages = info->max_pages >= free + info->used_pages
3656 		? info->max_pages - free - info->used_pages : 0;
3657 	info->page_faults = vm_num_page_faults();
3658 	info->ignored_pages = sIgnoredPages;
3659 
3660 	// TODO: We don't consider pages used for page directories/tables yet.
3661 }
3662 
3663 
3664 /*!	Returns the greatest address within the last page of accessible physical
3665 	memory.
3666 	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
3667 	means the that the last page ends at exactly 4 GB.
3668 */
3669 phys_addr_t
3670 vm_page_max_address()
3671 {
3672 	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
3673 }
3674