xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision caed67a8cba83913b9c21ac2b06ebc6bd1cb3111)
1 /*
2  * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2010, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 #include <string.h>
12 #include <stdlib.h>
13 
14 #include <algorithm>
15 
16 #include <KernelExport.h>
17 #include <OS.h>
18 
19 #include <AutoDeleter.h>
20 
21 #include <arch/cpu.h>
22 #include <arch/vm_translation_map.h>
23 #include <block_cache.h>
24 #include <boot/kernel_args.h>
25 #include <condition_variable.h>
26 #include <elf.h>
27 #include <heap.h>
28 #include <kernel.h>
29 #include <low_resource_manager.h>
30 #include <thread.h>
31 #include <tracing.h>
32 #include <util/AutoLock.h>
33 #include <vfs.h>
34 #include <vm/vm.h>
35 #include <vm/vm_priv.h>
36 #include <vm/vm_page.h>
37 #include <vm/VMAddressSpace.h>
38 #include <vm/VMArea.h>
39 #include <vm/VMCache.h>
40 
41 #include "IORequest.h"
42 #include "PageCacheLocker.h"
43 #include "VMAnonymousCache.h"
44 #include "VMPageQueue.h"
45 
46 
47 //#define TRACE_VM_PAGE
48 #ifdef TRACE_VM_PAGE
49 #	define TRACE(x) dprintf x
50 #else
51 #	define TRACE(x) ;
52 #endif
53 
54 //#define TRACE_VM_DAEMONS
55 #ifdef TRACE_VM_DAEMONS
56 #define TRACE_DAEMON(x...) dprintf(x)
57 #else
58 #define TRACE_DAEMON(x...) do {} while (false)
59 #endif
60 
61 //#define TRACK_PAGE_USAGE_STATS	1
62 
63 #define PAGE_ASSERT(page, condition)	\
64 	ASSERT_PRINT((condition), "page: %p", (page))
65 
66 #define SCRUB_SIZE 32
67 	// this many pages will be cleared at once in the page scrubber thread
68 
69 #define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70 	// maximum I/O priority of the page writer
71 #define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72 	// the maximum I/O priority shall be reached when this many pages need to
73 	// be written
74 
75 
76 // The page reserve an allocation of the certain priority must not touch.
77 static const size_t kPageReserveForPriority[] = {
78 	VM_PAGE_RESERVE_USER,		// user
79 	VM_PAGE_RESERVE_SYSTEM,		// system
80 	0							// VIP
81 };
82 
83 // Minimum number of free pages the page daemon will try to achieve.
84 static uint32 sFreePagesTarget;
85 static uint32 sFreeOrCachedPagesTarget;
86 static uint32 sInactivePagesTarget;
87 
88 // Wait interval between page daemon runs.
89 static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90 static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91 
92 // Number of idle runs after which we want to have processed the full active
93 // queue.
94 static const uint32 kIdleRunsForFullQueue = 20;
95 
96 // Maximum limit for the vm_page::usage_count.
97 static const int32 kPageUsageMax = 64;
98 // vm_page::usage_count buff an accessed page receives in a scan.
99 static const int32 kPageUsageAdvance = 3;
100 // vm_page::usage_count debuff an unaccessed page receives in a scan.
101 static const int32 kPageUsageDecline = 1;
102 
103 int32 gMappedPagesCount;
104 
105 static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106 
107 static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108 static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109 static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110 static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111 static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112 static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113 
114 static vm_page *sPages;
115 static page_num_t sPhysicalPageOffset;
116 static page_num_t sNumPages;
117 static page_num_t sNonExistingPages;
118 	// pages in the sPages array that aren't backed by physical memory
119 static uint64 sIgnoredPages;
120 	// pages of physical memory ignored by the boot loader (and thus not
121 	// available here)
122 static int32 sUnreservedFreePages;
123 static int32 sUnsatisfiedPageReservations;
124 static int32 sModifiedTemporaryPages;
125 
126 static ConditionVariable sFreePageCondition;
127 static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
128 
129 // This lock must be used whenever the free or clear page queues are changed.
130 // If you need to work on both queues at the same time, you need to hold a write
131 // lock, otherwise, a read lock suffices (each queue still has a spinlock to
132 // guard against concurrent changes).
133 static rw_lock sFreePageQueuesLock
134 	= RW_LOCK_INITIALIZER("free/clear page queues");
135 
136 #ifdef TRACK_PAGE_USAGE_STATS
137 static page_num_t sPageUsageArrays[512];
138 static page_num_t* sPageUsage = sPageUsageArrays;
139 static page_num_t sPageUsagePageCount;
140 static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
141 static page_num_t sNextPageUsagePageCount;
142 #endif
143 
144 
145 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
146 
147 struct caller_info {
148 	addr_t		caller;
149 	size_t		count;
150 };
151 
152 static const int32 kCallerInfoTableSize = 1024;
153 static caller_info sCallerInfoTable[kCallerInfoTableSize];
154 static int32 sCallerInfoCount = 0;
155 
156 static caller_info* get_caller_info(addr_t caller);
157 
158 
159 RANGE_MARKER_FUNCTION_PROTOTYPES(vm_page)
160 
161 static const addr_t kVMPageCodeAddressRange[] = {
162 	RANGE_MARKER_FUNCTION_ADDRESS_RANGE(vm_page)
163 };
164 
165 #endif
166 
167 
168 RANGE_MARKER_FUNCTION_BEGIN(vm_page)
169 
170 
171 struct page_stats {
172 	int32	totalFreePages;
173 	int32	unsatisfiedReservations;
174 	int32	cachedPages;
175 };
176 
177 
178 struct PageReservationWaiter
179 		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
180 	Thread*	thread;
181 	uint32	dontTouch;		// reserve not to touch
182 	uint32	missing;		// pages missing for the reservation
183 	int32	threadPriority;
184 
185 	bool operator<(const PageReservationWaiter& other) const
186 	{
187 		// Implies an order by descending VM priority (ascending dontTouch)
188 		// and (secondarily) descending thread priority.
189 		if (dontTouch != other.dontTouch)
190 			return dontTouch < other.dontTouch;
191 		return threadPriority > other.threadPriority;
192 	}
193 };
194 
195 typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
196 static PageReservationWaiterList sPageReservationWaiters;
197 
198 
199 struct DaemonCondition {
200 	void Init(const char* name)
201 	{
202 		mutex_init(&fLock, "daemon condition");
203 		fCondition.Init(this, name);
204 		fActivated = false;
205 	}
206 
207 	bool Lock()
208 	{
209 		return mutex_lock(&fLock) == B_OK;
210 	}
211 
212 	void Unlock()
213 	{
214 		mutex_unlock(&fLock);
215 	}
216 
217 	bool Wait(bigtime_t timeout, bool clearActivated)
218 	{
219 		MutexLocker locker(fLock);
220 		if (clearActivated)
221 			fActivated = false;
222 		else if (fActivated)
223 			return true;
224 
225 		ConditionVariableEntry entry;
226 		fCondition.Add(&entry);
227 
228 		locker.Unlock();
229 
230 		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
231 	}
232 
233 	void WakeUp()
234 	{
235 		if (fActivated)
236 			return;
237 
238 		MutexLocker locker(fLock);
239 		fActivated = true;
240 		fCondition.NotifyOne();
241 	}
242 
243 	void ClearActivated()
244 	{
245 		MutexLocker locker(fLock);
246 		fActivated = false;
247 	}
248 
249 private:
250 	mutex				fLock;
251 	ConditionVariable	fCondition;
252 	bool				fActivated;
253 };
254 
255 
256 static DaemonCondition sPageWriterCondition;
257 static DaemonCondition sPageDaemonCondition;
258 
259 
260 #if PAGE_ALLOCATION_TRACING
261 
262 namespace PageAllocationTracing {
263 
264 class ReservePages : public AbstractTraceEntry {
265 public:
266 	ReservePages(uint32 count)
267 		:
268 		fCount(count)
269 	{
270 		Initialized();
271 	}
272 
273 	virtual void AddDump(TraceOutput& out)
274 	{
275 		out.Print("page reserve:   %" B_PRIu32, fCount);
276 	}
277 
278 private:
279 	uint32		fCount;
280 };
281 
282 
283 class UnreservePages : public AbstractTraceEntry {
284 public:
285 	UnreservePages(uint32 count)
286 		:
287 		fCount(count)
288 	{
289 		Initialized();
290 	}
291 
292 	virtual void AddDump(TraceOutput& out)
293 	{
294 		out.Print("page unreserve: %" B_PRId32, fCount);
295 	}
296 
297 private:
298 	uint32		fCount;
299 };
300 
301 
302 class AllocatePage
303 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
304 public:
305 	AllocatePage(page_num_t pageNumber)
306 		:
307 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
308 		fPageNumber(pageNumber)
309 	{
310 		Initialized();
311 	}
312 
313 	virtual void AddDump(TraceOutput& out)
314 	{
315 		out.Print("page alloc: %#" B_PRIxPHYSADDR, fPageNumber);
316 	}
317 
318 private:
319 	page_num_t	fPageNumber;
320 };
321 
322 
323 class AllocatePageRun
324 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
325 public:
326 	AllocatePageRun(page_num_t startPage, uint32 length)
327 		:
328 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
329 		fStartPage(startPage),
330 		fLength(length)
331 	{
332 		Initialized();
333 	}
334 
335 	virtual void AddDump(TraceOutput& out)
336 	{
337 		out.Print("page alloc run: start %#" B_PRIxPHYSADDR " length: %"
338 			B_PRIu32, fStartPage, fLength);
339 	}
340 
341 private:
342 	page_num_t	fStartPage;
343 	uint32		fLength;
344 };
345 
346 
347 class FreePage
348 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
349 public:
350 	FreePage(page_num_t pageNumber)
351 		:
352 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
353 		fPageNumber(pageNumber)
354 	{
355 		Initialized();
356 	}
357 
358 	virtual void AddDump(TraceOutput& out)
359 	{
360 		out.Print("page free: %#" B_PRIxPHYSADDR, fPageNumber);
361 	}
362 
363 private:
364 	page_num_t	fPageNumber;
365 };
366 
367 
368 class ScrubbingPages : public AbstractTraceEntry {
369 public:
370 	ScrubbingPages(uint32 count)
371 		:
372 		fCount(count)
373 	{
374 		Initialized();
375 	}
376 
377 	virtual void AddDump(TraceOutput& out)
378 	{
379 		out.Print("page scrubbing: %" B_PRId32, fCount);
380 	}
381 
382 private:
383 	uint32		fCount;
384 };
385 
386 
387 class ScrubbedPages : public AbstractTraceEntry {
388 public:
389 	ScrubbedPages(uint32 count)
390 		:
391 		fCount(count)
392 	{
393 		Initialized();
394 	}
395 
396 	virtual void AddDump(TraceOutput& out)
397 	{
398 		out.Print("page scrubbed:  %" B_PRId32, fCount);
399 	}
400 
401 private:
402 	uint32		fCount;
403 };
404 
405 
406 class StolenPage : public AbstractTraceEntry {
407 public:
408 	StolenPage()
409 	{
410 		Initialized();
411 	}
412 
413 	virtual void AddDump(TraceOutput& out)
414 	{
415 		out.Print("page stolen");
416 	}
417 };
418 
419 }	// namespace PageAllocationTracing
420 
421 #	define TA(x)	new(std::nothrow) PageAllocationTracing::x
422 
423 #else
424 #	define TA(x)
425 #endif	// PAGE_ALLOCATION_TRACING
426 
427 
428 #if PAGE_DAEMON_TRACING
429 
430 namespace PageDaemonTracing {
431 
432 class ActivatePage : public AbstractTraceEntry {
433 	public:
434 		ActivatePage(vm_page* page)
435 			:
436 			fCache(page->cache),
437 			fPage(page)
438 		{
439 			Initialized();
440 		}
441 
442 		virtual void AddDump(TraceOutput& out)
443 		{
444 			out.Print("page activated:   %p, cache: %p", fPage, fCache);
445 		}
446 
447 	private:
448 		VMCache*	fCache;
449 		vm_page*	fPage;
450 };
451 
452 
453 class DeactivatePage : public AbstractTraceEntry {
454 	public:
455 		DeactivatePage(vm_page* page)
456 			:
457 			fCache(page->cache),
458 			fPage(page)
459 		{
460 			Initialized();
461 		}
462 
463 		virtual void AddDump(TraceOutput& out)
464 		{
465 			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
466 		}
467 
468 	private:
469 		VMCache*	fCache;
470 		vm_page*	fPage;
471 };
472 
473 
474 class FreedPageSwap : public AbstractTraceEntry {
475 	public:
476 		FreedPageSwap(vm_page* page)
477 			:
478 			fCache(page->cache),
479 			fPage(page)
480 		{
481 			Initialized();
482 		}
483 
484 		virtual void AddDump(TraceOutput& out)
485 		{
486 			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
487 		}
488 
489 	private:
490 		VMCache*	fCache;
491 		vm_page*	fPage;
492 };
493 
494 }	// namespace PageDaemonTracing
495 
496 #	define TD(x)	new(std::nothrow) PageDaemonTracing::x
497 
498 #else
499 #	define TD(x)
500 #endif	// PAGE_DAEMON_TRACING
501 
502 
503 #if PAGE_WRITER_TRACING
504 
505 namespace PageWriterTracing {
506 
507 class WritePage : public AbstractTraceEntry {
508 	public:
509 		WritePage(vm_page* page)
510 			:
511 			fCache(page->Cache()),
512 			fPage(page)
513 		{
514 			Initialized();
515 		}
516 
517 		virtual void AddDump(TraceOutput& out)
518 		{
519 			out.Print("page write: %p, cache: %p", fPage, fCache);
520 		}
521 
522 	private:
523 		VMCache*	fCache;
524 		vm_page*	fPage;
525 };
526 
527 }	// namespace PageWriterTracing
528 
529 #	define TPW(x)	new(std::nothrow) PageWriterTracing::x
530 
531 #else
532 #	define TPW(x)
533 #endif	// PAGE_WRITER_TRACING
534 
535 
536 #if PAGE_STATE_TRACING
537 
538 namespace PageStateTracing {
539 
540 class SetPageState : public AbstractTraceEntry {
541 	public:
542 		SetPageState(vm_page* page, uint8 newState)
543 			:
544 			fPage(page),
545 			fOldState(page->State()),
546 			fNewState(newState),
547 			fBusy(page->busy),
548 			fWired(page->WiredCount() > 0),
549 			fMapped(!page->mappings.IsEmpty()),
550 			fAccessed(page->accessed),
551 			fModified(page->modified)
552 		{
553 #if PAGE_STATE_TRACING_STACK_TRACE
554 			fStackTrace = capture_tracing_stack_trace(
555 				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
556 				// Don't capture userland stack trace to avoid potential
557 				// deadlocks.
558 #endif
559 			Initialized();
560 		}
561 
562 #if PAGE_STATE_TRACING_STACK_TRACE
563 		virtual void DumpStackTrace(TraceOutput& out)
564 		{
565 			out.PrintStackTrace(fStackTrace);
566 		}
567 #endif
568 
569 		virtual void AddDump(TraceOutput& out)
570 		{
571 			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
572 				fBusy ? 'b' : '-',
573 				fWired ? 'w' : '-',
574 				fMapped ? 'm' : '-',
575 				fAccessed ? 'a' : '-',
576 				fModified ? 'm' : '-',
577 				page_state_to_string(fOldState),
578 				page_state_to_string(fNewState));
579 		}
580 
581 	private:
582 		vm_page*	fPage;
583 #if PAGE_STATE_TRACING_STACK_TRACE
584 		tracing_stack_trace* fStackTrace;
585 #endif
586 		uint8		fOldState;
587 		uint8		fNewState;
588 		bool		fBusy : 1;
589 		bool		fWired : 1;
590 		bool		fMapped : 1;
591 		bool		fAccessed : 1;
592 		bool		fModified : 1;
593 };
594 
595 }	// namespace PageStateTracing
596 
597 #	define TPS(x)	new(std::nothrow) PageStateTracing::x
598 
599 #else
600 #	define TPS(x)
601 #endif	// PAGE_STATE_TRACING
602 
603 
604 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
605 
606 namespace BKernel {
607 
608 class AllocationTrackingCallback {
609 public:
610 	virtual						~AllocationTrackingCallback();
611 
612 	virtual	bool				ProcessTrackingInfo(
613 									AllocationTrackingInfo* info,
614 									page_num_t pageNumber) = 0;
615 };
616 
617 }
618 
619 using BKernel::AllocationTrackingCallback;
620 
621 
622 class AllocationCollectorCallback : public AllocationTrackingCallback {
623 public:
624 	AllocationCollectorCallback(bool resetInfos)
625 		:
626 		fResetInfos(resetInfos)
627 	{
628 	}
629 
630 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
631 		page_num_t pageNumber)
632 	{
633 		if (!info->IsInitialized())
634 			return true;
635 
636 		addr_t caller = 0;
637 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
638 
639 		if (traceEntry != NULL && info->IsTraceEntryValid()) {
640 			caller = tracing_find_caller_in_stack_trace(
641 				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
642 		}
643 
644 		caller_info* callerInfo = get_caller_info(caller);
645 		if (callerInfo == NULL) {
646 			kprintf("out of space for caller infos\n");
647 			return false;
648 		}
649 
650 		callerInfo->count++;
651 
652 		if (fResetInfos)
653 			info->Clear();
654 
655 		return true;
656 	}
657 
658 private:
659 	bool	fResetInfos;
660 };
661 
662 
663 class AllocationInfoPrinterCallback : public AllocationTrackingCallback {
664 public:
665 	AllocationInfoPrinterCallback(bool printStackTrace, page_num_t pageFilter,
666 		team_id teamFilter, thread_id threadFilter)
667 		:
668 		fPrintStackTrace(printStackTrace),
669 		fPageFilter(pageFilter),
670 		fTeamFilter(teamFilter),
671 		fThreadFilter(threadFilter)
672 	{
673 	}
674 
675 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
676 		page_num_t pageNumber)
677 	{
678 		if (!info->IsInitialized())
679 			return true;
680 
681 		if (fPageFilter != 0 && pageNumber != fPageFilter)
682 			return true;
683 
684 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
685 		if (traceEntry != NULL && !info->IsTraceEntryValid())
686 			traceEntry = NULL;
687 
688 		if (traceEntry != NULL) {
689 			if (fTeamFilter != -1 && traceEntry->TeamID() != fTeamFilter)
690 				return true;
691 			if (fThreadFilter != -1 && traceEntry->ThreadID() != fThreadFilter)
692 				return true;
693 		} else {
694 			// we need the info if we have filters set
695 			if (fTeamFilter != -1 || fThreadFilter != -1)
696 				return true;
697 		}
698 
699 		kprintf("page number %#" B_PRIxPHYSADDR, pageNumber);
700 
701 		if (traceEntry != NULL) {
702 			kprintf(", team: %" B_PRId32 ", thread %" B_PRId32
703 				", time %" B_PRId64 "\n", traceEntry->TeamID(),
704 				traceEntry->ThreadID(), traceEntry->Time());
705 
706 			if (fPrintStackTrace)
707 				tracing_print_stack_trace(traceEntry->StackTrace());
708 		} else
709 			kprintf("\n");
710 
711 		return true;
712 	}
713 
714 private:
715 	bool		fPrintStackTrace;
716 	page_num_t	fPageFilter;
717 	team_id		fTeamFilter;
718 	thread_id	fThreadFilter;
719 };
720 
721 
722 class AllocationDetailPrinterCallback : public AllocationTrackingCallback {
723 public:
724 	AllocationDetailPrinterCallback(addr_t caller)
725 		:
726 		fCaller(caller)
727 	{
728 	}
729 
730 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
731 		page_num_t pageNumber)
732 	{
733 		if (!info->IsInitialized())
734 			return true;
735 
736 		addr_t caller = 0;
737 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
738 		if (traceEntry != NULL && !info->IsTraceEntryValid())
739 			traceEntry = NULL;
740 
741 		if (traceEntry != NULL) {
742 			caller = tracing_find_caller_in_stack_trace(
743 				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
744 		}
745 
746 		if (caller != fCaller)
747 			return true;
748 
749 		kprintf("page %#" B_PRIxPHYSADDR "\n", pageNumber);
750 		if (traceEntry != NULL)
751 			tracing_print_stack_trace(traceEntry->StackTrace());
752 
753 		return true;
754 	}
755 
756 private:
757 	addr_t	fCaller;
758 };
759 
760 #endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
761 
762 
763 static void
764 list_page(vm_page* page)
765 {
766 	kprintf("0x%08" B_PRIxADDR " ",
767 		(addr_t)(page->physical_page_number * B_PAGE_SIZE));
768 	switch (page->State()) {
769 		case PAGE_STATE_ACTIVE:   kprintf("A"); break;
770 		case PAGE_STATE_INACTIVE: kprintf("I"); break;
771 		case PAGE_STATE_MODIFIED: kprintf("M"); break;
772 		case PAGE_STATE_CACHED:   kprintf("C"); break;
773 		case PAGE_STATE_FREE:     kprintf("F"); break;
774 		case PAGE_STATE_CLEAR:    kprintf("L"); break;
775 		case PAGE_STATE_WIRED:    kprintf("W"); break;
776 		case PAGE_STATE_UNUSED:   kprintf("-"); break;
777 	}
778 	kprintf(" ");
779 	if (page->busy)         kprintf("B"); else kprintf("-");
780 	if (page->busy_writing) kprintf("W"); else kprintf("-");
781 	if (page->accessed)     kprintf("A"); else kprintf("-");
782 	if (page->modified)     kprintf("M"); else kprintf("-");
783 	kprintf("-");
784 
785 	kprintf(" usage:%3u", page->usage_count);
786 	kprintf(" wired:%5u", page->WiredCount());
787 
788 	bool first = true;
789 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
790 	vm_page_mapping* mapping;
791 	while ((mapping = iterator.Next()) != NULL) {
792 		if (first) {
793 			kprintf(": ");
794 			first = false;
795 		} else
796 			kprintf(", ");
797 
798 		kprintf("%" B_PRId32 " (%s)", mapping->area->id, mapping->area->name);
799 		mapping = mapping->page_link.next;
800 	}
801 }
802 
803 
804 static int
805 dump_page_list(int argc, char **argv)
806 {
807 	kprintf("page table:\n");
808 	for (page_num_t i = 0; i < sNumPages; i++) {
809 		if (sPages[i].State() != PAGE_STATE_UNUSED) {
810 			list_page(&sPages[i]);
811 			kprintf("\n");
812 		}
813 	}
814 	kprintf("end of page table\n");
815 
816 	return 0;
817 }
818 
819 
820 static int
821 find_page(int argc, char **argv)
822 {
823 	struct vm_page *page;
824 	addr_t address;
825 	int32 index = 1;
826 	int i;
827 
828 	struct {
829 		const char*	name;
830 		VMPageQueue*	queue;
831 	} pageQueueInfos[] = {
832 		{ "free",		&sFreePageQueue },
833 		{ "clear",		&sClearPageQueue },
834 		{ "modified",	&sModifiedPageQueue },
835 		{ "active",		&sActivePageQueue },
836 		{ "inactive",	&sInactivePageQueue },
837 		{ "cached",		&sCachedPageQueue },
838 		{ NULL, NULL }
839 	};
840 
841 	if (argc < 2
842 		|| strlen(argv[index]) <= 2
843 		|| argv[index][0] != '0'
844 		|| argv[index][1] != 'x') {
845 		kprintf("usage: find_page <address>\n");
846 		return 0;
847 	}
848 
849 	address = strtoul(argv[index], NULL, 0);
850 	page = (vm_page*)address;
851 
852 	for (i = 0; pageQueueInfos[i].name; i++) {
853 		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
854 		while (vm_page* p = it.Next()) {
855 			if (p == page) {
856 				kprintf("found page %p in queue %p (%s)\n", page,
857 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
858 				return 0;
859 			}
860 		}
861 	}
862 
863 	kprintf("page %p isn't in any queue\n", page);
864 
865 	return 0;
866 }
867 
868 
869 const char *
870 page_state_to_string(int state)
871 {
872 	switch(state) {
873 		case PAGE_STATE_ACTIVE:
874 			return "active";
875 		case PAGE_STATE_INACTIVE:
876 			return "inactive";
877 		case PAGE_STATE_MODIFIED:
878 			return "modified";
879 		case PAGE_STATE_CACHED:
880 			return "cached";
881 		case PAGE_STATE_FREE:
882 			return "free";
883 		case PAGE_STATE_CLEAR:
884 			return "clear";
885 		case PAGE_STATE_WIRED:
886 			return "wired";
887 		case PAGE_STATE_UNUSED:
888 			return "unused";
889 		default:
890 			return "unknown";
891 	}
892 }
893 
894 
895 static int
896 dump_page_long(int argc, char **argv)
897 {
898 	bool addressIsPointer = true;
899 	bool physical = false;
900 	bool searchMappings = false;
901 	int32 index = 1;
902 
903 	while (index < argc) {
904 		if (argv[index][0] != '-')
905 			break;
906 
907 		if (!strcmp(argv[index], "-p")) {
908 			addressIsPointer = false;
909 			physical = true;
910 		} else if (!strcmp(argv[index], "-v")) {
911 			addressIsPointer = false;
912 		} else if (!strcmp(argv[index], "-m")) {
913 			searchMappings = true;
914 		} else {
915 			print_debugger_command_usage(argv[0]);
916 			return 0;
917 		}
918 
919 		index++;
920 	}
921 
922 	if (index + 1 != argc) {
923 		print_debugger_command_usage(argv[0]);
924 		return 0;
925 	}
926 
927 	uint64 value;
928 	if (!evaluate_debug_expression(argv[index], &value, false))
929 		return 0;
930 
931 	uint64 pageAddress = value;
932 	struct vm_page* page;
933 
934 	if (addressIsPointer) {
935 		page = (struct vm_page *)(addr_t)pageAddress;
936 	} else {
937 		if (!physical) {
938 			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
939 
940 			if (debug_get_debugged_thread()->team->address_space != NULL)
941 				addressSpace = debug_get_debugged_thread()->team->address_space;
942 
943 			uint32 flags = 0;
944 			phys_addr_t physicalAddress;
945 			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
946 					&physicalAddress, &flags) != B_OK
947 				|| (flags & PAGE_PRESENT) == 0) {
948 				kprintf("Virtual address not mapped to a physical page in this "
949 					"address space.\n");
950 				return 0;
951 			}
952 			pageAddress = physicalAddress;
953 		}
954 
955 		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
956 	}
957 
958 	const page_num_t expected = sPhysicalPageOffset + (page - sPages);
959 
960 	kprintf("PAGE: %p\n", page);
961 	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
962 		page->queue_link.previous);
963 	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n", page->physical_page_number);
964 	if (page->physical_page_number != expected)
965 		kprintf("\t(expected %#" B_PRIxPHYSADDR ")!\n", expected);
966 	kprintf("cache:           %p\n", page->Cache());
967 	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
968 	kprintf("cache_next:      %p\n", page->cache_next);
969 	kprintf("state:           %s\n", page_state_to_string(page->State()));
970 	kprintf("wired_count:     %d\n", page->WiredCount());
971 	kprintf("usage_count:     %d\n", page->usage_count);
972 	kprintf("busy:            %d\n", page->busy);
973 	kprintf("busy_writing:    %d\n", page->busy_writing);
974 	kprintf("accessed:        %d\n", page->accessed);
975 	kprintf("modified:        %d\n", page->modified);
976 	#if DEBUG_PAGE_QUEUE
977 		kprintf("queue:           %p\n", page->queue);
978 	#endif
979 	#if DEBUG_PAGE_ACCESS
980 		kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
981 	#endif
982 	kprintf("area mappings:\n");
983 
984 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
985 	vm_page_mapping *mapping;
986 	while ((mapping = iterator.Next()) != NULL) {
987 		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
988 		mapping = mapping->page_link.next;
989 	}
990 
991 	if (searchMappings) {
992 		kprintf("all mappings:\n");
993 		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
994 		while (addressSpace != NULL) {
995 			size_t pageCount = addressSpace->Size() / B_PAGE_SIZE;
996 			for (addr_t address = addressSpace->Base(); pageCount != 0;
997 					address += B_PAGE_SIZE, pageCount--) {
998 				phys_addr_t physicalAddress;
999 				uint32 flags = 0;
1000 				if (addressSpace->TranslationMap()->QueryInterrupt(address,
1001 						&physicalAddress, &flags) == B_OK
1002 					&& (flags & PAGE_PRESENT) != 0
1003 					&& physicalAddress / B_PAGE_SIZE
1004 						== page->physical_page_number) {
1005 					VMArea* area = addressSpace->LookupArea(address);
1006 					kprintf("  aspace %" B_PRId32 ", area %" B_PRId32 ": %#"
1007 						B_PRIxADDR " (%c%c%s%s)\n", addressSpace->ID(),
1008 						area != NULL ? area->id : -1, address,
1009 						(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
1010 						(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
1011 						(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
1012 						(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
1013 				}
1014 			}
1015 			addressSpace = VMAddressSpace::DebugNext(addressSpace);
1016 		}
1017 	}
1018 
1019 	set_debug_variable("_cache", (addr_t)page->Cache());
1020 	#if DEBUG_PAGE_ACCESS
1021 		set_debug_variable("_accessor", page->accessing_thread);
1022 	#endif
1023 
1024 	return 0;
1025 }
1026 
1027 
1028 static int
1029 dump_page_queue(int argc, char **argv)
1030 {
1031 	struct VMPageQueue *queue;
1032 
1033 	if (argc < 2) {
1034 		kprintf("usage: page_queue <address/name> [list]\n");
1035 		return 0;
1036 	}
1037 
1038 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
1039 		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
1040 	else if (!strcmp(argv[1], "free"))
1041 		queue = &sFreePageQueue;
1042 	else if (!strcmp(argv[1], "clear"))
1043 		queue = &sClearPageQueue;
1044 	else if (!strcmp(argv[1], "modified"))
1045 		queue = &sModifiedPageQueue;
1046 	else if (!strcmp(argv[1], "active"))
1047 		queue = &sActivePageQueue;
1048 	else if (!strcmp(argv[1], "inactive"))
1049 		queue = &sInactivePageQueue;
1050 	else if (!strcmp(argv[1], "cached"))
1051 		queue = &sCachedPageQueue;
1052 	else {
1053 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
1054 		return 0;
1055 	}
1056 
1057 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
1058 		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
1059 		queue->Count());
1060 
1061 	if (argc == 3) {
1062 		struct vm_page *page = queue->Head();
1063 
1064 		kprintf("page        cache       type       state  wired  usage\n");
1065 		for (page_num_t i = 0; page; i++, page = queue->Next(page)) {
1066 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
1067 				vm_cache_type_to_string(page->Cache()->type),
1068 				page_state_to_string(page->State()),
1069 				page->WiredCount(), page->usage_count);
1070 		}
1071 	}
1072 	return 0;
1073 }
1074 
1075 
1076 static int
1077 dump_page_stats(int argc, char **argv)
1078 {
1079 	page_num_t swappableModified = 0;
1080 	page_num_t swappableModifiedInactive = 0;
1081 
1082 	size_t counter[8];
1083 	size_t busyCounter[8];
1084 	memset(counter, 0, sizeof(counter));
1085 	memset(busyCounter, 0, sizeof(busyCounter));
1086 
1087 	struct page_run {
1088 		page_num_t	start;
1089 		page_num_t	end;
1090 
1091 		page_num_t Length() const	{ return end - start; }
1092 	};
1093 
1094 	page_run currentFreeRun = { 0, 0 };
1095 	page_run currentCachedRun = { 0, 0 };
1096 	page_run longestFreeRun = { 0, 0 };
1097 	page_run longestCachedRun = { 0, 0 };
1098 
1099 	for (page_num_t i = 0; i < sNumPages; i++) {
1100 		if (sPages[i].State() > 7) {
1101 			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
1102 				&sPages[i]);
1103 		}
1104 
1105 		uint32 pageState = sPages[i].State();
1106 
1107 		counter[pageState]++;
1108 		if (sPages[i].busy)
1109 			busyCounter[pageState]++;
1110 
1111 		if (pageState == PAGE_STATE_MODIFIED
1112 			&& sPages[i].Cache() != NULL
1113 			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
1114 			swappableModified++;
1115 			if (sPages[i].usage_count == 0)
1116 				swappableModifiedInactive++;
1117 		}
1118 
1119 		// track free and cached pages runs
1120 		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
1121 			currentFreeRun.end = i + 1;
1122 			currentCachedRun.end = i + 1;
1123 		} else {
1124 			if (currentFreeRun.Length() > longestFreeRun.Length())
1125 				longestFreeRun = currentFreeRun;
1126 			currentFreeRun.start = currentFreeRun.end = i + 1;
1127 
1128 			if (pageState == PAGE_STATE_CACHED) {
1129 				currentCachedRun.end = i + 1;
1130 			} else {
1131 				if (currentCachedRun.Length() > longestCachedRun.Length())
1132 					longestCachedRun = currentCachedRun;
1133 				currentCachedRun.start = currentCachedRun.end = i + 1;
1134 			}
1135 		}
1136 	}
1137 
1138 	kprintf("page stats:\n");
1139 	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
1140 
1141 	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1142 		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
1143 	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1144 		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
1145 	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1146 		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
1147 	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1148 		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
1149 	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1150 		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
1151 	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1152 		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
1153 	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
1154 	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
1155 
1156 	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
1157 	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
1158 		sUnsatisfiedPageReservations);
1159 	kprintf("mapped pages: %" B_PRId32 "\n", gMappedPagesCount);
1160 	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
1161 		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
1162 		sPages[longestFreeRun.start].physical_page_number);
1163 	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
1164 		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
1165 		sPages[longestCachedRun.start].physical_page_number);
1166 
1167 	kprintf("waiting threads:\n");
1168 	for (PageReservationWaiterList::Iterator it
1169 			= sPageReservationWaiters.GetIterator();
1170 		PageReservationWaiter* waiter = it.Next();) {
1171 		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
1172 			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
1173 			waiter->missing, waiter->dontTouch);
1174 	}
1175 
1176 	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
1177 		sFreePageQueue.Count());
1178 	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
1179 		sClearPageQueue.Count());
1180 	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
1181 		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
1182 		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
1183 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
1184 	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
1185 		&sActivePageQueue, sActivePageQueue.Count());
1186 	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
1187 		&sInactivePageQueue, sInactivePageQueue.Count());
1188 	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
1189 		&sCachedPageQueue, sCachedPageQueue.Count());
1190 	return 0;
1191 }
1192 
1193 
1194 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1195 
1196 static caller_info*
1197 get_caller_info(addr_t caller)
1198 {
1199 	// find the caller info
1200 	for (int32 i = 0; i < sCallerInfoCount; i++) {
1201 		if (caller == sCallerInfoTable[i].caller)
1202 			return &sCallerInfoTable[i];
1203 	}
1204 
1205 	// not found, add a new entry, if there are free slots
1206 	if (sCallerInfoCount >= kCallerInfoTableSize)
1207 		return NULL;
1208 
1209 	caller_info* info = &sCallerInfoTable[sCallerInfoCount++];
1210 	info->caller = caller;
1211 	info->count = 0;
1212 
1213 	return info;
1214 }
1215 
1216 
1217 static int
1218 caller_info_compare_count(const void* _a, const void* _b)
1219 {
1220 	const caller_info* a = (const caller_info*)_a;
1221 	const caller_info* b = (const caller_info*)_b;
1222 	return (int)(b->count - a->count);
1223 }
1224 
1225 
1226 static int
1227 dump_page_allocations_per_caller(int argc, char** argv)
1228 {
1229 	bool resetAllocationInfos = false;
1230 	bool printDetails = false;
1231 	addr_t caller = 0;
1232 
1233 	for (int32 i = 1; i < argc; i++) {
1234 		if (strcmp(argv[i], "-d") == 0) {
1235 			uint64 callerAddress;
1236 			if (++i >= argc
1237 				|| !evaluate_debug_expression(argv[i], &callerAddress, true)) {
1238 				print_debugger_command_usage(argv[0]);
1239 				return 0;
1240 			}
1241 
1242 			caller = callerAddress;
1243 			printDetails = true;
1244 		} else if (strcmp(argv[i], "-r") == 0) {
1245 			resetAllocationInfos = true;
1246 		} else {
1247 			print_debugger_command_usage(argv[0]);
1248 			return 0;
1249 		}
1250 	}
1251 
1252 	sCallerInfoCount = 0;
1253 
1254 	AllocationCollectorCallback collectorCallback(resetAllocationInfos);
1255 	AllocationDetailPrinterCallback detailsCallback(caller);
1256 	AllocationTrackingCallback& callback = printDetails
1257 		? (AllocationTrackingCallback&)detailsCallback
1258 		: (AllocationTrackingCallback&)collectorCallback;
1259 
1260 	for (page_num_t i = 0; i < sNumPages; i++)
1261 		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1262 
1263 	if (printDetails)
1264 		return 0;
1265 
1266 	// sort the array
1267 	qsort(sCallerInfoTable, sCallerInfoCount, sizeof(caller_info),
1268 		&caller_info_compare_count);
1269 
1270 	kprintf("%" B_PRId32 " different callers\n\n", sCallerInfoCount);
1271 
1272 	size_t totalAllocationCount = 0;
1273 
1274 	kprintf("     count      caller\n");
1275 	kprintf("----------------------------------\n");
1276 	for (int32 i = 0; i < sCallerInfoCount; i++) {
1277 		caller_info& info = sCallerInfoTable[i];
1278 		kprintf("%10" B_PRIuSIZE "  %p", info.count, (void*)info.caller);
1279 
1280 		const char* symbol;
1281 		const char* imageName;
1282 		bool exactMatch;
1283 		addr_t baseAddress;
1284 
1285 		if (elf_debug_lookup_symbol_address(info.caller, &baseAddress, &symbol,
1286 				&imageName, &exactMatch) == B_OK) {
1287 			kprintf("  %s + %#" B_PRIxADDR " (%s)%s\n", symbol,
1288 				info.caller - baseAddress, imageName,
1289 				exactMatch ? "" : " (nearest)");
1290 		} else
1291 			kprintf("\n");
1292 
1293 		totalAllocationCount += info.count;
1294 	}
1295 
1296 	kprintf("\ntotal page allocations: %" B_PRIuSIZE "\n",
1297 		totalAllocationCount);
1298 
1299 	return 0;
1300 }
1301 
1302 
1303 static int
1304 dump_page_allocation_infos(int argc, char** argv)
1305 {
1306 	page_num_t pageFilter = 0;
1307 	team_id teamFilter = -1;
1308 	thread_id threadFilter = -1;
1309 	bool printStackTraces = false;
1310 
1311 	for (int32 i = 1; i < argc; i++) {
1312 		if (strcmp(argv[i], "--stacktrace") == 0)
1313 			printStackTraces = true;
1314 		else if (strcmp(argv[i], "-p") == 0) {
1315 			uint64 pageNumber;
1316 			if (++i >= argc
1317 				|| !evaluate_debug_expression(argv[i], &pageNumber, true)) {
1318 				print_debugger_command_usage(argv[0]);
1319 				return 0;
1320 			}
1321 
1322 			pageFilter = pageNumber;
1323 		} else if (strcmp(argv[i], "--team") == 0) {
1324 			uint64 team;
1325 			if (++i >= argc
1326 				|| !evaluate_debug_expression(argv[i], &team, true)) {
1327 				print_debugger_command_usage(argv[0]);
1328 				return 0;
1329 			}
1330 
1331 			teamFilter = team;
1332 		} else if (strcmp(argv[i], "--thread") == 0) {
1333 			uint64 thread;
1334 			if (++i >= argc
1335 				|| !evaluate_debug_expression(argv[i], &thread, true)) {
1336 				print_debugger_command_usage(argv[0]);
1337 				return 0;
1338 			}
1339 
1340 			threadFilter = thread;
1341 		} else {
1342 			print_debugger_command_usage(argv[0]);
1343 			return 0;
1344 		}
1345 	}
1346 
1347 	AllocationInfoPrinterCallback callback(printStackTraces, pageFilter,
1348 		teamFilter, threadFilter);
1349 
1350 	for (page_num_t i = 0; i < sNumPages; i++)
1351 		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1352 
1353 	return 0;
1354 }
1355 
1356 #endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1357 
1358 
1359 #ifdef TRACK_PAGE_USAGE_STATS
1360 
1361 static void
1362 track_page_usage(vm_page* page)
1363 {
1364 	if (page->WiredCount() == 0) {
1365 		sNextPageUsage[(int32)page->usage_count + 128]++;
1366 		sNextPageUsagePageCount++;
1367 	}
1368 }
1369 
1370 
1371 static void
1372 update_page_usage_stats()
1373 {
1374 	std::swap(sPageUsage, sNextPageUsage);
1375 	sPageUsagePageCount = sNextPageUsagePageCount;
1376 
1377 	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
1378 	sNextPageUsagePageCount = 0;
1379 
1380 	// compute average
1381 	if (sPageUsagePageCount > 0) {
1382 		int64 sum = 0;
1383 		for (int32 i = 0; i < 256; i++)
1384 			sum += (int64)sPageUsage[i] * (i - 128);
1385 
1386 		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
1387 			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
1388 	}
1389 }
1390 
1391 
1392 static int
1393 dump_page_usage_stats(int argc, char** argv)
1394 {
1395 	kprintf("distribution of page usage counts (%lu pages):",
1396 		sPageUsagePageCount);
1397 
1398 	int64 sum = 0;
1399 	for (int32 i = 0; i < 256; i++) {
1400 		if (i % 8 == 0)
1401 			kprintf("\n%4ld:", i - 128);
1402 
1403 		int64 count = sPageUsage[i];
1404 		sum += count * (i - 128);
1405 
1406 		kprintf("  %9llu", count);
1407 	}
1408 
1409 	kprintf("\n\n");
1410 
1411 	kprintf("average usage count: %f\n",
1412 		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1413 
1414 	return 0;
1415 }
1416 
1417 #endif	// TRACK_PAGE_USAGE_STATS
1418 
1419 
1420 // #pragma mark - vm_page
1421 
1422 
1423 inline void
1424 vm_page::InitState(uint8 newState)
1425 {
1426 	state = newState;
1427 }
1428 
1429 
1430 inline void
1431 vm_page::SetState(uint8 newState)
1432 {
1433 	TPS(SetPageState(this, newState));
1434 
1435 	state = newState;
1436 }
1437 
1438 
1439 // #pragma mark -
1440 
1441 
1442 static void
1443 get_page_stats(page_stats& _pageStats)
1444 {
1445 	_pageStats.totalFreePages = sUnreservedFreePages;
1446 	_pageStats.cachedPages = sCachedPageQueue.Count();
1447 	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1448 	// TODO: We don't get an actual snapshot here!
1449 }
1450 
1451 
1452 static bool
1453 do_active_paging(const page_stats& pageStats)
1454 {
1455 	return pageStats.totalFreePages + pageStats.cachedPages
1456 		< pageStats.unsatisfiedReservations
1457 			+ (int32)sFreeOrCachedPagesTarget;
1458 }
1459 
1460 
1461 /*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1462 	\a count. Doesn't touch the last \a dontTouch pages of
1463 	\c sUnreservedFreePages, though.
1464 	\return The number of actually reserved pages.
1465 */
1466 static uint32
1467 reserve_some_pages(uint32 count, uint32 dontTouch)
1468 {
1469 	while (true) {
1470 		int32 freePages = atomic_get(&sUnreservedFreePages);
1471 		if (freePages <= (int32)dontTouch)
1472 			return 0;
1473 
1474 		int32 toReserve = std::min(count, freePages - dontTouch);
1475 		if (atomic_test_and_set(&sUnreservedFreePages,
1476 					freePages - toReserve, freePages)
1477 				== freePages) {
1478 			return toReserve;
1479 		}
1480 
1481 		// the count changed in the meantime -- retry
1482 	}
1483 }
1484 
1485 
1486 static void
1487 wake_up_page_reservation_waiters()
1488 {
1489 	MutexLocker pageDeficitLocker(sPageDeficitLock);
1490 
1491 	// TODO: If this is a low priority thread, we might want to disable
1492 	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1493 	// high priority threads wait be kept waiting while a medium priority thread
1494 	// prevents us from running.
1495 
1496 	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1497 		int32 reserved = reserve_some_pages(waiter->missing,
1498 			waiter->dontTouch);
1499 		if (reserved == 0)
1500 			return;
1501 
1502 		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1503 		waiter->missing -= reserved;
1504 
1505 		if (waiter->missing > 0)
1506 			return;
1507 
1508 		sPageReservationWaiters.Remove(waiter);
1509 
1510 		thread_unblock(waiter->thread, B_OK);
1511 	}
1512 }
1513 
1514 
1515 static inline void
1516 unreserve_pages(uint32 count)
1517 {
1518 	atomic_add(&sUnreservedFreePages, count);
1519 	if (atomic_get(&sUnsatisfiedPageReservations) != 0)
1520 		wake_up_page_reservation_waiters();
1521 }
1522 
1523 
1524 static void
1525 free_page(vm_page* page, bool clear)
1526 {
1527 	DEBUG_PAGE_ACCESS_CHECK(page);
1528 
1529 	PAGE_ASSERT(page, !page->IsMapped());
1530 
1531 	VMPageQueue* fromQueue;
1532 
1533 	switch (page->State()) {
1534 		case PAGE_STATE_ACTIVE:
1535 			fromQueue = &sActivePageQueue;
1536 			break;
1537 		case PAGE_STATE_INACTIVE:
1538 			fromQueue = &sInactivePageQueue;
1539 			break;
1540 		case PAGE_STATE_MODIFIED:
1541 			fromQueue = &sModifiedPageQueue;
1542 			break;
1543 		case PAGE_STATE_CACHED:
1544 			fromQueue = &sCachedPageQueue;
1545 			break;
1546 		case PAGE_STATE_FREE:
1547 		case PAGE_STATE_CLEAR:
1548 			panic("free_page(): page %p already free", page);
1549 			return;
1550 		case PAGE_STATE_WIRED:
1551 		case PAGE_STATE_UNUSED:
1552 			fromQueue = NULL;
1553 			break;
1554 		default:
1555 			panic("free_page(): page %p in invalid state %d",
1556 				page, page->State());
1557 			return;
1558 	}
1559 
1560 	if (page->CacheRef() != NULL)
1561 		panic("to be freed page %p has cache", page);
1562 	if (page->IsMapped())
1563 		panic("to be freed page %p has mappings", page);
1564 
1565 	if (fromQueue != NULL)
1566 		fromQueue->RemoveUnlocked(page);
1567 
1568 	TA(FreePage(page->physical_page_number));
1569 
1570 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1571 	page->allocation_tracking_info.Clear();
1572 #endif
1573 
1574 	ReadLocker locker(sFreePageQueuesLock);
1575 
1576 	DEBUG_PAGE_ACCESS_END(page);
1577 
1578 	if (clear) {
1579 		page->SetState(PAGE_STATE_CLEAR);
1580 		sClearPageQueue.PrependUnlocked(page);
1581 	} else {
1582 		page->SetState(PAGE_STATE_FREE);
1583 		sFreePageQueue.PrependUnlocked(page);
1584 		sFreePageCondition.NotifyAll();
1585 	}
1586 
1587 	locker.Unlock();
1588 }
1589 
1590 
1591 /*!	The caller must make sure that no-one else tries to change the page's state
1592 	while the function is called. If the page has a cache, this can be done by
1593 	locking the cache.
1594 */
1595 static void
1596 set_page_state(vm_page *page, int pageState)
1597 {
1598 	DEBUG_PAGE_ACCESS_CHECK(page);
1599 
1600 	if (pageState == page->State())
1601 		return;
1602 
1603 	VMPageQueue* fromQueue;
1604 
1605 	switch (page->State()) {
1606 		case PAGE_STATE_ACTIVE:
1607 			fromQueue = &sActivePageQueue;
1608 			break;
1609 		case PAGE_STATE_INACTIVE:
1610 			fromQueue = &sInactivePageQueue;
1611 			break;
1612 		case PAGE_STATE_MODIFIED:
1613 			fromQueue = &sModifiedPageQueue;
1614 			break;
1615 		case PAGE_STATE_CACHED:
1616 			fromQueue = &sCachedPageQueue;
1617 			break;
1618 		case PAGE_STATE_FREE:
1619 		case PAGE_STATE_CLEAR:
1620 			panic("set_page_state(): page %p is free/clear", page);
1621 			return;
1622 		case PAGE_STATE_WIRED:
1623 		case PAGE_STATE_UNUSED:
1624 			fromQueue = NULL;
1625 			break;
1626 		default:
1627 			panic("set_page_state(): page %p in invalid state %d",
1628 				page, page->State());
1629 			return;
1630 	}
1631 
1632 	VMPageQueue* toQueue;
1633 
1634 	switch (pageState) {
1635 		case PAGE_STATE_ACTIVE:
1636 			toQueue = &sActivePageQueue;
1637 			break;
1638 		case PAGE_STATE_INACTIVE:
1639 			toQueue = &sInactivePageQueue;
1640 			break;
1641 		case PAGE_STATE_MODIFIED:
1642 			toQueue = &sModifiedPageQueue;
1643 			break;
1644 		case PAGE_STATE_CACHED:
1645 			PAGE_ASSERT(page, !page->IsMapped());
1646 			PAGE_ASSERT(page, !page->modified);
1647 			toQueue = &sCachedPageQueue;
1648 			break;
1649 		case PAGE_STATE_FREE:
1650 		case PAGE_STATE_CLEAR:
1651 			panic("set_page_state(): target state is free/clear");
1652 			return;
1653 		case PAGE_STATE_WIRED:
1654 		case PAGE_STATE_UNUSED:
1655 			toQueue = NULL;
1656 			break;
1657 		default:
1658 			panic("set_page_state(): invalid target state %d", pageState);
1659 			return;
1660 	}
1661 
1662 	VMCache* cache = page->Cache();
1663 	if (cache != NULL && cache->temporary) {
1664 		if (pageState == PAGE_STATE_MODIFIED)
1665 			atomic_add(&sModifiedTemporaryPages, 1);
1666 		else if (page->State() == PAGE_STATE_MODIFIED)
1667 			atomic_add(&sModifiedTemporaryPages, -1);
1668 	}
1669 
1670 	// move the page
1671 	if (toQueue == fromQueue) {
1672 		// Note: Theoretically we are required to lock when changing the page
1673 		// state, even if we don't change the queue. We actually don't have to
1674 		// do this, though, since only for the active queue there are different
1675 		// page states and active pages have a cache that must be locked at
1676 		// this point. So we rely on the fact that everyone must lock the cache
1677 		// before trying to change/interpret the page state.
1678 		PAGE_ASSERT(page, cache != NULL);
1679 		cache->AssertLocked();
1680 		page->SetState(pageState);
1681 	} else {
1682 		if (fromQueue != NULL)
1683 			fromQueue->RemoveUnlocked(page);
1684 
1685 		page->SetState(pageState);
1686 
1687 		if (toQueue != NULL)
1688 			toQueue->AppendUnlocked(page);
1689 	}
1690 }
1691 
1692 
1693 /*! Moves a previously modified page into a now appropriate queue.
1694 	The page queues must not be locked.
1695 */
1696 static void
1697 move_page_to_appropriate_queue(vm_page *page)
1698 {
1699 	DEBUG_PAGE_ACCESS_CHECK(page);
1700 
1701 	// Note, this logic must be in sync with what the page daemon does.
1702 	int32 state;
1703 	if (page->IsMapped())
1704 		state = PAGE_STATE_ACTIVE;
1705 	else if (page->modified)
1706 		state = PAGE_STATE_MODIFIED;
1707 	else
1708 		state = PAGE_STATE_CACHED;
1709 
1710 // TODO: If free + cached pages are low, we might directly want to free the
1711 // page.
1712 	set_page_state(page, state);
1713 }
1714 
1715 
1716 static void
1717 clear_page(struct vm_page *page)
1718 {
1719 	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1720 		B_PAGE_SIZE);
1721 }
1722 
1723 
1724 static status_t
1725 mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1726 {
1727 	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1728 		B_PRIxPHYSADDR "\n", startPage, length));
1729 
1730 	if (sPhysicalPageOffset > startPage) {
1731 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1732 			"): start page is before free list\n", startPage, length);
1733 		if (sPhysicalPageOffset - startPage >= length)
1734 			return B_OK;
1735 		length -= sPhysicalPageOffset - startPage;
1736 		startPage = sPhysicalPageOffset;
1737 	}
1738 
1739 	startPage -= sPhysicalPageOffset;
1740 
1741 	if (startPage + length > sNumPages) {
1742 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1743 			"): range would extend past free list\n", startPage, length);
1744 		if (startPage >= sNumPages)
1745 			return B_OK;
1746 		length = sNumPages - startPage;
1747 	}
1748 
1749 	WriteLocker locker(sFreePageQueuesLock);
1750 
1751 	for (page_num_t i = 0; i < length; i++) {
1752 		vm_page *page = &sPages[startPage + i];
1753 		switch (page->State()) {
1754 			case PAGE_STATE_FREE:
1755 			case PAGE_STATE_CLEAR:
1756 			{
1757 // TODO: This violates the page reservation policy, since we remove pages from
1758 // the free/clear queues without having reserved them before. This should happen
1759 // in the early boot process only, though.
1760 				DEBUG_PAGE_ACCESS_START(page);
1761 				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1762 					? sFreePageQueue : sClearPageQueue;
1763 				queue.Remove(page);
1764 				page->SetState(wired ? PAGE_STATE_WIRED : PAGE_STATE_UNUSED);
1765 				page->busy = false;
1766 				atomic_add(&sUnreservedFreePages, -1);
1767 				DEBUG_PAGE_ACCESS_END(page);
1768 				break;
1769 			}
1770 			case PAGE_STATE_WIRED:
1771 			case PAGE_STATE_UNUSED:
1772 				break;
1773 			case PAGE_STATE_ACTIVE:
1774 			case PAGE_STATE_INACTIVE:
1775 			case PAGE_STATE_MODIFIED:
1776 			case PAGE_STATE_CACHED:
1777 			default:
1778 				// uh
1779 				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1780 					" in non-free state %d!\n", startPage + i, page->State());
1781 				break;
1782 		}
1783 	}
1784 
1785 	return B_OK;
1786 }
1787 
1788 
1789 /*!
1790 	This is a background thread that wakes up when its condition is notified
1791 	and moves some pages from the free queue over to the clear queue.
1792 	Given enough time, it will clear out all pages from the free queue - we
1793 	could probably slow it down after having reached a certain threshold.
1794 */
1795 static int32
1796 page_scrubber(void *unused)
1797 {
1798 	(void)(unused);
1799 
1800 	TRACE(("page_scrubber starting...\n"));
1801 
1802 	ConditionVariableEntry entry;
1803 	for (;;) {
1804 		while (sFreePageQueue.Count() == 0
1805 				|| atomic_get(&sUnreservedFreePages)
1806 					< (int32)sFreePagesTarget) {
1807 			sFreePageCondition.Add(&entry);
1808 			entry.Wait();
1809 		}
1810 
1811 		// Since we temporarily remove pages from the free pages reserve,
1812 		// we must make sure we don't cause a violation of the page
1813 		// reservation warranty. The following is usually stricter than
1814 		// necessary, because we don't have information on how many of the
1815 		// reserved pages have already been allocated.
1816 		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1817 			kPageReserveForPriority[VM_PRIORITY_USER]);
1818 		if (reserved == 0)
1819 			continue;
1820 
1821 		// get some pages from the free queue, mostly sorted
1822 		ReadLocker locker(sFreePageQueuesLock);
1823 
1824 		vm_page *page[SCRUB_SIZE];
1825 		int32 scrubCount = 0;
1826 		for (int32 i = 0; i < reserved; i++) {
1827 			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1828 			if (page[i] == NULL)
1829 				break;
1830 
1831 			DEBUG_PAGE_ACCESS_START(page[i]);
1832 
1833 			page[i]->SetState(PAGE_STATE_ACTIVE);
1834 			page[i]->busy = true;
1835 			scrubCount++;
1836 		}
1837 
1838 		locker.Unlock();
1839 
1840 		if (scrubCount == 0) {
1841 			unreserve_pages(reserved);
1842 			continue;
1843 		}
1844 
1845 		TA(ScrubbingPages(scrubCount));
1846 
1847 		// clear them
1848 		for (int32 i = 0; i < scrubCount; i++)
1849 			clear_page(page[i]);
1850 
1851 		locker.Lock();
1852 
1853 		// and put them into the clear queue
1854 		// process the array reversed when prepending to preserve sequential order
1855 		for (int32 i = scrubCount - 1; i >= 0; i--) {
1856 			page[i]->SetState(PAGE_STATE_CLEAR);
1857 			page[i]->busy = false;
1858 			DEBUG_PAGE_ACCESS_END(page[i]);
1859 			sClearPageQueue.PrependUnlocked(page[i]);
1860 		}
1861 
1862 		locker.Unlock();
1863 
1864 		unreserve_pages(reserved);
1865 
1866 		TA(ScrubbedPages(scrubCount));
1867 
1868 		// wait at least 100ms between runs
1869 		snooze(100 * 1000);
1870 	}
1871 
1872 	return 0;
1873 }
1874 
1875 
1876 static void
1877 init_page_marker(vm_page &marker)
1878 {
1879 	marker.SetCacheRef(NULL);
1880 	marker.InitState(PAGE_STATE_UNUSED);
1881 	marker.busy = true;
1882 #if DEBUG_PAGE_QUEUE
1883 	marker.queue = NULL;
1884 #endif
1885 #if DEBUG_PAGE_ACCESS
1886 	marker.accessing_thread = thread_get_current_thread_id();
1887 #endif
1888 }
1889 
1890 
1891 static void
1892 remove_page_marker(struct vm_page &marker)
1893 {
1894 	DEBUG_PAGE_ACCESS_CHECK(&marker);
1895 
1896 	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1897 		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1898 
1899 	marker.SetState(PAGE_STATE_UNUSED);
1900 }
1901 
1902 
1903 static vm_page*
1904 next_modified_page(page_num_t& maxPagesToSee)
1905 {
1906 	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1907 
1908 	while (maxPagesToSee > 0) {
1909 		vm_page* page = sModifiedPageQueue.Head();
1910 		if (page == NULL)
1911 			return NULL;
1912 
1913 		sModifiedPageQueue.Requeue(page, true);
1914 
1915 		maxPagesToSee--;
1916 
1917 		if (!page->busy)
1918 			return page;
1919 	}
1920 
1921 	return NULL;
1922 }
1923 
1924 
1925 // #pragma mark -
1926 
1927 
1928 class PageWriteTransfer;
1929 class PageWriteWrapper;
1930 
1931 
1932 class PageWriterRun {
1933 public:
1934 	status_t Init(uint32 maxPages);
1935 
1936 	void PrepareNextRun();
1937 	void AddPage(vm_page* page);
1938 	uint32 Go();
1939 
1940 	void PageWritten(PageWriteTransfer* transfer, status_t status,
1941 		bool partialTransfer, size_t bytesTransferred);
1942 
1943 private:
1944 	uint32				fMaxPages;
1945 	uint32				fWrapperCount;
1946 	uint32				fTransferCount;
1947 	int32				fPendingTransfers;
1948 	PageWriteWrapper*	fWrappers;
1949 	PageWriteTransfer*	fTransfers;
1950 	ConditionVariable	fAllFinishedCondition;
1951 };
1952 
1953 
1954 class PageWriteTransfer : public AsyncIOCallback {
1955 public:
1956 	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1957 	bool AddPage(vm_page* page);
1958 
1959 	status_t Schedule(uint32 flags);
1960 
1961 	void SetStatus(status_t status, size_t transferred);
1962 
1963 	status_t Status() const	{ return fStatus; }
1964 	struct VMCache* Cache() const { return fCache; }
1965 	uint32 PageCount() const { return fPageCount; }
1966 
1967 	virtual void IOFinished(status_t status, bool partialTransfer,
1968 		generic_size_t bytesTransferred);
1969 private:
1970 	PageWriterRun*		fRun;
1971 	struct VMCache*		fCache;
1972 	off_t				fOffset;
1973 	uint32				fPageCount;
1974 	int32				fMaxPages;
1975 	status_t			fStatus;
1976 	uint32				fVecCount;
1977 	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
1978 };
1979 
1980 
1981 class PageWriteWrapper {
1982 public:
1983 	PageWriteWrapper();
1984 	~PageWriteWrapper();
1985 	void SetTo(vm_page* page);
1986 	bool Done(status_t result);
1987 
1988 private:
1989 	vm_page*			fPage;
1990 	struct VMCache*		fCache;
1991 	bool				fIsActive;
1992 };
1993 
1994 
1995 PageWriteWrapper::PageWriteWrapper()
1996 	:
1997 	fIsActive(false)
1998 {
1999 }
2000 
2001 
2002 PageWriteWrapper::~PageWriteWrapper()
2003 {
2004 	if (fIsActive)
2005 		panic("page write wrapper going out of scope but isn't completed");
2006 }
2007 
2008 
2009 /*!	The page's cache must be locked.
2010 */
2011 void
2012 PageWriteWrapper::SetTo(vm_page* page)
2013 {
2014 	DEBUG_PAGE_ACCESS_CHECK(page);
2015 
2016 	if (page->busy)
2017 		panic("setting page write wrapper to busy page");
2018 
2019 	if (fIsActive)
2020 		panic("re-setting page write wrapper that isn't completed");
2021 
2022 	fPage = page;
2023 	fCache = page->Cache();
2024 	fIsActive = true;
2025 
2026 	fPage->busy = true;
2027 	fPage->busy_writing = true;
2028 
2029 	// We have a modified page -- however, while we're writing it back,
2030 	// the page might still be mapped. In order not to lose any changes to the
2031 	// page, we mark it clean before actually writing it back; if
2032 	// writing the page fails for some reason, we'll just keep it in the
2033 	// modified page list, but that should happen only rarely.
2034 
2035 	// If the page is changed after we cleared the dirty flag, but before we
2036 	// had the chance to write it back, then we'll write it again later -- that
2037 	// will probably not happen that often, though.
2038 
2039 	vm_clear_map_flags(fPage, PAGE_MODIFIED);
2040 }
2041 
2042 
2043 /*!	The page's cache must be locked.
2044 	The page queues must not be locked.
2045 	\return \c true if the page was written successfully respectively could be
2046 		handled somehow, \c false otherwise.
2047 */
2048 bool
2049 PageWriteWrapper::Done(status_t result)
2050 {
2051 	if (!fIsActive)
2052 		panic("completing page write wrapper that is not active");
2053 
2054 	DEBUG_PAGE_ACCESS_START(fPage);
2055 
2056 	fPage->busy = false;
2057 		// Set unbusy and notify later by hand, since we might free the page.
2058 
2059 	bool success = true;
2060 
2061 	if (result == B_OK) {
2062 		// put it into the active/inactive queue
2063 		move_page_to_appropriate_queue(fPage);
2064 		fPage->busy_writing = false;
2065 		DEBUG_PAGE_ACCESS_END(fPage);
2066 	} else {
2067 		// Writing the page failed. One reason would be that the cache has been
2068 		// shrunk and the page does no longer belong to the file. Otherwise the
2069 		// actual I/O failed, in which case we'll simply keep the page modified.
2070 
2071 		if (!fPage->busy_writing) {
2072 			// The busy_writing flag was cleared. That means the cache has been
2073 			// shrunk while we were trying to write the page and we have to free
2074 			// it now.
2075 			vm_remove_all_page_mappings(fPage);
2076 // TODO: Unmapping should already happen when resizing the cache!
2077 			fCache->RemovePage(fPage);
2078 			free_page(fPage, false);
2079 			unreserve_pages(1);
2080 		} else {
2081 			// Writing the page failed -- mark the page modified and move it to
2082 			// an appropriate queue other than the modified queue, so we don't
2083 			// keep trying to write it over and over again. We keep
2084 			// non-temporary pages in the modified queue, though, so they don't
2085 			// get lost in the inactive queue.
2086 			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
2087 				strerror(result));
2088 
2089 			fPage->modified = true;
2090 			if (!fCache->temporary)
2091 				set_page_state(fPage, PAGE_STATE_MODIFIED);
2092 			else if (fPage->IsMapped())
2093 				set_page_state(fPage, PAGE_STATE_ACTIVE);
2094 			else
2095 				set_page_state(fPage, PAGE_STATE_INACTIVE);
2096 
2097 			fPage->busy_writing = false;
2098 			DEBUG_PAGE_ACCESS_END(fPage);
2099 
2100 			success = false;
2101 		}
2102 	}
2103 
2104 	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
2105 	fIsActive = false;
2106 
2107 	return success;
2108 }
2109 
2110 
2111 /*!	The page's cache must be locked.
2112 */
2113 void
2114 PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
2115 {
2116 	fRun = run;
2117 	fCache = page->Cache();
2118 	fOffset = page->cache_offset;
2119 	fPageCount = 1;
2120 	fMaxPages = maxPages;
2121 	fStatus = B_OK;
2122 
2123 	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2124 	fVecs[0].length = B_PAGE_SIZE;
2125 	fVecCount = 1;
2126 }
2127 
2128 
2129 /*!	The page's cache must be locked.
2130 */
2131 bool
2132 PageWriteTransfer::AddPage(vm_page* page)
2133 {
2134 	if (page->Cache() != fCache
2135 		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
2136 		return false;
2137 
2138 	phys_addr_t nextBase = fVecs[fVecCount - 1].base
2139 		+ fVecs[fVecCount - 1].length;
2140 
2141 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2142 		&& (off_t)page->cache_offset == fOffset + fPageCount) {
2143 		// append to last iovec
2144 		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
2145 		fPageCount++;
2146 		return true;
2147 	}
2148 
2149 	nextBase = fVecs[0].base - B_PAGE_SIZE;
2150 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2151 		&& (off_t)page->cache_offset == fOffset - 1) {
2152 		// prepend to first iovec and adjust offset
2153 		fVecs[0].base = nextBase;
2154 		fVecs[0].length += B_PAGE_SIZE;
2155 		fOffset = page->cache_offset;
2156 		fPageCount++;
2157 		return true;
2158 	}
2159 
2160 	if (((off_t)page->cache_offset == fOffset + fPageCount
2161 			|| (off_t)page->cache_offset == fOffset - 1)
2162 		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
2163 		// not physically contiguous or not in the right order
2164 		uint32 vectorIndex;
2165 		if ((off_t)page->cache_offset < fOffset) {
2166 			// we are pre-pending another vector, move the other vecs
2167 			for (uint32 i = fVecCount; i > 0; i--)
2168 				fVecs[i] = fVecs[i - 1];
2169 
2170 			fOffset = page->cache_offset;
2171 			vectorIndex = 0;
2172 		} else
2173 			vectorIndex = fVecCount;
2174 
2175 		fVecs[vectorIndex].base
2176 			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2177 		fVecs[vectorIndex].length = B_PAGE_SIZE;
2178 
2179 		fVecCount++;
2180 		fPageCount++;
2181 		return true;
2182 	}
2183 
2184 	return false;
2185 }
2186 
2187 
2188 status_t
2189 PageWriteTransfer::Schedule(uint32 flags)
2190 {
2191 	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
2192 	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
2193 
2194 	if (fRun != NULL) {
2195 		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
2196 			flags | B_PHYSICAL_IO_REQUEST, this);
2197 	}
2198 
2199 	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
2200 		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
2201 
2202 	SetStatus(status, writeLength);
2203 	return fStatus;
2204 }
2205 
2206 
2207 void
2208 PageWriteTransfer::SetStatus(status_t status, size_t transferred)
2209 {
2210 	// only succeed if all pages up to the last one have been written fully
2211 	// and the last page has at least been written partially
2212 	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
2213 		status = B_ERROR;
2214 
2215 	fStatus = status;
2216 }
2217 
2218 
2219 void
2220 PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
2221 	generic_size_t bytesTransferred)
2222 {
2223 	SetStatus(status, bytesTransferred);
2224 	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
2225 }
2226 
2227 
2228 status_t
2229 PageWriterRun::Init(uint32 maxPages)
2230 {
2231 	fMaxPages = maxPages;
2232 	fWrapperCount = 0;
2233 	fTransferCount = 0;
2234 	fPendingTransfers = 0;
2235 
2236 	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
2237 	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
2238 	if (fWrappers == NULL || fTransfers == NULL)
2239 		return B_NO_MEMORY;
2240 
2241 	return B_OK;
2242 }
2243 
2244 
2245 void
2246 PageWriterRun::PrepareNextRun()
2247 {
2248 	fWrapperCount = 0;
2249 	fTransferCount = 0;
2250 	fPendingTransfers = 0;
2251 }
2252 
2253 
2254 /*!	The page's cache must be locked.
2255 */
2256 void
2257 PageWriterRun::AddPage(vm_page* page)
2258 {
2259 	fWrappers[fWrapperCount++].SetTo(page);
2260 
2261 	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
2262 		fTransfers[fTransferCount++].SetTo(this, page,
2263 			page->Cache()->MaxPagesPerAsyncWrite());
2264 	}
2265 }
2266 
2267 
2268 /*!	Writes all pages previously added.
2269 	\return The number of pages that could not be written or otherwise handled.
2270 */
2271 uint32
2272 PageWriterRun::Go()
2273 {
2274 	atomic_set(&fPendingTransfers, fTransferCount);
2275 
2276 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
2277 	ConditionVariableEntry waitEntry;
2278 	fAllFinishedCondition.Add(&waitEntry);
2279 
2280 	// schedule writes
2281 	for (uint32 i = 0; i < fTransferCount; i++)
2282 		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
2283 
2284 	// wait until all pages have been written
2285 	waitEntry.Wait();
2286 
2287 	// mark pages depending on whether they could be written or not
2288 
2289 	uint32 failedPages = 0;
2290 	uint32 wrapperIndex = 0;
2291 	for (uint32 i = 0; i < fTransferCount; i++) {
2292 		PageWriteTransfer& transfer = fTransfers[i];
2293 		transfer.Cache()->Lock();
2294 
2295 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2296 			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
2297 				failedPages++;
2298 		}
2299 
2300 		transfer.Cache()->Unlock();
2301 	}
2302 
2303 	ASSERT(wrapperIndex == fWrapperCount);
2304 
2305 	for (uint32 i = 0; i < fTransferCount; i++) {
2306 		PageWriteTransfer& transfer = fTransfers[i];
2307 		struct VMCache* cache = transfer.Cache();
2308 
2309 		// We've acquired a references for each page
2310 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2311 			// We release the cache references after all pages were made
2312 			// unbusy again - otherwise releasing a vnode could deadlock.
2313 			cache->ReleaseStoreRef();
2314 			cache->ReleaseRef();
2315 		}
2316 	}
2317 
2318 	return failedPages;
2319 }
2320 
2321 
2322 void
2323 PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
2324 	bool partialTransfer, size_t bytesTransferred)
2325 {
2326 	if (atomic_add(&fPendingTransfers, -1) == 1)
2327 		fAllFinishedCondition.NotifyAll();
2328 }
2329 
2330 
2331 /*!	The page writer continuously takes some pages from the modified
2332 	queue, writes them back, and moves them back to the active queue.
2333 	It runs in its own thread, and is only there to keep the number
2334 	of modified pages low, so that more pages can be reused with
2335 	fewer costs.
2336 */
2337 status_t
2338 page_writer(void* /*unused*/)
2339 {
2340 	const uint32 kNumPages = 256;
2341 #ifdef TRACE_VM_PAGE
2342 	uint32 writtenPages = 0;
2343 	bigtime_t lastWrittenTime = 0;
2344 	bigtime_t pageCollectionTime = 0;
2345 	bigtime_t pageWritingTime = 0;
2346 #endif
2347 
2348 	PageWriterRun run;
2349 	if (run.Init(kNumPages) != B_OK) {
2350 		panic("page writer: Failed to init PageWriterRun!");
2351 		return B_ERROR;
2352 	}
2353 
2354 	page_num_t pagesSinceLastSuccessfulWrite = 0;
2355 
2356 	while (true) {
2357 // TODO: Maybe wait shorter when memory is low!
2358 		if (sModifiedPageQueue.Count() < kNumPages) {
2359 			sPageWriterCondition.Wait(3000000, true);
2360 				// all 3 seconds when no one triggers us
2361 		}
2362 
2363 		page_num_t modifiedPages = sModifiedPageQueue.Count();
2364 		if (modifiedPages == 0)
2365 			continue;
2366 
2367 		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
2368 			// We ran through the whole queue without being able to write a
2369 			// single page. Take a break.
2370 			snooze(500000);
2371 			pagesSinceLastSuccessfulWrite = 0;
2372 		}
2373 
2374 #if ENABLE_SWAP_SUPPORT
2375 		page_stats pageStats;
2376 		get_page_stats(pageStats);
2377 		bool activePaging = do_active_paging(pageStats);
2378 #endif
2379 
2380 		// depending on how urgent it becomes to get pages to disk, we adjust
2381 		// our I/O priority
2382 		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
2383 		int32 ioPriority = B_IDLE_PRIORITY;
2384 		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
2385 			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
2386 			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
2387 		} else {
2388 			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
2389 				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
2390 		}
2391 
2392 		thread_set_io_priority(ioPriority);
2393 
2394 		uint32 numPages = 0;
2395 		run.PrepareNextRun();
2396 
2397 		// TODO: make this laptop friendly, too (ie. only start doing
2398 		// something if someone else did something or there is really
2399 		// enough to do).
2400 
2401 		// collect pages to be written
2402 #ifdef TRACE_VM_PAGE
2403 		pageCollectionTime -= system_time();
2404 #endif
2405 
2406 		page_num_t maxPagesToSee = modifiedPages;
2407 
2408 		while (numPages < kNumPages && maxPagesToSee > 0) {
2409 			vm_page *page = next_modified_page(maxPagesToSee);
2410 			if (page == NULL)
2411 				break;
2412 
2413 			PageCacheLocker cacheLocker(page, false);
2414 			if (!cacheLocker.IsLocked())
2415 				continue;
2416 
2417 			VMCache *cache = page->Cache();
2418 
2419 			// If the page is busy or its state has changed while we were
2420 			// locking the cache, just ignore it.
2421 			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2422 				continue;
2423 
2424 			DEBUG_PAGE_ACCESS_START(page);
2425 
2426 			// Don't write back wired (locked) pages.
2427 			if (page->WiredCount() > 0) {
2428 				set_page_state(page, PAGE_STATE_ACTIVE);
2429 				DEBUG_PAGE_ACCESS_END(page);
2430 				continue;
2431 			}
2432 
2433 			// Write back temporary pages only when we're actively paging.
2434 			if (cache->temporary
2435 #if ENABLE_SWAP_SUPPORT
2436 				&& (!activePaging
2437 					|| !cache->CanWritePage(
2438 							(off_t)page->cache_offset << PAGE_SHIFT))
2439 #endif
2440 				) {
2441 				// We can't/don't want to do anything with this page, so move it
2442 				// to one of the other queues.
2443 				if (page->mappings.IsEmpty())
2444 					set_page_state(page, PAGE_STATE_INACTIVE);
2445 				else
2446 					set_page_state(page, PAGE_STATE_ACTIVE);
2447 
2448 				DEBUG_PAGE_ACCESS_END(page);
2449 				continue;
2450 			}
2451 
2452 			// We need our own reference to the store, as it might currently be
2453 			// destroyed.
2454 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2455 				DEBUG_PAGE_ACCESS_END(page);
2456 				cacheLocker.Unlock();
2457 				thread_yield();
2458 				continue;
2459 			}
2460 
2461 			run.AddPage(page);
2462 				// TODO: We're possibly adding pages of different caches and
2463 				// thus maybe of different underlying file systems here. This
2464 				// is a potential problem for loop file systems/devices, since
2465 				// we could mark a page busy that would need to be accessed
2466 				// when writing back another page, thus causing a deadlock.
2467 
2468 			DEBUG_PAGE_ACCESS_END(page);
2469 
2470 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2471 			TPW(WritePage(page));
2472 
2473 			cache->AcquireRefLocked();
2474 			numPages++;
2475 		}
2476 
2477 #ifdef TRACE_VM_PAGE
2478 		pageCollectionTime += system_time();
2479 #endif
2480 		if (numPages == 0)
2481 			continue;
2482 
2483 		// write pages to disk and do all the cleanup
2484 #ifdef TRACE_VM_PAGE
2485 		pageWritingTime -= system_time();
2486 #endif
2487 		uint32 failedPages = run.Go();
2488 #ifdef TRACE_VM_PAGE
2489 		pageWritingTime += system_time();
2490 
2491 		// debug output only...
2492 		writtenPages += numPages;
2493 		if (writtenPages >= 1024) {
2494 			bigtime_t now = system_time();
2495 			TRACE(("page writer: wrote 1024 pages (total: %" B_PRIu64 " ms, "
2496 				"collect: %" B_PRIu64 " ms, write: %" B_PRIu64 " ms)\n",
2497 				(now - lastWrittenTime) / 1000,
2498 				pageCollectionTime / 1000, pageWritingTime / 1000));
2499 			lastWrittenTime = now;
2500 
2501 			writtenPages -= 1024;
2502 			pageCollectionTime = 0;
2503 			pageWritingTime = 0;
2504 		}
2505 #endif
2506 
2507 		if (failedPages == numPages)
2508 			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2509 		else
2510 			pagesSinceLastSuccessfulWrite = 0;
2511 	}
2512 
2513 	return B_OK;
2514 }
2515 
2516 
2517 // #pragma mark -
2518 
2519 
2520 // TODO: This should be done in the page daemon!
2521 #if 0
2522 #if ENABLE_SWAP_SUPPORT
2523 static bool
2524 free_page_swap_space(int32 index)
2525 {
2526 	vm_page *page = vm_page_at_index(index);
2527 	PageCacheLocker locker(page);
2528 	if (!locker.IsLocked())
2529 		return false;
2530 
2531 	DEBUG_PAGE_ACCESS_START(page);
2532 
2533 	VMCache* cache = page->Cache();
2534 	if (cache->temporary && page->WiredCount() == 0
2535 			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2536 			&& page->usage_count > 0) {
2537 		// TODO: how to judge a page is highly active?
2538 		if (swap_free_page_swap_space(page)) {
2539 			// We need to mark the page modified, since otherwise it could be
2540 			// stolen and we'd lose its data.
2541 			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2542 			TD(FreedPageSwap(page));
2543 			DEBUG_PAGE_ACCESS_END(page);
2544 			return true;
2545 		}
2546 	}
2547 	DEBUG_PAGE_ACCESS_END(page);
2548 	return false;
2549 }
2550 #endif
2551 #endif	// 0
2552 
2553 
2554 static vm_page *
2555 find_cached_page_candidate(struct vm_page &marker)
2556 {
2557 	DEBUG_PAGE_ACCESS_CHECK(&marker);
2558 
2559 	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2560 	vm_page *page;
2561 
2562 	if (marker.State() == PAGE_STATE_UNUSED) {
2563 		// Get the first free pages of the (in)active queue
2564 		page = sCachedPageQueue.Head();
2565 	} else {
2566 		// Get the next page of the current queue
2567 		if (marker.State() != PAGE_STATE_CACHED) {
2568 			panic("invalid marker %p state", &marker);
2569 			return NULL;
2570 		}
2571 
2572 		page = sCachedPageQueue.Next(&marker);
2573 		sCachedPageQueue.Remove(&marker);
2574 		marker.SetState(PAGE_STATE_UNUSED);
2575 	}
2576 
2577 	while (page != NULL) {
2578 		if (!page->busy) {
2579 			// we found a candidate, insert marker
2580 			marker.SetState(PAGE_STATE_CACHED);
2581 			sCachedPageQueue.InsertAfter(page, &marker);
2582 			return page;
2583 		}
2584 
2585 		page = sCachedPageQueue.Next(page);
2586 	}
2587 
2588 	return NULL;
2589 }
2590 
2591 
2592 static bool
2593 free_cached_page(vm_page *page, bool dontWait)
2594 {
2595 	// try to lock the page's cache
2596 	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2597 		return false;
2598 	VMCache* cache = page->Cache();
2599 
2600 	AutoLocker<VMCache> cacheLocker(cache, true);
2601 	MethodDeleter<VMCache, void, &VMCache::ReleaseRefLocked> _2(cache);
2602 
2603 	// check again if that page is still a candidate
2604 	if (page->busy || page->State() != PAGE_STATE_CACHED)
2605 		return false;
2606 
2607 	DEBUG_PAGE_ACCESS_START(page);
2608 
2609 	PAGE_ASSERT(page, !page->IsMapped());
2610 	PAGE_ASSERT(page, !page->modified);
2611 
2612 	// we can now steal this page
2613 
2614 	cache->RemovePage(page);
2615 		// Now the page doesn't have cache anymore, so no one else (e.g.
2616 		// vm_page_allocate_page_run() can pick it up), since they would be
2617 		// required to lock the cache first, which would fail.
2618 
2619 	sCachedPageQueue.RemoveUnlocked(page);
2620 	return true;
2621 }
2622 
2623 
2624 static uint32
2625 free_cached_pages(uint32 pagesToFree, bool dontWait)
2626 {
2627 	vm_page marker;
2628 	init_page_marker(marker);
2629 
2630 	uint32 pagesFreed = 0;
2631 
2632 	while (pagesFreed < pagesToFree) {
2633 		vm_page *page = find_cached_page_candidate(marker);
2634 		if (page == NULL)
2635 			break;
2636 
2637 		if (free_cached_page(page, dontWait)) {
2638 			ReadLocker locker(sFreePageQueuesLock);
2639 			page->SetState(PAGE_STATE_FREE);
2640 			DEBUG_PAGE_ACCESS_END(page);
2641 			sFreePageQueue.PrependUnlocked(page);
2642 			locker.Unlock();
2643 
2644 			TA(StolenPage());
2645 
2646 			pagesFreed++;
2647 		}
2648 	}
2649 
2650 	remove_page_marker(marker);
2651 
2652 	sFreePageCondition.NotifyAll();
2653 
2654 	return pagesFreed;
2655 }
2656 
2657 
2658 static void
2659 idle_scan_active_pages(page_stats& pageStats)
2660 {
2661 	VMPageQueue& queue = sActivePageQueue;
2662 
2663 	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2664 	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2665 
2666 	while (maxToScan > 0) {
2667 		maxToScan--;
2668 
2669 		// Get the next page. Note that we don't bother to lock here. We go with
2670 		// the assumption that on all architectures reading/writing pointers is
2671 		// atomic. Beyond that it doesn't really matter. We have to unlock the
2672 		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2673 		vm_page* page = queue.Head();
2674 		if (page == NULL)
2675 			break;
2676 
2677 		// lock the page's cache
2678 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2679 		if (cache == NULL)
2680 			continue;
2681 
2682 		if (page->State() != PAGE_STATE_ACTIVE) {
2683 			// page is no longer in the cache or in this queue
2684 			cache->ReleaseRefAndUnlock();
2685 			continue;
2686 		}
2687 
2688 		if (page->busy) {
2689 			// page is busy -- requeue at the end
2690 			vm_page_requeue(page, true);
2691 			cache->ReleaseRefAndUnlock();
2692 			continue;
2693 		}
2694 
2695 		DEBUG_PAGE_ACCESS_START(page);
2696 
2697 		// Get the page active/modified flags and update the page's usage count.
2698 		// We completely unmap inactive temporary pages. This saves us to
2699 		// iterate through the inactive list as well, since we'll be notified
2700 		// via page fault whenever such an inactive page is used again.
2701 		// We don't remove the mappings of non-temporary pages, since we
2702 		// wouldn't notice when those would become unused and could thus be
2703 		// moved to the cached list.
2704 		int32 usageCount;
2705 		if (page->WiredCount() > 0 || page->usage_count > 0
2706 			|| !cache->temporary) {
2707 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2708 		} else
2709 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2710 
2711 		if (usageCount > 0) {
2712 			usageCount += page->usage_count + kPageUsageAdvance;
2713 			if (usageCount > kPageUsageMax)
2714 				usageCount = kPageUsageMax;
2715 // TODO: This would probably also be the place to reclaim swap space.
2716 		} else {
2717 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2718 			if (usageCount < 0) {
2719 				usageCount = 0;
2720 				set_page_state(page, PAGE_STATE_INACTIVE);
2721 			}
2722 		}
2723 
2724 		page->usage_count = usageCount;
2725 
2726 		DEBUG_PAGE_ACCESS_END(page);
2727 
2728 		cache->ReleaseRefAndUnlock();
2729 	}
2730 }
2731 
2732 
2733 static void
2734 full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2735 {
2736 	int32 pagesToFree = pageStats.unsatisfiedReservations
2737 		+ sFreeOrCachedPagesTarget
2738 		- (pageStats.totalFreePages + pageStats.cachedPages);
2739 	if (pagesToFree <= 0)
2740 		return;
2741 
2742 	bigtime_t time = system_time();
2743 	uint32 pagesScanned = 0;
2744 	uint32 pagesToCached = 0;
2745 	uint32 pagesToModified = 0;
2746 	uint32 pagesToActive = 0;
2747 
2748 	// Determine how many pages at maximum to send to the modified queue. Since
2749 	// it is relatively expensive to page out pages, we do that on a grander
2750 	// scale only when things get desperate.
2751 	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2752 
2753 	vm_page marker;
2754 	init_page_marker(marker);
2755 
2756 	VMPageQueue& queue = sInactivePageQueue;
2757 	InterruptsSpinLocker queueLocker(queue.GetLock());
2758 	uint32 maxToScan = queue.Count();
2759 
2760 	vm_page* nextPage = queue.Head();
2761 
2762 	while (pagesToFree > 0 && maxToScan > 0) {
2763 		maxToScan--;
2764 
2765 		// get the next page
2766 		vm_page* page = nextPage;
2767 		if (page == NULL)
2768 			break;
2769 		nextPage = queue.Next(page);
2770 
2771 		if (page->busy)
2772 			continue;
2773 
2774 		// mark the position
2775 		queue.InsertAfter(page, &marker);
2776 		queueLocker.Unlock();
2777 
2778 		// lock the page's cache
2779 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2780 		if (cache == NULL || page->busy
2781 				|| page->State() != PAGE_STATE_INACTIVE) {
2782 			if (cache != NULL)
2783 				cache->ReleaseRefAndUnlock();
2784 			queueLocker.Lock();
2785 			nextPage = queue.Next(&marker);
2786 			queue.Remove(&marker);
2787 			continue;
2788 		}
2789 
2790 		pagesScanned++;
2791 
2792 		DEBUG_PAGE_ACCESS_START(page);
2793 
2794 		// Get the accessed count, clear the accessed/modified flags and
2795 		// unmap the page, if it hasn't been accessed.
2796 		int32 usageCount;
2797 		if (page->WiredCount() > 0)
2798 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2799 		else
2800 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2801 
2802 		// update usage count
2803 		if (usageCount > 0) {
2804 			usageCount += page->usage_count + kPageUsageAdvance;
2805 			if (usageCount > kPageUsageMax)
2806 				usageCount = kPageUsageMax;
2807 		} else {
2808 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2809 			if (usageCount < 0)
2810 				usageCount = 0;
2811 		}
2812 
2813 		page->usage_count = usageCount;
2814 
2815 		// Move to fitting queue or requeue:
2816 		// * Active mapped pages go to the active queue.
2817 		// * Inactive mapped (i.e. wired) pages are requeued.
2818 		// * The remaining pages are cachable. Thus, if unmodified they go to
2819 		//   the cached queue, otherwise to the modified queue (up to a limit).
2820 		//   Note that until in the idle scanning we don't exempt pages of
2821 		//   temporary caches. Apparently we really need memory, so we better
2822 		//   page out memory as well.
2823 		bool isMapped = page->IsMapped();
2824 		if (usageCount > 0) {
2825 			if (isMapped) {
2826 				set_page_state(page, PAGE_STATE_ACTIVE);
2827 				pagesToActive++;
2828 			} else
2829 				vm_page_requeue(page, true);
2830 		} else if (isMapped) {
2831 			vm_page_requeue(page, true);
2832 		} else if (!page->modified) {
2833 			set_page_state(page, PAGE_STATE_CACHED);
2834 			pagesToFree--;
2835 			pagesToCached++;
2836 		} else if (maxToFlush > 0) {
2837 			set_page_state(page, PAGE_STATE_MODIFIED);
2838 			maxToFlush--;
2839 			pagesToModified++;
2840 		} else
2841 			vm_page_requeue(page, true);
2842 
2843 		DEBUG_PAGE_ACCESS_END(page);
2844 
2845 		cache->ReleaseRefAndUnlock();
2846 
2847 		// remove the marker
2848 		queueLocker.Lock();
2849 		nextPage = queue.Next(&marker);
2850 		queue.Remove(&marker);
2851 	}
2852 
2853 	queueLocker.Unlock();
2854 
2855 	time = system_time() - time;
2856 	TRACE_DAEMON("  -> inactive scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2857 		", moved: %" B_PRIu32 " -> cached, %" B_PRIu32 " -> modified, %"
2858 		B_PRIu32 " -> active\n", time, pagesScanned, pagesToCached,
2859 		pagesToModified, pagesToActive);
2860 
2861 	// wake up the page writer, if we tossed it some pages
2862 	if (pagesToModified > 0)
2863 		sPageWriterCondition.WakeUp();
2864 }
2865 
2866 
2867 static void
2868 full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2869 {
2870 	vm_page marker;
2871 	init_page_marker(marker);
2872 
2873 	VMPageQueue& queue = sActivePageQueue;
2874 	InterruptsSpinLocker queueLocker(queue.GetLock());
2875 	uint32 maxToScan = queue.Count();
2876 
2877 	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2878 		+ sFreeOrCachedPagesTarget
2879 		- (pageStats.totalFreePages + pageStats.cachedPages)
2880 		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2881 	if (pagesToDeactivate <= 0)
2882 		return;
2883 
2884 	bigtime_t time = system_time();
2885 	uint32 pagesAccessed = 0;
2886 	uint32 pagesToInactive = 0;
2887 	uint32 pagesScanned = 0;
2888 
2889 	vm_page* nextPage = queue.Head();
2890 
2891 	while (pagesToDeactivate > 0 && maxToScan > 0) {
2892 		maxToScan--;
2893 
2894 		// get the next page
2895 		vm_page* page = nextPage;
2896 		if (page == NULL)
2897 			break;
2898 		nextPage = queue.Next(page);
2899 
2900 		if (page->busy)
2901 			continue;
2902 
2903 		// mark the position
2904 		queue.InsertAfter(page, &marker);
2905 		queueLocker.Unlock();
2906 
2907 		// lock the page's cache
2908 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2909 		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2910 			if (cache != NULL)
2911 				cache->ReleaseRefAndUnlock();
2912 			queueLocker.Lock();
2913 			nextPage = queue.Next(&marker);
2914 			queue.Remove(&marker);
2915 			continue;
2916 		}
2917 
2918 		pagesScanned++;
2919 
2920 		DEBUG_PAGE_ACCESS_START(page);
2921 
2922 		// Get the page active/modified flags and update the page's usage count.
2923 		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2924 
2925 		if (usageCount > 0) {
2926 			usageCount += page->usage_count + kPageUsageAdvance;
2927 			if (usageCount > kPageUsageMax)
2928 				usageCount = kPageUsageMax;
2929 			pagesAccessed++;
2930 // TODO: This would probably also be the place to reclaim swap space.
2931 		} else {
2932 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2933 			if (usageCount <= 0) {
2934 				usageCount = 0;
2935 				set_page_state(page, PAGE_STATE_INACTIVE);
2936 				pagesToInactive++;
2937 			}
2938 		}
2939 
2940 		page->usage_count = usageCount;
2941 
2942 		DEBUG_PAGE_ACCESS_END(page);
2943 
2944 		cache->ReleaseRefAndUnlock();
2945 
2946 		// remove the marker
2947 		queueLocker.Lock();
2948 		nextPage = queue.Next(&marker);
2949 		queue.Remove(&marker);
2950 	}
2951 
2952 	time = system_time() - time;
2953 	TRACE_DAEMON("  ->   active scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2954 		", moved: %" B_PRIu32 " -> inactive, encountered %" B_PRIu32 " accessed"
2955 		" ones\n", time, pagesScanned, pagesToInactive, pagesAccessed);
2956 }
2957 
2958 
2959 static void
2960 page_daemon_idle_scan(page_stats& pageStats)
2961 {
2962 	TRACE_DAEMON("page daemon: idle run\n");
2963 
2964 	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2965 		// We want more actually free pages, so free some from the cached
2966 		// ones.
2967 		uint32 freed = free_cached_pages(
2968 			sFreePagesTarget - pageStats.totalFreePages, false);
2969 		if (freed > 0)
2970 			unreserve_pages(freed);
2971 		get_page_stats(pageStats);
2972 	}
2973 
2974 	// Walk the active list and move pages to the inactive queue.
2975 	get_page_stats(pageStats);
2976 	idle_scan_active_pages(pageStats);
2977 }
2978 
2979 
2980 static void
2981 page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
2982 {
2983 	TRACE_DAEMON("page daemon: full run: free: %" B_PRIu32 ", cached: %"
2984 		B_PRIu32 ", to free: %" B_PRIu32 "\n", pageStats.totalFreePages,
2985 		pageStats.cachedPages, pageStats.unsatisfiedReservations
2986 			+ sFreeOrCachedPagesTarget
2987 			- (pageStats.totalFreePages + pageStats.cachedPages));
2988 
2989 	// Walk the inactive list and transfer pages to the cached and modified
2990 	// queues.
2991 	full_scan_inactive_pages(pageStats, despairLevel);
2992 
2993 	// Free cached pages. Also wake up reservation waiters.
2994 	get_page_stats(pageStats);
2995 	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
2996 		- (pageStats.totalFreePages);
2997 	if (pagesToFree > 0) {
2998 		uint32 freed = free_cached_pages(pagesToFree, true);
2999 		if (freed > 0)
3000 			unreserve_pages(freed);
3001 	}
3002 
3003 	// Walk the active list and move pages to the inactive queue.
3004 	get_page_stats(pageStats);
3005 	full_scan_active_pages(pageStats, despairLevel);
3006 }
3007 
3008 
3009 static status_t
3010 page_daemon(void* /*unused*/)
3011 {
3012 	int32 despairLevel = 0;
3013 
3014 	while (true) {
3015 		sPageDaemonCondition.ClearActivated();
3016 
3017 		// evaluate the free pages situation
3018 		page_stats pageStats;
3019 		get_page_stats(pageStats);
3020 
3021 		if (!do_active_paging(pageStats)) {
3022 			// Things look good -- just maintain statistics and keep the pool
3023 			// of actually free pages full enough.
3024 			despairLevel = 0;
3025 			page_daemon_idle_scan(pageStats);
3026 			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
3027 		} else {
3028 			// Not enough free pages. We need to do some real work.
3029 			despairLevel = std::max(despairLevel + 1, (int32)3);
3030 			page_daemon_full_scan(pageStats, despairLevel);
3031 
3032 			// Don't wait after the first full scan, but rather immediately
3033 			// check whether we were successful in freeing enough pages and
3034 			// re-run with increased despair level. The first scan is
3035 			// conservative with respect to moving inactive modified pages to
3036 			// the modified list to avoid thrashing. The second scan, however,
3037 			// will not hold back.
3038 			if (despairLevel > 1)
3039 				snooze(kBusyScanWaitInterval);
3040 		}
3041 	}
3042 
3043 	return B_OK;
3044 }
3045 
3046 
3047 /*!	Returns how many pages could *not* be reserved.
3048 */
3049 static uint32
3050 reserve_pages(uint32 count, int priority, bool dontWait)
3051 {
3052 	int32 dontTouch = kPageReserveForPriority[priority];
3053 
3054 	while (true) {
3055 		count -= reserve_some_pages(count, dontTouch);
3056 		if (count == 0)
3057 			return 0;
3058 
3059 		if (sUnsatisfiedPageReservations == 0) {
3060 			count -= free_cached_pages(count, dontWait);
3061 			if (count == 0)
3062 				return count;
3063 		}
3064 
3065 		if (dontWait)
3066 			return count;
3067 
3068 		// we need to wait for pages to become available
3069 
3070 		MutexLocker pageDeficitLocker(sPageDeficitLock);
3071 
3072 		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
3073 		sUnsatisfiedPageReservations += count;
3074 
3075 		if (atomic_get(&sUnreservedFreePages) > dontTouch) {
3076 			// the situation changed
3077 			sUnsatisfiedPageReservations -= count;
3078 			continue;
3079 		}
3080 
3081 		PageReservationWaiter waiter;
3082 		waiter.dontTouch = dontTouch;
3083 		waiter.missing = count;
3084 		waiter.thread = thread_get_current_thread();
3085 		waiter.threadPriority = waiter.thread->priority;
3086 
3087 		// insert ordered (i.e. after all waiters with higher or equal priority)
3088 		PageReservationWaiter* otherWaiter = NULL;
3089 		for (PageReservationWaiterList::Iterator it
3090 				= sPageReservationWaiters.GetIterator();
3091 			(otherWaiter = it.Next()) != NULL;) {
3092 			if (waiter < *otherWaiter)
3093 				break;
3094 		}
3095 
3096 		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
3097 
3098 		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
3099 			"waiting for pages");
3100 
3101 		if (notifyDaemon)
3102 			sPageDaemonCondition.WakeUp();
3103 
3104 		pageDeficitLocker.Unlock();
3105 
3106 		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
3107 		thread_block();
3108 
3109 		pageDeficitLocker.Lock();
3110 
3111 		return 0;
3112 	}
3113 }
3114 
3115 
3116 //	#pragma mark - private kernel API
3117 
3118 
3119 /*!	Writes a range of modified pages of a cache to disk.
3120 	You need to hold the VMCache lock when calling this function.
3121 	Note that the cache lock is released in this function.
3122 	\param cache The cache.
3123 	\param firstPage Offset (in page size units) of the first page in the range.
3124 	\param endPage End offset (in page size units) of the page range. The page
3125 		at this offset is not included.
3126 */
3127 status_t
3128 vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
3129 	uint32 endPage)
3130 {
3131 	static const int32 kMaxPages = 256;
3132 	int32 maxPages = cache->MaxPagesPerWrite();
3133 	if (maxPages < 0 || maxPages > kMaxPages)
3134 		maxPages = kMaxPages;
3135 
3136 	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
3137 		| HEAP_DONT_LOCK_KERNEL_SPACE;
3138 
3139 	PageWriteWrapper stackWrappersPool[2];
3140 	PageWriteWrapper* stackWrappers[1];
3141 	PageWriteWrapper* wrapperPool
3142 		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
3143 	PageWriteWrapper** wrappers
3144 		= new(malloc_flags(allocationFlags)) PageWriteWrapper*[maxPages];
3145 	if (wrapperPool == NULL || wrappers == NULL) {
3146 		// don't fail, just limit our capabilities
3147 		delete[] wrapperPool;
3148 		delete[] wrappers;
3149 		wrapperPool = stackWrappersPool;
3150 		wrappers = stackWrappers;
3151 		maxPages = 1;
3152 	}
3153 
3154 	int32 nextWrapper = 0;
3155 	int32 usedWrappers = 0;
3156 
3157 	PageWriteTransfer transfer;
3158 	bool transferEmpty = true;
3159 
3160 	VMCachePagesTree::Iterator it
3161 		= cache->pages.GetIterator(firstPage, true, true);
3162 
3163 	while (true) {
3164 		vm_page* page = it.Next();
3165 		if (page == NULL || page->cache_offset >= endPage) {
3166 			if (transferEmpty)
3167 				break;
3168 
3169 			page = NULL;
3170 		}
3171 
3172 		if (page != NULL) {
3173 			if (page->busy
3174 				|| (page->State() != PAGE_STATE_MODIFIED
3175 					&& !vm_test_map_modification(page))) {
3176 				page = NULL;
3177 			}
3178 		}
3179 
3180 		PageWriteWrapper* wrapper = NULL;
3181 		if (page != NULL) {
3182 			wrapper = &wrapperPool[nextWrapper++];
3183 			if (nextWrapper > maxPages)
3184 				nextWrapper = 0;
3185 
3186 			DEBUG_PAGE_ACCESS_START(page);
3187 
3188 			wrapper->SetTo(page);
3189 
3190 			if (transferEmpty || transfer.AddPage(page)) {
3191 				if (transferEmpty) {
3192 					transfer.SetTo(NULL, page, maxPages);
3193 					transferEmpty = false;
3194 				}
3195 
3196 				DEBUG_PAGE_ACCESS_END(page);
3197 
3198 				wrappers[usedWrappers++] = wrapper;
3199 				continue;
3200 			}
3201 
3202 			DEBUG_PAGE_ACCESS_END(page);
3203 		}
3204 
3205 		if (transferEmpty)
3206 			continue;
3207 
3208 		cache->Unlock();
3209 		status_t status = transfer.Schedule(0);
3210 		cache->Lock();
3211 
3212 		for (int32 i = 0; i < usedWrappers; i++)
3213 			wrappers[i]->Done(status);
3214 
3215 		usedWrappers = 0;
3216 
3217 		if (page != NULL) {
3218 			transfer.SetTo(NULL, page, maxPages);
3219 			wrappers[usedWrappers++] = wrapper;
3220 		} else
3221 			transferEmpty = true;
3222 	}
3223 
3224 	if (wrapperPool != stackWrappersPool) {
3225 		delete[] wrapperPool;
3226 		delete[] wrappers;
3227 	}
3228 
3229 	return B_OK;
3230 }
3231 
3232 
3233 /*!	You need to hold the VMCache lock when calling this function.
3234 	Note that the cache lock is released in this function.
3235 */
3236 status_t
3237 vm_page_write_modified_pages(VMCache *cache)
3238 {
3239 	return vm_page_write_modified_page_range(cache, 0,
3240 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
3241 }
3242 
3243 
3244 /*!	Schedules the page writer to write back the specified \a page.
3245 	Note, however, that it might not do this immediately, and it can well
3246 	take several seconds until the page is actually written out.
3247 */
3248 void
3249 vm_page_schedule_write_page(vm_page *page)
3250 {
3251 	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
3252 
3253 	vm_page_requeue(page, false);
3254 
3255 	sPageWriterCondition.WakeUp();
3256 }
3257 
3258 
3259 /*!	Cache must be locked.
3260 */
3261 void
3262 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
3263 	uint32 endPage)
3264 {
3265 	uint32 modified = 0;
3266 	for (VMCachePagesTree::Iterator it
3267 				= cache->pages.GetIterator(firstPage, true, true);
3268 			vm_page *page = it.Next();) {
3269 		if (page->cache_offset >= endPage)
3270 			break;
3271 
3272 		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
3273 			DEBUG_PAGE_ACCESS_START(page);
3274 			vm_page_requeue(page, false);
3275 			modified++;
3276 			DEBUG_PAGE_ACCESS_END(page);
3277 		}
3278 	}
3279 
3280 	if (modified > 0)
3281 		sPageWriterCondition.WakeUp();
3282 }
3283 
3284 
3285 void
3286 vm_page_init_num_pages(kernel_args *args)
3287 {
3288 	// calculate the size of memory by looking at the physical_memory_range array
3289 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
3290 	page_num_t physicalPagesEnd = sPhysicalPageOffset
3291 		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
3292 
3293 	sNonExistingPages = 0;
3294 	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
3295 
3296 	for (uint32 i = 1; i < args->num_physical_memory_ranges; i++) {
3297 		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
3298 		if (start > physicalPagesEnd)
3299 			sNonExistingPages += start - physicalPagesEnd;
3300 		physicalPagesEnd = start
3301 			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
3302 
3303 #ifdef LIMIT_AVAILABLE_MEMORY
3304 		page_num_t available
3305 			= physicalPagesEnd - sPhysicalPageOffset - sNonExistingPages;
3306 		if (available > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE)) {
3307 			physicalPagesEnd = sPhysicalPageOffset + sNonExistingPages
3308 				+ LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
3309 			break;
3310 		}
3311 #endif
3312 	}
3313 
3314 	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
3315 		sPhysicalPageOffset, physicalPagesEnd));
3316 
3317 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
3318 }
3319 
3320 
3321 status_t
3322 vm_page_init(kernel_args *args)
3323 {
3324 	TRACE(("vm_page_init: entry\n"));
3325 
3326 	// init page queues
3327 	sModifiedPageQueue.Init("modified pages queue");
3328 	sInactivePageQueue.Init("inactive pages queue");
3329 	sActivePageQueue.Init("active pages queue");
3330 	sCachedPageQueue.Init("cached pages queue");
3331 	sFreePageQueue.Init("free pages queue");
3332 	sClearPageQueue.Init("clear pages queue");
3333 
3334 	new (&sPageReservationWaiters) PageReservationWaiterList;
3335 
3336 	// map in the new free page table
3337 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
3338 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3339 
3340 	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
3341 		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
3342 		(phys_addr_t)(sNumPages * sizeof(vm_page))));
3343 
3344 	// initialize the free page table
3345 	for (uint32 i = 0; i < sNumPages; i++) {
3346 		sPages[i].Init(sPhysicalPageOffset + i);
3347 		sFreePageQueue.Append(&sPages[i]);
3348 
3349 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3350 		sPages[i].allocation_tracking_info.Clear();
3351 #endif
3352 	}
3353 
3354 	sUnreservedFreePages = sNumPages;
3355 
3356 	TRACE(("initialized table\n"));
3357 
3358 	// mark the ranges between usable physical memory unused
3359 	phys_addr_t previousEnd = 0;
3360 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3361 		phys_addr_t base = args->physical_memory_range[i].start;
3362 		phys_size_t size = args->physical_memory_range[i].size;
3363 		if (base > previousEnd) {
3364 			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
3365 				(base - previousEnd) / B_PAGE_SIZE, false);
3366 		}
3367 		previousEnd = base + size;
3368 	}
3369 
3370 	// mark the allocated physical page ranges wired
3371 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3372 		mark_page_range_in_use(
3373 			args->physical_allocated_range[i].start / B_PAGE_SIZE,
3374 			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
3375 	}
3376 
3377 	// prevent future allocations from the kernel args ranges
3378 	args->num_physical_allocated_ranges = 0;
3379 
3380 	// The target of actually free pages. This must be at least the system
3381 	// reserve, but should be a few more pages, so we don't have to extract
3382 	// a cached page with each allocation.
3383 	sFreePagesTarget = VM_PAGE_RESERVE_USER
3384 		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
3385 
3386 	// The target of free + cached and inactive pages. On low-memory machines
3387 	// keep things tight. free + cached is the pool of immediately allocatable
3388 	// pages. We want a few inactive pages, so when we're actually paging, we
3389 	// have a reasonably large set of pages to work with.
3390 	if (sUnreservedFreePages < 16 * 1024) {
3391 		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
3392 		sInactivePagesTarget = sFreePagesTarget / 3;
3393 	} else {
3394 		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
3395 		sInactivePagesTarget = sFreePagesTarget / 2;
3396 	}
3397 
3398 	TRACE(("vm_page_init: exit\n"));
3399 
3400 	return B_OK;
3401 }
3402 
3403 
3404 status_t
3405 vm_page_init_post_area(kernel_args *args)
3406 {
3407 	void *dummy;
3408 
3409 	dummy = sPages;
3410 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
3411 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
3412 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3413 
3414 	add_debugger_command("list_pages", &dump_page_list,
3415 		"List physical pages");
3416 	add_debugger_command("page_stats", &dump_page_stats,
3417 		"Dump statistics about page usage");
3418 	add_debugger_command_etc("page", &dump_page_long,
3419 		"Dump page info",
3420 		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
3421 		"Prints information for the physical page. If neither \"-p\" nor\n"
3422 		"\"-v\" are given, the provided address is interpreted as address of\n"
3423 		"the vm_page data structure for the page in question. If \"-p\" is\n"
3424 		"given, the address is the physical address of the page. If \"-v\" is\n"
3425 		"given, the address is interpreted as virtual address in the current\n"
3426 		"thread's address space and for the page it is mapped to (if any)\n"
3427 		"information are printed. If \"-m\" is specified, the command will\n"
3428 		"search all known address spaces for mappings to that page and print\n"
3429 		"them.\n", 0);
3430 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
3431 	add_debugger_command("find_page", &find_page,
3432 		"Find out which queue a page is actually in");
3433 
3434 #ifdef TRACK_PAGE_USAGE_STATS
3435 	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
3436 		"Dumps statistics about page usage counts",
3437 		"\n"
3438 		"Dumps statistics about page usage counts.\n",
3439 		B_KDEBUG_DONT_PARSE_ARGUMENTS);
3440 #endif
3441 
3442 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3443 	add_debugger_command_etc("page_allocations_per_caller",
3444 		&dump_page_allocations_per_caller,
3445 		"Dump current page allocations summed up per caller",
3446 		"[ -d <caller> ] [ -r ]\n"
3447 		"The current allocations will by summed up by caller (their count)\n"
3448 		"printed in decreasing order by count.\n"
3449 		"If \"-d\" is given, each allocation for caller <caller> is printed\n"
3450 		"including the respective stack trace.\n"
3451 		"If \"-r\" is given, the allocation infos are reset after gathering\n"
3452 		"the information, so the next command invocation will only show the\n"
3453 		"allocations made after the reset.\n", 0);
3454 	add_debugger_command_etc("page_allocation_infos",
3455 		&dump_page_allocation_infos,
3456 		"Dump current page allocations",
3457 		"[ --stacktrace ] [ -p <page number> ] [ --team <team ID> ] "
3458 		"[ --thread <thread ID> ]\n"
3459 		"The current allocations filtered by optional values will be printed.\n"
3460 		"The optional \"-p\" page number filters for a specific page,\n"
3461 		"with \"--team\" and \"--thread\" allocations by specific teams\n"
3462 		"and/or threads can be filtered (these only work if a corresponding\n"
3463 		"tracing entry is still available).\n"
3464 		"If \"--stacktrace\" is given, then stack traces of the allocation\n"
3465 		"callers are printed, where available\n", 0);
3466 #endif
3467 
3468 	return B_OK;
3469 }
3470 
3471 
3472 status_t
3473 vm_page_init_post_thread(kernel_args *args)
3474 {
3475 	new (&sFreePageCondition) ConditionVariable;
3476 
3477 	// create a kernel thread to clear out pages
3478 
3479 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3480 		B_LOWEST_ACTIVE_PRIORITY, NULL);
3481 	resume_thread(thread);
3482 
3483 	// start page writer
3484 
3485 	sPageWriterCondition.Init("page writer");
3486 
3487 	thread = spawn_kernel_thread(&page_writer, "page writer",
3488 		B_NORMAL_PRIORITY + 1, NULL);
3489 	resume_thread(thread);
3490 
3491 	// start page daemon
3492 
3493 	sPageDaemonCondition.Init("page daemon");
3494 
3495 	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3496 		B_NORMAL_PRIORITY, NULL);
3497 	resume_thread(thread);
3498 
3499 	return B_OK;
3500 }
3501 
3502 
3503 status_t
3504 vm_mark_page_inuse(page_num_t page)
3505 {
3506 	return vm_mark_page_range_inuse(page, 1);
3507 }
3508 
3509 
3510 status_t
3511 vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3512 {
3513 	return mark_page_range_in_use(startPage, length, false);
3514 }
3515 
3516 
3517 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3518 */
3519 void
3520 vm_page_unreserve_pages(vm_page_reservation* reservation)
3521 {
3522 	uint32 count = reservation->count;
3523 	reservation->count = 0;
3524 
3525 	if (count == 0)
3526 		return;
3527 
3528 	TA(UnreservePages(count));
3529 
3530 	unreserve_pages(count);
3531 }
3532 
3533 
3534 /*!	With this call, you can reserve a number of free pages in the system.
3535 	They will only be handed out to someone who has actually reserved them.
3536 	This call returns as soon as the number of requested pages has been
3537 	reached.
3538 	The caller must not hold any cache lock or the function might deadlock.
3539 */
3540 void
3541 vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3542 	int priority)
3543 {
3544 	reservation->count = count;
3545 
3546 	if (count == 0)
3547 		return;
3548 
3549 	TA(ReservePages(count));
3550 
3551 	reserve_pages(count, priority, false);
3552 }
3553 
3554 
3555 bool
3556 vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3557 	int priority)
3558 {
3559 	if (count == 0) {
3560 		reservation->count = count;
3561 		return true;
3562 	}
3563 
3564 	uint32 remaining = reserve_pages(count, priority, true);
3565 	if (remaining == 0) {
3566 		TA(ReservePages(count));
3567 		reservation->count = count;
3568 		return true;
3569 	}
3570 
3571 	unreserve_pages(count - remaining);
3572 
3573 	return false;
3574 }
3575 
3576 
3577 vm_page *
3578 vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3579 {
3580 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3581 	ASSERT(pageState != PAGE_STATE_FREE);
3582 	ASSERT(pageState != PAGE_STATE_CLEAR);
3583 
3584 	ASSERT(reservation->count > 0);
3585 	reservation->count--;
3586 
3587 	VMPageQueue* queue;
3588 	VMPageQueue* otherQueue;
3589 
3590 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3591 		queue = &sClearPageQueue;
3592 		otherQueue = &sFreePageQueue;
3593 	} else {
3594 		queue = &sFreePageQueue;
3595 		otherQueue = &sClearPageQueue;
3596 	}
3597 
3598 	ReadLocker locker(sFreePageQueuesLock);
3599 
3600 	vm_page* page = queue->RemoveHeadUnlocked();
3601 	if (page == NULL) {
3602 		// if the primary queue was empty, grab the page from the
3603 		// secondary queue
3604 		page = otherQueue->RemoveHeadUnlocked();
3605 
3606 		if (page == NULL) {
3607 			// Unlikely, but possible: the page we have reserved has moved
3608 			// between the queues after we checked the first queue. Grab the
3609 			// write locker to make sure this doesn't happen again.
3610 			locker.Unlock();
3611 			WriteLocker writeLocker(sFreePageQueuesLock);
3612 
3613 			page = queue->RemoveHead();
3614 			if (page == NULL)
3615 				otherQueue->RemoveHead();
3616 
3617 			if (page == NULL) {
3618 				panic("Had reserved page, but there is none!");
3619 				return NULL;
3620 			}
3621 
3622 			// downgrade to read lock
3623 			locker.Lock();
3624 		}
3625 	}
3626 
3627 	if (page->CacheRef() != NULL)
3628 		panic("supposed to be free page %p has cache\n", page);
3629 
3630 	DEBUG_PAGE_ACCESS_START(page);
3631 
3632 	int oldPageState = page->State();
3633 	page->SetState(pageState);
3634 	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3635 	page->usage_count = 0;
3636 	page->accessed = false;
3637 	page->modified = false;
3638 
3639 	locker.Unlock();
3640 
3641 	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3642 		sPageQueues[pageState].AppendUnlocked(page);
3643 
3644 	// clear the page, if we had to take it from the free queue and a clear
3645 	// page was requested
3646 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3647 		clear_page(page);
3648 
3649 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3650 	page->allocation_tracking_info.Init(
3651 		TA(AllocatePage(page->physical_page_number)));
3652 #else
3653 	TA(AllocatePage(page->physical_page_number));
3654 #endif
3655 
3656 	return page;
3657 }
3658 
3659 
3660 static void
3661 allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3662 	VMPageQueue::PageList& clearPages)
3663 {
3664 	// Page lists are sorted, so remove tails before prepending to the respective queue.
3665 
3666 	while (vm_page* page = freePages.RemoveTail()) {
3667 		page->busy = false;
3668 		page->SetState(PAGE_STATE_FREE);
3669 		DEBUG_PAGE_ACCESS_END(page);
3670 		sFreePageQueue.PrependUnlocked(page);
3671 	}
3672 
3673 	while (vm_page* page = clearPages.RemoveTail()) {
3674 		page->busy = false;
3675 		page->SetState(PAGE_STATE_CLEAR);
3676 		DEBUG_PAGE_ACCESS_END(page);
3677 		sClearPageQueue.PrependUnlocked(page);
3678 	}
3679 
3680 	sFreePageCondition.NotifyAll();
3681 }
3682 
3683 
3684 /*!	Tries to allocate the a contiguous run of \a length pages starting at
3685 	index \a start.
3686 
3687 	The caller must have write-locked the free/clear page queues. The function
3688 	will unlock regardless of whether it succeeds or fails.
3689 
3690 	If the function fails, it cleans up after itself, i.e. it will free all
3691 	pages it managed to allocate.
3692 
3693 	\param start The start index (into \c sPages) of the run.
3694 	\param length The number of pages to allocate.
3695 	\param flags Page allocation flags. Encodes the state the function shall
3696 		set the allocated pages to, whether the pages shall be marked busy
3697 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3698 		(VM_PAGE_ALLOC_CLEAR).
3699 	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3700 		queues in locked state. Will be unlocked by the function.
3701 	\return The index of the first page that could not be allocated. \a length
3702 		is returned when the function was successful.
3703 */
3704 static page_num_t
3705 allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3706 	WriteLocker& freeClearQueueLocker)
3707 {
3708 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3709 	ASSERT(pageState != PAGE_STATE_FREE);
3710 	ASSERT(pageState != PAGE_STATE_CLEAR);
3711 	ASSERT(start + length <= sNumPages);
3712 
3713 	// Pull the free/clear pages out of their respective queues. Cached pages
3714 	// are allocated later.
3715 	page_num_t cachedPages = 0;
3716 	VMPageQueue::PageList freePages;
3717 	VMPageQueue::PageList clearPages;
3718 	page_num_t i = 0;
3719 	for (; i < length; i++) {
3720 		bool pageAllocated = true;
3721 		bool noPage = false;
3722 		vm_page& page = sPages[start + i];
3723 		switch (page.State()) {
3724 			case PAGE_STATE_CLEAR:
3725 				DEBUG_PAGE_ACCESS_START(&page);
3726 				sClearPageQueue.Remove(&page);
3727 				clearPages.Add(&page);
3728 				break;
3729 			case PAGE_STATE_FREE:
3730 				DEBUG_PAGE_ACCESS_START(&page);
3731 				sFreePageQueue.Remove(&page);
3732 				freePages.Add(&page);
3733 				break;
3734 			case PAGE_STATE_CACHED:
3735 				// We allocate cached pages later.
3736 				cachedPages++;
3737 				pageAllocated = false;
3738 				break;
3739 
3740 			default:
3741 				// Probably a page was cached when our caller checked. Now it's
3742 				// gone and we have to abort.
3743 				noPage = true;
3744 				break;
3745 		}
3746 
3747 		if (noPage)
3748 			break;
3749 
3750 		if (pageAllocated) {
3751 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3752 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3753 			page.usage_count = 0;
3754 			page.accessed = false;
3755 			page.modified = false;
3756 		}
3757 	}
3758 
3759 	if (i < length) {
3760 		// failed to allocate a page -- free all that we've got
3761 		allocate_page_run_cleanup(freePages, clearPages);
3762 		return i;
3763 	}
3764 
3765 	freeClearQueueLocker.Unlock();
3766 
3767 	if (cachedPages > 0) {
3768 		// allocate the pages that weren't free but cached
3769 		page_num_t freedCachedPages = 0;
3770 		page_num_t nextIndex = start;
3771 		vm_page* freePage = freePages.Head();
3772 		vm_page* clearPage = clearPages.Head();
3773 		while (cachedPages > 0) {
3774 			// skip, if we've already got the page
3775 			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3776 				freePage = freePages.GetNext(freePage);
3777 				nextIndex++;
3778 				continue;
3779 			}
3780 			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3781 				clearPage = clearPages.GetNext(clearPage);
3782 				nextIndex++;
3783 				continue;
3784 			}
3785 
3786 			// free the page, if it is still cached
3787 			vm_page& page = sPages[nextIndex];
3788 			if (!free_cached_page(&page, false)) {
3789 				// TODO: if the page turns out to have been freed already,
3790 				// there would be no need to fail
3791 				break;
3792 			}
3793 
3794 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3795 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3796 			page.usage_count = 0;
3797 			page.accessed = false;
3798 			page.modified = false;
3799 
3800 			freePages.InsertBefore(freePage, &page);
3801 			freedCachedPages++;
3802 			cachedPages--;
3803 			nextIndex++;
3804 		}
3805 
3806 		// If we have freed cached pages, we need to balance things.
3807 		if (freedCachedPages > 0)
3808 			unreserve_pages(freedCachedPages);
3809 
3810 		if (nextIndex - start < length) {
3811 			// failed to allocate all cached pages -- free all that we've got
3812 			freeClearQueueLocker.Lock();
3813 			allocate_page_run_cleanup(freePages, clearPages);
3814 			freeClearQueueLocker.Unlock();
3815 
3816 			return nextIndex - start;
3817 		}
3818 	}
3819 
3820 	// clear pages, if requested
3821 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3822 		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3823 				vm_page* page = it.Next();) {
3824 			clear_page(page);
3825 		}
3826 	}
3827 
3828 	// add pages to target queue
3829 	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3830 		freePages.MoveFrom(&clearPages);
3831 		sPageQueues[pageState].AppendUnlocked(freePages, length);
3832 	}
3833 
3834 	// Note: We don't unreserve the pages since we pulled them out of the
3835 	// free/clear queues without adjusting sUnreservedFreePages.
3836 
3837 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3838 	AbstractTraceEntryWithStackTrace* traceEntry
3839 		= TA(AllocatePageRun(start, length));
3840 
3841 	for (page_num_t i = start; i < start + length; i++)
3842 		sPages[i].allocation_tracking_info.Init(traceEntry);
3843 #else
3844 	TA(AllocatePageRun(start, length));
3845 #endif
3846 
3847 	return length;
3848 }
3849 
3850 
3851 /*! Allocate a physically contiguous range of pages.
3852 
3853 	\param flags Page allocation flags. Encodes the state the function shall
3854 		set the allocated pages to, whether the pages shall be marked busy
3855 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3856 		(VM_PAGE_ALLOC_CLEAR).
3857 	\param length The number of contiguous pages to allocate.
3858 	\param restrictions Restrictions to the physical addresses of the page run
3859 		to allocate, including \c low_address, the first acceptable physical
3860 		address where the page run may start, \c high_address, the last
3861 		acceptable physical address where the page run may end (i.e. it must
3862 		hold \code runStartAddress + length <= high_address \endcode),
3863 		\c alignment, the alignment of the page run start address, and
3864 		\c boundary, multiples of which the page run must not cross.
3865 		Values set to \c 0 are ignored.
3866 	\param priority The page reservation priority (as passed to
3867 		vm_page_reserve_pages()).
3868 	\return The first page of the allocated page run on success; \c NULL
3869 		when the allocation failed.
3870 */
3871 vm_page*
3872 vm_page_allocate_page_run(uint32 flags, page_num_t length,
3873 	const physical_address_restrictions* restrictions, int priority)
3874 {
3875 	// compute start and end page index
3876 	page_num_t requestedStart
3877 		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3878 			- sPhysicalPageOffset;
3879 	page_num_t start = requestedStart;
3880 	page_num_t end;
3881 	if (restrictions->high_address > 0) {
3882 		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3883 				sPhysicalPageOffset)
3884 			- sPhysicalPageOffset;
3885 		end = std::min(end, sNumPages);
3886 	} else
3887 		end = sNumPages;
3888 
3889 	// compute alignment mask
3890 	page_num_t alignmentMask
3891 		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3892 	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3893 		// alignment must be a power of 2
3894 
3895 	// compute the boundary mask
3896 	uint32 boundaryMask = 0;
3897 	if (restrictions->boundary != 0) {
3898 		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3899 		// boundary must be a power of two and not less than alignment and
3900 		// length
3901 		ASSERT(((boundary - 1) & boundary) == 0);
3902 		ASSERT(boundary >= alignmentMask + 1);
3903 		ASSERT(boundary >= length);
3904 
3905 		boundaryMask = -boundary;
3906 	}
3907 
3908 	vm_page_reservation reservation;
3909 	vm_page_reserve_pages(&reservation, length, priority);
3910 
3911 	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3912 
3913 	// First we try to get a run with free pages only. If that fails, we also
3914 	// consider cached pages. If there are only few free pages and many cached
3915 	// ones, the odds are that we won't find enough contiguous ones, so we skip
3916 	// the first iteration in this case.
3917 	int32 freePages = sUnreservedFreePages;
3918 	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3919 
3920 	for (;;) {
3921 		if (alignmentMask != 0 || boundaryMask != 0) {
3922 			page_num_t offsetStart = start + sPhysicalPageOffset;
3923 
3924 			// enforce alignment
3925 			if ((offsetStart & alignmentMask) != 0)
3926 				offsetStart = (offsetStart + alignmentMask) & ~alignmentMask;
3927 
3928 			// enforce boundary
3929 			if (boundaryMask != 0 && ((offsetStart ^ (offsetStart
3930 				+ length - 1)) & boundaryMask) != 0) {
3931 				offsetStart = (offsetStart + length - 1) & boundaryMask;
3932 			}
3933 
3934 			start = offsetStart - sPhysicalPageOffset;
3935 		}
3936 
3937 		if (start + length > end) {
3938 			if (useCached == 0) {
3939 				// The first iteration with free pages only was unsuccessful.
3940 				// Try again also considering cached pages.
3941 				useCached = 1;
3942 				start = requestedStart;
3943 				continue;
3944 			}
3945 
3946 			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3947 				"length %" B_PRIuPHYSADDR " (%" B_PRIuPHYSADDR " %"
3948 				B_PRIuPHYSADDR ") in second iteration (align: %" B_PRIuPHYSADDR
3949 				" boundary: %" B_PRIuPHYSADDR ")!\n", length, requestedStart,
3950 				end, restrictions->alignment, restrictions->boundary);
3951 
3952 			freeClearQueueLocker.Unlock();
3953 			vm_page_unreserve_pages(&reservation);
3954 			return NULL;
3955 		}
3956 
3957 		bool foundRun = true;
3958 		page_num_t i;
3959 		for (i = 0; i < length; i++) {
3960 			uint32 pageState = sPages[start + i].State();
3961 			if (pageState != PAGE_STATE_FREE
3962 				&& pageState != PAGE_STATE_CLEAR
3963 				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3964 				foundRun = false;
3965 				break;
3966 			}
3967 		}
3968 
3969 		if (foundRun) {
3970 			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
3971 			if (i == length)
3972 				return &sPages[start];
3973 
3974 			// apparently a cached page couldn't be allocated -- skip it and
3975 			// continue
3976 			freeClearQueueLocker.Lock();
3977 		}
3978 
3979 		start += i + 1;
3980 	}
3981 }
3982 
3983 
3984 vm_page *
3985 vm_page_at_index(int32 index)
3986 {
3987 	return &sPages[index];
3988 }
3989 
3990 
3991 vm_page *
3992 vm_lookup_page(page_num_t pageNumber)
3993 {
3994 	if (pageNumber < sPhysicalPageOffset)
3995 		return NULL;
3996 
3997 	pageNumber -= sPhysicalPageOffset;
3998 	if (pageNumber >= sNumPages)
3999 		return NULL;
4000 
4001 	return &sPages[pageNumber];
4002 }
4003 
4004 
4005 bool
4006 vm_page_is_dummy(struct vm_page *page)
4007 {
4008 	return page < sPages || page >= sPages + sNumPages;
4009 }
4010 
4011 
4012 /*!	Free the page that belonged to a certain cache.
4013 	You can use vm_page_set_state() manually if you prefer, but only
4014 	if the page does not equal PAGE_STATE_MODIFIED.
4015 
4016 	\param cache The cache the page was previously owned by or NULL. The page
4017 		must have been removed from its cache before calling this method in
4018 		either case.
4019 	\param page The page to free.
4020 	\param reservation If not NULL, the page count of the reservation will be
4021 		incremented, thus allowing to allocate another page for the freed one at
4022 		a later time.
4023 */
4024 void
4025 vm_page_free_etc(VMCache* cache, vm_page* page,
4026 	vm_page_reservation* reservation)
4027 {
4028 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4029 		&& page->State() != PAGE_STATE_CLEAR);
4030 
4031 	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
4032 		atomic_add(&sModifiedTemporaryPages, -1);
4033 
4034 	free_page(page, false);
4035 	if (reservation == NULL)
4036 		unreserve_pages(1);
4037 }
4038 
4039 
4040 void
4041 vm_page_set_state(vm_page *page, int pageState)
4042 {
4043 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4044 		&& page->State() != PAGE_STATE_CLEAR);
4045 
4046 	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
4047 		free_page(page, pageState == PAGE_STATE_CLEAR);
4048 		unreserve_pages(1);
4049 	} else
4050 		set_page_state(page, pageState);
4051 }
4052 
4053 
4054 /*!	Moves a page to either the tail of the head of its current queue,
4055 	depending on \a tail.
4056 	The page must have a cache and the cache must be locked!
4057 */
4058 void
4059 vm_page_requeue(struct vm_page *page, bool tail)
4060 {
4061 	PAGE_ASSERT(page, page->Cache() != NULL);
4062 	page->Cache()->AssertLocked();
4063 	// DEBUG_PAGE_ACCESS_CHECK(page);
4064 		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
4065 		// when it requeues busy pages. The reason is that vm_soft_fault()
4066 		// (respectively fault_get_page()) and the file cache keep newly
4067 		// allocated pages accessed while they are reading them from disk. It
4068 		// would probably be better to change that code and reenable this
4069 		// check.
4070 
4071 	VMPageQueue *queue = NULL;
4072 
4073 	switch (page->State()) {
4074 		case PAGE_STATE_ACTIVE:
4075 			queue = &sActivePageQueue;
4076 			break;
4077 		case PAGE_STATE_INACTIVE:
4078 			queue = &sInactivePageQueue;
4079 			break;
4080 		case PAGE_STATE_MODIFIED:
4081 			queue = &sModifiedPageQueue;
4082 			break;
4083 		case PAGE_STATE_CACHED:
4084 			queue = &sCachedPageQueue;
4085 			break;
4086 		case PAGE_STATE_FREE:
4087 		case PAGE_STATE_CLEAR:
4088 			panic("vm_page_requeue() called for free/clear page %p", page);
4089 			return;
4090 		case PAGE_STATE_WIRED:
4091 		case PAGE_STATE_UNUSED:
4092 			return;
4093 		default:
4094 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
4095 				page, page->State());
4096 			break;
4097 	}
4098 
4099 	queue->RequeueUnlocked(page, tail);
4100 }
4101 
4102 
4103 page_num_t
4104 vm_page_num_pages(void)
4105 {
4106 	return sNumPages - sNonExistingPages;
4107 }
4108 
4109 
4110 /*! There is a subtle distinction between the page counts returned by
4111 	this function and vm_page_num_free_pages():
4112 	The latter returns the number of pages that are completely uncommitted,
4113 	whereas this one returns the number of pages that are available for
4114 	use by being reclaimed as well (IOW it factors in things like cache pages
4115 	as available).
4116 */
4117 page_num_t
4118 vm_page_num_available_pages(void)
4119 {
4120 	return vm_available_memory() / B_PAGE_SIZE;
4121 }
4122 
4123 
4124 page_num_t
4125 vm_page_num_free_pages(void)
4126 {
4127 	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
4128 	return count > 0 ? count : 0;
4129 }
4130 
4131 
4132 page_num_t
4133 vm_page_num_unused_pages(void)
4134 {
4135 	int32 count = sUnreservedFreePages;
4136 	return count > 0 ? count : 0;
4137 }
4138 
4139 
4140 void
4141 vm_page_get_stats(system_info *info)
4142 {
4143 	// Note: there's no locking protecting any of the queues or counters here,
4144 	// so we run the risk of getting bogus values when evaluating them
4145 	// throughout this function. As these stats are for informational purposes
4146 	// only, it is not really worth introducing such locking. Therefore we just
4147 	// ensure that we don't under- or overflow any of the values.
4148 
4149 	// The pages used for the block cache buffers. Those should not be counted
4150 	// as used but as cached pages.
4151 	// TODO: We should subtract the blocks that are in use ATM, since those
4152 	// can't really be freed in a low memory situation.
4153 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
4154 	info->block_cache_pages = blockCachePages;
4155 
4156 	// Non-temporary modified pages are special as they represent pages that
4157 	// can be written back, so they could be freed if necessary, for us
4158 	// basically making them into cached pages with a higher overhead. The
4159 	// modified queue count is therefore split into temporary and non-temporary
4160 	// counts that are then added to the corresponding number.
4161 	page_num_t modifiedNonTemporaryPages
4162 		= (sModifiedPageQueue.Count() - sModifiedTemporaryPages);
4163 
4164 	info->max_pages = vm_page_num_pages();
4165 	info->cached_pages = sCachedPageQueue.Count() + modifiedNonTemporaryPages
4166 		+ blockCachePages;
4167 
4168 	// max_pages is composed of:
4169 	//	active + inactive + unused + wired + modified + cached + free + clear
4170 	// So taking out the cached (including modified non-temporary), free and
4171 	// clear ones leaves us with all used pages.
4172 	uint32 subtractPages = info->cached_pages + sFreePageQueue.Count()
4173 		+ sClearPageQueue.Count();
4174 	info->used_pages = subtractPages > info->max_pages
4175 		? 0 : info->max_pages - subtractPages;
4176 
4177 	if (info->used_pages + info->cached_pages > info->max_pages) {
4178 		// Something was shuffled around while we were summing up the counts.
4179 		// Make the values sane, preferring the worse case of more used pages.
4180 		info->cached_pages = info->max_pages - info->used_pages;
4181 	}
4182 
4183 	info->page_faults = vm_num_page_faults();
4184 	info->ignored_pages = sIgnoredPages;
4185 
4186 	// TODO: We don't consider pages used for page directories/tables yet.
4187 }
4188 
4189 
4190 /*!	Returns the greatest address within the last page of accessible physical
4191 	memory.
4192 	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
4193 	means the that the last page ends at exactly 4 GB.
4194 */
4195 phys_addr_t
4196 vm_page_max_address()
4197 {
4198 	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
4199 }
4200 
4201 
4202 RANGE_MARKER_FUNCTION_END(vm_page)
4203