xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision 7b1d3485677579da582509346646af2807024538)
1 /*
2  * Copyright 2010-2011, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2010, Axel Dörfler, axeld@pinc-software.de.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 #include <string.h>
12 #include <stdlib.h>
13 
14 #include <algorithm>
15 
16 #include <KernelExport.h>
17 #include <OS.h>
18 
19 #include <AutoDeleter.h>
20 
21 #include <arch/cpu.h>
22 #include <arch/vm_translation_map.h>
23 #include <block_cache.h>
24 #include <boot/kernel_args.h>
25 #include <condition_variable.h>
26 #include <elf.h>
27 #include <heap.h>
28 #include <kernel.h>
29 #include <low_resource_manager.h>
30 #include <thread.h>
31 #include <tracing.h>
32 #include <util/AutoLock.h>
33 #include <vfs.h>
34 #include <vm/vm.h>
35 #include <vm/vm_priv.h>
36 #include <vm/vm_page.h>
37 #include <vm/VMAddressSpace.h>
38 #include <vm/VMArea.h>
39 #include <vm/VMCache.h>
40 
41 #include "IORequest.h"
42 #include "PageCacheLocker.h"
43 #include "VMAnonymousCache.h"
44 #include "VMPageQueue.h"
45 
46 
47 //#define TRACE_VM_PAGE
48 #ifdef TRACE_VM_PAGE
49 #	define TRACE(x) dprintf x
50 #else
51 #	define TRACE(x) ;
52 #endif
53 
54 //#define TRACE_VM_DAEMONS
55 #ifdef TRACE_VM_DAEMONS
56 #define TRACE_DAEMON(x...) dprintf(x)
57 #else
58 #define TRACE_DAEMON(x...) do {} while (false)
59 #endif
60 
61 //#define TRACK_PAGE_USAGE_STATS	1
62 
63 #define PAGE_ASSERT(page, condition)	\
64 	ASSERT_PRINT((condition), "page: %p", (page))
65 
66 #define SCRUB_SIZE 32
67 	// this many pages will be cleared at once in the page scrubber thread
68 
69 #define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
70 	// maximum I/O priority of the page writer
71 #define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
72 	// the maximum I/O priority shall be reached when this many pages need to
73 	// be written
74 
75 
76 // The page reserve an allocation of the certain priority must not touch.
77 static const size_t kPageReserveForPriority[] = {
78 	VM_PAGE_RESERVE_USER,		// user
79 	VM_PAGE_RESERVE_SYSTEM,		// system
80 	0							// VIP
81 };
82 
83 // Minimum number of free pages the page daemon will try to achieve.
84 static uint32 sFreePagesTarget;
85 static uint32 sFreeOrCachedPagesTarget;
86 static uint32 sInactivePagesTarget;
87 
88 // Wait interval between page daemon runs.
89 static const bigtime_t kIdleScanWaitInterval = 1000000LL;	// 1 sec
90 static const bigtime_t kBusyScanWaitInterval = 500000LL;	// 0.5 sec
91 
92 // Number of idle runs after which we want to have processed the full active
93 // queue.
94 static const uint32 kIdleRunsForFullQueue = 20;
95 
96 // Maximum limit for the vm_page::usage_count.
97 static const int32 kPageUsageMax = 64;
98 // vm_page::usage_count buff an accessed page receives in a scan.
99 static const int32 kPageUsageAdvance = 3;
100 // vm_page::usage_count debuff an unaccessed page receives in a scan.
101 static const int32 kPageUsageDecline = 1;
102 
103 int32 gMappedPagesCount;
104 
105 static VMPageQueue sPageQueues[PAGE_STATE_COUNT];
106 
107 static VMPageQueue& sFreePageQueue = sPageQueues[PAGE_STATE_FREE];
108 static VMPageQueue& sClearPageQueue = sPageQueues[PAGE_STATE_CLEAR];
109 static VMPageQueue& sModifiedPageQueue = sPageQueues[PAGE_STATE_MODIFIED];
110 static VMPageQueue& sInactivePageQueue = sPageQueues[PAGE_STATE_INACTIVE];
111 static VMPageQueue& sActivePageQueue = sPageQueues[PAGE_STATE_ACTIVE];
112 static VMPageQueue& sCachedPageQueue = sPageQueues[PAGE_STATE_CACHED];
113 
114 static vm_page *sPages;
115 static page_num_t sPhysicalPageOffset;
116 static page_num_t sNumPages;
117 static page_num_t sNonExistingPages;
118 	// pages in the sPages array that aren't backed by physical memory
119 static uint64 sIgnoredPages;
120 	// pages of physical memory ignored by the boot loader (and thus not
121 	// available here)
122 static int32 sUnreservedFreePages;
123 static int32 sUnsatisfiedPageReservations;
124 static int32 sModifiedTemporaryPages;
125 
126 static ConditionVariable sFreePageCondition;
127 static mutex sPageDeficitLock = MUTEX_INITIALIZER("page deficit");
128 
129 // This lock must be used whenever the free or clear page queues are changed.
130 // If you need to work on both queues at the same time, you need to hold a write
131 // lock, otherwise, a read lock suffices (each queue still has a spinlock to
132 // guard against concurrent changes).
133 static rw_lock sFreePageQueuesLock
134 	= RW_LOCK_INITIALIZER("free/clear page queues");
135 
136 #ifdef TRACK_PAGE_USAGE_STATS
137 static page_num_t sPageUsageArrays[512];
138 static page_num_t* sPageUsage = sPageUsageArrays;
139 static page_num_t sPageUsagePageCount;
140 static page_num_t* sNextPageUsage = sPageUsageArrays + 256;
141 static page_num_t sNextPageUsagePageCount;
142 #endif
143 
144 
145 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
146 
147 struct caller_info {
148 	addr_t		caller;
149 	size_t		count;
150 };
151 
152 static const int32 kCallerInfoTableSize = 1024;
153 static caller_info sCallerInfoTable[kCallerInfoTableSize];
154 static int32 sCallerInfoCount = 0;
155 
156 static caller_info* get_caller_info(addr_t caller);
157 
158 
159 RANGE_MARKER_FUNCTION_PROTOTYPES(vm_page)
160 
161 static const addr_t kVMPageCodeAddressRange[] = {
162 	RANGE_MARKER_FUNCTION_ADDRESS_RANGE(vm_page)
163 };
164 
165 #endif
166 
167 
168 RANGE_MARKER_FUNCTION_BEGIN(vm_page)
169 
170 
171 struct page_stats {
172 	int32	totalFreePages;
173 	int32	unsatisfiedReservations;
174 	int32	cachedPages;
175 };
176 
177 
178 struct PageReservationWaiter
179 		: public DoublyLinkedListLinkImpl<PageReservationWaiter> {
180 	Thread*	thread;
181 	uint32	dontTouch;		// reserve not to touch
182 	uint32	missing;		// pages missing for the reservation
183 	int32	threadPriority;
184 
185 	bool operator<(const PageReservationWaiter& other) const
186 	{
187 		// Implies an order by descending VM priority (ascending dontTouch)
188 		// and (secondarily) descending thread priority.
189 		if (dontTouch != other.dontTouch)
190 			return dontTouch < other.dontTouch;
191 		return threadPriority > other.threadPriority;
192 	}
193 };
194 
195 typedef DoublyLinkedList<PageReservationWaiter> PageReservationWaiterList;
196 static PageReservationWaiterList sPageReservationWaiters;
197 
198 
199 struct DaemonCondition {
200 	void Init(const char* name)
201 	{
202 		mutex_init(&fLock, "daemon condition");
203 		fCondition.Init(this, name);
204 		fActivated = false;
205 	}
206 
207 	bool Lock()
208 	{
209 		return mutex_lock(&fLock) == B_OK;
210 	}
211 
212 	void Unlock()
213 	{
214 		mutex_unlock(&fLock);
215 	}
216 
217 	bool Wait(bigtime_t timeout, bool clearActivated)
218 	{
219 		MutexLocker locker(fLock);
220 		if (clearActivated)
221 			fActivated = false;
222 		else if (fActivated)
223 			return true;
224 
225 		ConditionVariableEntry entry;
226 		fCondition.Add(&entry);
227 
228 		locker.Unlock();
229 
230 		return entry.Wait(B_RELATIVE_TIMEOUT, timeout) == B_OK;
231 	}
232 
233 	void WakeUp()
234 	{
235 		if (fActivated)
236 			return;
237 
238 		MutexLocker locker(fLock);
239 		fActivated = true;
240 		fCondition.NotifyOne();
241 	}
242 
243 	void ClearActivated()
244 	{
245 		MutexLocker locker(fLock);
246 		fActivated = false;
247 	}
248 
249 private:
250 	mutex				fLock;
251 	ConditionVariable	fCondition;
252 	bool				fActivated;
253 };
254 
255 
256 static DaemonCondition sPageWriterCondition;
257 static DaemonCondition sPageDaemonCondition;
258 
259 
260 #if PAGE_ALLOCATION_TRACING
261 
262 namespace PageAllocationTracing {
263 
264 class ReservePages : public AbstractTraceEntry {
265 public:
266 	ReservePages(uint32 count)
267 		:
268 		fCount(count)
269 	{
270 		Initialized();
271 	}
272 
273 	virtual void AddDump(TraceOutput& out)
274 	{
275 		out.Print("page reserve:   %" B_PRIu32, fCount);
276 	}
277 
278 private:
279 	uint32		fCount;
280 };
281 
282 
283 class UnreservePages : public AbstractTraceEntry {
284 public:
285 	UnreservePages(uint32 count)
286 		:
287 		fCount(count)
288 	{
289 		Initialized();
290 	}
291 
292 	virtual void AddDump(TraceOutput& out)
293 	{
294 		out.Print("page unreserve: %" B_PRId32, fCount);
295 	}
296 
297 private:
298 	uint32		fCount;
299 };
300 
301 
302 class AllocatePage
303 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
304 public:
305 	AllocatePage(page_num_t pageNumber)
306 		:
307 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
308 		fPageNumber(pageNumber)
309 	{
310 		Initialized();
311 	}
312 
313 	virtual void AddDump(TraceOutput& out)
314 	{
315 		out.Print("page alloc: %#" B_PRIxPHYSADDR, fPageNumber);
316 	}
317 
318 private:
319 	page_num_t	fPageNumber;
320 };
321 
322 
323 class AllocatePageRun
324 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
325 public:
326 	AllocatePageRun(page_num_t startPage, uint32 length)
327 		:
328 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
329 		fStartPage(startPage),
330 		fLength(length)
331 	{
332 		Initialized();
333 	}
334 
335 	virtual void AddDump(TraceOutput& out)
336 	{
337 		out.Print("page alloc run: start %#" B_PRIxPHYSADDR " length: %"
338 			B_PRIu32, fStartPage, fLength);
339 	}
340 
341 private:
342 	page_num_t	fStartPage;
343 	uint32		fLength;
344 };
345 
346 
347 class FreePage
348 	: public TRACE_ENTRY_SELECTOR(PAGE_ALLOCATION_TRACING_STACK_TRACE) {
349 public:
350 	FreePage(page_num_t pageNumber)
351 		:
352 		TraceEntryBase(PAGE_ALLOCATION_TRACING_STACK_TRACE, 0, true),
353 		fPageNumber(pageNumber)
354 	{
355 		Initialized();
356 	}
357 
358 	virtual void AddDump(TraceOutput& out)
359 	{
360 		out.Print("page free: %#" B_PRIxPHYSADDR, fPageNumber);
361 	}
362 
363 private:
364 	page_num_t	fPageNumber;
365 };
366 
367 
368 class ScrubbingPages : public AbstractTraceEntry {
369 public:
370 	ScrubbingPages(uint32 count)
371 		:
372 		fCount(count)
373 	{
374 		Initialized();
375 	}
376 
377 	virtual void AddDump(TraceOutput& out)
378 	{
379 		out.Print("page scrubbing: %" B_PRId32, fCount);
380 	}
381 
382 private:
383 	uint32		fCount;
384 };
385 
386 
387 class ScrubbedPages : public AbstractTraceEntry {
388 public:
389 	ScrubbedPages(uint32 count)
390 		:
391 		fCount(count)
392 	{
393 		Initialized();
394 	}
395 
396 	virtual void AddDump(TraceOutput& out)
397 	{
398 		out.Print("page scrubbed:  %" B_PRId32, fCount);
399 	}
400 
401 private:
402 	uint32		fCount;
403 };
404 
405 
406 class StolenPage : public AbstractTraceEntry {
407 public:
408 	StolenPage()
409 	{
410 		Initialized();
411 	}
412 
413 	virtual void AddDump(TraceOutput& out)
414 	{
415 		out.Print("page stolen");
416 	}
417 };
418 
419 }	// namespace PageAllocationTracing
420 
421 #	define TA(x)	new(std::nothrow) PageAllocationTracing::x
422 
423 #else
424 #	define TA(x)
425 #endif	// PAGE_ALLOCATION_TRACING
426 
427 
428 #if PAGE_DAEMON_TRACING
429 
430 namespace PageDaemonTracing {
431 
432 class ActivatePage : public AbstractTraceEntry {
433 	public:
434 		ActivatePage(vm_page* page)
435 			:
436 			fCache(page->cache),
437 			fPage(page)
438 		{
439 			Initialized();
440 		}
441 
442 		virtual void AddDump(TraceOutput& out)
443 		{
444 			out.Print("page activated:   %p, cache: %p", fPage, fCache);
445 		}
446 
447 	private:
448 		VMCache*	fCache;
449 		vm_page*	fPage;
450 };
451 
452 
453 class DeactivatePage : public AbstractTraceEntry {
454 	public:
455 		DeactivatePage(vm_page* page)
456 			:
457 			fCache(page->cache),
458 			fPage(page)
459 		{
460 			Initialized();
461 		}
462 
463 		virtual void AddDump(TraceOutput& out)
464 		{
465 			out.Print("page deactivated: %p, cache: %p", fPage, fCache);
466 		}
467 
468 	private:
469 		VMCache*	fCache;
470 		vm_page*	fPage;
471 };
472 
473 
474 class FreedPageSwap : public AbstractTraceEntry {
475 	public:
476 		FreedPageSwap(vm_page* page)
477 			:
478 			fCache(page->cache),
479 			fPage(page)
480 		{
481 			Initialized();
482 		}
483 
484 		virtual void AddDump(TraceOutput& out)
485 		{
486 			out.Print("page swap freed:  %p, cache: %p", fPage, fCache);
487 		}
488 
489 	private:
490 		VMCache*	fCache;
491 		vm_page*	fPage;
492 };
493 
494 }	// namespace PageDaemonTracing
495 
496 #	define TD(x)	new(std::nothrow) PageDaemonTracing::x
497 
498 #else
499 #	define TD(x)
500 #endif	// PAGE_DAEMON_TRACING
501 
502 
503 #if PAGE_WRITER_TRACING
504 
505 namespace PageWriterTracing {
506 
507 class WritePage : public AbstractTraceEntry {
508 	public:
509 		WritePage(vm_page* page)
510 			:
511 			fCache(page->Cache()),
512 			fPage(page)
513 		{
514 			Initialized();
515 		}
516 
517 		virtual void AddDump(TraceOutput& out)
518 		{
519 			out.Print("page write: %p, cache: %p", fPage, fCache);
520 		}
521 
522 	private:
523 		VMCache*	fCache;
524 		vm_page*	fPage;
525 };
526 
527 }	// namespace PageWriterTracing
528 
529 #	define TPW(x)	new(std::nothrow) PageWriterTracing::x
530 
531 #else
532 #	define TPW(x)
533 #endif	// PAGE_WRITER_TRACING
534 
535 
536 #if PAGE_STATE_TRACING
537 
538 namespace PageStateTracing {
539 
540 class SetPageState : public AbstractTraceEntry {
541 	public:
542 		SetPageState(vm_page* page, uint8 newState)
543 			:
544 			fPage(page),
545 			fOldState(page->State()),
546 			fNewState(newState),
547 			fBusy(page->busy),
548 			fWired(page->WiredCount() > 0),
549 			fMapped(!page->mappings.IsEmpty()),
550 			fAccessed(page->accessed),
551 			fModified(page->modified)
552 		{
553 #if PAGE_STATE_TRACING_STACK_TRACE
554 			fStackTrace = capture_tracing_stack_trace(
555 				PAGE_STATE_TRACING_STACK_TRACE, 0, true);
556 				// Don't capture userland stack trace to avoid potential
557 				// deadlocks.
558 #endif
559 			Initialized();
560 		}
561 
562 #if PAGE_STATE_TRACING_STACK_TRACE
563 		virtual void DumpStackTrace(TraceOutput& out)
564 		{
565 			out.PrintStackTrace(fStackTrace);
566 		}
567 #endif
568 
569 		virtual void AddDump(TraceOutput& out)
570 		{
571 			out.Print("page set state: %p (%c%c%c%c%c): %s -> %s", fPage,
572 				fBusy ? 'b' : '-',
573 				fWired ? 'w' : '-',
574 				fMapped ? 'm' : '-',
575 				fAccessed ? 'a' : '-',
576 				fModified ? 'm' : '-',
577 				page_state_to_string(fOldState),
578 				page_state_to_string(fNewState));
579 		}
580 
581 	private:
582 		vm_page*	fPage;
583 #if PAGE_STATE_TRACING_STACK_TRACE
584 		tracing_stack_trace* fStackTrace;
585 #endif
586 		uint8		fOldState;
587 		uint8		fNewState;
588 		bool		fBusy : 1;
589 		bool		fWired : 1;
590 		bool		fMapped : 1;
591 		bool		fAccessed : 1;
592 		bool		fModified : 1;
593 };
594 
595 }	// namespace PageStateTracing
596 
597 #	define TPS(x)	new(std::nothrow) PageStateTracing::x
598 
599 #else
600 #	define TPS(x)
601 #endif	// PAGE_STATE_TRACING
602 
603 
604 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
605 
606 namespace BKernel {
607 
608 class AllocationTrackingCallback {
609 public:
610 	virtual						~AllocationTrackingCallback();
611 
612 	virtual	bool				ProcessTrackingInfo(
613 									AllocationTrackingInfo* info,
614 									page_num_t pageNumber) = 0;
615 };
616 
617 }
618 
619 using BKernel::AllocationTrackingCallback;
620 
621 
622 class AllocationCollectorCallback : public AllocationTrackingCallback {
623 public:
624 	AllocationCollectorCallback(bool resetInfos)
625 		:
626 		fResetInfos(resetInfos)
627 	{
628 	}
629 
630 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
631 		page_num_t pageNumber)
632 	{
633 		if (!info->IsInitialized())
634 			return true;
635 
636 		addr_t caller = 0;
637 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
638 
639 		if (traceEntry != NULL && info->IsTraceEntryValid()) {
640 			caller = tracing_find_caller_in_stack_trace(
641 				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
642 		}
643 
644 		caller_info* callerInfo = get_caller_info(caller);
645 		if (callerInfo == NULL) {
646 			kprintf("out of space for caller infos\n");
647 			return false;
648 		}
649 
650 		callerInfo->count++;
651 
652 		if (fResetInfos)
653 			info->Clear();
654 
655 		return true;
656 	}
657 
658 private:
659 	bool	fResetInfos;
660 };
661 
662 
663 class AllocationInfoPrinterCallback : public AllocationTrackingCallback {
664 public:
665 	AllocationInfoPrinterCallback(bool printStackTrace, page_num_t pageFilter,
666 		team_id teamFilter, thread_id threadFilter)
667 		:
668 		fPrintStackTrace(printStackTrace),
669 		fPageFilter(pageFilter),
670 		fTeamFilter(teamFilter),
671 		fThreadFilter(threadFilter)
672 	{
673 	}
674 
675 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
676 		page_num_t pageNumber)
677 	{
678 		if (!info->IsInitialized())
679 			return true;
680 
681 		if (fPageFilter != 0 && pageNumber != fPageFilter)
682 			return true;
683 
684 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
685 		if (traceEntry != NULL && !info->IsTraceEntryValid())
686 			traceEntry = NULL;
687 
688 		if (traceEntry != NULL) {
689 			if (fTeamFilter != -1 && traceEntry->TeamID() != fTeamFilter)
690 				return true;
691 			if (fThreadFilter != -1 && traceEntry->ThreadID() != fThreadFilter)
692 				return true;
693 		} else {
694 			// we need the info if we have filters set
695 			if (fTeamFilter != -1 || fThreadFilter != -1)
696 				return true;
697 		}
698 
699 		kprintf("page number %#" B_PRIxPHYSADDR, pageNumber);
700 
701 		if (traceEntry != NULL) {
702 			kprintf(", team: %" B_PRId32 ", thread %" B_PRId32
703 				", time %" B_PRId64 "\n", traceEntry->TeamID(),
704 				traceEntry->ThreadID(), traceEntry->Time());
705 
706 			if (fPrintStackTrace)
707 				tracing_print_stack_trace(traceEntry->StackTrace());
708 		} else
709 			kprintf("\n");
710 
711 		return true;
712 	}
713 
714 private:
715 	bool		fPrintStackTrace;
716 	page_num_t	fPageFilter;
717 	team_id		fTeamFilter;
718 	thread_id	fThreadFilter;
719 };
720 
721 
722 class AllocationDetailPrinterCallback : public AllocationTrackingCallback {
723 public:
724 	AllocationDetailPrinterCallback(addr_t caller)
725 		:
726 		fCaller(caller)
727 	{
728 	}
729 
730 	virtual bool ProcessTrackingInfo(AllocationTrackingInfo* info,
731 		page_num_t pageNumber)
732 	{
733 		if (!info->IsInitialized())
734 			return true;
735 
736 		addr_t caller = 0;
737 		AbstractTraceEntryWithStackTrace* traceEntry = info->TraceEntry();
738 		if (traceEntry != NULL && !info->IsTraceEntryValid())
739 			traceEntry = NULL;
740 
741 		if (traceEntry != NULL) {
742 			caller = tracing_find_caller_in_stack_trace(
743 				traceEntry->StackTrace(), kVMPageCodeAddressRange, 1);
744 		}
745 
746 		if (caller != fCaller)
747 			return true;
748 
749 		kprintf("page %#" B_PRIxPHYSADDR "\n", pageNumber);
750 		if (traceEntry != NULL)
751 			tracing_print_stack_trace(traceEntry->StackTrace());
752 
753 		return true;
754 	}
755 
756 private:
757 	addr_t	fCaller;
758 };
759 
760 #endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
761 
762 
763 static void
764 list_page(vm_page* page)
765 {
766 	kprintf("0x%08" B_PRIxADDR " ",
767 		(addr_t)(page->physical_page_number * B_PAGE_SIZE));
768 	switch (page->State()) {
769 		case PAGE_STATE_ACTIVE:   kprintf("A"); break;
770 		case PAGE_STATE_INACTIVE: kprintf("I"); break;
771 		case PAGE_STATE_MODIFIED: kprintf("M"); break;
772 		case PAGE_STATE_CACHED:   kprintf("C"); break;
773 		case PAGE_STATE_FREE:     kprintf("F"); break;
774 		case PAGE_STATE_CLEAR:    kprintf("L"); break;
775 		case PAGE_STATE_WIRED:    kprintf("W"); break;
776 		case PAGE_STATE_UNUSED:   kprintf("-"); break;
777 	}
778 	kprintf(" ");
779 	if (page->busy)         kprintf("B"); else kprintf("-");
780 	if (page->busy_writing) kprintf("W"); else kprintf("-");
781 	if (page->accessed)     kprintf("A"); else kprintf("-");
782 	if (page->modified)     kprintf("M"); else kprintf("-");
783 	kprintf("-");
784 
785 	kprintf(" usage:%3u", page->usage_count);
786 	kprintf(" wired:%5u", page->WiredCount());
787 
788 	bool first = true;
789 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
790 	vm_page_mapping* mapping;
791 	while ((mapping = iterator.Next()) != NULL) {
792 		if (first) {
793 			kprintf(": ");
794 			first = false;
795 		} else
796 			kprintf(", ");
797 
798 		kprintf("%" B_PRId32 " (%s)", mapping->area->id, mapping->area->name);
799 		mapping = mapping->page_link.next;
800 	}
801 }
802 
803 
804 static int
805 dump_page_list(int argc, char **argv)
806 {
807 	kprintf("page table:\n");
808 	for (page_num_t i = 0; i < sNumPages; i++) {
809 		if (sPages[i].State() != PAGE_STATE_UNUSED) {
810 			list_page(&sPages[i]);
811 			kprintf("\n");
812 		}
813 	}
814 	kprintf("end of page table\n");
815 
816 	return 0;
817 }
818 
819 
820 static int
821 find_page(int argc, char **argv)
822 {
823 	struct vm_page *page;
824 	addr_t address;
825 	int32 index = 1;
826 	int i;
827 
828 	struct {
829 		const char*	name;
830 		VMPageQueue*	queue;
831 	} pageQueueInfos[] = {
832 		{ "free",		&sFreePageQueue },
833 		{ "clear",		&sClearPageQueue },
834 		{ "modified",	&sModifiedPageQueue },
835 		{ "active",		&sActivePageQueue },
836 		{ "inactive",	&sInactivePageQueue },
837 		{ "cached",		&sCachedPageQueue },
838 		{ NULL, NULL }
839 	};
840 
841 	if (argc < 2
842 		|| strlen(argv[index]) <= 2
843 		|| argv[index][0] != '0'
844 		|| argv[index][1] != 'x') {
845 		kprintf("usage: find_page <address>\n");
846 		return 0;
847 	}
848 
849 	address = strtoul(argv[index], NULL, 0);
850 	page = (vm_page*)address;
851 
852 	for (i = 0; pageQueueInfos[i].name; i++) {
853 		VMPageQueue::Iterator it = pageQueueInfos[i].queue->GetIterator();
854 		while (vm_page* p = it.Next()) {
855 			if (p == page) {
856 				kprintf("found page %p in queue %p (%s)\n", page,
857 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
858 				return 0;
859 			}
860 		}
861 	}
862 
863 	kprintf("page %p isn't in any queue\n", page);
864 
865 	return 0;
866 }
867 
868 
869 const char *
870 page_state_to_string(int state)
871 {
872 	switch(state) {
873 		case PAGE_STATE_ACTIVE:
874 			return "active";
875 		case PAGE_STATE_INACTIVE:
876 			return "inactive";
877 		case PAGE_STATE_MODIFIED:
878 			return "modified";
879 		case PAGE_STATE_CACHED:
880 			return "cached";
881 		case PAGE_STATE_FREE:
882 			return "free";
883 		case PAGE_STATE_CLEAR:
884 			return "clear";
885 		case PAGE_STATE_WIRED:
886 			return "wired";
887 		case PAGE_STATE_UNUSED:
888 			return "unused";
889 		default:
890 			return "unknown";
891 	}
892 }
893 
894 
895 static int
896 dump_page_long(int argc, char **argv)
897 {
898 	bool addressIsPointer = true;
899 	bool physical = false;
900 	bool searchMappings = false;
901 	int32 index = 1;
902 
903 	while (index < argc) {
904 		if (argv[index][0] != '-')
905 			break;
906 
907 		if (!strcmp(argv[index], "-p")) {
908 			addressIsPointer = false;
909 			physical = true;
910 		} else if (!strcmp(argv[index], "-v")) {
911 			addressIsPointer = false;
912 		} else if (!strcmp(argv[index], "-m")) {
913 			searchMappings = true;
914 		} else {
915 			print_debugger_command_usage(argv[0]);
916 			return 0;
917 		}
918 
919 		index++;
920 	}
921 
922 	if (index + 1 != argc) {
923 		print_debugger_command_usage(argv[0]);
924 		return 0;
925 	}
926 
927 	uint64 value;
928 	if (!evaluate_debug_expression(argv[index], &value, false))
929 		return 0;
930 
931 	uint64 pageAddress = value;
932 	struct vm_page* page;
933 
934 	if (addressIsPointer) {
935 		page = (struct vm_page *)(addr_t)pageAddress;
936 	} else {
937 		if (!physical) {
938 			VMAddressSpace *addressSpace = VMAddressSpace::Kernel();
939 
940 			if (debug_get_debugged_thread()->team->address_space != NULL)
941 				addressSpace = debug_get_debugged_thread()->team->address_space;
942 
943 			uint32 flags = 0;
944 			phys_addr_t physicalAddress;
945 			if (addressSpace->TranslationMap()->QueryInterrupt(pageAddress,
946 					&physicalAddress, &flags) != B_OK
947 				|| (flags & PAGE_PRESENT) == 0) {
948 				kprintf("Virtual address not mapped to a physical page in this "
949 					"address space.\n");
950 				return 0;
951 			}
952 			pageAddress = physicalAddress;
953 		}
954 
955 		page = vm_lookup_page(pageAddress / B_PAGE_SIZE);
956 	}
957 
958 	if (page == NULL) {
959 		kprintf("Page not found.\n");
960 		return 0;
961 	}
962 
963 	kprintf("PAGE: %p\n", page);
964 
965 	const off_t pageOffset = (addr_t)page - (addr_t)sPages;
966 	const off_t pageIndex = pageOffset / (off_t)sizeof(vm_page);
967 	if (pageIndex < 0) {
968 		kprintf("\taddress is before start of page array!"
969 			" (offset %" B_PRIdOFF ")\n", pageOffset);
970 	} else if ((page_num_t)pageIndex >= sNumPages) {
971 		kprintf("\taddress is after end of page array!"
972 			" (offset %" B_PRIdOFF ")\n", pageOffset);
973 	} else if ((pageIndex * (off_t)sizeof(vm_page)) != pageOffset) {
974 		kprintf("\taddress isn't a multiple of page structure size!"
975 			" (offset %" B_PRIdOFF ", expected align %" B_PRIuSIZE ")\n",
976 			pageOffset, sizeof(vm_page));
977 	}
978 
979 	kprintf("queue_next,prev: %p, %p\n", page->queue_link.next,
980 		page->queue_link.previous);
981 	kprintf("physical_number: %#" B_PRIxPHYSADDR "\n", page->physical_page_number);
982 	kprintf("cache:           %p\n", page->Cache());
983 	kprintf("cache_offset:    %" B_PRIuPHYSADDR "\n", page->cache_offset);
984 	kprintf("cache_next:      %p\n", page->cache_next);
985 	kprintf("state:           %s\n", page_state_to_string(page->State()));
986 	kprintf("wired_count:     %d\n", page->WiredCount());
987 	kprintf("usage_count:     %d\n", page->usage_count);
988 	kprintf("busy:            %d\n", page->busy);
989 	kprintf("busy_writing:    %d\n", page->busy_writing);
990 	kprintf("accessed:        %d\n", page->accessed);
991 	kprintf("modified:        %d\n", page->modified);
992 #if DEBUG_PAGE_QUEUE
993 	kprintf("queue:           %p\n", page->queue);
994 #endif
995 #if DEBUG_PAGE_ACCESS
996 	kprintf("accessor:        %" B_PRId32 "\n", page->accessing_thread);
997 #endif
998 
999 	if (pageIndex < 0 || (page_num_t)pageIndex >= sNumPages) {
1000 		// Don't try to read the mappings.
1001 		return 0;
1002 	}
1003 
1004 	kprintf("area mappings:\n");
1005 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
1006 	vm_page_mapping *mapping;
1007 	while ((mapping = iterator.Next()) != NULL) {
1008 		kprintf("  %p (%" B_PRId32 ")\n", mapping->area, mapping->area->id);
1009 		mapping = mapping->page_link.next;
1010 	}
1011 
1012 	if (searchMappings) {
1013 		struct Callback : VMTranslationMap::ReverseMappingInfoCallback {
1014 			VMAddressSpace*	fAddressSpace;
1015 
1016 			virtual bool HandleVirtualAddress(addr_t virtualAddress)
1017 			{
1018 				phys_addr_t physicalAddress;
1019 				uint32 flags = 0;
1020 				if (fAddressSpace->TranslationMap()->QueryInterrupt(virtualAddress,
1021 						&physicalAddress, &flags) != B_OK) {
1022 					kprintf(" aspace %" B_PRId32 ": %#"	B_PRIxADDR " (querying failed)\n",
1023 						fAddressSpace->ID(), virtualAddress);
1024 					return false;
1025 				}
1026 				VMArea* area = fAddressSpace->LookupArea(virtualAddress);
1027 				kprintf("  aspace %" B_PRId32 ", area %" B_PRId32 ": %#"
1028 					B_PRIxADDR " (%c%c%s%s)\n", fAddressSpace->ID(),
1029 					area != NULL ? area->id : -1, virtualAddress,
1030 					(flags & B_KERNEL_READ_AREA) != 0 ? 'r' : '-',
1031 					(flags & B_KERNEL_WRITE_AREA) != 0 ? 'w' : '-',
1032 					(flags & PAGE_MODIFIED) != 0 ? " modified" : "",
1033 					(flags & PAGE_ACCESSED) != 0 ? " accessed" : "");
1034 				return false;
1035 			}
1036 		} callback;
1037 
1038 		kprintf("all mappings:\n");
1039 		VMAddressSpace* addressSpace = VMAddressSpace::DebugFirst();
1040 		while (addressSpace != NULL) {
1041 			callback.fAddressSpace = addressSpace;
1042 			addressSpace->TranslationMap()->DebugGetReverseMappingInfo(
1043 				page->physical_page_number * B_PAGE_SIZE, callback);
1044 			addressSpace = VMAddressSpace::DebugNext(addressSpace);
1045 		}
1046 	}
1047 
1048 	set_debug_variable("_cache", (addr_t)page->Cache());
1049 #if DEBUG_PAGE_ACCESS
1050 	set_debug_variable("_accessor", page->accessing_thread);
1051 #endif
1052 
1053 	return 0;
1054 }
1055 
1056 
1057 static int
1058 dump_page_queue(int argc, char **argv)
1059 {
1060 	struct VMPageQueue *queue;
1061 
1062 	if (argc < 2) {
1063 		kprintf("usage: page_queue <address/name> [list]\n");
1064 		return 0;
1065 	}
1066 
1067 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
1068 		queue = (VMPageQueue*)strtoul(argv[1], NULL, 16);
1069 	else if (!strcmp(argv[1], "free"))
1070 		queue = &sFreePageQueue;
1071 	else if (!strcmp(argv[1], "clear"))
1072 		queue = &sClearPageQueue;
1073 	else if (!strcmp(argv[1], "modified"))
1074 		queue = &sModifiedPageQueue;
1075 	else if (!strcmp(argv[1], "active"))
1076 		queue = &sActivePageQueue;
1077 	else if (!strcmp(argv[1], "inactive"))
1078 		queue = &sInactivePageQueue;
1079 	else if (!strcmp(argv[1], "cached"))
1080 		queue = &sCachedPageQueue;
1081 	else {
1082 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
1083 		return 0;
1084 	}
1085 
1086 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %"
1087 		B_PRIuPHYSADDR "\n", queue, queue->Head(), queue->Tail(),
1088 		queue->Count());
1089 
1090 	if (argc == 3) {
1091 		struct vm_page *page = queue->Head();
1092 
1093 		kprintf("page        cache       type       state  wired  usage\n");
1094 		for (page_num_t i = 0; page; i++, page = queue->Next(page)) {
1095 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->Cache(),
1096 				vm_cache_type_to_string(page->Cache()->type),
1097 				page_state_to_string(page->State()),
1098 				page->WiredCount(), page->usage_count);
1099 		}
1100 	}
1101 	return 0;
1102 }
1103 
1104 
1105 static int
1106 dump_page_stats(int argc, char **argv)
1107 {
1108 	page_num_t swappableModified = 0;
1109 	page_num_t swappableModifiedInactive = 0;
1110 
1111 	size_t counter[8];
1112 	size_t busyCounter[8];
1113 	memset(counter, 0, sizeof(counter));
1114 	memset(busyCounter, 0, sizeof(busyCounter));
1115 
1116 	struct page_run {
1117 		page_num_t	start;
1118 		page_num_t	end;
1119 
1120 		page_num_t Length() const	{ return end - start; }
1121 	};
1122 
1123 	page_run currentFreeRun = { 0, 0 };
1124 	page_run currentCachedRun = { 0, 0 };
1125 	page_run longestFreeRun = { 0, 0 };
1126 	page_run longestCachedRun = { 0, 0 };
1127 
1128 	for (page_num_t i = 0; i < sNumPages; i++) {
1129 		if (sPages[i].State() > 7) {
1130 			panic("page %" B_PRIuPHYSADDR " at %p has invalid state!\n", i,
1131 				&sPages[i]);
1132 		}
1133 
1134 		uint32 pageState = sPages[i].State();
1135 
1136 		counter[pageState]++;
1137 		if (sPages[i].busy)
1138 			busyCounter[pageState]++;
1139 
1140 		if (pageState == PAGE_STATE_MODIFIED
1141 			&& sPages[i].Cache() != NULL
1142 			&& sPages[i].Cache()->temporary && sPages[i].WiredCount() == 0) {
1143 			swappableModified++;
1144 			if (sPages[i].usage_count == 0)
1145 				swappableModifiedInactive++;
1146 		}
1147 
1148 		// track free and cached pages runs
1149 		if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
1150 			currentFreeRun.end = i + 1;
1151 			currentCachedRun.end = i + 1;
1152 		} else {
1153 			if (currentFreeRun.Length() > longestFreeRun.Length())
1154 				longestFreeRun = currentFreeRun;
1155 			currentFreeRun.start = currentFreeRun.end = i + 1;
1156 
1157 			if (pageState == PAGE_STATE_CACHED) {
1158 				currentCachedRun.end = i + 1;
1159 			} else {
1160 				if (currentCachedRun.Length() > longestCachedRun.Length())
1161 					longestCachedRun = currentCachedRun;
1162 				currentCachedRun.start = currentCachedRun.end = i + 1;
1163 			}
1164 		}
1165 	}
1166 
1167 	kprintf("page stats:\n");
1168 	kprintf("total: %" B_PRIuPHYSADDR "\n", sNumPages);
1169 
1170 	kprintf("active: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1171 		counter[PAGE_STATE_ACTIVE], busyCounter[PAGE_STATE_ACTIVE]);
1172 	kprintf("inactive: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1173 		counter[PAGE_STATE_INACTIVE], busyCounter[PAGE_STATE_INACTIVE]);
1174 	kprintf("cached: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1175 		counter[PAGE_STATE_CACHED], busyCounter[PAGE_STATE_CACHED]);
1176 	kprintf("unused: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1177 		counter[PAGE_STATE_UNUSED], busyCounter[PAGE_STATE_UNUSED]);
1178 	kprintf("wired: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1179 		counter[PAGE_STATE_WIRED], busyCounter[PAGE_STATE_WIRED]);
1180 	kprintf("modified: %" B_PRIuSIZE " (busy: %" B_PRIuSIZE ")\n",
1181 		counter[PAGE_STATE_MODIFIED], busyCounter[PAGE_STATE_MODIFIED]);
1182 	kprintf("free: %" B_PRIuSIZE "\n", counter[PAGE_STATE_FREE]);
1183 	kprintf("clear: %" B_PRIuSIZE "\n", counter[PAGE_STATE_CLEAR]);
1184 
1185 	kprintf("unreserved free pages: %" B_PRId32 "\n", sUnreservedFreePages);
1186 	kprintf("unsatisfied page reservations: %" B_PRId32 "\n",
1187 		sUnsatisfiedPageReservations);
1188 	kprintf("mapped pages: %" B_PRId32 "\n", gMappedPagesCount);
1189 	kprintf("longest free pages run: %" B_PRIuPHYSADDR " pages (at %"
1190 		B_PRIuPHYSADDR ")\n", longestFreeRun.Length(),
1191 		sPages[longestFreeRun.start].physical_page_number);
1192 	kprintf("longest free/cached pages run: %" B_PRIuPHYSADDR " pages (at %"
1193 		B_PRIuPHYSADDR ")\n", longestCachedRun.Length(),
1194 		sPages[longestCachedRun.start].physical_page_number);
1195 
1196 	kprintf("waiting threads:\n");
1197 	for (PageReservationWaiterList::Iterator it
1198 			= sPageReservationWaiters.GetIterator();
1199 		PageReservationWaiter* waiter = it.Next();) {
1200 		kprintf("  %6" B_PRId32 ": missing: %6" B_PRIu32
1201 			", don't touch: %6" B_PRIu32 "\n", waiter->thread->id,
1202 			waiter->missing, waiter->dontTouch);
1203 	}
1204 
1205 	kprintf("\nfree queue: %p, count = %" B_PRIuPHYSADDR "\n", &sFreePageQueue,
1206 		sFreePageQueue.Count());
1207 	kprintf("clear queue: %p, count = %" B_PRIuPHYSADDR "\n", &sClearPageQueue,
1208 		sClearPageQueue.Count());
1209 	kprintf("modified queue: %p, count = %" B_PRIuPHYSADDR " (%" B_PRId32
1210 		" temporary, %" B_PRIuPHYSADDR " swappable, " "inactive: %"
1211 		B_PRIuPHYSADDR ")\n", &sModifiedPageQueue, sModifiedPageQueue.Count(),
1212 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
1213 	kprintf("active queue: %p, count = %" B_PRIuPHYSADDR "\n",
1214 		&sActivePageQueue, sActivePageQueue.Count());
1215 	kprintf("inactive queue: %p, count = %" B_PRIuPHYSADDR "\n",
1216 		&sInactivePageQueue, sInactivePageQueue.Count());
1217 	kprintf("cached queue: %p, count = %" B_PRIuPHYSADDR "\n",
1218 		&sCachedPageQueue, sCachedPageQueue.Count());
1219 	return 0;
1220 }
1221 
1222 
1223 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1224 
1225 static caller_info*
1226 get_caller_info(addr_t caller)
1227 {
1228 	// find the caller info
1229 	for (int32 i = 0; i < sCallerInfoCount; i++) {
1230 		if (caller == sCallerInfoTable[i].caller)
1231 			return &sCallerInfoTable[i];
1232 	}
1233 
1234 	// not found, add a new entry, if there are free slots
1235 	if (sCallerInfoCount >= kCallerInfoTableSize)
1236 		return NULL;
1237 
1238 	caller_info* info = &sCallerInfoTable[sCallerInfoCount++];
1239 	info->caller = caller;
1240 	info->count = 0;
1241 
1242 	return info;
1243 }
1244 
1245 
1246 static int
1247 caller_info_compare_count(const void* _a, const void* _b)
1248 {
1249 	const caller_info* a = (const caller_info*)_a;
1250 	const caller_info* b = (const caller_info*)_b;
1251 	return (int)(b->count - a->count);
1252 }
1253 
1254 
1255 static int
1256 dump_page_allocations_per_caller(int argc, char** argv)
1257 {
1258 	bool resetAllocationInfos = false;
1259 	bool printDetails = false;
1260 	addr_t caller = 0;
1261 
1262 	for (int32 i = 1; i < argc; i++) {
1263 		if (strcmp(argv[i], "-d") == 0) {
1264 			uint64 callerAddress;
1265 			if (++i >= argc
1266 				|| !evaluate_debug_expression(argv[i], &callerAddress, true)) {
1267 				print_debugger_command_usage(argv[0]);
1268 				return 0;
1269 			}
1270 
1271 			caller = callerAddress;
1272 			printDetails = true;
1273 		} else if (strcmp(argv[i], "-r") == 0) {
1274 			resetAllocationInfos = true;
1275 		} else {
1276 			print_debugger_command_usage(argv[0]);
1277 			return 0;
1278 		}
1279 	}
1280 
1281 	sCallerInfoCount = 0;
1282 
1283 	AllocationCollectorCallback collectorCallback(resetAllocationInfos);
1284 	AllocationDetailPrinterCallback detailsCallback(caller);
1285 	AllocationTrackingCallback& callback = printDetails
1286 		? (AllocationTrackingCallback&)detailsCallback
1287 		: (AllocationTrackingCallback&)collectorCallback;
1288 
1289 	for (page_num_t i = 0; i < sNumPages; i++)
1290 		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1291 
1292 	if (printDetails)
1293 		return 0;
1294 
1295 	// sort the array
1296 	qsort(sCallerInfoTable, sCallerInfoCount, sizeof(caller_info),
1297 		&caller_info_compare_count);
1298 
1299 	kprintf("%" B_PRId32 " different callers\n\n", sCallerInfoCount);
1300 
1301 	size_t totalAllocationCount = 0;
1302 
1303 	kprintf("     count      caller\n");
1304 	kprintf("----------------------------------\n");
1305 	for (int32 i = 0; i < sCallerInfoCount; i++) {
1306 		caller_info& info = sCallerInfoTable[i];
1307 		kprintf("%10" B_PRIuSIZE "  %p", info.count, (void*)info.caller);
1308 
1309 		const char* symbol;
1310 		const char* imageName;
1311 		bool exactMatch;
1312 		addr_t baseAddress;
1313 
1314 		if (elf_debug_lookup_symbol_address(info.caller, &baseAddress, &symbol,
1315 				&imageName, &exactMatch) == B_OK) {
1316 			kprintf("  %s + %#" B_PRIxADDR " (%s)%s\n", symbol,
1317 				info.caller - baseAddress, imageName,
1318 				exactMatch ? "" : " (nearest)");
1319 		} else
1320 			kprintf("\n");
1321 
1322 		totalAllocationCount += info.count;
1323 	}
1324 
1325 	kprintf("\ntotal page allocations: %" B_PRIuSIZE "\n",
1326 		totalAllocationCount);
1327 
1328 	return 0;
1329 }
1330 
1331 
1332 static int
1333 dump_page_allocation_infos(int argc, char** argv)
1334 {
1335 	page_num_t pageFilter = 0;
1336 	team_id teamFilter = -1;
1337 	thread_id threadFilter = -1;
1338 	bool printStackTraces = false;
1339 
1340 	for (int32 i = 1; i < argc; i++) {
1341 		if (strcmp(argv[i], "--stacktrace") == 0)
1342 			printStackTraces = true;
1343 		else if (strcmp(argv[i], "-p") == 0) {
1344 			uint64 pageNumber;
1345 			if (++i >= argc
1346 				|| !evaluate_debug_expression(argv[i], &pageNumber, true)) {
1347 				print_debugger_command_usage(argv[0]);
1348 				return 0;
1349 			}
1350 
1351 			pageFilter = pageNumber;
1352 		} else if (strcmp(argv[i], "--team") == 0) {
1353 			uint64 team;
1354 			if (++i >= argc
1355 				|| !evaluate_debug_expression(argv[i], &team, true)) {
1356 				print_debugger_command_usage(argv[0]);
1357 				return 0;
1358 			}
1359 
1360 			teamFilter = team;
1361 		} else if (strcmp(argv[i], "--thread") == 0) {
1362 			uint64 thread;
1363 			if (++i >= argc
1364 				|| !evaluate_debug_expression(argv[i], &thread, true)) {
1365 				print_debugger_command_usage(argv[0]);
1366 				return 0;
1367 			}
1368 
1369 			threadFilter = thread;
1370 		} else {
1371 			print_debugger_command_usage(argv[0]);
1372 			return 0;
1373 		}
1374 	}
1375 
1376 	AllocationInfoPrinterCallback callback(printStackTraces, pageFilter,
1377 		teamFilter, threadFilter);
1378 
1379 	for (page_num_t i = 0; i < sNumPages; i++)
1380 		callback.ProcessTrackingInfo(&sPages[i].allocation_tracking_info, i);
1381 
1382 	return 0;
1383 }
1384 
1385 #endif	// VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1386 
1387 
1388 #ifdef TRACK_PAGE_USAGE_STATS
1389 
1390 static void
1391 track_page_usage(vm_page* page)
1392 {
1393 	if (page->WiredCount() == 0) {
1394 		sNextPageUsage[(int32)page->usage_count + 128]++;
1395 		sNextPageUsagePageCount++;
1396 	}
1397 }
1398 
1399 
1400 static void
1401 update_page_usage_stats()
1402 {
1403 	std::swap(sPageUsage, sNextPageUsage);
1404 	sPageUsagePageCount = sNextPageUsagePageCount;
1405 
1406 	memset(sNextPageUsage, 0, sizeof(page_num_t) * 256);
1407 	sNextPageUsagePageCount = 0;
1408 
1409 	// compute average
1410 	if (sPageUsagePageCount > 0) {
1411 		int64 sum = 0;
1412 		for (int32 i = 0; i < 256; i++)
1413 			sum += (int64)sPageUsage[i] * (i - 128);
1414 
1415 		TRACE_DAEMON("average page usage: %f (%lu pages)\n",
1416 			(float)sum / sPageUsagePageCount, sPageUsagePageCount);
1417 	}
1418 }
1419 
1420 
1421 static int
1422 dump_page_usage_stats(int argc, char** argv)
1423 {
1424 	kprintf("distribution of page usage counts (%lu pages):",
1425 		sPageUsagePageCount);
1426 
1427 	int64 sum = 0;
1428 	for (int32 i = 0; i < 256; i++) {
1429 		if (i % 8 == 0)
1430 			kprintf("\n%4ld:", i - 128);
1431 
1432 		int64 count = sPageUsage[i];
1433 		sum += count * (i - 128);
1434 
1435 		kprintf("  %9llu", count);
1436 	}
1437 
1438 	kprintf("\n\n");
1439 
1440 	kprintf("average usage count: %f\n",
1441 		sPageUsagePageCount > 0 ? (float)sum / sPageUsagePageCount : 0);
1442 
1443 	return 0;
1444 }
1445 
1446 #endif	// TRACK_PAGE_USAGE_STATS
1447 
1448 
1449 // #pragma mark - vm_page
1450 
1451 
1452 inline void
1453 vm_page::InitState(uint8 newState)
1454 {
1455 	state = newState;
1456 }
1457 
1458 
1459 inline void
1460 vm_page::SetState(uint8 newState)
1461 {
1462 	TPS(SetPageState(this, newState));
1463 
1464 	state = newState;
1465 }
1466 
1467 
1468 // #pragma mark -
1469 
1470 
1471 static void
1472 get_page_stats(page_stats& _pageStats)
1473 {
1474 	_pageStats.totalFreePages = sUnreservedFreePages;
1475 	_pageStats.cachedPages = sCachedPageQueue.Count();
1476 	_pageStats.unsatisfiedReservations = sUnsatisfiedPageReservations;
1477 	// TODO: We don't get an actual snapshot here!
1478 }
1479 
1480 
1481 static bool
1482 do_active_paging(const page_stats& pageStats)
1483 {
1484 	return pageStats.totalFreePages + pageStats.cachedPages
1485 		< pageStats.unsatisfiedReservations
1486 			+ (int32)sFreeOrCachedPagesTarget;
1487 }
1488 
1489 
1490 /*!	Reserves as many pages as possible from \c sUnreservedFreePages up to
1491 	\a count. Doesn't touch the last \a dontTouch pages of
1492 	\c sUnreservedFreePages, though.
1493 	\return The number of actually reserved pages.
1494 */
1495 static uint32
1496 reserve_some_pages(uint32 count, uint32 dontTouch)
1497 {
1498 	while (true) {
1499 		int32 freePages = atomic_get(&sUnreservedFreePages);
1500 		if (freePages <= (int32)dontTouch)
1501 			return 0;
1502 
1503 		int32 toReserve = std::min(count, freePages - dontTouch);
1504 		if (atomic_test_and_set(&sUnreservedFreePages,
1505 					freePages - toReserve, freePages)
1506 				== freePages) {
1507 			return toReserve;
1508 		}
1509 
1510 		// the count changed in the meantime -- retry
1511 	}
1512 }
1513 
1514 
1515 static void
1516 wake_up_page_reservation_waiters()
1517 {
1518 	MutexLocker pageDeficitLocker(sPageDeficitLock);
1519 
1520 	// TODO: If this is a low priority thread, we might want to disable
1521 	// interrupts or otherwise ensure that we aren't unscheduled. Otherwise
1522 	// high priority threads wait be kept waiting while a medium priority thread
1523 	// prevents us from running.
1524 
1525 	while (PageReservationWaiter* waiter = sPageReservationWaiters.Head()) {
1526 		int32 reserved = reserve_some_pages(waiter->missing,
1527 			waiter->dontTouch);
1528 		if (reserved == 0)
1529 			return;
1530 
1531 		atomic_add(&sUnsatisfiedPageReservations, -reserved);
1532 		waiter->missing -= reserved;
1533 
1534 		if (waiter->missing > 0)
1535 			return;
1536 
1537 		sPageReservationWaiters.Remove(waiter);
1538 
1539 		thread_unblock(waiter->thread, B_OK);
1540 	}
1541 }
1542 
1543 
1544 static inline void
1545 unreserve_pages(uint32 count)
1546 {
1547 	atomic_add(&sUnreservedFreePages, count);
1548 	if (atomic_get(&sUnsatisfiedPageReservations) != 0)
1549 		wake_up_page_reservation_waiters();
1550 }
1551 
1552 
1553 static void
1554 free_page(vm_page* page, bool clear)
1555 {
1556 	DEBUG_PAGE_ACCESS_CHECK(page);
1557 
1558 	PAGE_ASSERT(page, !page->IsMapped());
1559 
1560 	VMPageQueue* fromQueue;
1561 
1562 	switch (page->State()) {
1563 		case PAGE_STATE_ACTIVE:
1564 			fromQueue = &sActivePageQueue;
1565 			break;
1566 		case PAGE_STATE_INACTIVE:
1567 			fromQueue = &sInactivePageQueue;
1568 			break;
1569 		case PAGE_STATE_MODIFIED:
1570 			fromQueue = &sModifiedPageQueue;
1571 			break;
1572 		case PAGE_STATE_CACHED:
1573 			fromQueue = &sCachedPageQueue;
1574 			break;
1575 		case PAGE_STATE_FREE:
1576 		case PAGE_STATE_CLEAR:
1577 			panic("free_page(): page %p already free", page);
1578 			return;
1579 		case PAGE_STATE_WIRED:
1580 		case PAGE_STATE_UNUSED:
1581 			fromQueue = NULL;
1582 			break;
1583 		default:
1584 			panic("free_page(): page %p in invalid state %d",
1585 				page, page->State());
1586 			return;
1587 	}
1588 
1589 	if (page->CacheRef() != NULL)
1590 		panic("to be freed page %p has cache", page);
1591 	if (page->IsMapped())
1592 		panic("to be freed page %p has mappings", page);
1593 
1594 	if (fromQueue != NULL)
1595 		fromQueue->RemoveUnlocked(page);
1596 
1597 	TA(FreePage(page->physical_page_number));
1598 
1599 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
1600 	page->allocation_tracking_info.Clear();
1601 #endif
1602 
1603 	ReadLocker locker(sFreePageQueuesLock);
1604 
1605 	DEBUG_PAGE_ACCESS_END(page);
1606 
1607 	if (clear) {
1608 		page->SetState(PAGE_STATE_CLEAR);
1609 		sClearPageQueue.PrependUnlocked(page);
1610 	} else {
1611 		page->SetState(PAGE_STATE_FREE);
1612 		sFreePageQueue.PrependUnlocked(page);
1613 		sFreePageCondition.NotifyAll();
1614 	}
1615 
1616 	locker.Unlock();
1617 }
1618 
1619 
1620 /*!	The caller must make sure that no-one else tries to change the page's state
1621 	while the function is called. If the page has a cache, this can be done by
1622 	locking the cache.
1623 */
1624 static void
1625 set_page_state(vm_page *page, int pageState)
1626 {
1627 	DEBUG_PAGE_ACCESS_CHECK(page);
1628 
1629 	if (pageState == page->State())
1630 		return;
1631 
1632 	VMPageQueue* fromQueue;
1633 
1634 	switch (page->State()) {
1635 		case PAGE_STATE_ACTIVE:
1636 			fromQueue = &sActivePageQueue;
1637 			break;
1638 		case PAGE_STATE_INACTIVE:
1639 			fromQueue = &sInactivePageQueue;
1640 			break;
1641 		case PAGE_STATE_MODIFIED:
1642 			fromQueue = &sModifiedPageQueue;
1643 			break;
1644 		case PAGE_STATE_CACHED:
1645 			fromQueue = &sCachedPageQueue;
1646 			break;
1647 		case PAGE_STATE_FREE:
1648 		case PAGE_STATE_CLEAR:
1649 			panic("set_page_state(): page %p is free/clear", page);
1650 			return;
1651 		case PAGE_STATE_WIRED:
1652 		case PAGE_STATE_UNUSED:
1653 			fromQueue = NULL;
1654 			break;
1655 		default:
1656 			panic("set_page_state(): page %p in invalid state %d",
1657 				page, page->State());
1658 			return;
1659 	}
1660 
1661 	VMPageQueue* toQueue;
1662 
1663 	switch (pageState) {
1664 		case PAGE_STATE_ACTIVE:
1665 			toQueue = &sActivePageQueue;
1666 			break;
1667 		case PAGE_STATE_INACTIVE:
1668 			toQueue = &sInactivePageQueue;
1669 			break;
1670 		case PAGE_STATE_MODIFIED:
1671 			toQueue = &sModifiedPageQueue;
1672 			break;
1673 		case PAGE_STATE_CACHED:
1674 			PAGE_ASSERT(page, !page->IsMapped());
1675 			PAGE_ASSERT(page, !page->modified);
1676 			toQueue = &sCachedPageQueue;
1677 			break;
1678 		case PAGE_STATE_FREE:
1679 		case PAGE_STATE_CLEAR:
1680 			panic("set_page_state(): target state is free/clear");
1681 			return;
1682 		case PAGE_STATE_WIRED:
1683 		case PAGE_STATE_UNUSED:
1684 			toQueue = NULL;
1685 			break;
1686 		default:
1687 			panic("set_page_state(): invalid target state %d", pageState);
1688 			return;
1689 	}
1690 
1691 	VMCache* cache = page->Cache();
1692 	if (cache != NULL && cache->temporary) {
1693 		if (pageState == PAGE_STATE_MODIFIED)
1694 			atomic_add(&sModifiedTemporaryPages, 1);
1695 		else if (page->State() == PAGE_STATE_MODIFIED)
1696 			atomic_add(&sModifiedTemporaryPages, -1);
1697 	}
1698 
1699 	// move the page
1700 	if (toQueue == fromQueue) {
1701 		// Note: Theoretically we are required to lock when changing the page
1702 		// state, even if we don't change the queue. We actually don't have to
1703 		// do this, though, since only for the active queue there are different
1704 		// page states and active pages have a cache that must be locked at
1705 		// this point. So we rely on the fact that everyone must lock the cache
1706 		// before trying to change/interpret the page state.
1707 		PAGE_ASSERT(page, cache != NULL);
1708 		cache->AssertLocked();
1709 		page->SetState(pageState);
1710 	} else {
1711 		if (fromQueue != NULL)
1712 			fromQueue->RemoveUnlocked(page);
1713 
1714 		page->SetState(pageState);
1715 
1716 		if (toQueue != NULL)
1717 			toQueue->AppendUnlocked(page);
1718 	}
1719 }
1720 
1721 
1722 /*! Moves a previously modified page into a now appropriate queue.
1723 	The page queues must not be locked.
1724 */
1725 static void
1726 move_page_to_appropriate_queue(vm_page *page)
1727 {
1728 	DEBUG_PAGE_ACCESS_CHECK(page);
1729 
1730 	// Note, this logic must be in sync with what the page daemon does.
1731 	int32 state;
1732 	if (page->IsMapped())
1733 		state = PAGE_STATE_ACTIVE;
1734 	else if (page->modified)
1735 		state = PAGE_STATE_MODIFIED;
1736 	else
1737 		state = PAGE_STATE_CACHED;
1738 
1739 // TODO: If free + cached pages are low, we might directly want to free the
1740 // page.
1741 	set_page_state(page, state);
1742 }
1743 
1744 
1745 static void
1746 clear_page(struct vm_page *page)
1747 {
1748 	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
1749 		B_PAGE_SIZE);
1750 }
1751 
1752 
1753 static status_t
1754 mark_page_range_in_use(page_num_t startPage, page_num_t length, bool wired)
1755 {
1756 	TRACE(("mark_page_range_in_use: start %#" B_PRIxPHYSADDR ", len %#"
1757 		B_PRIxPHYSADDR "\n", startPage, length));
1758 
1759 	if (sPhysicalPageOffset > startPage) {
1760 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1761 			"): start page is before free list\n", startPage, length);
1762 		if (sPhysicalPageOffset - startPage >= length)
1763 			return B_OK;
1764 		length -= sPhysicalPageOffset - startPage;
1765 		startPage = sPhysicalPageOffset;
1766 	}
1767 
1768 	startPage -= sPhysicalPageOffset;
1769 
1770 	if (startPage + length > sNumPages) {
1771 		dprintf("mark_page_range_in_use(%#" B_PRIxPHYSADDR ", %#" B_PRIxPHYSADDR
1772 			"): range would extend past free list\n", startPage, length);
1773 		if (startPage >= sNumPages)
1774 			return B_OK;
1775 		length = sNumPages - startPage;
1776 	}
1777 
1778 	WriteLocker locker(sFreePageQueuesLock);
1779 
1780 	for (page_num_t i = 0; i < length; i++) {
1781 		vm_page *page = &sPages[startPage + i];
1782 		switch (page->State()) {
1783 			case PAGE_STATE_FREE:
1784 			case PAGE_STATE_CLEAR:
1785 			{
1786 // TODO: This violates the page reservation policy, since we remove pages from
1787 // the free/clear queues without having reserved them before. This should happen
1788 // in the early boot process only, though.
1789 				DEBUG_PAGE_ACCESS_START(page);
1790 				VMPageQueue& queue = page->State() == PAGE_STATE_FREE
1791 					? sFreePageQueue : sClearPageQueue;
1792 				queue.Remove(page);
1793 				page->SetState(wired ? PAGE_STATE_WIRED : PAGE_STATE_UNUSED);
1794 				page->busy = false;
1795 				atomic_add(&sUnreservedFreePages, -1);
1796 				DEBUG_PAGE_ACCESS_END(page);
1797 				break;
1798 			}
1799 			case PAGE_STATE_WIRED:
1800 			case PAGE_STATE_UNUSED:
1801 				break;
1802 			case PAGE_STATE_ACTIVE:
1803 			case PAGE_STATE_INACTIVE:
1804 			case PAGE_STATE_MODIFIED:
1805 			case PAGE_STATE_CACHED:
1806 			default:
1807 				// uh
1808 				dprintf("mark_page_range_in_use: page %#" B_PRIxPHYSADDR
1809 					" in non-free state %d!\n", startPage + i, page->State());
1810 				break;
1811 		}
1812 	}
1813 
1814 	return B_OK;
1815 }
1816 
1817 
1818 /*!
1819 	This is a background thread that wakes up when its condition is notified
1820 	and moves some pages from the free queue over to the clear queue.
1821 	Given enough time, it will clear out all pages from the free queue - we
1822 	could probably slow it down after having reached a certain threshold.
1823 */
1824 static int32
1825 page_scrubber(void *unused)
1826 {
1827 	(void)(unused);
1828 
1829 	TRACE(("page_scrubber starting...\n"));
1830 
1831 	ConditionVariableEntry entry;
1832 	for (;;) {
1833 		while (sFreePageQueue.Count() == 0
1834 				|| atomic_get(&sUnreservedFreePages)
1835 					< (int32)sFreePagesTarget) {
1836 			sFreePageCondition.Add(&entry);
1837 			entry.Wait();
1838 		}
1839 
1840 		// Since we temporarily remove pages from the free pages reserve,
1841 		// we must make sure we don't cause a violation of the page
1842 		// reservation warranty. The following is usually stricter than
1843 		// necessary, because we don't have information on how many of the
1844 		// reserved pages have already been allocated.
1845 		int32 reserved = reserve_some_pages(SCRUB_SIZE,
1846 			kPageReserveForPriority[VM_PRIORITY_USER]);
1847 		if (reserved == 0)
1848 			continue;
1849 
1850 		// get some pages from the free queue, mostly sorted
1851 		ReadLocker locker(sFreePageQueuesLock);
1852 
1853 		vm_page *page[SCRUB_SIZE];
1854 		int32 scrubCount = 0;
1855 		for (int32 i = 0; i < reserved; i++) {
1856 			page[i] = sFreePageQueue.RemoveHeadUnlocked();
1857 			if (page[i] == NULL)
1858 				break;
1859 
1860 			DEBUG_PAGE_ACCESS_START(page[i]);
1861 
1862 			page[i]->SetState(PAGE_STATE_ACTIVE);
1863 			page[i]->busy = true;
1864 			scrubCount++;
1865 		}
1866 
1867 		locker.Unlock();
1868 
1869 		if (scrubCount == 0) {
1870 			unreserve_pages(reserved);
1871 			continue;
1872 		}
1873 
1874 		TA(ScrubbingPages(scrubCount));
1875 
1876 		// clear them
1877 		for (int32 i = 0; i < scrubCount; i++)
1878 			clear_page(page[i]);
1879 
1880 		locker.Lock();
1881 
1882 		// and put them into the clear queue
1883 		// process the array reversed when prepending to preserve sequential order
1884 		for (int32 i = scrubCount - 1; i >= 0; i--) {
1885 			page[i]->SetState(PAGE_STATE_CLEAR);
1886 			page[i]->busy = false;
1887 			DEBUG_PAGE_ACCESS_END(page[i]);
1888 			sClearPageQueue.PrependUnlocked(page[i]);
1889 		}
1890 
1891 		locker.Unlock();
1892 
1893 		unreserve_pages(reserved);
1894 
1895 		TA(ScrubbedPages(scrubCount));
1896 
1897 		// wait at least 100ms between runs
1898 		snooze(100 * 1000);
1899 	}
1900 
1901 	return 0;
1902 }
1903 
1904 
1905 static void
1906 init_page_marker(vm_page &marker)
1907 {
1908 	marker.SetCacheRef(NULL);
1909 	marker.InitState(PAGE_STATE_UNUSED);
1910 	marker.busy = true;
1911 #if DEBUG_PAGE_QUEUE
1912 	marker.queue = NULL;
1913 #endif
1914 #if DEBUG_PAGE_ACCESS
1915 	marker.accessing_thread = thread_get_current_thread_id();
1916 #endif
1917 }
1918 
1919 
1920 static void
1921 remove_page_marker(struct vm_page &marker)
1922 {
1923 	DEBUG_PAGE_ACCESS_CHECK(&marker);
1924 
1925 	if (marker.State() < PAGE_STATE_FIRST_UNQUEUED)
1926 		sPageQueues[marker.State()].RemoveUnlocked(&marker);
1927 
1928 	marker.SetState(PAGE_STATE_UNUSED);
1929 }
1930 
1931 
1932 static vm_page*
1933 next_modified_page(page_num_t& maxPagesToSee)
1934 {
1935 	InterruptsSpinLocker locker(sModifiedPageQueue.GetLock());
1936 
1937 	while (maxPagesToSee > 0) {
1938 		vm_page* page = sModifiedPageQueue.Head();
1939 		if (page == NULL)
1940 			return NULL;
1941 
1942 		sModifiedPageQueue.Requeue(page, true);
1943 
1944 		maxPagesToSee--;
1945 
1946 		if (!page->busy)
1947 			return page;
1948 	}
1949 
1950 	return NULL;
1951 }
1952 
1953 
1954 // #pragma mark -
1955 
1956 
1957 class PageWriteTransfer;
1958 class PageWriteWrapper;
1959 
1960 
1961 class PageWriterRun {
1962 public:
1963 	status_t Init(uint32 maxPages);
1964 
1965 	void PrepareNextRun();
1966 	void AddPage(vm_page* page);
1967 	uint32 Go();
1968 
1969 	void PageWritten(PageWriteTransfer* transfer, status_t status,
1970 		bool partialTransfer, size_t bytesTransferred);
1971 
1972 private:
1973 	uint32				fMaxPages;
1974 	uint32				fWrapperCount;
1975 	uint32				fTransferCount;
1976 	int32				fPendingTransfers;
1977 	PageWriteWrapper*	fWrappers;
1978 	PageWriteTransfer*	fTransfers;
1979 	ConditionVariable	fAllFinishedCondition;
1980 };
1981 
1982 
1983 class PageWriteTransfer : public AsyncIOCallback {
1984 public:
1985 	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1986 	bool AddPage(vm_page* page);
1987 
1988 	status_t Schedule(uint32 flags);
1989 
1990 	void SetStatus(status_t status, size_t transferred);
1991 
1992 	status_t Status() const	{ return fStatus; }
1993 	struct VMCache* Cache() const { return fCache; }
1994 	uint32 PageCount() const { return fPageCount; }
1995 
1996 	virtual void IOFinished(status_t status, bool partialTransfer,
1997 		generic_size_t bytesTransferred);
1998 
1999 private:
2000 	PageWriterRun*		fRun;
2001 	struct VMCache*		fCache;
2002 	off_t				fOffset;
2003 	uint32				fPageCount;
2004 	int32				fMaxPages;
2005 	status_t			fStatus;
2006 	uint32				fVecCount;
2007 	generic_io_vec		fVecs[32]; // TODO: make dynamic/configurable
2008 };
2009 
2010 
2011 class PageWriteWrapper {
2012 public:
2013 	PageWriteWrapper();
2014 	~PageWriteWrapper();
2015 	void SetTo(vm_page* page);
2016 	bool Done(status_t result);
2017 
2018 private:
2019 	vm_page*			fPage;
2020 	struct VMCache*		fCache;
2021 	bool				fIsActive;
2022 };
2023 
2024 
2025 PageWriteWrapper::PageWriteWrapper()
2026 	:
2027 	fIsActive(false)
2028 {
2029 }
2030 
2031 
2032 PageWriteWrapper::~PageWriteWrapper()
2033 {
2034 	if (fIsActive)
2035 		panic("page write wrapper going out of scope but isn't completed");
2036 }
2037 
2038 
2039 /*!	The page's cache must be locked.
2040 */
2041 void
2042 PageWriteWrapper::SetTo(vm_page* page)
2043 {
2044 	DEBUG_PAGE_ACCESS_CHECK(page);
2045 
2046 	if (page->busy)
2047 		panic("setting page write wrapper to busy page");
2048 
2049 	if (fIsActive)
2050 		panic("re-setting page write wrapper that isn't completed");
2051 
2052 	fPage = page;
2053 	fCache = page->Cache();
2054 	fIsActive = true;
2055 
2056 	fPage->busy = true;
2057 	fPage->busy_writing = true;
2058 
2059 	// We have a modified page -- however, while we're writing it back,
2060 	// the page might still be mapped. In order not to lose any changes to the
2061 	// page, we mark it clean before actually writing it back; if
2062 	// writing the page fails for some reason, we'll just keep it in the
2063 	// modified page list, but that should happen only rarely.
2064 
2065 	// If the page is changed after we cleared the dirty flag, but before we
2066 	// had the chance to write it back, then we'll write it again later -- that
2067 	// will probably not happen that often, though.
2068 
2069 	vm_clear_map_flags(fPage, PAGE_MODIFIED);
2070 }
2071 
2072 
2073 /*!	The page's cache must be locked.
2074 	The page queues must not be locked.
2075 	\return \c true if the page was written successfully respectively could be
2076 		handled somehow, \c false otherwise.
2077 */
2078 bool
2079 PageWriteWrapper::Done(status_t result)
2080 {
2081 	if (!fIsActive)
2082 		panic("completing page write wrapper that is not active");
2083 
2084 	DEBUG_PAGE_ACCESS_START(fPage);
2085 
2086 	fPage->busy = false;
2087 		// Set unbusy and notify later by hand, since we might free the page.
2088 
2089 	bool success = true;
2090 
2091 	if (result == B_OK) {
2092 		// put it into the active/inactive queue
2093 		move_page_to_appropriate_queue(fPage);
2094 		fPage->busy_writing = false;
2095 		DEBUG_PAGE_ACCESS_END(fPage);
2096 	} else {
2097 		// Writing the page failed. One reason would be that the cache has been
2098 		// shrunk and the page does no longer belong to the file. Otherwise the
2099 		// actual I/O failed, in which case we'll simply keep the page modified.
2100 
2101 		if (!fPage->busy_writing) {
2102 			// The busy_writing flag was cleared. That means the cache has been
2103 			// shrunk while we were trying to write the page and we have to free
2104 			// it now.
2105 			vm_remove_all_page_mappings(fPage);
2106 // TODO: Unmapping should already happen when resizing the cache!
2107 			fCache->RemovePage(fPage);
2108 			free_page(fPage, false);
2109 			unreserve_pages(1);
2110 		} else {
2111 			// Writing the page failed -- mark the page modified and move it to
2112 			// an appropriate queue other than the modified queue, so we don't
2113 			// keep trying to write it over and over again. We keep
2114 			// non-temporary pages in the modified queue, though, so they don't
2115 			// get lost in the inactive queue.
2116 			dprintf("PageWriteWrapper: Failed to write page %p: %s\n", fPage,
2117 				strerror(result));
2118 
2119 			fPage->modified = true;
2120 			if (!fCache->temporary)
2121 				set_page_state(fPage, PAGE_STATE_MODIFIED);
2122 			else if (fPage->IsMapped())
2123 				set_page_state(fPage, PAGE_STATE_ACTIVE);
2124 			else
2125 				set_page_state(fPage, PAGE_STATE_INACTIVE);
2126 
2127 			fPage->busy_writing = false;
2128 			DEBUG_PAGE_ACCESS_END(fPage);
2129 
2130 			success = false;
2131 		}
2132 	}
2133 
2134 	fCache->NotifyPageEvents(fPage, PAGE_EVENT_NOT_BUSY);
2135 	fIsActive = false;
2136 
2137 	return success;
2138 }
2139 
2140 
2141 /*!	The page's cache must be locked.
2142 */
2143 void
2144 PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
2145 {
2146 	fRun = run;
2147 	fCache = page->Cache();
2148 	fOffset = page->cache_offset;
2149 	fPageCount = 1;
2150 	fMaxPages = maxPages;
2151 	fStatus = B_OK;
2152 
2153 	fVecs[0].base = (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2154 	fVecs[0].length = B_PAGE_SIZE;
2155 	fVecCount = 1;
2156 }
2157 
2158 
2159 /*!	The page's cache must be locked.
2160 */
2161 bool
2162 PageWriteTransfer::AddPage(vm_page* page)
2163 {
2164 	if (page->Cache() != fCache
2165 		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
2166 		return false;
2167 
2168 	phys_addr_t nextBase = fVecs[fVecCount - 1].base
2169 		+ fVecs[fVecCount - 1].length;
2170 
2171 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2172 		&& (off_t)page->cache_offset == fOffset + fPageCount) {
2173 		// append to last iovec
2174 		fVecs[fVecCount - 1].length += B_PAGE_SIZE;
2175 		fPageCount++;
2176 		return true;
2177 	}
2178 
2179 	nextBase = fVecs[0].base - B_PAGE_SIZE;
2180 	if ((phys_addr_t)page->physical_page_number << PAGE_SHIFT == nextBase
2181 		&& (off_t)page->cache_offset == fOffset - 1) {
2182 		// prepend to first iovec and adjust offset
2183 		fVecs[0].base = nextBase;
2184 		fVecs[0].length += B_PAGE_SIZE;
2185 		fOffset = page->cache_offset;
2186 		fPageCount++;
2187 		return true;
2188 	}
2189 
2190 	if (((off_t)page->cache_offset == fOffset + fPageCount
2191 			|| (off_t)page->cache_offset == fOffset - 1)
2192 		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
2193 		// not physically contiguous or not in the right order
2194 		uint32 vectorIndex;
2195 		if ((off_t)page->cache_offset < fOffset) {
2196 			// we are pre-pending another vector, move the other vecs
2197 			for (uint32 i = fVecCount; i > 0; i--)
2198 				fVecs[i] = fVecs[i - 1];
2199 
2200 			fOffset = page->cache_offset;
2201 			vectorIndex = 0;
2202 		} else
2203 			vectorIndex = fVecCount;
2204 
2205 		fVecs[vectorIndex].base
2206 			= (phys_addr_t)page->physical_page_number << PAGE_SHIFT;
2207 		fVecs[vectorIndex].length = B_PAGE_SIZE;
2208 
2209 		fVecCount++;
2210 		fPageCount++;
2211 		return true;
2212 	}
2213 
2214 	return false;
2215 }
2216 
2217 
2218 status_t
2219 PageWriteTransfer::Schedule(uint32 flags)
2220 {
2221 	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
2222 	generic_size_t writeLength = (phys_size_t)fPageCount << PAGE_SHIFT;
2223 
2224 	if (fRun != NULL) {
2225 		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
2226 			flags | B_PHYSICAL_IO_REQUEST, this);
2227 	}
2228 
2229 	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
2230 		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
2231 
2232 	SetStatus(status, writeLength);
2233 	return fStatus;
2234 }
2235 
2236 
2237 void
2238 PageWriteTransfer::SetStatus(status_t status, size_t transferred)
2239 {
2240 	// only succeed if all pages up to the last one have been written fully
2241 	// and the last page has at least been written partially
2242 	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
2243 		status = B_ERROR;
2244 
2245 	fStatus = status;
2246 }
2247 
2248 
2249 void
2250 PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
2251 	generic_size_t bytesTransferred)
2252 {
2253 	SetStatus(status, bytesTransferred);
2254 	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
2255 }
2256 
2257 
2258 status_t
2259 PageWriterRun::Init(uint32 maxPages)
2260 {
2261 	fMaxPages = maxPages;
2262 	fWrapperCount = 0;
2263 	fTransferCount = 0;
2264 	fPendingTransfers = 0;
2265 
2266 	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
2267 	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
2268 	if (fWrappers == NULL || fTransfers == NULL)
2269 		return B_NO_MEMORY;
2270 
2271 	return B_OK;
2272 }
2273 
2274 
2275 void
2276 PageWriterRun::PrepareNextRun()
2277 {
2278 	fWrapperCount = 0;
2279 	fTransferCount = 0;
2280 	fPendingTransfers = 0;
2281 }
2282 
2283 
2284 /*!	The page's cache must be locked.
2285 */
2286 void
2287 PageWriterRun::AddPage(vm_page* page)
2288 {
2289 	fWrappers[fWrapperCount++].SetTo(page);
2290 
2291 	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
2292 		fTransfers[fTransferCount++].SetTo(this, page,
2293 			page->Cache()->MaxPagesPerAsyncWrite());
2294 	}
2295 }
2296 
2297 
2298 /*!	Writes all pages previously added.
2299 	\return The number of pages that could not be written or otherwise handled.
2300 */
2301 uint32
2302 PageWriterRun::Go()
2303 {
2304 	atomic_set(&fPendingTransfers, fTransferCount);
2305 
2306 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
2307 	ConditionVariableEntry waitEntry;
2308 	fAllFinishedCondition.Add(&waitEntry);
2309 
2310 	// schedule writes
2311 	for (uint32 i = 0; i < fTransferCount; i++)
2312 		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
2313 
2314 	// wait until all pages have been written
2315 	waitEntry.Wait();
2316 
2317 	// mark pages depending on whether they could be written or not
2318 
2319 	uint32 failedPages = 0;
2320 	uint32 wrapperIndex = 0;
2321 	for (uint32 i = 0; i < fTransferCount; i++) {
2322 		PageWriteTransfer& transfer = fTransfers[i];
2323 		transfer.Cache()->Lock();
2324 
2325 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2326 			if (!fWrappers[wrapperIndex++].Done(transfer.Status()))
2327 				failedPages++;
2328 		}
2329 
2330 		transfer.Cache()->Unlock();
2331 	}
2332 
2333 	ASSERT(wrapperIndex == fWrapperCount);
2334 
2335 	for (uint32 i = 0; i < fTransferCount; i++) {
2336 		PageWriteTransfer& transfer = fTransfers[i];
2337 		struct VMCache* cache = transfer.Cache();
2338 
2339 		// We've acquired a references for each page
2340 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
2341 			// We release the cache references after all pages were made
2342 			// unbusy again - otherwise releasing a vnode could deadlock.
2343 			cache->ReleaseStoreRef();
2344 			cache->ReleaseRef();
2345 		}
2346 	}
2347 
2348 	return failedPages;
2349 }
2350 
2351 
2352 void
2353 PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
2354 	bool partialTransfer, size_t bytesTransferred)
2355 {
2356 	if (atomic_add(&fPendingTransfers, -1) == 1)
2357 		fAllFinishedCondition.NotifyAll();
2358 }
2359 
2360 
2361 /*!	The page writer continuously takes some pages from the modified
2362 	queue, writes them back, and moves them back to the active queue.
2363 	It runs in its own thread, and is only there to keep the number
2364 	of modified pages low, so that more pages can be reused with
2365 	fewer costs.
2366 */
2367 status_t
2368 page_writer(void* /*unused*/)
2369 {
2370 	const uint32 kNumPages = 256;
2371 #ifdef TRACE_VM_PAGE
2372 	uint32 writtenPages = 0;
2373 	bigtime_t lastWrittenTime = 0;
2374 	bigtime_t pageCollectionTime = 0;
2375 	bigtime_t pageWritingTime = 0;
2376 #endif
2377 
2378 	PageWriterRun run;
2379 	if (run.Init(kNumPages) != B_OK) {
2380 		panic("page writer: Failed to init PageWriterRun!");
2381 		return B_ERROR;
2382 	}
2383 
2384 	page_num_t pagesSinceLastSuccessfulWrite = 0;
2385 
2386 	while (true) {
2387 // TODO: Maybe wait shorter when memory is low!
2388 		if (sModifiedPageQueue.Count() < kNumPages) {
2389 			sPageWriterCondition.Wait(3000000, true);
2390 				// all 3 seconds when no one triggers us
2391 		}
2392 
2393 		page_num_t modifiedPages = sModifiedPageQueue.Count();
2394 		if (modifiedPages == 0)
2395 			continue;
2396 
2397 		if (modifiedPages <= pagesSinceLastSuccessfulWrite) {
2398 			// We ran through the whole queue without being able to write a
2399 			// single page. Take a break.
2400 			snooze(500000);
2401 			pagesSinceLastSuccessfulWrite = 0;
2402 		}
2403 
2404 #if ENABLE_SWAP_SUPPORT
2405 		page_stats pageStats;
2406 		get_page_stats(pageStats);
2407 		bool activePaging = do_active_paging(pageStats);
2408 #endif
2409 
2410 		// depending on how urgent it becomes to get pages to disk, we adjust
2411 		// our I/O priority
2412 		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
2413 		int32 ioPriority = B_IDLE_PRIORITY;
2414 		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
2415 			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
2416 			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
2417 		} else {
2418 			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
2419 				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
2420 		}
2421 
2422 		thread_set_io_priority(ioPriority);
2423 
2424 		uint32 numPages = 0;
2425 		run.PrepareNextRun();
2426 
2427 		// TODO: make this laptop friendly, too (ie. only start doing
2428 		// something if someone else did something or there is really
2429 		// enough to do).
2430 
2431 		// collect pages to be written
2432 #ifdef TRACE_VM_PAGE
2433 		pageCollectionTime -= system_time();
2434 #endif
2435 
2436 		page_num_t maxPagesToSee = modifiedPages;
2437 
2438 		while (numPages < kNumPages && maxPagesToSee > 0) {
2439 			vm_page *page = next_modified_page(maxPagesToSee);
2440 			if (page == NULL)
2441 				break;
2442 
2443 			PageCacheLocker cacheLocker(page, false);
2444 			if (!cacheLocker.IsLocked())
2445 				continue;
2446 
2447 			VMCache *cache = page->Cache();
2448 
2449 			// If the page is busy or its state has changed while we were
2450 			// locking the cache, just ignore it.
2451 			if (page->busy || page->State() != PAGE_STATE_MODIFIED)
2452 				continue;
2453 
2454 			DEBUG_PAGE_ACCESS_START(page);
2455 
2456 			// Don't write back wired (locked) pages.
2457 			if (page->WiredCount() > 0) {
2458 				set_page_state(page, PAGE_STATE_ACTIVE);
2459 				DEBUG_PAGE_ACCESS_END(page);
2460 				continue;
2461 			}
2462 
2463 			// Write back temporary pages only when we're actively paging.
2464 			if (cache->temporary
2465 #if ENABLE_SWAP_SUPPORT
2466 				&& (!activePaging
2467 					|| !cache->CanWritePage(
2468 							(off_t)page->cache_offset << PAGE_SHIFT))
2469 #endif
2470 				) {
2471 				// We can't/don't want to do anything with this page, so move it
2472 				// to one of the other queues.
2473 				if (page->mappings.IsEmpty())
2474 					set_page_state(page, PAGE_STATE_INACTIVE);
2475 				else
2476 					set_page_state(page, PAGE_STATE_ACTIVE);
2477 
2478 				DEBUG_PAGE_ACCESS_END(page);
2479 				continue;
2480 			}
2481 
2482 			// We need our own reference to the store, as it might currently be
2483 			// destroyed.
2484 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
2485 				DEBUG_PAGE_ACCESS_END(page);
2486 				cacheLocker.Unlock();
2487 				thread_yield();
2488 				continue;
2489 			}
2490 
2491 			run.AddPage(page);
2492 				// TODO: We're possibly adding pages of different caches and
2493 				// thus maybe of different underlying file systems here. This
2494 				// is a potential problem for loop file systems/devices, since
2495 				// we could mark a page busy that would need to be accessed
2496 				// when writing back another page, thus causing a deadlock.
2497 
2498 			DEBUG_PAGE_ACCESS_END(page);
2499 
2500 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
2501 			TPW(WritePage(page));
2502 
2503 			cache->AcquireRefLocked();
2504 			numPages++;
2505 		}
2506 
2507 #ifdef TRACE_VM_PAGE
2508 		pageCollectionTime += system_time();
2509 #endif
2510 		if (numPages == 0)
2511 			continue;
2512 
2513 		// write pages to disk and do all the cleanup
2514 #ifdef TRACE_VM_PAGE
2515 		pageWritingTime -= system_time();
2516 #endif
2517 		uint32 failedPages = run.Go();
2518 #ifdef TRACE_VM_PAGE
2519 		pageWritingTime += system_time();
2520 
2521 		// debug output only...
2522 		writtenPages += numPages;
2523 		if (writtenPages >= 1024) {
2524 			bigtime_t now = system_time();
2525 			TRACE(("page writer: wrote 1024 pages (total: %" B_PRIu64 " ms, "
2526 				"collect: %" B_PRIu64 " ms, write: %" B_PRIu64 " ms)\n",
2527 				(now - lastWrittenTime) / 1000,
2528 				pageCollectionTime / 1000, pageWritingTime / 1000));
2529 			lastWrittenTime = now;
2530 
2531 			writtenPages -= 1024;
2532 			pageCollectionTime = 0;
2533 			pageWritingTime = 0;
2534 		}
2535 #endif
2536 
2537 		if (failedPages == numPages)
2538 			pagesSinceLastSuccessfulWrite += modifiedPages - maxPagesToSee;
2539 		else
2540 			pagesSinceLastSuccessfulWrite = 0;
2541 	}
2542 
2543 	return B_OK;
2544 }
2545 
2546 
2547 // #pragma mark -
2548 
2549 
2550 // TODO: This should be done in the page daemon!
2551 #if 0
2552 #if ENABLE_SWAP_SUPPORT
2553 static bool
2554 free_page_swap_space(int32 index)
2555 {
2556 	vm_page *page = vm_page_at_index(index);
2557 	PageCacheLocker locker(page);
2558 	if (!locker.IsLocked())
2559 		return false;
2560 
2561 	DEBUG_PAGE_ACCESS_START(page);
2562 
2563 	VMCache* cache = page->Cache();
2564 	if (cache->temporary && page->WiredCount() == 0
2565 			&& cache->HasPage(page->cache_offset << PAGE_SHIFT)
2566 			&& page->usage_count > 0) {
2567 		// TODO: how to judge a page is highly active?
2568 		if (swap_free_page_swap_space(page)) {
2569 			// We need to mark the page modified, since otherwise it could be
2570 			// stolen and we'd lose its data.
2571 			vm_page_set_state(page, PAGE_STATE_MODIFIED);
2572 			TD(FreedPageSwap(page));
2573 			DEBUG_PAGE_ACCESS_END(page);
2574 			return true;
2575 		}
2576 	}
2577 	DEBUG_PAGE_ACCESS_END(page);
2578 	return false;
2579 }
2580 #endif
2581 #endif	// 0
2582 
2583 
2584 static vm_page *
2585 find_cached_page_candidate(struct vm_page &marker)
2586 {
2587 	DEBUG_PAGE_ACCESS_CHECK(&marker);
2588 
2589 	InterruptsSpinLocker locker(sCachedPageQueue.GetLock());
2590 	vm_page *page;
2591 
2592 	if (marker.State() == PAGE_STATE_UNUSED) {
2593 		// Get the first free pages of the (in)active queue
2594 		page = sCachedPageQueue.Head();
2595 	} else {
2596 		// Get the next page of the current queue
2597 		if (marker.State() != PAGE_STATE_CACHED) {
2598 			panic("invalid marker %p state", &marker);
2599 			return NULL;
2600 		}
2601 
2602 		page = sCachedPageQueue.Next(&marker);
2603 		sCachedPageQueue.Remove(&marker);
2604 		marker.SetState(PAGE_STATE_UNUSED);
2605 	}
2606 
2607 	while (page != NULL) {
2608 		if (!page->busy) {
2609 			// we found a candidate, insert marker
2610 			marker.SetState(PAGE_STATE_CACHED);
2611 			sCachedPageQueue.InsertAfter(page, &marker);
2612 			return page;
2613 		}
2614 
2615 		page = sCachedPageQueue.Next(page);
2616 	}
2617 
2618 	return NULL;
2619 }
2620 
2621 
2622 static bool
2623 free_cached_page(vm_page *page, bool dontWait)
2624 {
2625 	// try to lock the page's cache
2626 	if (vm_cache_acquire_locked_page_cache(page, dontWait) == NULL)
2627 		return false;
2628 	VMCache* cache = page->Cache();
2629 
2630 	AutoLocker<VMCache> cacheLocker(cache, true);
2631 	MethodDeleter<VMCache, void, &VMCache::ReleaseRefLocked> _2(cache);
2632 
2633 	// check again if that page is still a candidate
2634 	if (page->busy || page->State() != PAGE_STATE_CACHED)
2635 		return false;
2636 
2637 	DEBUG_PAGE_ACCESS_START(page);
2638 
2639 	PAGE_ASSERT(page, !page->IsMapped());
2640 	PAGE_ASSERT(page, !page->modified);
2641 
2642 	// we can now steal this page
2643 
2644 	cache->RemovePage(page);
2645 		// Now the page doesn't have cache anymore, so no one else (e.g.
2646 		// vm_page_allocate_page_run() can pick it up), since they would be
2647 		// required to lock the cache first, which would fail.
2648 
2649 	sCachedPageQueue.RemoveUnlocked(page);
2650 	return true;
2651 }
2652 
2653 
2654 static uint32
2655 free_cached_pages(uint32 pagesToFree, bool dontWait)
2656 {
2657 	vm_page marker;
2658 	init_page_marker(marker);
2659 
2660 	uint32 pagesFreed = 0;
2661 
2662 	while (pagesFreed < pagesToFree) {
2663 		vm_page *page = find_cached_page_candidate(marker);
2664 		if (page == NULL)
2665 			break;
2666 
2667 		if (free_cached_page(page, dontWait)) {
2668 			ReadLocker locker(sFreePageQueuesLock);
2669 			page->SetState(PAGE_STATE_FREE);
2670 			DEBUG_PAGE_ACCESS_END(page);
2671 			sFreePageQueue.PrependUnlocked(page);
2672 			locker.Unlock();
2673 
2674 			TA(StolenPage());
2675 
2676 			pagesFreed++;
2677 		}
2678 	}
2679 
2680 	remove_page_marker(marker);
2681 
2682 	sFreePageCondition.NotifyAll();
2683 
2684 	return pagesFreed;
2685 }
2686 
2687 
2688 static void
2689 idle_scan_active_pages(page_stats& pageStats)
2690 {
2691 	VMPageQueue& queue = sActivePageQueue;
2692 
2693 	// We want to scan the whole queue in roughly kIdleRunsForFullQueue runs.
2694 	uint32 maxToScan = queue.Count() / kIdleRunsForFullQueue + 1;
2695 
2696 	while (maxToScan > 0) {
2697 		maxToScan--;
2698 
2699 		// Get the next page. Note that we don't bother to lock here. We go with
2700 		// the assumption that on all architectures reading/writing pointers is
2701 		// atomic. Beyond that it doesn't really matter. We have to unlock the
2702 		// queue anyway to lock the page's cache, and we'll recheck afterwards.
2703 		vm_page* page = queue.Head();
2704 		if (page == NULL)
2705 			break;
2706 
2707 		// lock the page's cache
2708 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2709 		if (cache == NULL)
2710 			continue;
2711 
2712 		if (page->State() != PAGE_STATE_ACTIVE) {
2713 			// page is no longer in the cache or in this queue
2714 			cache->ReleaseRefAndUnlock();
2715 			continue;
2716 		}
2717 
2718 		if (page->busy) {
2719 			// page is busy -- requeue at the end
2720 			vm_page_requeue(page, true);
2721 			cache->ReleaseRefAndUnlock();
2722 			continue;
2723 		}
2724 
2725 		DEBUG_PAGE_ACCESS_START(page);
2726 
2727 		// Get the page active/modified flags and update the page's usage count.
2728 		// We completely unmap inactive temporary pages. This saves us to
2729 		// iterate through the inactive list as well, since we'll be notified
2730 		// via page fault whenever such an inactive page is used again.
2731 		// We don't remove the mappings of non-temporary pages, since we
2732 		// wouldn't notice when those would become unused and could thus be
2733 		// moved to the cached list.
2734 		int32 usageCount;
2735 		if (page->WiredCount() > 0 || page->usage_count > 0
2736 			|| !cache->temporary) {
2737 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2738 		} else
2739 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2740 
2741 		if (usageCount > 0) {
2742 			usageCount += page->usage_count + kPageUsageAdvance;
2743 			if (usageCount > kPageUsageMax)
2744 				usageCount = kPageUsageMax;
2745 // TODO: This would probably also be the place to reclaim swap space.
2746 		} else {
2747 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2748 			if (usageCount < 0) {
2749 				usageCount = 0;
2750 				set_page_state(page, PAGE_STATE_INACTIVE);
2751 			}
2752 		}
2753 
2754 		page->usage_count = usageCount;
2755 
2756 		DEBUG_PAGE_ACCESS_END(page);
2757 
2758 		cache->ReleaseRefAndUnlock();
2759 	}
2760 }
2761 
2762 
2763 static void
2764 full_scan_inactive_pages(page_stats& pageStats, int32 despairLevel)
2765 {
2766 	int32 pagesToFree = pageStats.unsatisfiedReservations
2767 		+ sFreeOrCachedPagesTarget
2768 		- (pageStats.totalFreePages + pageStats.cachedPages);
2769 	if (pagesToFree <= 0)
2770 		return;
2771 
2772 	bigtime_t time = system_time();
2773 	uint32 pagesScanned = 0;
2774 	uint32 pagesToCached = 0;
2775 	uint32 pagesToModified = 0;
2776 	uint32 pagesToActive = 0;
2777 
2778 	// Determine how many pages at maximum to send to the modified queue. Since
2779 	// it is relatively expensive to page out pages, we do that on a grander
2780 	// scale only when things get desperate.
2781 	uint32 maxToFlush = despairLevel <= 1 ? 32 : 10000;
2782 
2783 	vm_page marker;
2784 	init_page_marker(marker);
2785 
2786 	VMPageQueue& queue = sInactivePageQueue;
2787 	InterruptsSpinLocker queueLocker(queue.GetLock());
2788 	uint32 maxToScan = queue.Count();
2789 
2790 	vm_page* nextPage = queue.Head();
2791 
2792 	while (pagesToFree > 0 && maxToScan > 0) {
2793 		maxToScan--;
2794 
2795 		// get the next page
2796 		vm_page* page = nextPage;
2797 		if (page == NULL)
2798 			break;
2799 		nextPage = queue.Next(page);
2800 
2801 		if (page->busy)
2802 			continue;
2803 
2804 		// mark the position
2805 		queue.InsertAfter(page, &marker);
2806 		queueLocker.Unlock();
2807 
2808 		// lock the page's cache
2809 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2810 		if (cache == NULL || page->busy
2811 				|| page->State() != PAGE_STATE_INACTIVE) {
2812 			if (cache != NULL)
2813 				cache->ReleaseRefAndUnlock();
2814 			queueLocker.Lock();
2815 			nextPage = queue.Next(&marker);
2816 			queue.Remove(&marker);
2817 			continue;
2818 		}
2819 
2820 		pagesScanned++;
2821 
2822 		DEBUG_PAGE_ACCESS_START(page);
2823 
2824 		// Get the accessed count, clear the accessed/modified flags and
2825 		// unmap the page, if it hasn't been accessed.
2826 		int32 usageCount;
2827 		if (page->WiredCount() > 0)
2828 			usageCount = vm_clear_page_mapping_accessed_flags(page);
2829 		else
2830 			usageCount = vm_remove_all_page_mappings_if_unaccessed(page);
2831 
2832 		// update usage count
2833 		if (usageCount > 0) {
2834 			usageCount += page->usage_count + kPageUsageAdvance;
2835 			if (usageCount > kPageUsageMax)
2836 				usageCount = kPageUsageMax;
2837 		} else {
2838 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2839 			if (usageCount < 0)
2840 				usageCount = 0;
2841 		}
2842 
2843 		page->usage_count = usageCount;
2844 
2845 		// Move to fitting queue or requeue:
2846 		// * Active mapped pages go to the active queue.
2847 		// * Inactive mapped (i.e. wired) pages are requeued.
2848 		// * The remaining pages are cachable. Thus, if unmodified they go to
2849 		//   the cached queue, otherwise to the modified queue (up to a limit).
2850 		//   Note that until in the idle scanning we don't exempt pages of
2851 		//   temporary caches. Apparently we really need memory, so we better
2852 		//   page out memory as well.
2853 		bool isMapped = page->IsMapped();
2854 		if (usageCount > 0) {
2855 			if (isMapped) {
2856 				set_page_state(page, PAGE_STATE_ACTIVE);
2857 				pagesToActive++;
2858 			} else
2859 				vm_page_requeue(page, true);
2860 		} else if (isMapped) {
2861 			vm_page_requeue(page, true);
2862 		} else if (!page->modified) {
2863 			set_page_state(page, PAGE_STATE_CACHED);
2864 			pagesToFree--;
2865 			pagesToCached++;
2866 		} else if (maxToFlush > 0) {
2867 			set_page_state(page, PAGE_STATE_MODIFIED);
2868 			maxToFlush--;
2869 			pagesToModified++;
2870 		} else
2871 			vm_page_requeue(page, true);
2872 
2873 		DEBUG_PAGE_ACCESS_END(page);
2874 
2875 		cache->ReleaseRefAndUnlock();
2876 
2877 		// remove the marker
2878 		queueLocker.Lock();
2879 		nextPage = queue.Next(&marker);
2880 		queue.Remove(&marker);
2881 	}
2882 
2883 	queueLocker.Unlock();
2884 
2885 	time = system_time() - time;
2886 	TRACE_DAEMON("  -> inactive scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2887 		", moved: %" B_PRIu32 " -> cached, %" B_PRIu32 " -> modified, %"
2888 		B_PRIu32 " -> active\n", time, pagesScanned, pagesToCached,
2889 		pagesToModified, pagesToActive);
2890 
2891 	// wake up the page writer, if we tossed it some pages
2892 	if (pagesToModified > 0)
2893 		sPageWriterCondition.WakeUp();
2894 }
2895 
2896 
2897 static void
2898 full_scan_active_pages(page_stats& pageStats, int32 despairLevel)
2899 {
2900 	vm_page marker;
2901 	init_page_marker(marker);
2902 
2903 	VMPageQueue& queue = sActivePageQueue;
2904 	InterruptsSpinLocker queueLocker(queue.GetLock());
2905 	uint32 maxToScan = queue.Count();
2906 
2907 	int32 pagesToDeactivate = pageStats.unsatisfiedReservations
2908 		+ sFreeOrCachedPagesTarget
2909 		- (pageStats.totalFreePages + pageStats.cachedPages)
2910 		+ std::max((int32)sInactivePagesTarget - (int32)maxToScan, (int32)0);
2911 	if (pagesToDeactivate <= 0)
2912 		return;
2913 
2914 	bigtime_t time = system_time();
2915 	uint32 pagesAccessed = 0;
2916 	uint32 pagesToInactive = 0;
2917 	uint32 pagesScanned = 0;
2918 
2919 	vm_page* nextPage = queue.Head();
2920 
2921 	while (pagesToDeactivate > 0 && maxToScan > 0) {
2922 		maxToScan--;
2923 
2924 		// get the next page
2925 		vm_page* page = nextPage;
2926 		if (page == NULL)
2927 			break;
2928 		nextPage = queue.Next(page);
2929 
2930 		if (page->busy)
2931 			continue;
2932 
2933 		// mark the position
2934 		queue.InsertAfter(page, &marker);
2935 		queueLocker.Unlock();
2936 
2937 		// lock the page's cache
2938 		VMCache* cache = vm_cache_acquire_locked_page_cache(page, true);
2939 		if (cache == NULL || page->busy || page->State() != PAGE_STATE_ACTIVE) {
2940 			if (cache != NULL)
2941 				cache->ReleaseRefAndUnlock();
2942 			queueLocker.Lock();
2943 			nextPage = queue.Next(&marker);
2944 			queue.Remove(&marker);
2945 			continue;
2946 		}
2947 
2948 		pagesScanned++;
2949 
2950 		DEBUG_PAGE_ACCESS_START(page);
2951 
2952 		// Get the page active/modified flags and update the page's usage count.
2953 		int32 usageCount = vm_clear_page_mapping_accessed_flags(page);
2954 
2955 		if (usageCount > 0) {
2956 			usageCount += page->usage_count + kPageUsageAdvance;
2957 			if (usageCount > kPageUsageMax)
2958 				usageCount = kPageUsageMax;
2959 			pagesAccessed++;
2960 // TODO: This would probably also be the place to reclaim swap space.
2961 		} else {
2962 			usageCount += page->usage_count - (int32)kPageUsageDecline;
2963 			if (usageCount <= 0) {
2964 				usageCount = 0;
2965 				set_page_state(page, PAGE_STATE_INACTIVE);
2966 				pagesToInactive++;
2967 			}
2968 		}
2969 
2970 		page->usage_count = usageCount;
2971 
2972 		DEBUG_PAGE_ACCESS_END(page);
2973 
2974 		cache->ReleaseRefAndUnlock();
2975 
2976 		// remove the marker
2977 		queueLocker.Lock();
2978 		nextPage = queue.Next(&marker);
2979 		queue.Remove(&marker);
2980 	}
2981 
2982 	time = system_time() - time;
2983 	TRACE_DAEMON("  ->   active scan (%7" B_PRId64 " us): scanned: %7" B_PRIu32
2984 		", moved: %" B_PRIu32 " -> inactive, encountered %" B_PRIu32 " accessed"
2985 		" ones\n", time, pagesScanned, pagesToInactive, pagesAccessed);
2986 }
2987 
2988 
2989 static void
2990 page_daemon_idle_scan(page_stats& pageStats)
2991 {
2992 	TRACE_DAEMON("page daemon: idle run\n");
2993 
2994 	if (pageStats.totalFreePages < (int32)sFreePagesTarget) {
2995 		// We want more actually free pages, so free some from the cached
2996 		// ones.
2997 		uint32 freed = free_cached_pages(
2998 			sFreePagesTarget - pageStats.totalFreePages, false);
2999 		if (freed > 0)
3000 			unreserve_pages(freed);
3001 		get_page_stats(pageStats);
3002 	}
3003 
3004 	// Walk the active list and move pages to the inactive queue.
3005 	get_page_stats(pageStats);
3006 	idle_scan_active_pages(pageStats);
3007 }
3008 
3009 
3010 static void
3011 page_daemon_full_scan(page_stats& pageStats, int32 despairLevel)
3012 {
3013 	TRACE_DAEMON("page daemon: full run: free: %" B_PRIu32 ", cached: %"
3014 		B_PRIu32 ", to free: %" B_PRIu32 "\n", pageStats.totalFreePages,
3015 		pageStats.cachedPages, pageStats.unsatisfiedReservations
3016 			+ sFreeOrCachedPagesTarget
3017 			- (pageStats.totalFreePages + pageStats.cachedPages));
3018 
3019 	// Walk the inactive list and transfer pages to the cached and modified
3020 	// queues.
3021 	full_scan_inactive_pages(pageStats, despairLevel);
3022 
3023 	// Free cached pages. Also wake up reservation waiters.
3024 	get_page_stats(pageStats);
3025 	int32 pagesToFree = pageStats.unsatisfiedReservations + sFreePagesTarget
3026 		- (pageStats.totalFreePages);
3027 	if (pagesToFree > 0) {
3028 		uint32 freed = free_cached_pages(pagesToFree, true);
3029 		if (freed > 0)
3030 			unreserve_pages(freed);
3031 	}
3032 
3033 	// Walk the active list and move pages to the inactive queue.
3034 	get_page_stats(pageStats);
3035 	full_scan_active_pages(pageStats, despairLevel);
3036 }
3037 
3038 
3039 static status_t
3040 page_daemon(void* /*unused*/)
3041 {
3042 	int32 despairLevel = 0;
3043 
3044 	while (true) {
3045 		sPageDaemonCondition.ClearActivated();
3046 
3047 		// evaluate the free pages situation
3048 		page_stats pageStats;
3049 		get_page_stats(pageStats);
3050 
3051 		if (!do_active_paging(pageStats)) {
3052 			// Things look good -- just maintain statistics and keep the pool
3053 			// of actually free pages full enough.
3054 			despairLevel = 0;
3055 			page_daemon_idle_scan(pageStats);
3056 			sPageDaemonCondition.Wait(kIdleScanWaitInterval, false);
3057 		} else {
3058 			// Not enough free pages. We need to do some real work.
3059 			despairLevel = std::max(despairLevel + 1, (int32)3);
3060 			page_daemon_full_scan(pageStats, despairLevel);
3061 
3062 			// Don't wait after the first full scan, but rather immediately
3063 			// check whether we were successful in freeing enough pages and
3064 			// re-run with increased despair level. The first scan is
3065 			// conservative with respect to moving inactive modified pages to
3066 			// the modified list to avoid thrashing. The second scan, however,
3067 			// will not hold back.
3068 			if (despairLevel > 1)
3069 				snooze(kBusyScanWaitInterval);
3070 		}
3071 	}
3072 
3073 	return B_OK;
3074 }
3075 
3076 
3077 /*!	Returns how many pages could *not* be reserved.
3078 */
3079 static uint32
3080 reserve_pages(uint32 count, int priority, bool dontWait)
3081 {
3082 	int32 dontTouch = kPageReserveForPriority[priority];
3083 
3084 	while (true) {
3085 		count -= reserve_some_pages(count, dontTouch);
3086 		if (count == 0)
3087 			return 0;
3088 
3089 		if (sUnsatisfiedPageReservations == 0) {
3090 			count -= free_cached_pages(count, dontWait);
3091 			if (count == 0)
3092 				return count;
3093 		}
3094 
3095 		if (dontWait)
3096 			return count;
3097 
3098 		// we need to wait for pages to become available
3099 
3100 		MutexLocker pageDeficitLocker(sPageDeficitLock);
3101 
3102 		bool notifyDaemon = sUnsatisfiedPageReservations == 0;
3103 		sUnsatisfiedPageReservations += count;
3104 
3105 		if (atomic_get(&sUnreservedFreePages) > dontTouch) {
3106 			// the situation changed
3107 			sUnsatisfiedPageReservations -= count;
3108 			continue;
3109 		}
3110 
3111 		PageReservationWaiter waiter;
3112 		waiter.dontTouch = dontTouch;
3113 		waiter.missing = count;
3114 		waiter.thread = thread_get_current_thread();
3115 		waiter.threadPriority = waiter.thread->priority;
3116 
3117 		// insert ordered (i.e. after all waiters with higher or equal priority)
3118 		PageReservationWaiter* otherWaiter = NULL;
3119 		for (PageReservationWaiterList::Iterator it
3120 				= sPageReservationWaiters.GetIterator();
3121 			(otherWaiter = it.Next()) != NULL;) {
3122 			if (waiter < *otherWaiter)
3123 				break;
3124 		}
3125 
3126 		sPageReservationWaiters.InsertBefore(otherWaiter, &waiter);
3127 
3128 		thread_prepare_to_block(waiter.thread, 0, THREAD_BLOCK_TYPE_OTHER,
3129 			"waiting for pages");
3130 
3131 		if (notifyDaemon)
3132 			sPageDaemonCondition.WakeUp();
3133 
3134 		pageDeficitLocker.Unlock();
3135 
3136 		low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
3137 		thread_block();
3138 
3139 		pageDeficitLocker.Lock();
3140 
3141 		return 0;
3142 	}
3143 }
3144 
3145 
3146 //	#pragma mark - private kernel API
3147 
3148 
3149 /*!	Writes a range of modified pages of a cache to disk.
3150 	You need to hold the VMCache lock when calling this function.
3151 	Note that the cache lock is released in this function.
3152 	\param cache The cache.
3153 	\param firstPage Offset (in page size units) of the first page in the range.
3154 	\param endPage End offset (in page size units) of the page range. The page
3155 		at this offset is not included.
3156 */
3157 status_t
3158 vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
3159 	uint32 endPage)
3160 {
3161 	static const int32 kMaxPages = 256;
3162 	int32 maxPages = cache->MaxPagesPerWrite();
3163 	if (maxPages < 0 || maxPages > kMaxPages)
3164 		maxPages = kMaxPages;
3165 
3166 	const uint32 allocationFlags = HEAP_DONT_WAIT_FOR_MEMORY
3167 		| HEAP_DONT_LOCK_KERNEL_SPACE;
3168 
3169 	PageWriteWrapper stackWrappersPool[2];
3170 	PageWriteWrapper* stackWrappers[1];
3171 	PageWriteWrapper* wrapperPool
3172 		= new(malloc_flags(allocationFlags)) PageWriteWrapper[maxPages + 1];
3173 	PageWriteWrapper** wrappers
3174 		= new(malloc_flags(allocationFlags)) PageWriteWrapper*[maxPages];
3175 	if (wrapperPool == NULL || wrappers == NULL) {
3176 		// don't fail, just limit our capabilities
3177 		delete[] wrapperPool;
3178 		delete[] wrappers;
3179 		wrapperPool = stackWrappersPool;
3180 		wrappers = stackWrappers;
3181 		maxPages = 1;
3182 	}
3183 
3184 	int32 nextWrapper = 0;
3185 	int32 usedWrappers = 0;
3186 
3187 	PageWriteTransfer transfer;
3188 	bool transferEmpty = true;
3189 
3190 	VMCachePagesTree::Iterator it
3191 		= cache->pages.GetIterator(firstPage, true, true);
3192 
3193 	while (true) {
3194 		vm_page* page = it.Next();
3195 		if (page == NULL || page->cache_offset >= endPage) {
3196 			if (transferEmpty)
3197 				break;
3198 
3199 			page = NULL;
3200 		}
3201 
3202 		if (page != NULL) {
3203 			if (page->busy
3204 				|| (page->State() != PAGE_STATE_MODIFIED
3205 					&& !vm_test_map_modification(page))) {
3206 				page = NULL;
3207 			}
3208 		}
3209 
3210 		PageWriteWrapper* wrapper = NULL;
3211 		if (page != NULL) {
3212 			wrapper = &wrapperPool[nextWrapper++];
3213 			if (nextWrapper > maxPages)
3214 				nextWrapper = 0;
3215 
3216 			DEBUG_PAGE_ACCESS_START(page);
3217 
3218 			wrapper->SetTo(page);
3219 
3220 			if (transferEmpty || transfer.AddPage(page)) {
3221 				if (transferEmpty) {
3222 					transfer.SetTo(NULL, page, maxPages);
3223 					transferEmpty = false;
3224 				}
3225 
3226 				DEBUG_PAGE_ACCESS_END(page);
3227 
3228 				wrappers[usedWrappers++] = wrapper;
3229 				continue;
3230 			}
3231 
3232 			DEBUG_PAGE_ACCESS_END(page);
3233 		}
3234 
3235 		if (transferEmpty)
3236 			continue;
3237 
3238 		cache->Unlock();
3239 		status_t status = transfer.Schedule(0);
3240 		cache->Lock();
3241 
3242 		for (int32 i = 0; i < usedWrappers; i++)
3243 			wrappers[i]->Done(status);
3244 
3245 		usedWrappers = 0;
3246 
3247 		if (page != NULL) {
3248 			transfer.SetTo(NULL, page, maxPages);
3249 			wrappers[usedWrappers++] = wrapper;
3250 		} else
3251 			transferEmpty = true;
3252 	}
3253 
3254 	if (wrapperPool != stackWrappersPool) {
3255 		delete[] wrapperPool;
3256 		delete[] wrappers;
3257 	}
3258 
3259 	return B_OK;
3260 }
3261 
3262 
3263 /*!	You need to hold the VMCache lock when calling this function.
3264 	Note that the cache lock is released in this function.
3265 */
3266 status_t
3267 vm_page_write_modified_pages(VMCache *cache)
3268 {
3269 	return vm_page_write_modified_page_range(cache, 0,
3270 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
3271 }
3272 
3273 
3274 /*!	Schedules the page writer to write back the specified \a page.
3275 	Note, however, that it might not do this immediately, and it can well
3276 	take several seconds until the page is actually written out.
3277 */
3278 void
3279 vm_page_schedule_write_page(vm_page *page)
3280 {
3281 	PAGE_ASSERT(page, page->State() == PAGE_STATE_MODIFIED);
3282 
3283 	vm_page_requeue(page, false);
3284 
3285 	sPageWriterCondition.WakeUp();
3286 }
3287 
3288 
3289 /*!	Cache must be locked.
3290 */
3291 void
3292 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
3293 	uint32 endPage)
3294 {
3295 	uint32 modified = 0;
3296 	for (VMCachePagesTree::Iterator it
3297 				= cache->pages.GetIterator(firstPage, true, true);
3298 			vm_page *page = it.Next();) {
3299 		if (page->cache_offset >= endPage)
3300 			break;
3301 
3302 		if (!page->busy && page->State() == PAGE_STATE_MODIFIED) {
3303 			DEBUG_PAGE_ACCESS_START(page);
3304 			vm_page_requeue(page, false);
3305 			modified++;
3306 			DEBUG_PAGE_ACCESS_END(page);
3307 		}
3308 	}
3309 
3310 	if (modified > 0)
3311 		sPageWriterCondition.WakeUp();
3312 }
3313 
3314 
3315 void
3316 vm_page_init_num_pages(kernel_args *args)
3317 {
3318 	// calculate the size of memory by looking at the physical_memory_range array
3319 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
3320 	page_num_t physicalPagesEnd = sPhysicalPageOffset
3321 		+ args->physical_memory_range[0].size / B_PAGE_SIZE;
3322 
3323 	sNonExistingPages = 0;
3324 	sIgnoredPages = args->ignored_physical_memory / B_PAGE_SIZE;
3325 
3326 	for (uint32 i = 1; i < args->num_physical_memory_ranges; i++) {
3327 		page_num_t start = args->physical_memory_range[i].start / B_PAGE_SIZE;
3328 		if (start > physicalPagesEnd)
3329 			sNonExistingPages += start - physicalPagesEnd;
3330 		physicalPagesEnd = start
3331 			+ args->physical_memory_range[i].size / B_PAGE_SIZE;
3332 
3333 #ifdef LIMIT_AVAILABLE_MEMORY
3334 		page_num_t available
3335 			= physicalPagesEnd - sPhysicalPageOffset - sNonExistingPages;
3336 		if (available > LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE)) {
3337 			physicalPagesEnd = sPhysicalPageOffset + sNonExistingPages
3338 				+ LIMIT_AVAILABLE_MEMORY * (1024 * 1024 / B_PAGE_SIZE);
3339 			break;
3340 		}
3341 #endif
3342 	}
3343 
3344 	TRACE(("first phys page = %#" B_PRIxPHYSADDR ", end %#" B_PRIxPHYSADDR "\n",
3345 		sPhysicalPageOffset, physicalPagesEnd));
3346 
3347 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
3348 }
3349 
3350 
3351 status_t
3352 vm_page_init(kernel_args *args)
3353 {
3354 	TRACE(("vm_page_init: entry\n"));
3355 
3356 	// init page queues
3357 	sModifiedPageQueue.Init("modified pages queue");
3358 	sInactivePageQueue.Init("inactive pages queue");
3359 	sActivePageQueue.Init("active pages queue");
3360 	sCachedPageQueue.Init("cached pages queue");
3361 	sFreePageQueue.Init("free pages queue");
3362 	sClearPageQueue.Init("clear pages queue");
3363 
3364 	new (&sPageReservationWaiters) PageReservationWaiterList;
3365 
3366 	// map in the new free page table
3367 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
3368 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA, 0);
3369 
3370 	TRACE(("vm_init: putting free_page_table @ %p, # ents %" B_PRIuPHYSADDR
3371 		" (size %#" B_PRIxPHYSADDR ")\n", sPages, sNumPages,
3372 		(phys_addr_t)(sNumPages * sizeof(vm_page))));
3373 
3374 	// initialize the free page table
3375 	for (uint32 i = 0; i < sNumPages; i++) {
3376 		sPages[i].Init(sPhysicalPageOffset + i);
3377 		sFreePageQueue.Append(&sPages[i]);
3378 
3379 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3380 		sPages[i].allocation_tracking_info.Clear();
3381 #endif
3382 	}
3383 
3384 	sUnreservedFreePages = sNumPages;
3385 
3386 	TRACE(("initialized table\n"));
3387 
3388 	// mark the ranges between usable physical memory unused
3389 	phys_addr_t previousEnd = 0;
3390 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
3391 		phys_addr_t base = args->physical_memory_range[i].start;
3392 		phys_size_t size = args->physical_memory_range[i].size;
3393 		if (base > previousEnd) {
3394 			mark_page_range_in_use(previousEnd / B_PAGE_SIZE,
3395 				(base - previousEnd) / B_PAGE_SIZE, false);
3396 		}
3397 		previousEnd = base + size;
3398 	}
3399 
3400 	// mark the allocated physical page ranges wired
3401 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
3402 		mark_page_range_in_use(
3403 			args->physical_allocated_range[i].start / B_PAGE_SIZE,
3404 			args->physical_allocated_range[i].size / B_PAGE_SIZE, true);
3405 	}
3406 
3407 	// prevent future allocations from the kernel args ranges
3408 	args->num_physical_allocated_ranges = 0;
3409 
3410 	// The target of actually free pages. This must be at least the system
3411 	// reserve, but should be a few more pages, so we don't have to extract
3412 	// a cached page with each allocation.
3413 	sFreePagesTarget = VM_PAGE_RESERVE_USER
3414 		+ std::max((page_num_t)32, (sNumPages - sNonExistingPages) / 1024);
3415 
3416 	// The target of free + cached and inactive pages. On low-memory machines
3417 	// keep things tight. free + cached is the pool of immediately allocatable
3418 	// pages. We want a few inactive pages, so when we're actually paging, we
3419 	// have a reasonably large set of pages to work with.
3420 	if (sUnreservedFreePages < 16 * 1024) {
3421 		sFreeOrCachedPagesTarget = sFreePagesTarget + 128;
3422 		sInactivePagesTarget = sFreePagesTarget / 3;
3423 	} else {
3424 		sFreeOrCachedPagesTarget = 2 * sFreePagesTarget;
3425 		sInactivePagesTarget = sFreePagesTarget / 2;
3426 	}
3427 
3428 	TRACE(("vm_page_init: exit\n"));
3429 
3430 	return B_OK;
3431 }
3432 
3433 
3434 status_t
3435 vm_page_init_post_area(kernel_args *args)
3436 {
3437 	void *dummy;
3438 
3439 	dummy = sPages;
3440 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
3441 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
3442 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
3443 
3444 	add_debugger_command("list_pages", &dump_page_list,
3445 		"List physical pages");
3446 	add_debugger_command("page_stats", &dump_page_stats,
3447 		"Dump statistics about page usage");
3448 	add_debugger_command_etc("page", &dump_page_long,
3449 		"Dump page info",
3450 		"[ \"-p\" | \"-v\" ] [ \"-m\" ] <address>\n"
3451 		"Prints information for the physical page. If neither \"-p\" nor\n"
3452 		"\"-v\" are given, the provided address is interpreted as address of\n"
3453 		"the vm_page data structure for the page in question. If \"-p\" is\n"
3454 		"given, the address is the physical address of the page. If \"-v\" is\n"
3455 		"given, the address is interpreted as virtual address in the current\n"
3456 		"thread's address space and for the page it is mapped to (if any)\n"
3457 		"information are printed. If \"-m\" is specified, the command will\n"
3458 		"search all known address spaces for mappings to that page and print\n"
3459 		"them.\n", 0);
3460 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
3461 	add_debugger_command("find_page", &find_page,
3462 		"Find out which queue a page is actually in");
3463 
3464 #ifdef TRACK_PAGE_USAGE_STATS
3465 	add_debugger_command_etc("page_usage", &dump_page_usage_stats,
3466 		"Dumps statistics about page usage counts",
3467 		"\n"
3468 		"Dumps statistics about page usage counts.\n",
3469 		B_KDEBUG_DONT_PARSE_ARGUMENTS);
3470 #endif
3471 
3472 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3473 	add_debugger_command_etc("page_allocations_per_caller",
3474 		&dump_page_allocations_per_caller,
3475 		"Dump current page allocations summed up per caller",
3476 		"[ -d <caller> ] [ -r ]\n"
3477 		"The current allocations will by summed up by caller (their count)\n"
3478 		"printed in decreasing order by count.\n"
3479 		"If \"-d\" is given, each allocation for caller <caller> is printed\n"
3480 		"including the respective stack trace.\n"
3481 		"If \"-r\" is given, the allocation infos are reset after gathering\n"
3482 		"the information, so the next command invocation will only show the\n"
3483 		"allocations made after the reset.\n", 0);
3484 	add_debugger_command_etc("page_allocation_infos",
3485 		&dump_page_allocation_infos,
3486 		"Dump current page allocations",
3487 		"[ --stacktrace ] [ -p <page number> ] [ --team <team ID> ] "
3488 		"[ --thread <thread ID> ]\n"
3489 		"The current allocations filtered by optional values will be printed.\n"
3490 		"The optional \"-p\" page number filters for a specific page,\n"
3491 		"with \"--team\" and \"--thread\" allocations by specific teams\n"
3492 		"and/or threads can be filtered (these only work if a corresponding\n"
3493 		"tracing entry is still available).\n"
3494 		"If \"--stacktrace\" is given, then stack traces of the allocation\n"
3495 		"callers are printed, where available\n", 0);
3496 #endif
3497 
3498 	return B_OK;
3499 }
3500 
3501 
3502 status_t
3503 vm_page_init_post_thread(kernel_args *args)
3504 {
3505 	new (&sFreePageCondition) ConditionVariable;
3506 
3507 	// create a kernel thread to clear out pages
3508 
3509 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
3510 		B_LOWEST_ACTIVE_PRIORITY, NULL);
3511 	resume_thread(thread);
3512 
3513 	// start page writer
3514 
3515 	sPageWriterCondition.Init("page writer");
3516 
3517 	thread = spawn_kernel_thread(&page_writer, "page writer",
3518 		B_NORMAL_PRIORITY + 1, NULL);
3519 	resume_thread(thread);
3520 
3521 	// start page daemon
3522 
3523 	sPageDaemonCondition.Init("page daemon");
3524 
3525 	thread = spawn_kernel_thread(&page_daemon, "page daemon",
3526 		B_NORMAL_PRIORITY, NULL);
3527 	resume_thread(thread);
3528 
3529 	return B_OK;
3530 }
3531 
3532 
3533 status_t
3534 vm_mark_page_inuse(page_num_t page)
3535 {
3536 	return vm_mark_page_range_inuse(page, 1);
3537 }
3538 
3539 
3540 status_t
3541 vm_mark_page_range_inuse(page_num_t startPage, page_num_t length)
3542 {
3543 	return mark_page_range_in_use(startPage, length, false);
3544 }
3545 
3546 
3547 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
3548 */
3549 void
3550 vm_page_unreserve_pages(vm_page_reservation* reservation)
3551 {
3552 	uint32 count = reservation->count;
3553 	reservation->count = 0;
3554 
3555 	if (count == 0)
3556 		return;
3557 
3558 	TA(UnreservePages(count));
3559 
3560 	unreserve_pages(count);
3561 }
3562 
3563 
3564 /*!	With this call, you can reserve a number of free pages in the system.
3565 	They will only be handed out to someone who has actually reserved them.
3566 	This call returns as soon as the number of requested pages has been
3567 	reached.
3568 	The caller must not hold any cache lock or the function might deadlock.
3569 */
3570 void
3571 vm_page_reserve_pages(vm_page_reservation* reservation, uint32 count,
3572 	int priority)
3573 {
3574 	reservation->count = count;
3575 
3576 	if (count == 0)
3577 		return;
3578 
3579 	TA(ReservePages(count));
3580 
3581 	reserve_pages(count, priority, false);
3582 }
3583 
3584 
3585 bool
3586 vm_page_try_reserve_pages(vm_page_reservation* reservation, uint32 count,
3587 	int priority)
3588 {
3589 	if (count == 0) {
3590 		reservation->count = count;
3591 		return true;
3592 	}
3593 
3594 	uint32 remaining = reserve_pages(count, priority, true);
3595 	if (remaining == 0) {
3596 		TA(ReservePages(count));
3597 		reservation->count = count;
3598 		return true;
3599 	}
3600 
3601 	unreserve_pages(count - remaining);
3602 
3603 	return false;
3604 }
3605 
3606 
3607 vm_page *
3608 vm_page_allocate_page(vm_page_reservation* reservation, uint32 flags)
3609 {
3610 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3611 	ASSERT(pageState != PAGE_STATE_FREE);
3612 	ASSERT(pageState != PAGE_STATE_CLEAR);
3613 
3614 	ASSERT(reservation->count > 0);
3615 	reservation->count--;
3616 
3617 	VMPageQueue* queue;
3618 	VMPageQueue* otherQueue;
3619 
3620 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3621 		queue = &sClearPageQueue;
3622 		otherQueue = &sFreePageQueue;
3623 	} else {
3624 		queue = &sFreePageQueue;
3625 		otherQueue = &sClearPageQueue;
3626 	}
3627 
3628 	ReadLocker locker(sFreePageQueuesLock);
3629 
3630 	vm_page* page = queue->RemoveHeadUnlocked();
3631 	if (page == NULL) {
3632 		// if the primary queue was empty, grab the page from the
3633 		// secondary queue
3634 		page = otherQueue->RemoveHeadUnlocked();
3635 
3636 		if (page == NULL) {
3637 			// Unlikely, but possible: the page we have reserved has moved
3638 			// between the queues after we checked the first queue. Grab the
3639 			// write locker to make sure this doesn't happen again.
3640 			locker.Unlock();
3641 			WriteLocker writeLocker(sFreePageQueuesLock);
3642 
3643 			page = queue->RemoveHead();
3644 			if (page == NULL)
3645 				otherQueue->RemoveHead();
3646 
3647 			if (page == NULL) {
3648 				panic("Had reserved page, but there is none!");
3649 				return NULL;
3650 			}
3651 
3652 			// downgrade to read lock
3653 			locker.Lock();
3654 		}
3655 	}
3656 
3657 	if (page->CacheRef() != NULL)
3658 		panic("supposed to be free page %p has cache @! page %p; cache _cache", page, page);
3659 
3660 	DEBUG_PAGE_ACCESS_START(page);
3661 
3662 	int oldPageState = page->State();
3663 	page->SetState(pageState);
3664 	page->busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3665 	page->usage_count = 0;
3666 	page->accessed = false;
3667 	page->modified = false;
3668 
3669 	locker.Unlock();
3670 
3671 	if (pageState < PAGE_STATE_FIRST_UNQUEUED)
3672 		sPageQueues[pageState].AppendUnlocked(page);
3673 
3674 	// clear the page, if we had to take it from the free queue and a clear
3675 	// page was requested
3676 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0 && oldPageState != PAGE_STATE_CLEAR)
3677 		clear_page(page);
3678 
3679 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3680 	page->allocation_tracking_info.Init(
3681 		TA(AllocatePage(page->physical_page_number)));
3682 #else
3683 	TA(AllocatePage(page->physical_page_number));
3684 #endif
3685 
3686 	return page;
3687 }
3688 
3689 
3690 static void
3691 allocate_page_run_cleanup(VMPageQueue::PageList& freePages,
3692 	VMPageQueue::PageList& clearPages)
3693 {
3694 	// Page lists are sorted, so remove tails before prepending to the respective queue.
3695 
3696 	while (vm_page* page = freePages.RemoveTail()) {
3697 		page->busy = false;
3698 		page->SetState(PAGE_STATE_FREE);
3699 		DEBUG_PAGE_ACCESS_END(page);
3700 		sFreePageQueue.PrependUnlocked(page);
3701 	}
3702 
3703 	while (vm_page* page = clearPages.RemoveTail()) {
3704 		page->busy = false;
3705 		page->SetState(PAGE_STATE_CLEAR);
3706 		DEBUG_PAGE_ACCESS_END(page);
3707 		sClearPageQueue.PrependUnlocked(page);
3708 	}
3709 
3710 	sFreePageCondition.NotifyAll();
3711 }
3712 
3713 
3714 /*!	Tries to allocate the a contiguous run of \a length pages starting at
3715 	index \a start.
3716 
3717 	The caller must have write-locked the free/clear page queues. The function
3718 	will unlock regardless of whether it succeeds or fails.
3719 
3720 	If the function fails, it cleans up after itself, i.e. it will free all
3721 	pages it managed to allocate.
3722 
3723 	\param start The start index (into \c sPages) of the run.
3724 	\param length The number of pages to allocate.
3725 	\param flags Page allocation flags. Encodes the state the function shall
3726 		set the allocated pages to, whether the pages shall be marked busy
3727 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3728 		(VM_PAGE_ALLOC_CLEAR).
3729 	\param freeClearQueueLocker Locked WriteLocker for the free/clear page
3730 		queues in locked state. Will be unlocked by the function.
3731 	\return The index of the first page that could not be allocated. \a length
3732 		is returned when the function was successful.
3733 */
3734 static page_num_t
3735 allocate_page_run(page_num_t start, page_num_t length, uint32 flags,
3736 	WriteLocker& freeClearQueueLocker)
3737 {
3738 	uint32 pageState = flags & VM_PAGE_ALLOC_STATE;
3739 	ASSERT(pageState != PAGE_STATE_FREE);
3740 	ASSERT(pageState != PAGE_STATE_CLEAR);
3741 	ASSERT(start + length <= sNumPages);
3742 
3743 	// Pull the free/clear pages out of their respective queues. Cached pages
3744 	// are allocated later.
3745 	page_num_t cachedPages = 0;
3746 	VMPageQueue::PageList freePages;
3747 	VMPageQueue::PageList clearPages;
3748 	page_num_t i = 0;
3749 	for (; i < length; i++) {
3750 		bool pageAllocated = true;
3751 		bool noPage = false;
3752 		vm_page& page = sPages[start + i];
3753 		switch (page.State()) {
3754 			case PAGE_STATE_CLEAR:
3755 				DEBUG_PAGE_ACCESS_START(&page);
3756 				sClearPageQueue.Remove(&page);
3757 				clearPages.Add(&page);
3758 				break;
3759 			case PAGE_STATE_FREE:
3760 				DEBUG_PAGE_ACCESS_START(&page);
3761 				sFreePageQueue.Remove(&page);
3762 				freePages.Add(&page);
3763 				break;
3764 			case PAGE_STATE_CACHED:
3765 				// We allocate cached pages later.
3766 				cachedPages++;
3767 				pageAllocated = false;
3768 				break;
3769 
3770 			default:
3771 				// Probably a page was cached when our caller checked. Now it's
3772 				// gone and we have to abort.
3773 				noPage = true;
3774 				break;
3775 		}
3776 
3777 		if (noPage)
3778 			break;
3779 
3780 		if (pageAllocated) {
3781 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3782 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3783 			page.usage_count = 0;
3784 			page.accessed = false;
3785 			page.modified = false;
3786 		}
3787 	}
3788 
3789 	if (i < length) {
3790 		// failed to allocate a page -- free all that we've got
3791 		allocate_page_run_cleanup(freePages, clearPages);
3792 		return i;
3793 	}
3794 
3795 	freeClearQueueLocker.Unlock();
3796 
3797 	if (cachedPages > 0) {
3798 		// allocate the pages that weren't free but cached
3799 		page_num_t freedCachedPages = 0;
3800 		page_num_t nextIndex = start;
3801 		vm_page* freePage = freePages.Head();
3802 		vm_page* clearPage = clearPages.Head();
3803 		while (cachedPages > 0) {
3804 			// skip, if we've already got the page
3805 			if (freePage != NULL && size_t(freePage - sPages) == nextIndex) {
3806 				freePage = freePages.GetNext(freePage);
3807 				nextIndex++;
3808 				continue;
3809 			}
3810 			if (clearPage != NULL && size_t(clearPage - sPages) == nextIndex) {
3811 				clearPage = clearPages.GetNext(clearPage);
3812 				nextIndex++;
3813 				continue;
3814 			}
3815 
3816 			// free the page, if it is still cached
3817 			vm_page& page = sPages[nextIndex];
3818 			if (!free_cached_page(&page, false)) {
3819 				// TODO: if the page turns out to have been freed already,
3820 				// there would be no need to fail
3821 				break;
3822 			}
3823 
3824 			page.SetState(flags & VM_PAGE_ALLOC_STATE);
3825 			page.busy = (flags & VM_PAGE_ALLOC_BUSY) != 0;
3826 			page.usage_count = 0;
3827 			page.accessed = false;
3828 			page.modified = false;
3829 
3830 			freePages.InsertBefore(freePage, &page);
3831 			freedCachedPages++;
3832 			cachedPages--;
3833 			nextIndex++;
3834 		}
3835 
3836 		// If we have freed cached pages, we need to balance things.
3837 		if (freedCachedPages > 0)
3838 			unreserve_pages(freedCachedPages);
3839 
3840 		if (nextIndex - start < length) {
3841 			// failed to allocate all cached pages -- free all that we've got
3842 			freeClearQueueLocker.Lock();
3843 			allocate_page_run_cleanup(freePages, clearPages);
3844 			freeClearQueueLocker.Unlock();
3845 
3846 			return nextIndex - start;
3847 		}
3848 	}
3849 
3850 	// clear pages, if requested
3851 	if ((flags & VM_PAGE_ALLOC_CLEAR) != 0) {
3852 		for (VMPageQueue::PageList::Iterator it = freePages.GetIterator();
3853 				vm_page* page = it.Next();) {
3854 			clear_page(page);
3855 		}
3856 	}
3857 
3858 	// add pages to target queue
3859 	if (pageState < PAGE_STATE_FIRST_UNQUEUED) {
3860 		freePages.MoveFrom(&clearPages);
3861 		sPageQueues[pageState].AppendUnlocked(freePages, length);
3862 	}
3863 
3864 	// Note: We don't unreserve the pages since we pulled them out of the
3865 	// free/clear queues without adjusting sUnreservedFreePages.
3866 
3867 #if VM_PAGE_ALLOCATION_TRACKING_AVAILABLE
3868 	AbstractTraceEntryWithStackTrace* traceEntry
3869 		= TA(AllocatePageRun(start, length));
3870 
3871 	for (page_num_t i = start; i < start + length; i++)
3872 		sPages[i].allocation_tracking_info.Init(traceEntry);
3873 #else
3874 	TA(AllocatePageRun(start, length));
3875 #endif
3876 
3877 	return length;
3878 }
3879 
3880 
3881 /*! Allocate a physically contiguous range of pages.
3882 
3883 	\param flags Page allocation flags. Encodes the state the function shall
3884 		set the allocated pages to, whether the pages shall be marked busy
3885 		(VM_PAGE_ALLOC_BUSY), and whether the pages shall be cleared
3886 		(VM_PAGE_ALLOC_CLEAR).
3887 	\param length The number of contiguous pages to allocate.
3888 	\param restrictions Restrictions to the physical addresses of the page run
3889 		to allocate, including \c low_address, the first acceptable physical
3890 		address where the page run may start, \c high_address, the last
3891 		acceptable physical address where the page run may end (i.e. it must
3892 		hold \code runStartAddress + length <= high_address \endcode),
3893 		\c alignment, the alignment of the page run start address, and
3894 		\c boundary, multiples of which the page run must not cross.
3895 		Values set to \c 0 are ignored.
3896 	\param priority The page reservation priority (as passed to
3897 		vm_page_reserve_pages()).
3898 	\return The first page of the allocated page run on success; \c NULL
3899 		when the allocation failed.
3900 */
3901 vm_page*
3902 vm_page_allocate_page_run(uint32 flags, page_num_t length,
3903 	const physical_address_restrictions* restrictions, int priority)
3904 {
3905 	// compute start and end page index
3906 	page_num_t requestedStart
3907 		= std::max(restrictions->low_address / B_PAGE_SIZE, sPhysicalPageOffset)
3908 			- sPhysicalPageOffset;
3909 	page_num_t start = requestedStart;
3910 	page_num_t end;
3911 	if (restrictions->high_address > 0) {
3912 		end = std::max(restrictions->high_address / B_PAGE_SIZE,
3913 				sPhysicalPageOffset)
3914 			- sPhysicalPageOffset;
3915 		end = std::min(end, sNumPages);
3916 	} else
3917 		end = sNumPages;
3918 
3919 	// compute alignment mask
3920 	page_num_t alignmentMask
3921 		= std::max(restrictions->alignment / B_PAGE_SIZE, (phys_addr_t)1) - 1;
3922 	ASSERT(((alignmentMask + 1) & alignmentMask) == 0);
3923 		// alignment must be a power of 2
3924 
3925 	// compute the boundary mask
3926 	uint32 boundaryMask = 0;
3927 	if (restrictions->boundary != 0) {
3928 		page_num_t boundary = restrictions->boundary / B_PAGE_SIZE;
3929 		// boundary must be a power of two and not less than alignment and
3930 		// length
3931 		ASSERT(((boundary - 1) & boundary) == 0);
3932 		ASSERT(boundary >= alignmentMask + 1);
3933 		ASSERT(boundary >= length);
3934 
3935 		boundaryMask = -boundary;
3936 	}
3937 
3938 	vm_page_reservation reservation;
3939 	vm_page_reserve_pages(&reservation, length, priority);
3940 
3941 	WriteLocker freeClearQueueLocker(sFreePageQueuesLock);
3942 
3943 	// First we try to get a run with free pages only. If that fails, we also
3944 	// consider cached pages. If there are only few free pages and many cached
3945 	// ones, the odds are that we won't find enough contiguous ones, so we skip
3946 	// the first iteration in this case.
3947 	int32 freePages = sUnreservedFreePages;
3948 	int useCached = freePages > 0 && (page_num_t)freePages > 2 * length ? 0 : 1;
3949 
3950 	for (;;) {
3951 		if (alignmentMask != 0 || boundaryMask != 0) {
3952 			page_num_t offsetStart = start + sPhysicalPageOffset;
3953 
3954 			// enforce alignment
3955 			if ((offsetStart & alignmentMask) != 0)
3956 				offsetStart = (offsetStart + alignmentMask) & ~alignmentMask;
3957 
3958 			// enforce boundary
3959 			if (boundaryMask != 0 && ((offsetStart ^ (offsetStart
3960 				+ length - 1)) & boundaryMask) != 0) {
3961 				offsetStart = (offsetStart + length - 1) & boundaryMask;
3962 			}
3963 
3964 			start = offsetStart - sPhysicalPageOffset;
3965 		}
3966 
3967 		if (start + length > end) {
3968 			if (useCached == 0) {
3969 				// The first iteration with free pages only was unsuccessful.
3970 				// Try again also considering cached pages.
3971 				useCached = 1;
3972 				start = requestedStart;
3973 				continue;
3974 			}
3975 
3976 			dprintf("vm_page_allocate_page_run(): Failed to allocate run of "
3977 				"length %" B_PRIuPHYSADDR " (%" B_PRIuPHYSADDR " %"
3978 				B_PRIuPHYSADDR ") in second iteration (align: %" B_PRIuPHYSADDR
3979 				" boundary: %" B_PRIuPHYSADDR ")!\n", length, requestedStart,
3980 				end, restrictions->alignment, restrictions->boundary);
3981 
3982 			freeClearQueueLocker.Unlock();
3983 			vm_page_unreserve_pages(&reservation);
3984 			return NULL;
3985 		}
3986 
3987 		bool foundRun = true;
3988 		page_num_t i;
3989 		for (i = 0; i < length; i++) {
3990 			uint32 pageState = sPages[start + i].State();
3991 			if (pageState != PAGE_STATE_FREE
3992 				&& pageState != PAGE_STATE_CLEAR
3993 				&& (pageState != PAGE_STATE_CACHED || useCached == 0)) {
3994 				foundRun = false;
3995 				break;
3996 			}
3997 		}
3998 
3999 		if (foundRun) {
4000 			i = allocate_page_run(start, length, flags, freeClearQueueLocker);
4001 			if (i == length) {
4002 				reservation.count = 0;
4003 				return &sPages[start];
4004 			}
4005 
4006 			// apparently a cached page couldn't be allocated -- skip it and
4007 			// continue
4008 			freeClearQueueLocker.Lock();
4009 		}
4010 
4011 		start += i + 1;
4012 	}
4013 }
4014 
4015 
4016 vm_page *
4017 vm_page_at_index(int32 index)
4018 {
4019 	return &sPages[index];
4020 }
4021 
4022 
4023 vm_page *
4024 vm_lookup_page(page_num_t pageNumber)
4025 {
4026 	if (pageNumber < sPhysicalPageOffset)
4027 		return NULL;
4028 
4029 	pageNumber -= sPhysicalPageOffset;
4030 	if (pageNumber >= sNumPages)
4031 		return NULL;
4032 
4033 	return &sPages[pageNumber];
4034 }
4035 
4036 
4037 bool
4038 vm_page_is_dummy(struct vm_page *page)
4039 {
4040 	return page < sPages || page >= sPages + sNumPages;
4041 }
4042 
4043 
4044 /*!	Free the page that belonged to a certain cache.
4045 	You can use vm_page_set_state() manually if you prefer, but only
4046 	if the page does not equal PAGE_STATE_MODIFIED.
4047 
4048 	\param cache The cache the page was previously owned by or NULL. The page
4049 		must have been removed from its cache before calling this method in
4050 		either case.
4051 	\param page The page to free.
4052 	\param reservation If not NULL, the page count of the reservation will be
4053 		incremented, thus allowing to allocate another page for the freed one at
4054 		a later time.
4055 */
4056 void
4057 vm_page_free_etc(VMCache* cache, vm_page* page,
4058 	vm_page_reservation* reservation)
4059 {
4060 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4061 		&& page->State() != PAGE_STATE_CLEAR);
4062 
4063 	if (page->State() == PAGE_STATE_MODIFIED && cache->temporary)
4064 		atomic_add(&sModifiedTemporaryPages, -1);
4065 
4066 	free_page(page, false);
4067 	if (reservation == NULL)
4068 		unreserve_pages(1);
4069 }
4070 
4071 
4072 void
4073 vm_page_set_state(vm_page *page, int pageState)
4074 {
4075 	PAGE_ASSERT(page, page->State() != PAGE_STATE_FREE
4076 		&& page->State() != PAGE_STATE_CLEAR);
4077 
4078 	if (pageState == PAGE_STATE_FREE || pageState == PAGE_STATE_CLEAR) {
4079 		free_page(page, pageState == PAGE_STATE_CLEAR);
4080 		unreserve_pages(1);
4081 	} else
4082 		set_page_state(page, pageState);
4083 }
4084 
4085 
4086 /*!	Moves a page to either the tail of the head of its current queue,
4087 	depending on \a tail.
4088 	The page must have a cache and the cache must be locked!
4089 */
4090 void
4091 vm_page_requeue(struct vm_page *page, bool tail)
4092 {
4093 	PAGE_ASSERT(page, page->Cache() != NULL);
4094 	page->Cache()->AssertLocked();
4095 	// DEBUG_PAGE_ACCESS_CHECK(page);
4096 		// TODO: This assertion cannot be satisfied by idle_scan_active_pages()
4097 		// when it requeues busy pages. The reason is that vm_soft_fault()
4098 		// (respectively fault_get_page()) and the file cache keep newly
4099 		// allocated pages accessed while they are reading them from disk. It
4100 		// would probably be better to change that code and reenable this
4101 		// check.
4102 
4103 	VMPageQueue *queue = NULL;
4104 
4105 	switch (page->State()) {
4106 		case PAGE_STATE_ACTIVE:
4107 			queue = &sActivePageQueue;
4108 			break;
4109 		case PAGE_STATE_INACTIVE:
4110 			queue = &sInactivePageQueue;
4111 			break;
4112 		case PAGE_STATE_MODIFIED:
4113 			queue = &sModifiedPageQueue;
4114 			break;
4115 		case PAGE_STATE_CACHED:
4116 			queue = &sCachedPageQueue;
4117 			break;
4118 		case PAGE_STATE_FREE:
4119 		case PAGE_STATE_CLEAR:
4120 			panic("vm_page_requeue() called for free/clear page %p", page);
4121 			return;
4122 		case PAGE_STATE_WIRED:
4123 		case PAGE_STATE_UNUSED:
4124 			return;
4125 		default:
4126 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
4127 				page, page->State());
4128 			break;
4129 	}
4130 
4131 	queue->RequeueUnlocked(page, tail);
4132 }
4133 
4134 
4135 page_num_t
4136 vm_page_num_pages(void)
4137 {
4138 	return sNumPages - sNonExistingPages;
4139 }
4140 
4141 
4142 /*! There is a subtle distinction between the page counts returned by
4143 	this function and vm_page_num_free_pages():
4144 	The latter returns the number of pages that are completely uncommitted,
4145 	whereas this one returns the number of pages that are available for
4146 	use by being reclaimed as well (IOW it factors in things like cache pages
4147 	as available).
4148 */
4149 page_num_t
4150 vm_page_num_available_pages(void)
4151 {
4152 	return vm_available_memory() / B_PAGE_SIZE;
4153 }
4154 
4155 
4156 page_num_t
4157 vm_page_num_free_pages(void)
4158 {
4159 	int32 count = sUnreservedFreePages + sCachedPageQueue.Count();
4160 	return count > 0 ? count : 0;
4161 }
4162 
4163 
4164 page_num_t
4165 vm_page_num_unused_pages(void)
4166 {
4167 	int32 count = sUnreservedFreePages;
4168 	return count > 0 ? count : 0;
4169 }
4170 
4171 
4172 void
4173 vm_page_get_stats(system_info *info)
4174 {
4175 	// Note: there's no locking protecting any of the queues or counters here,
4176 	// so we run the risk of getting bogus values when evaluating them
4177 	// throughout this function. As these stats are for informational purposes
4178 	// only, it is not really worth introducing such locking. Therefore we just
4179 	// ensure that we don't under- or overflow any of the values.
4180 
4181 	// The pages used for the block cache buffers. Those should not be counted
4182 	// as used but as cached pages.
4183 	// TODO: We should subtract the blocks that are in use ATM, since those
4184 	// can't really be freed in a low memory situation.
4185 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
4186 	info->block_cache_pages = blockCachePages;
4187 
4188 	// Non-temporary modified pages are special as they represent pages that
4189 	// can be written back, so they could be freed if necessary, for us
4190 	// basically making them into cached pages with a higher overhead. The
4191 	// modified queue count is therefore split into temporary and non-temporary
4192 	// counts that are then added to the corresponding number.
4193 	page_num_t modifiedNonTemporaryPages
4194 		= (sModifiedPageQueue.Count() - sModifiedTemporaryPages);
4195 
4196 	info->max_pages = vm_page_num_pages();
4197 	info->cached_pages = sCachedPageQueue.Count() + modifiedNonTemporaryPages
4198 		+ blockCachePages;
4199 
4200 	// max_pages is composed of:
4201 	//	active + inactive + unused + wired + modified + cached + free + clear
4202 	// So taking out the cached (including modified non-temporary), free and
4203 	// clear ones leaves us with all used pages.
4204 	uint32 subtractPages = info->cached_pages + sFreePageQueue.Count()
4205 		+ sClearPageQueue.Count();
4206 	info->used_pages = subtractPages > info->max_pages
4207 		? 0 : info->max_pages - subtractPages;
4208 
4209 	if (info->used_pages + info->cached_pages > info->max_pages) {
4210 		// Something was shuffled around while we were summing up the counts.
4211 		// Make the values sane, preferring the worse case of more used pages.
4212 		info->cached_pages = info->max_pages - info->used_pages;
4213 	}
4214 
4215 	info->page_faults = vm_num_page_faults();
4216 	info->ignored_pages = sIgnoredPages;
4217 
4218 	// TODO: We don't consider pages used for page directories/tables yet.
4219 }
4220 
4221 
4222 /*!	Returns the greatest address within the last page of accessible physical
4223 	memory.
4224 	The value is inclusive, i.e. in case of a 32 bit phys_addr_t 0xffffffff
4225 	means the that the last page ends at exactly 4 GB.
4226 */
4227 phys_addr_t
4228 vm_page_max_address()
4229 {
4230 	return ((phys_addr_t)sPhysicalPageOffset + sNumPages) * B_PAGE_SIZE - 1;
4231 }
4232 
4233 
4234 RANGE_MARKER_FUNCTION_END(vm_page)
4235