xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision 22dcb7a5bf5795b97fd88ba84994e887be7a9e89)
1 /*
2  * Copyright 2002-2008, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  *
5  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
6  * Distributed under the terms of the NewOS License.
7  */
8 
9 #include <signal.h>
10 #include <string.h>
11 #include <stdlib.h>
12 
13 #include <KernelExport.h>
14 #include <OS.h>
15 
16 #include <arch/cpu.h>
17 #include <arch/vm_translation_map.h>
18 #include <boot/kernel_args.h>
19 #include <condition_variable.h>
20 #include <kernel.h>
21 #include <thread.h>
22 #include <tracing.h>
23 #include <util/AutoLock.h>
24 #include <vm.h>
25 #include <vm_address_space.h>
26 #include <vm_low_memory.h>
27 #include <vm_priv.h>
28 #include <vm_page.h>
29 #include <vm_cache.h>
30 
31 #include "PageCacheLocker.h"
32 
33 
34 //#define TRACE_VM_PAGE
35 #ifdef TRACE_VM_PAGE
36 #	define TRACE(x) dprintf x
37 #else
38 #	define TRACE(x) ;
39 #endif
40 
41 #define SCRUB_SIZE 16
42 	// this many pages will be cleared at once in the page scrubber thread
43 
44 typedef struct page_queue {
45 	vm_page *head;
46 	vm_page *tail;
47 	uint32	count;
48 } page_queue;
49 
50 static page_queue sFreePageQueue;
51 static page_queue sClearPageQueue;
52 static page_queue sModifiedPageQueue;
53 static page_queue sInactivePageQueue;
54 static page_queue sActivePageQueue;
55 
56 static vm_page *sPages;
57 static addr_t sPhysicalPageOffset;
58 static size_t sNumPages;
59 static size_t sReservedPages;
60 static vint32 sPageDeficit;
61 static size_t sModifiedTemporaryPages;
62 
63 static ConditionVariable sFreePageCondition;
64 static spinlock sPageLock;
65 
66 static sem_id sWriterWaitSem;
67 
68 
69 #if PAGE_ALLOCATION_TRACING
70 
71 namespace PageAllocationTracing {
72 
73 class ReservePages : public AbstractTraceEntry {
74 	public:
75 		ReservePages(uint32 count)
76 			:
77 			fCount(count)
78 		{
79 			Initialized();
80 		}
81 
82 		virtual void AddDump(TraceOutput& out)
83 		{
84 			out.Print("page reserve:   %lu", fCount);
85 		}
86 
87 	private:
88 		uint32		fCount;
89 };
90 
91 
92 class UnreservePages : public AbstractTraceEntry {
93 	public:
94 		UnreservePages(uint32 count)
95 			:
96 			fCount(count)
97 		{
98 			Initialized();
99 		}
100 
101 		virtual void AddDump(TraceOutput& out)
102 		{
103 			out.Print("page unreserve: %lu", fCount);
104 		}
105 
106 	private:
107 		uint32		fCount;
108 };
109 
110 
111 class AllocatePage : public AbstractTraceEntry {
112 	public:
113 		AllocatePage(bool reserved)
114 			:
115 			fReserved(reserved)
116 		{
117 			Initialized();
118 		}
119 
120 		virtual void AddDump(TraceOutput& out)
121 		{
122 			out.Print("page alloc");
123 			if (fReserved)
124 				out.Print(" reserved");
125 		}
126 
127 	private:
128 		bool		fReserved;
129 };
130 
131 
132 class AllocatePageRun : public AbstractTraceEntry {
133 	public:
134 		AllocatePageRun(uint32 length)
135 			:
136 			fLength(length)
137 		{
138 			Initialized();
139 		}
140 
141 		virtual void AddDump(TraceOutput& out)
142 		{
143 			out.Print("page alloc run: length: %ld", fLength);
144 		}
145 
146 	private:
147 		uint32		fLength;
148 };
149 
150 
151 class FreePage : public AbstractTraceEntry {
152 	public:
153 		FreePage()
154 		{
155 			Initialized();
156 		}
157 
158 		virtual void AddDump(TraceOutput& out)
159 		{
160 			out.Print("page free");
161 		}
162 };
163 
164 
165 class ScrubbingPages : public AbstractTraceEntry {
166 	public:
167 		ScrubbingPages(uint32 count)
168 			:
169 			fCount(count)
170 		{
171 			Initialized();
172 		}
173 
174 		virtual void AddDump(TraceOutput& out)
175 		{
176 			out.Print("page scrubbing: %lu", fCount);
177 		}
178 
179 	private:
180 		uint32		fCount;
181 };
182 
183 
184 class ScrubbedPages : public AbstractTraceEntry {
185 	public:
186 		ScrubbedPages(uint32 count)
187 			:
188 			fCount(count)
189 		{
190 			Initialized();
191 		}
192 
193 		virtual void AddDump(TraceOutput& out)
194 		{
195 			out.Print("page scrubbed:  %lu", fCount);
196 		}
197 
198 	private:
199 		uint32		fCount;
200 };
201 
202 
203 class StolenPage : public AbstractTraceEntry {
204 	public:
205 		StolenPage()
206 		{
207 			Initialized();
208 		}
209 
210 		virtual void AddDump(TraceOutput& out)
211 		{
212 			out.Print("page stolen");
213 		}
214 };
215 
216 }	// namespace PageAllocationTracing
217 
218 #	define T(x)	new(std::nothrow) PageAllocationTracing::x
219 
220 #else
221 #	define T(x)
222 #endif	// PAGE_ALLOCATION_TRACING
223 
224 
225 /*!	Dequeues a page from the head of the given queue */
226 static vm_page *
227 dequeue_page(page_queue *queue)
228 {
229 	vm_page *page;
230 
231 	page = queue->head;
232 	if (page != NULL) {
233 		if (queue->tail == page)
234 			queue->tail = NULL;
235 		if (page->queue_next != NULL)
236 			page->queue_next->queue_prev = NULL;
237 
238 		queue->head = page->queue_next;
239 		queue->count--;
240 
241 #ifdef DEBUG_PAGE_QUEUE
242 		if (page->queue != queue) {
243 			panic("dequeue_page(queue: %p): page %p thinks it is in queue "
244 				"%p", queue, page, page->queue);
245 		}
246 
247 		page->queue = NULL;
248 #endif	// DEBUG_PAGE_QUEUE
249 	}
250 
251 	return page;
252 }
253 
254 
255 /*!	Enqueues a page to the tail of the given queue */
256 static void
257 enqueue_page(page_queue *queue, vm_page *page)
258 {
259 #ifdef DEBUG_PAGE_QUEUE
260 	if (page->queue != NULL) {
261 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
262 			"already in queue %p", queue, page, page->queue);
263 	}
264 #endif	// DEBUG_PAGE_QUEUE
265 
266 	if (queue->tail != NULL)
267 		queue->tail->queue_next = page;
268 	page->queue_prev = queue->tail;
269 	queue->tail = page;
270 	page->queue_next = NULL;
271 	if (queue->head == NULL)
272 		queue->head = page;
273 	queue->count++;
274 
275 #ifdef DEBUG_PAGE_QUEUE
276 	page->queue = queue;
277 #endif
278 }
279 
280 
281 /*!	Enqueues a page to the head of the given queue */
282 static void
283 enqueue_page_to_head(page_queue *queue, vm_page *page)
284 {
285 #ifdef DEBUG_PAGE_QUEUE
286 	if (page->queue != NULL) {
287 		panic("enqueue_page_to_head(queue: %p, page: %p): page thinks it is "
288 			"already in queue %p", queue, page, page->queue);
289 	}
290 #endif	// DEBUG_PAGE_QUEUE
291 
292 	if (queue->head != NULL)
293 		queue->head->queue_prev = page;
294 	page->queue_next = queue->head;
295 	queue->head = page;
296 	page->queue_prev = NULL;
297 	if (queue->tail == NULL)
298 		queue->tail = page;
299 	queue->count++;
300 
301 #ifdef DEBUG_PAGE_QUEUE
302 	page->queue = queue;
303 #endif
304 }
305 
306 
307 static void
308 remove_page_from_queue(page_queue *queue, vm_page *page)
309 {
310 #ifdef DEBUG_PAGE_QUEUE
311 	if (page->queue != queue) {
312 		panic("remove_page_from_queue(queue: %p, page: %p): page thinks it "
313 			"is in queue %p", queue, page, page->queue);
314 	}
315 #endif	// DEBUG_PAGE_QUEUE
316 
317 	if (page->queue_next != NULL)
318 		page->queue_next->queue_prev = page->queue_prev;
319 	else
320 		queue->tail = page->queue_prev;
321 
322 	if (page->queue_prev != NULL)
323 		page->queue_prev->queue_next = page->queue_next;
324 	else
325 		queue->head = page->queue_next;
326 
327 	queue->count--;
328 
329 #ifdef DEBUG_PAGE_QUEUE
330 	page->queue = NULL;
331 #endif
332 }
333 
334 
335 /*!	Moves a page to the tail of the given queue, but only does so if
336 	the page is currently in another queue.
337 */
338 static void
339 move_page_to_queue(page_queue *fromQueue, page_queue *toQueue, vm_page *page)
340 {
341 	if (fromQueue != toQueue) {
342 		remove_page_from_queue(fromQueue, page);
343 		enqueue_page(toQueue, page);
344 	}
345 }
346 
347 
348 /*! Inserts \a page after the \a before page in the \a queue. */
349 static void
350 insert_page_after(page_queue *queue, vm_page *before, vm_page *page)
351 {
352 #ifdef DEBUG_PAGE_QUEUE
353 	if (page->queue != NULL) {
354 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
355 			"already in queue %p", queue, page, page->queue);
356 	}
357 #endif	// DEBUG_PAGE_QUEUE
358 
359 	if (before == NULL) {
360 		enqueue_page(queue, page);
361 		return;
362 	}
363 
364 	page->queue_next = before->queue_next;
365 	if (page->queue_next != NULL)
366 		page->queue_next->queue_prev = page;
367 	page->queue_prev = before;
368 	before->queue_next = page;
369 
370 	if (queue->tail == before)
371 		queue->tail = page;
372 
373 	queue->count++;
374 
375 #ifdef DEBUG_PAGE_QUEUE
376 	page->queue = queue;
377 #endif
378 }
379 
380 
381 static int
382 find_page(int argc, char **argv)
383 {
384 	struct vm_page *page;
385 	addr_t address;
386 	int32 index = 1;
387 	int i;
388 
389 	struct {
390 		const char*	name;
391 		page_queue*	queue;
392 	} pageQueueInfos[] = {
393 		{ "free",		&sFreePageQueue },
394 		{ "clear",		&sClearPageQueue },
395 		{ "modified",	&sModifiedPageQueue },
396 		{ "active",		&sActivePageQueue },
397 		{ NULL, NULL }
398 	};
399 
400 	if (argc < 2
401 		|| strlen(argv[index]) <= 2
402 		|| argv[index][0] != '0'
403 		|| argv[index][1] != 'x') {
404 		kprintf("usage: find_page <address>\n");
405 		return 0;
406 	}
407 
408 	address = strtoul(argv[index], NULL, 0);
409 	page = (vm_page*)address;
410 
411 	for (i = 0; pageQueueInfos[i].name; i++) {
412 		vm_page* p = pageQueueInfos[i].queue->head;
413 		while (p) {
414 			if (p == page) {
415 				kprintf("found page %p in queue %p (%s)\n", page,
416 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
417 				return 0;
418 			}
419 			p = p->queue_next;
420 		}
421 	}
422 
423 	kprintf("page %p isn't in any queue\n", page);
424 
425 	return 0;
426 }
427 
428 
429 const char *
430 page_state_to_string(int state)
431 {
432 	switch(state) {
433 		case PAGE_STATE_ACTIVE:
434 			return "active";
435 		case PAGE_STATE_INACTIVE:
436 			return "inactive";
437 		case PAGE_STATE_BUSY:
438 			return "busy";
439 		case PAGE_STATE_MODIFIED:
440 			return "modified";
441 		case PAGE_STATE_FREE:
442 			return "free";
443 		case PAGE_STATE_CLEAR:
444 			return "clear";
445 		case PAGE_STATE_WIRED:
446 			return "wired";
447 		case PAGE_STATE_UNUSED:
448 			return "unused";
449 		default:
450 			return "unknown";
451 	}
452 }
453 
454 
455 static int
456 dump_page(int argc, char **argv)
457 {
458 	struct vm_page *page;
459 	addr_t address;
460 	bool physical = false;
461 	int32 index = 1;
462 
463 	if (argc > 2) {
464 		if (!strcmp(argv[1], "-p")) {
465 			physical = true;
466 			index++;
467 		} else if (!strcmp(argv[1], "-v"))
468 			index++;
469 	}
470 
471 	if (argc < 2
472 		|| strlen(argv[index]) <= 2
473 		|| argv[index][0] != '0'
474 		|| argv[index][1] != 'x') {
475 		kprintf("usage: page [-p|-v] <address>\n"
476 			"  -v looks up a virtual address for the page, -p a physical address.\n"
477 			"  Default is to look for the page structure address directly.\n");
478 		return 0;
479 	}
480 
481 	address = strtoul(argv[index], NULL, 0);
482 
483 	if (index == 2) {
484 		if (!physical) {
485 			vm_address_space *addressSpace = vm_kernel_address_space();
486 			uint32 flags;
487 
488 			if (thread_get_current_thread()->team->address_space != NULL)
489 				addressSpace = thread_get_current_thread()->team->address_space;
490 
491 			addressSpace->translation_map.ops->query_interrupt(
492 				&addressSpace->translation_map, address, &address, &flags);
493 		}
494 		page = vm_lookup_page(address / B_PAGE_SIZE);
495 	} else
496 		page = (struct vm_page *)address;
497 
498 	kprintf("PAGE: %p\n", page);
499 	kprintf("queue_next,prev: %p, %p\n", page->queue_next, page->queue_prev);
500 	kprintf("hash_next:       %p\n", page->hash_next);
501 	kprintf("physical_number: %lx\n", page->physical_page_number);
502 	kprintf("cache:           %p\n", page->cache);
503 	kprintf("cache_offset:    %ld\n", page->cache_offset);
504 	kprintf("cache_next,prev: %p, %p\n", page->cache_next, page->cache_prev);
505 	kprintf("type:            %d\n", page->type);
506 	kprintf("state:           %s\n", page_state_to_string(page->state));
507 	kprintf("wired_count:     %d\n", page->wired_count);
508 	kprintf("usage_count:     %d\n", page->usage_count);
509 	kprintf("busy_writing:    %d\n", page->busy_writing);
510 	#ifdef DEBUG_PAGE_QUEUE
511 		kprintf("queue:           %p\n", page->queue);
512 	#endif
513 	#ifdef DEBUG_PAGE_CACHE_TRANSITIONS
514 		kprintf("debug_flags:     0x%lx\n", page->debug_flags);
515 		kprintf("collided page:   %p\n", page->collided_page);
516 	#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
517 	kprintf("area mappings:\n");
518 
519 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
520 	vm_page_mapping *mapping;
521 	while ((mapping = iterator.Next()) != NULL) {
522 		kprintf("  %p (%#lx)\n", mapping->area, mapping->area->id);
523 		mapping = mapping->page_link.next;
524 	}
525 
526 	return 0;
527 }
528 
529 
530 static int
531 dump_page_queue(int argc, char **argv)
532 {
533 	struct page_queue *queue;
534 
535 	if (argc < 2) {
536 		kprintf("usage: page_queue <address/name> [list]\n");
537 		return 0;
538 	}
539 
540 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
541 		queue = (struct page_queue *)strtoul(argv[1], NULL, 16);
542 	if (!strcmp(argv[1], "free"))
543 		queue = &sFreePageQueue;
544 	else if (!strcmp(argv[1], "clear"))
545 		queue = &sClearPageQueue;
546 	else if (!strcmp(argv[1], "modified"))
547 		queue = &sModifiedPageQueue;
548 	else if (!strcmp(argv[1], "active"))
549 		queue = &sActivePageQueue;
550 	else if (!strcmp(argv[1], "inactive"))
551 		queue = &sInactivePageQueue;
552 	else {
553 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
554 		return 0;
555 	}
556 
557 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %ld\n",
558 		queue, queue->head, queue->tail, queue->count);
559 
560 	if (argc == 3) {
561 		struct vm_page *page = queue->head;
562 		const char *type = "none";
563 		int i;
564 
565 		if (page->cache != NULL) {
566 			switch (page->cache->type) {
567 				case CACHE_TYPE_RAM:
568 					type = "RAM";
569 					break;
570 				case CACHE_TYPE_DEVICE:
571 					type = "device";
572 					break;
573 				case CACHE_TYPE_VNODE:
574 					type = "vnode";
575 					break;
576 				case CACHE_TYPE_NULL:
577 					type = "null";
578 					break;
579 				default:
580 					type = "???";
581 					break;
582 			}
583 		}
584 
585 		kprintf("page        cache       type       state  wired  usage\n");
586 		for (i = 0; page; i++, page = page->queue_next) {
587 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->cache,
588 				type, page_state_to_string(page->state),
589 				page->wired_count, page->usage_count);
590 		}
591 	}
592 	return 0;
593 }
594 
595 
596 static int
597 dump_page_stats(int argc, char **argv)
598 {
599 	uint32 counter[8];
600 	int32 totalActive;
601 	addr_t i;
602 
603 	memset(counter, 0, sizeof(counter));
604 
605 	for (i = 0; i < sNumPages; i++) {
606 		if (sPages[i].state > 7)
607 			panic("page %li at %p has invalid state!\n", i, &sPages[i]);
608 
609 		counter[sPages[i].state]++;
610 	}
611 
612 	kprintf("page stats:\n");
613 	kprintf("active: %lu\ninactive: %lu\nbusy: %lu\nunused: %lu\n",
614 		counter[PAGE_STATE_ACTIVE], counter[PAGE_STATE_INACTIVE],
615 		counter[PAGE_STATE_BUSY], counter[PAGE_STATE_UNUSED]);
616 	kprintf("wired: %lu\nmodified: %lu\nfree: %lu\nclear: %lu\n",
617 		counter[PAGE_STATE_WIRED], counter[PAGE_STATE_MODIFIED],
618 		counter[PAGE_STATE_FREE], counter[PAGE_STATE_CLEAR]);
619 	kprintf("reserved pages: %lu\n", sReservedPages);
620 	kprintf("page deficit: %lu\n", sPageDeficit);
621 
622 	kprintf("\nfree queue: %p, count = %ld\n", &sFreePageQueue,
623 		sFreePageQueue.count);
624 	kprintf("clear queue: %p, count = %ld\n", &sClearPageQueue,
625 		sClearPageQueue.count);
626 	kprintf("modified queue: %p, count = %ld (%ld temporary)\n",
627 		&sModifiedPageQueue, sModifiedPageQueue.count, sModifiedTemporaryPages);
628 	kprintf("active queue: %p, count = %ld\n", &sActivePageQueue,
629 		sActivePageQueue.count);
630 	kprintf("inactive queue: %p, count = %ld\n", &sInactivePageQueue,
631 		sInactivePageQueue.count);
632 	return 0;
633 }
634 
635 
636 static inline size_t
637 free_page_queue_count(void)
638 {
639 	return sFreePageQueue.count + sClearPageQueue.count;
640 }
641 
642 
643 static status_t
644 set_page_state_nolock(vm_page *page, int pageState)
645 {
646 	if (pageState == page->state)
647 		return B_OK;
648 
649 	page_queue *fromQueue = NULL;
650 	page_queue *toQueue = NULL;
651 
652 	switch (page->state) {
653 		case PAGE_STATE_BUSY:
654 		case PAGE_STATE_ACTIVE:
655 		case PAGE_STATE_WIRED:
656 		case PAGE_STATE_UNUSED:
657 			fromQueue = &sActivePageQueue;
658 			break;
659 		case PAGE_STATE_INACTIVE:
660 			fromQueue = &sInactivePageQueue;
661 			break;
662 		case PAGE_STATE_MODIFIED:
663 			fromQueue = &sModifiedPageQueue;
664 			break;
665 		case PAGE_STATE_FREE:
666 			fromQueue = &sFreePageQueue;
667 			break;
668 		case PAGE_STATE_CLEAR:
669 			fromQueue = &sClearPageQueue;
670 			break;
671 		default:
672 			panic("vm_page_set_state: vm_page %p in invalid state %d\n",
673 				page, page->state);
674 			break;
675 	}
676 
677 	if (page->state == PAGE_STATE_CLEAR || page->state == PAGE_STATE_FREE) {
678 		if (page->cache != NULL)
679 			panic("free page %p has cache", page);
680 	}
681 
682 	switch (pageState) {
683 		case PAGE_STATE_BUSY:
684 		case PAGE_STATE_ACTIVE:
685 		case PAGE_STATE_WIRED:
686 		case PAGE_STATE_UNUSED:
687 			toQueue = &sActivePageQueue;
688 			break;
689 		case PAGE_STATE_INACTIVE:
690 			toQueue = &sInactivePageQueue;
691 			break;
692 		case PAGE_STATE_MODIFIED:
693 			toQueue = &sModifiedPageQueue;
694 			break;
695 		case PAGE_STATE_FREE:
696 			toQueue = &sFreePageQueue;
697 			break;
698 		case PAGE_STATE_CLEAR:
699 			toQueue = &sClearPageQueue;
700 			break;
701 		default:
702 			panic("vm_page_set_state: invalid target state %d\n", pageState);
703 	}
704 
705 	if (pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE
706 		|| pageState == PAGE_STATE_INACTIVE) {
707 		if (sPageDeficit > 0)
708 			sFreePageCondition.NotifyOne();
709 
710 		if (pageState != PAGE_STATE_INACTIVE && page->cache != NULL)
711 			panic("to be freed page %p has cache", page);
712 	}
713 	if (page->cache != NULL && page->cache->temporary) {
714 		if (pageState == PAGE_STATE_MODIFIED)
715 			sModifiedTemporaryPages++;
716 		else if (page->state == PAGE_STATE_MODIFIED)
717 			sModifiedTemporaryPages--;
718 	}
719 
720 #ifdef PAGE_ALLOCATION_TRACING
721 	if ((pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE)
722 		&& page->state != PAGE_STATE_CLEAR && page->state != PAGE_STATE_FREE) {
723 		T(FreePage());
724 	}
725 #endif	// PAGE_ALLOCATION_TRACING
726 
727 	page->state = pageState;
728 	move_page_to_queue(fromQueue, toQueue, page);
729 
730 	return B_OK;
731 }
732 
733 
734 /*! Moves a modified page into either the active or inactive page queue
735 	depending on its usage count and wiring.
736 */
737 static void
738 move_page_to_active_or_inactive_queue(vm_page *page, bool dequeued)
739 {
740 	// Note, this logic must be in sync with what the page daemon does
741 	int32 state;
742 	if (!page->mappings.IsEmpty() || page->usage_count >= 0
743 		|| page->wired_count)
744 		state = PAGE_STATE_ACTIVE;
745 	else
746 		state = PAGE_STATE_INACTIVE;
747 
748 	if (dequeued) {
749 		page->state = state;
750 		enqueue_page(state == PAGE_STATE_ACTIVE
751 			? &sActivePageQueue : &sInactivePageQueue, page);
752 		if (page->cache->temporary)
753 			sModifiedTemporaryPages--;
754 	} else
755 		set_page_state_nolock(page, state);
756 }
757 
758 
759 static void
760 clear_page(struct vm_page *page)
761 {
762 	addr_t virtualAddress;
763 	vm_get_physical_page(page->physical_page_number << PAGE_SHIFT,
764 		&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
765 
766 	memset((void *)virtualAddress, 0, B_PAGE_SIZE);
767 
768 	vm_put_physical_page(virtualAddress);
769 }
770 
771 
772 /*!
773 	This is a background thread that wakes up every now and then (every 100ms)
774 	and moves some pages from the free queue over to the clear queue.
775 	Given enough time, it will clear out all pages from the free queue - we
776 	could probably slow it down after having reached a certain threshold.
777 */
778 static int32
779 page_scrubber(void *unused)
780 {
781 	(void)(unused);
782 
783 	TRACE(("page_scrubber starting...\n"));
784 
785 	for (;;) {
786 		snooze(100000); // 100ms
787 
788 		if (sFreePageQueue.count > 0) {
789 			cpu_status state;
790 			vm_page *page[SCRUB_SIZE];
791 			int32 i, scrubCount;
792 
793 			// get some pages from the free queue
794 
795 			InterruptsSpinLocker locker(sPageLock);
796 
797 			// Since we temporarily remove pages from the free pages reserve,
798 			// we must make sure we don't cause a violation of the page
799 			// reservation warranty. The following is usually stricter than
800 			// necessary, because we don't have information on how many of the
801 			// reserved pages have already been allocated.
802 			scrubCount = SCRUB_SIZE;
803 			uint32 freeCount = free_page_queue_count();
804 			if (freeCount < sReservedPages)
805 				scrubCount = 0;
806 			else if ((uint32)scrubCount > freeCount - sReservedPages)
807 				scrubCount = freeCount - sReservedPages;
808 
809 			for (i = 0; i < scrubCount; i++) {
810 				page[i] = dequeue_page(&sFreePageQueue);
811 				if (page[i] == NULL)
812 					break;
813 				page[i]->state = PAGE_STATE_BUSY;
814 			}
815 
816 			scrubCount = i;
817 
818 			if (scrubCount > 0) {
819 				T(ScrubbingPages(scrubCount));
820 			}
821 
822 			locker.Unlock();
823 
824 			// clear them
825 
826 			for (i = 0; i < scrubCount; i++) {
827 				clear_page(page[i]);
828 			}
829 
830 			locker.Lock();
831 
832 			// and put them into the clear queue
833 
834 			for (i = 0; i < scrubCount; i++) {
835 				page[i]->state = PAGE_STATE_CLEAR;
836 				enqueue_page(&sClearPageQueue, page[i]);
837 			}
838 
839 			if (scrubCount > 0) {
840 				T(ScrubbedPages(scrubCount));
841 			}
842 		}
843 	}
844 
845 	return 0;
846 }
847 
848 
849 static status_t
850 write_page(vm_page *page, bool fsReenter)
851 {
852 	vm_store *store = page->cache->store;
853 	size_t length = B_PAGE_SIZE;
854 	status_t status;
855 	iovec vecs[1];
856 
857 	TRACE(("write_page(page = %p): offset = %Ld\n", page, (off_t)page->cache_offset << PAGE_SHIFT));
858 
859 	status = vm_get_physical_page(page->physical_page_number * B_PAGE_SIZE,
860 		(addr_t *)&vecs[0].iov_base, PHYSICAL_PAGE_CAN_WAIT);
861 	if (status < B_OK)
862 		panic("could not map page!");
863 	vecs->iov_len = B_PAGE_SIZE;
864 
865 	status = store->ops->write(store, (off_t)page->cache_offset << PAGE_SHIFT,
866 		vecs, 1, &length, fsReenter);
867 
868 	vm_put_physical_page((addr_t)vecs[0].iov_base);
869 #if 0
870 	if (status < B_OK) {
871 		dprintf("write_page(page = %p): offset = %lx, status = %ld\n",
872 			page, page->cache_offset, status);
873 	}
874 #endif
875 	if (status == B_OK && length == 0)
876 		status = B_ERROR;
877 
878 	return status;
879 }
880 
881 
882 static inline bool
883 is_marker_page(struct vm_page *page)
884 {
885 	return page->type == PAGE_TYPE_DUMMY;
886 }
887 
888 
889 static void
890 remove_page_marker(struct vm_page &marker)
891 {
892 	if (marker.state == PAGE_STATE_UNUSED)
893 		return;
894 
895 	page_queue *queue;
896 
897 	switch (marker.state) {
898 		case PAGE_STATE_ACTIVE:
899 			queue = &sActivePageQueue;
900 			break;
901 		case PAGE_STATE_INACTIVE:
902 			queue = &sInactivePageQueue;
903 			break;
904 		case PAGE_STATE_MODIFIED:
905 			queue = &sModifiedPageQueue;
906 			break;
907 
908 		default:
909 			return;
910 	}
911 
912 	InterruptsSpinLocker locker(sPageLock);
913 	remove_page_from_queue(queue, &marker);
914 
915 	marker.state = PAGE_STATE_UNUSED;
916 }
917 
918 
919 static vm_page *
920 next_modified_page(struct vm_page &marker)
921 {
922 	InterruptsSpinLocker locker(sPageLock);
923 	vm_page *page;
924 
925 	if (marker.state == PAGE_STATE_MODIFIED) {
926 		page = marker.queue_next;
927 		remove_page_from_queue(&sModifiedPageQueue, &marker);
928 		marker.state = PAGE_STATE_UNUSED;
929 	} else
930 		page = sModifiedPageQueue.head;
931 
932 	for (; page != NULL; page = page->queue_next) {
933 		if (!is_marker_page(page) && page->state != PAGE_STATE_BUSY) {
934 			// insert marker
935 			marker.state = PAGE_STATE_MODIFIED;
936 			insert_page_after(&sModifiedPageQueue, page, &marker);
937 			return page;
938 		}
939 	}
940 
941 	return NULL;
942 }
943 
944 
945 /*!	The page writer continuously takes some pages from the modified
946 	queue, writes them back, and moves them back to the active queue.
947 	It runs in its own thread, and is only there to keep the number
948 	of modified pages low, so that more pages can be reused with
949 	fewer costs.
950 */
951 status_t
952 page_writer(void* /*unused*/)
953 {
954 	vm_page marker;
955 	marker.type = PAGE_TYPE_DUMMY;
956 	marker.cache = NULL;
957 	marker.state = PAGE_STATE_UNUSED;
958 
959 	while (true) {
960 		if (sModifiedPageQueue.count - sModifiedTemporaryPages < 1024) {
961 			int32 count = 0;
962 			get_sem_count(sWriterWaitSem, &count);
963 			if (count == 0)
964 				count = 1;
965 
966 			acquire_sem_etc(sWriterWaitSem, count, B_RELATIVE_TIMEOUT, 3000000);
967 				// all 3 seconds when no one triggers us
968 		}
969 
970 		const uint32 kNumPages = 32;
971 		ConditionVariable busyConditions[kNumPages];
972 		union {
973 			vm_page *pages[kNumPages];
974 			vm_cache *caches[kNumPages];
975 		} u;
976 		uint32 numPages = 0;
977 
978 		// TODO: once the I/O scheduler is there, we should write
979 		// a lot more pages back.
980 		// TODO: make this laptop friendly, too (ie. only start doing
981 		// something if someone else did something or there is really
982 		// enough to do).
983 
984 		// collect pages to be written
985 
986 		while (numPages < kNumPages) {
987 			vm_page *page = next_modified_page(marker);
988 			if (page == NULL)
989 				break;
990 
991 			PageCacheLocker cacheLocker(page, false);
992 			if (!cacheLocker.IsLocked())
993 				continue;
994 
995 			vm_cache *cache = page->cache;
996 			// TODO: write back temporary ones as soon as we have swap file support
997 			if (cache->temporary/* && vm_low_memory_state() == B_NO_LOW_MEMORY*/)
998 				continue;
999 
1000 			if (cache->store->ops->acquire_unreferenced_ref != NULL) {
1001 				// we need our own reference to the store, as it might
1002 				// currently be destructed
1003 				if (cache->store->ops->acquire_unreferenced_ref(cache->store)
1004 						!= B_OK) {
1005 					cacheLocker.Unlock();
1006 					thread_yield(true);
1007 					continue;
1008 				}
1009 			}
1010 
1011 			InterruptsSpinLocker locker(sPageLock);
1012 
1013 			// state might have change while we were locking the cache
1014 			if (page->state != PAGE_STATE_MODIFIED)
1015 				continue;
1016 
1017 			remove_page_from_queue(&sModifiedPageQueue, page);
1018 			page->state = PAGE_STATE_BUSY;
1019 			page->busy_writing = true;
1020 
1021 			busyConditions[numPages].Publish(page, "page");
1022 
1023 			locker.Unlock();
1024 
1025 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
1026 			vm_clear_map_flags(page, PAGE_MODIFIED);
1027 			vm_cache_acquire_ref(cache);
1028 			u.pages[numPages++] = page;
1029 		}
1030 
1031 		if (numPages == 0)
1032 			continue;
1033 
1034 		// write pages to disk
1035 
1036 		// TODO: put this as requests into the I/O scheduler
1037 		status_t writeStatus[kNumPages];
1038 		for (uint32 i = 0; i < numPages; i++) {
1039 			writeStatus[i] = write_page(u.pages[i], false);
1040 		}
1041 
1042 		// mark pages depending on whether they could be written or not
1043 
1044 		for (uint32 i = 0; i < numPages; i++) {
1045 			vm_cache *cache = u.pages[i]->cache;
1046 			mutex_lock(&cache->lock);
1047 
1048 			if (writeStatus[i] == B_OK) {
1049 				// put it into the active queue
1050 				InterruptsSpinLocker locker(sPageLock);
1051 				move_page_to_active_or_inactive_queue(u.pages[i], true);
1052 				u.pages[i]->busy_writing = false;
1053 			} else {
1054 				// We don't have to put the PAGE_MODIFIED bit back, as it's
1055 				// still in the modified pages list.
1056 				{
1057 					InterruptsSpinLocker locker(sPageLock);
1058 					u.pages[i]->state = PAGE_STATE_MODIFIED;
1059 					enqueue_page(&sModifiedPageQueue, u.pages[i]);
1060 				}
1061 				if (!u.pages[i]->busy_writing) {
1062 					// someone has cleared the busy_writing flag which tells
1063 					// us our page has gone invalid
1064 					vm_cache_remove_page(cache, u.pages[i]);
1065 				} else
1066 					u.pages[i]->busy_writing = false;
1067 			}
1068 
1069 			busyConditions[i].Unpublish();
1070 
1071 			u.caches[i] = cache;
1072 			mutex_unlock(&cache->lock);
1073 		}
1074 
1075 		for (uint32 i = 0; i < numPages; i++) {
1076 			vm_cache *cache = u.caches[i];
1077 
1078 			// We release the cache references after all pages were made
1079 			// unbusy again - otherwise releasing a vnode could deadlock.
1080 			if (cache->store->ops->release_ref != NULL)
1081 				cache->store->ops->release_ref(cache->store);
1082 			vm_cache_release_ref(cache);
1083 		}
1084 	}
1085 
1086 	remove_page_marker(marker);
1087 	return B_OK;
1088 }
1089 
1090 
1091 static vm_page *
1092 find_page_candidate(struct vm_page &marker, bool stealActive)
1093 {
1094 	InterruptsSpinLocker locker(sPageLock);
1095 	page_queue *queue;
1096 	vm_page *page;
1097 
1098 	if (marker.state == PAGE_STATE_UNUSED) {
1099 		// Get the first free pages of the (in)active queue
1100 		queue = &sInactivePageQueue;
1101 		page = sInactivePageQueue.head;
1102 		if (page == NULL && stealActive) {
1103 			queue = &sActivePageQueue;
1104 			page = sActivePageQueue.head;
1105 		}
1106 	} else {
1107 		// Get the next page of the current queue
1108 		if (marker.state == PAGE_STATE_INACTIVE)
1109 			queue = &sInactivePageQueue;
1110 		else if (marker.state == PAGE_STATE_ACTIVE)
1111 			queue = &sActivePageQueue;
1112 		else {
1113 			panic("invalid marker %p state", &marker);
1114 			queue = NULL;
1115 		}
1116 
1117 		page = marker.queue_next;
1118 		remove_page_from_queue(queue, &marker);
1119 		marker.state = PAGE_STATE_UNUSED;
1120 	}
1121 
1122 	while (page != NULL) {
1123 		if (!is_marker_page(page)
1124 			&& (page->state == PAGE_STATE_INACTIVE
1125 				|| (stealActive && page->state == PAGE_STATE_ACTIVE
1126 					&& page->wired_count == 0))) {
1127 			// we found a candidate, insert marker
1128 			marker.state = queue == &sActivePageQueue
1129 				? PAGE_STATE_ACTIVE : PAGE_STATE_INACTIVE;
1130 			insert_page_after(queue, page, &marker);
1131 			return page;
1132 		}
1133 
1134 		page = page->queue_next;
1135 		if (page == NULL && stealActive && queue != &sActivePageQueue) {
1136 			queue = &sActivePageQueue;
1137 			page = sActivePageQueue.head;
1138 		}
1139 	}
1140 
1141 	return NULL;
1142 }
1143 
1144 
1145 static bool
1146 steal_page(vm_page *page, bool stealActive)
1147 {
1148 	// try to lock the page's cache
1149 
1150 	class PageCacheTryLocker {
1151 	public:
1152 		PageCacheTryLocker(vm_page *page)
1153 			:
1154 			fIsLocked(false),
1155 			fOwnsLock(false)
1156 		{
1157 			fCache = vm_cache_acquire_page_cache_ref(page);
1158 			if (fCache != NULL) {
1159 				if (mutex_trylock(&fCache->lock) != B_OK)
1160 					return;
1161 
1162 				fOwnsLock = true;
1163 
1164 				if (fCache == page->cache)
1165 					fIsLocked = true;
1166 			}
1167 		}
1168 
1169 		~PageCacheTryLocker()
1170 		{
1171 			if (fOwnsLock)
1172 				mutex_unlock(&fCache->lock);
1173 			if (fCache != NULL)
1174 				vm_cache_release_ref(fCache);
1175 		}
1176 
1177 		bool IsLocked() { return fIsLocked; }
1178 
1179 	private:
1180 		vm_cache *fCache;
1181 		bool fIsLocked;
1182 		bool fOwnsLock;
1183 	} cacheLocker(page);
1184 
1185 	if (!cacheLocker.IsLocked())
1186 		return false;
1187 
1188 	// check again if that page is still a candidate
1189 	if (page->state != PAGE_STATE_INACTIVE
1190 		&& (!stealActive || page->state != PAGE_STATE_ACTIVE
1191 			|| page->wired_count != 0))
1192 		return false;
1193 
1194 	// recheck eventual last minute changes
1195 	uint32 flags;
1196 	vm_remove_all_page_mappings(page, &flags);
1197 	if ((flags & PAGE_MODIFIED) != 0) {
1198 		// page was modified, don't steal it
1199 		vm_page_set_state(page, PAGE_STATE_MODIFIED);
1200 		return false;
1201 	} else if ((flags & PAGE_ACCESSED) != 0) {
1202 		// page is in active use, don't steal it
1203 		vm_page_set_state(page, PAGE_STATE_ACTIVE);
1204 		return false;
1205 	}
1206 
1207 	// we can now steal this page
1208 
1209 	//dprintf("  steal page %p from cache %p%s\n", page, page->cache,
1210 	//	page->state == PAGE_STATE_INACTIVE ? "" : " (ACTIVE)");
1211 
1212 	vm_cache_remove_page(page->cache, page);
1213 
1214 	InterruptsSpinLocker _(sPageLock);
1215 	remove_page_from_queue(page->state == PAGE_STATE_ACTIVE
1216 		? &sActivePageQueue : &sInactivePageQueue, page);
1217 	return true;
1218 }
1219 
1220 
1221 static size_t
1222 steal_pages(vm_page **pages, size_t count, bool reserve)
1223 {
1224 	size_t maxCount = count;
1225 
1226 	while (true) {
1227 		vm_page marker;
1228 		marker.type = PAGE_TYPE_DUMMY;
1229 		marker.cache = NULL;
1230 		marker.state = PAGE_STATE_UNUSED;
1231 
1232 		bool tried = false;
1233 		size_t stolen = 0;
1234 
1235 		while (count > 0) {
1236 			vm_page *page = find_page_candidate(marker, false);
1237 			if (page == NULL)
1238 				break;
1239 
1240 			if (steal_page(page, false)) {
1241 				if (reserve || stolen >= maxCount) {
1242 					InterruptsSpinLocker _(sPageLock);
1243 					enqueue_page(&sFreePageQueue, page);
1244 					page->state = PAGE_STATE_FREE;
1245 
1246 					T(StolenPage());
1247 				} else if (stolen < maxCount) {
1248 					pages[stolen] = page;
1249 				}
1250 				stolen++;
1251 				count--;
1252 			} else
1253 				tried = true;
1254 		}
1255 
1256 		remove_page_marker(marker);
1257 
1258 		InterruptsSpinLocker locker(sPageLock);
1259 
1260 		if (reserve && sReservedPages <= free_page_queue_count()
1261 			|| count == 0
1262 			|| !reserve && (sInactivePageQueue.count > 0
1263 				|| free_page_queue_count() > sReservedPages))
1264 			return stolen;
1265 
1266 		if (stolen && !tried && sInactivePageQueue.count > 0) {
1267 			count++;
1268 			continue;
1269 		}
1270 		if (tried) {
1271 			// We tried all potential pages, but one or more couldn't be stolen
1272 			// at that time (likely because their cache was locked). No one
1273 			// else will have any better luck, so we'll just retry a little
1274 			// later.
1275 			// TODO: Think about better strategies. E.g. if our condition
1276 			// variables had timeouts, we could just wait with timeout on
1277 			// the free page queue condition variable, which could might
1278 			// succeed earlier.
1279 			locker.Unlock();
1280 			snooze(10000);
1281 			continue;
1282 		}
1283 
1284 		// we need to wait for pages to become inactive
1285 
1286 		ConditionVariableEntry freeConditionEntry;
1287 		sPageDeficit++;
1288 		freeConditionEntry.Add(&sFreePageQueue);
1289 		locker.Unlock();
1290 
1291 		vm_low_memory(count);
1292 		//snooze(50000);
1293 			// sleep for 50ms
1294 
1295 		freeConditionEntry.Wait();
1296 
1297 		locker.Lock();
1298 		sPageDeficit--;
1299 
1300 		if (reserve && sReservedPages <= free_page_queue_count())
1301 			return stolen;
1302 	}
1303 }
1304 
1305 
1306 //	#pragma mark - private kernel API
1307 
1308 
1309 /*!	Writes a range of modified pages of a cache to disk.
1310 	You need to hold the vm_cache lock when calling this function.
1311 	Note that the cache lock is released in this function.
1312 	\param cache The cache.
1313 	\param firstPage Offset (in page size units) of the first page in the range.
1314 	\param endPage End offset (in page size units) of the page range. The page
1315 		at this offset is not included.
1316 */
1317 status_t
1318 vm_page_write_modified_page_range(struct vm_cache *cache, uint32 firstPage,
1319 	uint32 endPage, bool fsReenter)
1320 {
1321 	// TODO: join adjacent pages into one vec list
1322 
1323 	for (vm_page *page = cache->page_list; page; page = page->cache_next) {
1324 		bool dequeuedPage = false;
1325 
1326 		if (page->cache_offset < firstPage || page->cache_offset >= endPage)
1327 			continue;
1328 
1329 		if (page->state == PAGE_STATE_MODIFIED) {
1330 			InterruptsSpinLocker locker(&sPageLock);
1331 			remove_page_from_queue(&sModifiedPageQueue, page);
1332 			dequeuedPage = true;
1333 		} else if (page->state == PAGE_STATE_BUSY
1334 				|| !vm_test_map_modification(page)) {
1335 			continue;
1336 		}
1337 
1338 		page->state = PAGE_STATE_BUSY;
1339 		page->busy_writing = true;
1340 
1341 		ConditionVariable busyCondition;
1342 		busyCondition.Publish(page, "page");
1343 
1344 		// We have a modified page - however, while we're writing it back,
1345 		// the page is still mapped. In order not to lose any changes to the
1346 		// page, we mark it clean before actually writing it back; if writing
1347 		// the page fails for some reason, we just keep it in the modified page
1348 		// list, but that should happen only rarely.
1349 
1350 		// If the page is changed after we cleared the dirty flag, but before we
1351 		// had the chance to write it back, then we'll write it again later -
1352 		// that will probably not happen that often, though.
1353 
1354 		// clear the modified flag
1355 		vm_clear_map_flags(page, PAGE_MODIFIED);
1356 
1357 		mutex_unlock(&cache->lock);
1358 		status_t status = write_page(page, fsReenter);
1359 		mutex_lock(&cache->lock);
1360 
1361 		InterruptsSpinLocker locker(&sPageLock);
1362 
1363 		if (status == B_OK) {
1364 			// put it into the active/inactive queue
1365 			move_page_to_active_or_inactive_queue(page, dequeuedPage);
1366 			page->busy_writing = false;
1367 		} else {
1368 			// We don't have to put the PAGE_MODIFIED bit back, as it's still
1369 			// in the modified pages list.
1370 			if (dequeuedPage) {
1371 				page->state = PAGE_STATE_MODIFIED;
1372 				enqueue_page(&sModifiedPageQueue, page);
1373 			}
1374 
1375 			if (!page->busy_writing) {
1376 				// someone has cleared the busy_writing flag which tells
1377 				// us our page has gone invalid
1378 				vm_cache_remove_page(cache, page);
1379 			} else {
1380 				if (!dequeuedPage)
1381 					set_page_state_nolock(page, PAGE_STATE_MODIFIED);
1382 
1383 				page->busy_writing = false;
1384 			}
1385 		}
1386 
1387 		busyCondition.Unpublish();
1388 	}
1389 
1390 	return B_OK;
1391 }
1392 
1393 
1394 /*!	You need to hold the vm_cache lock when calling this function.
1395 	Note that the cache lock is released in this function.
1396 */
1397 status_t
1398 vm_page_write_modified_pages(vm_cache *cache, bool fsReenter)
1399 {
1400 	return vm_page_write_modified_page_range(cache, 0,
1401 		(cache->virtual_size + B_PAGE_SIZE - 1) >> PAGE_SHIFT, fsReenter);
1402 }
1403 
1404 
1405 /*!	Schedules the page writer to write back the specified \a page.
1406 	Note, however, that it might not do this immediately, and it can well
1407 	take several seconds until the page is actually written out.
1408 */
1409 void
1410 vm_page_schedule_write_page(vm_page *page)
1411 {
1412 	ASSERT(page->state == PAGE_STATE_MODIFIED);
1413 
1414 	vm_page_requeue(page, false);
1415 
1416 	release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1417 }
1418 
1419 
1420 /*!	Cache must be locked.
1421 */
1422 void
1423 vm_page_schedule_write_page_range(struct vm_cache *cache, uint32 firstPage,
1424 	uint32 endPage)
1425 {
1426 	uint32 modified = 0;
1427 	for (vm_page *page = cache->page_list; page; page = page->cache_next) {
1428 		bool dequeuedPage = false;
1429 
1430 		if (page->cache_offset >= firstPage && page->cache_offset < endPage
1431 			&& page->state == PAGE_STATE_MODIFIED) {
1432 			vm_page_requeue(page, false);
1433 			modified++;
1434 		}
1435 	}
1436 
1437 	if (modified > 0)
1438 		release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1439 }
1440 
1441 
1442 void
1443 vm_page_init_num_pages(kernel_args *args)
1444 {
1445 	uint32 i;
1446 
1447 	// calculate the size of memory by looking at the physical_memory_range array
1448 	addr_t physicalPagesEnd = 0;
1449 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
1450 
1451 	for (i = 0; i < args->num_physical_memory_ranges; i++) {
1452 		physicalPagesEnd = (args->physical_memory_range[i].start
1453 			+ args->physical_memory_range[i].size) / B_PAGE_SIZE;
1454 	}
1455 
1456 	TRACE(("first phys page = 0x%lx, end 0x%x\n", sPhysicalPageOffset,
1457 		physicalPagesEnd));
1458 
1459 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
1460 }
1461 
1462 
1463 status_t
1464 vm_page_init(kernel_args *args)
1465 {
1466 	TRACE(("vm_page_init: entry\n"));
1467 
1468 	// map in the new free page table
1469 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
1470 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1471 
1472 	TRACE(("vm_init: putting free_page_table @ %p, # ents %d (size 0x%x)\n",
1473 		sPages, sNumPages, (unsigned int)(sNumPages * sizeof(vm_page))));
1474 
1475 	// initialize the free page table
1476 	for (uint32 i = 0; i < sNumPages; i++) {
1477 		sPages[i].physical_page_number = sPhysicalPageOffset + i;
1478 		sPages[i].type = PAGE_TYPE_PHYSICAL;
1479 		sPages[i].state = PAGE_STATE_FREE;
1480 		new(&sPages[i].mappings) vm_page_mappings();
1481 		sPages[i].wired_count = 0;
1482 		sPages[i].usage_count = 0;
1483 		sPages[i].busy_writing = false;
1484 		sPages[i].cache = NULL;
1485 		#ifdef DEBUG_PAGE_QUEUE
1486 			sPages[i].queue = NULL;
1487 		#endif
1488 		#ifdef DEBUG_PAGE_CACHE_TRANSITIONS
1489 			sPages[i].debug_flags = 0;
1490 			sPages[i].collided_page = NULL;
1491 		#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
1492 		enqueue_page(&sFreePageQueue, &sPages[i]);
1493 	}
1494 
1495 	TRACE(("initialized table\n"));
1496 
1497 	// mark some of the page ranges inuse
1498 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
1499 		vm_mark_page_range_inuse(args->physical_allocated_range[i].start / B_PAGE_SIZE,
1500 			args->physical_allocated_range[i].size / B_PAGE_SIZE);
1501 	}
1502 
1503 	TRACE(("vm_page_init: exit\n"));
1504 
1505 	return B_OK;
1506 }
1507 
1508 
1509 status_t
1510 vm_page_init_post_area(kernel_args *args)
1511 {
1512 	void *dummy;
1513 
1514 	dummy = sPages;
1515 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
1516 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
1517 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1518 
1519 	add_debugger_command("page_stats", &dump_page_stats, "Dump statistics about page usage");
1520 	add_debugger_command("page", &dump_page, "Dump page info");
1521 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
1522 	add_debugger_command("find_page", &find_page,
1523 		"Find out which queue a page is actually in");
1524 
1525 	return B_OK;
1526 }
1527 
1528 
1529 status_t
1530 vm_page_init_post_thread(kernel_args *args)
1531 {
1532 	new (&sFreePageCondition) ConditionVariable;
1533 	sFreePageCondition.Publish(&sFreePageQueue, "free page");
1534 
1535 	// create a kernel thread to clear out pages
1536 
1537 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
1538 		B_LOWEST_ACTIVE_PRIORITY, NULL);
1539 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1540 
1541 	// start page writer
1542 
1543 	sWriterWaitSem = create_sem(0, "page writer");
1544 
1545 	thread = spawn_kernel_thread(&page_writer, "page writer",
1546 		B_NORMAL_PRIORITY + 1, NULL);
1547 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1548 
1549 	return B_OK;
1550 }
1551 
1552 
1553 status_t
1554 vm_mark_page_inuse(addr_t page)
1555 {
1556 	return vm_mark_page_range_inuse(page, 1);
1557 }
1558 
1559 
1560 status_t
1561 vm_mark_page_range_inuse(addr_t startPage, addr_t length)
1562 {
1563 	TRACE(("vm_mark_page_range_inuse: start 0x%lx, len 0x%lx\n",
1564 		startPage, length));
1565 
1566 	if (sPhysicalPageOffset > startPage) {
1567 		TRACE(("vm_mark_page_range_inuse: start page %ld is before free list\n",
1568 			startPage));
1569 		return B_BAD_VALUE;
1570 	}
1571 	startPage -= sPhysicalPageOffset;
1572 	if (startPage + length > sNumPages) {
1573 		TRACE(("vm_mark_page_range_inuse: range would extend past free list\n"));
1574 		return B_BAD_VALUE;
1575 	}
1576 
1577 	cpu_status state = disable_interrupts();
1578 	acquire_spinlock(&sPageLock);
1579 
1580 	for (addr_t i = 0; i < length; i++) {
1581 		vm_page *page = &sPages[startPage + i];
1582 		switch (page->state) {
1583 			case PAGE_STATE_FREE:
1584 			case PAGE_STATE_CLEAR:
1585 				set_page_state_nolock(page, PAGE_STATE_UNUSED);
1586 				break;
1587 			case PAGE_STATE_WIRED:
1588 				break;
1589 			case PAGE_STATE_ACTIVE:
1590 			case PAGE_STATE_INACTIVE:
1591 			case PAGE_STATE_BUSY:
1592 			case PAGE_STATE_MODIFIED:
1593 			case PAGE_STATE_UNUSED:
1594 			default:
1595 				// uh
1596 				dprintf("vm_mark_page_range_inuse: page 0x%lx in non-free state %d!\n",
1597 					startPage + i, page->state);
1598 				break;
1599 		}
1600 	}
1601 
1602 	release_spinlock(&sPageLock);
1603 	restore_interrupts(state);
1604 
1605 	return B_OK;
1606 }
1607 
1608 
1609 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
1610 	Note, you specify the same \a count here that you specified when
1611 	reserving the pages - you don't need to keep track how many pages
1612 	you actually needed of that upfront allocation.
1613 */
1614 void
1615 vm_page_unreserve_pages(uint32 count)
1616 {
1617 	if (count == 0)
1618 		return;
1619 
1620 	InterruptsSpinLocker locker(sPageLock);
1621 	ASSERT(sReservedPages >= count);
1622 
1623 	T(UnreservePages(count));
1624 
1625 	sReservedPages -= count;
1626 
1627 	if (sPageDeficit > 0)
1628 		sFreePageCondition.NotifyAll();
1629 }
1630 
1631 
1632 /*!	With this call, you can reserve a number of free pages in the system.
1633 	They will only be handed out to someone who has actually reserved them.
1634 	This call returns as soon as the number of requested pages has been
1635 	reached.
1636 */
1637 void
1638 vm_page_reserve_pages(uint32 count)
1639 {
1640 	if (count == 0)
1641 		return;
1642 
1643 	InterruptsSpinLocker locker(sPageLock);
1644 
1645 	T(ReservePages(count));
1646 
1647 	sReservedPages += count;
1648 	size_t freePages = free_page_queue_count();
1649 	if (sReservedPages <= freePages)
1650 		return;
1651 
1652 	locker.Unlock();
1653 
1654 	steal_pages(NULL, count + 1, true);
1655 		// we get one more, just in case we can do something someone
1656 		// else can't
1657 }
1658 
1659 
1660 vm_page *
1661 vm_page_allocate_page(int pageState, bool reserved)
1662 {
1663 	ConditionVariableEntry freeConditionEntry;
1664 	page_queue *queue;
1665 	page_queue *otherQueue;
1666 
1667 	switch (pageState) {
1668 		case PAGE_STATE_FREE:
1669 			queue = &sFreePageQueue;
1670 			otherQueue = &sClearPageQueue;
1671 			break;
1672 		case PAGE_STATE_CLEAR:
1673 			queue = &sClearPageQueue;
1674 			otherQueue = &sFreePageQueue;
1675 			break;
1676 		default:
1677 			return NULL; // invalid
1678 	}
1679 
1680 	InterruptsSpinLocker locker(sPageLock);
1681 
1682 	T(AllocatePage(reserved));
1683 
1684 	vm_page *page = NULL;
1685 	while (true) {
1686 		if (reserved || sReservedPages < free_page_queue_count()) {
1687 			page = dequeue_page(queue);
1688 			if (page == NULL) {
1689 #ifdef DEBUG
1690 				if (queue->count != 0)
1691 					panic("queue %p corrupted, count = %d\n", queue, queue->count);
1692 #endif
1693 
1694 				// if the primary queue was empty, grap the page from the
1695 				// secondary queue
1696 				page = dequeue_page(otherQueue);
1697 			}
1698 		}
1699 
1700 		if (page != NULL)
1701 			break;
1702 
1703 		if (reserved)
1704 			panic("Had reserved page, but there is none!");
1705 
1706 		// steal one from the inactive list
1707 		locker.Unlock();
1708 		size_t stolen = steal_pages(&page, 1, false);
1709 		locker.Lock();
1710 
1711 		if (stolen > 0)
1712 			break;
1713 	}
1714 
1715 	if (page->cache != NULL)
1716 		panic("supposed to be free page %p has cache\n", page);
1717 
1718 	int oldPageState = page->state;
1719 	page->state = PAGE_STATE_BUSY;
1720 	page->usage_count = 2;
1721 
1722 	enqueue_page(&sActivePageQueue, page);
1723 
1724 	locker.Unlock();
1725 
1726 	// if needed take the page from the free queue and zero it out
1727 	if (pageState == PAGE_STATE_CLEAR && oldPageState != PAGE_STATE_CLEAR)
1728 		clear_page(page);
1729 
1730 	return page;
1731 }
1732 
1733 
1734 /*!	Allocates a number of pages and puts their pointers into the provided
1735 	array. All pages are marked busy.
1736 	Returns B_OK on success, and B_NO_MEMORY when there aren't any free
1737 	pages left to allocate.
1738 */
1739 status_t
1740 vm_page_allocate_pages(int pageState, vm_page **pages, uint32 numPages)
1741 {
1742 	uint32 i;
1743 
1744 	for (i = 0; i < numPages; i++) {
1745 		pages[i] = vm_page_allocate_page(pageState, false);
1746 		if (pages[i] == NULL) {
1747 			// allocation failed, we need to free what we already have
1748 			while (i-- > 0)
1749 				vm_page_set_state(pages[i], pageState);
1750 
1751 			return B_NO_MEMORY;
1752 		}
1753 	}
1754 
1755 	return B_OK;
1756 }
1757 
1758 
1759 vm_page *
1760 vm_page_allocate_page_run(int pageState, addr_t length)
1761 {
1762 	vm_page *firstPage = NULL;
1763 	uint32 start = 0;
1764 
1765 	InterruptsSpinLocker locker(sPageLock);
1766 
1767 	if (sFreePageQueue.count + sClearPageQueue.count - sReservedPages < length) {
1768 		// TODO: add more tries, ie. free some inactive, ...
1769 		// no free space
1770 		return NULL;
1771 	}
1772 
1773 	for (;;) {
1774 		bool foundRun = true;
1775 		if (start + length > sNumPages)
1776 			break;
1777 
1778 		uint32 i;
1779 		for (i = 0; i < length; i++) {
1780 			if (sPages[start + i].state != PAGE_STATE_FREE
1781 				&& sPages[start + i].state != PAGE_STATE_CLEAR) {
1782 				foundRun = false;
1783 				i++;
1784 				break;
1785 			}
1786 		}
1787 		if (foundRun) {
1788 			// pull the pages out of the appropriate queues
1789 			for (i = 0; i < length; i++) {
1790 				sPages[start + i].is_cleared
1791 					= sPages[start + i].state == PAGE_STATE_CLEAR;
1792 				set_page_state_nolock(&sPages[start + i], PAGE_STATE_BUSY);
1793 				sPages[i].usage_count = 2;
1794 			}
1795 			firstPage = &sPages[start];
1796 			break;
1797 		} else {
1798 			start += i;
1799 		}
1800 	}
1801 
1802 	T(AllocatePageRun(length));
1803 
1804 	locker.Unlock();
1805 
1806 	if (firstPage != NULL && pageState == PAGE_STATE_CLEAR) {
1807 		for (uint32 i = 0; i < length; i++) {
1808 			if (!sPages[start + i].is_cleared)
1809 	 			clear_page(&sPages[start + i]);
1810 		}
1811 	}
1812 
1813 	return firstPage;
1814 }
1815 
1816 
1817 vm_page *
1818 vm_page_at_index(int32 index)
1819 {
1820 	return &sPages[index];
1821 }
1822 
1823 
1824 vm_page *
1825 vm_lookup_page(addr_t pageNumber)
1826 {
1827 	if (pageNumber < sPhysicalPageOffset)
1828 		return NULL;
1829 
1830 	pageNumber -= sPhysicalPageOffset;
1831 	if (pageNumber >= sNumPages)
1832 		return NULL;
1833 
1834 	return &sPages[pageNumber];
1835 }
1836 
1837 
1838 /*!	Free the page that belonged to a certain cache.
1839 	You can use vm_page_set_state() manually if you prefer, but only
1840 	if the page does not equal PAGE_STATE_MODIFIED.
1841 */
1842 void
1843 vm_page_free(vm_cache *cache, vm_page *page)
1844 {
1845 	InterruptsSpinLocker _(sPageLock);
1846 
1847 	if (page->cache == NULL && page->state == PAGE_STATE_MODIFIED
1848 		&& cache->temporary)
1849 		sModifiedTemporaryPages--;
1850 
1851 	set_page_state_nolock(page, PAGE_STATE_FREE);
1852 }
1853 
1854 
1855 status_t
1856 vm_page_set_state(vm_page *page, int pageState)
1857 {
1858 	InterruptsSpinLocker _(sPageLock);
1859 
1860 	return set_page_state_nolock(page, pageState);
1861 }
1862 
1863 
1864 /*!	Moves a page to either the tail of the head of its current queue,
1865 	depending on \a tail.
1866 */
1867 void
1868 vm_page_requeue(struct vm_page *page, bool tail)
1869 {
1870 	InterruptsSpinLocker _(sPageLock);
1871 	page_queue *queue = NULL;
1872 
1873 	switch (page->state) {
1874 		case PAGE_STATE_BUSY:
1875 		case PAGE_STATE_ACTIVE:
1876 		case PAGE_STATE_WIRED:
1877 		case PAGE_STATE_UNUSED:
1878 			queue = &sActivePageQueue;
1879 			break;
1880 		case PAGE_STATE_INACTIVE:
1881 			queue = &sInactivePageQueue;
1882 			break;
1883 		case PAGE_STATE_MODIFIED:
1884 			queue = &sModifiedPageQueue;
1885 			break;
1886 		case PAGE_STATE_FREE:
1887 			queue = &sFreePageQueue;
1888 			break;
1889 		case PAGE_STATE_CLEAR:
1890 			queue = &sClearPageQueue;
1891 			break;
1892 		default:
1893 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
1894 				page, page->state);
1895 			break;
1896 	}
1897 
1898 	remove_page_from_queue(queue, page);
1899 
1900 	if (tail)
1901 		enqueue_page(queue, page);
1902 	else
1903 		enqueue_page_to_head(queue, page);
1904 }
1905 
1906 
1907 size_t
1908 vm_page_num_pages(void)
1909 {
1910 	return sNumPages;
1911 }
1912 
1913 
1914 /*! There is a subtle distinction between the page counts returned by
1915 	this function and vm_page_num_free_pages():
1916 	The latter returns the number of pages that are completely uncommitted,
1917 	whereas this one returns the number of pages that are available for
1918 	use by being reclaimed as well (IOW it factors in things like cache pages
1919 	as available).
1920 */
1921 size_t
1922 vm_page_num_available_pages(void)
1923 {
1924 	return vm_available_memory() / B_PAGE_SIZE;
1925 }
1926 
1927 
1928 size_t
1929 vm_page_num_free_pages(void)
1930 {
1931 	size_t reservedPages = sReservedPages;
1932 	size_t count = free_page_queue_count() + sInactivePageQueue.count;
1933 	if (reservedPages > count)
1934 		return 0;
1935 
1936 	return count - reservedPages;
1937 }
1938 
1939