xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision f75a7bf508f3156d63a14f8fd77c5e0ca4d08c42)
1 /*
2  * Copyright 2002-2008, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  *
5  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
6  * Distributed under the terms of the NewOS License.
7  */
8 
9 #include <signal.h>
10 #include <string.h>
11 #include <stdlib.h>
12 
13 #include <KernelExport.h>
14 #include <OS.h>
15 
16 #include <AutoDeleter.h>
17 
18 #include <arch/cpu.h>
19 #include <arch/vm_translation_map.h>
20 #include <block_cache.h>
21 #include <boot/kernel_args.h>
22 #include <condition_variable.h>
23 #include <kernel.h>
24 #include <low_resource_manager.h>
25 #include <thread.h>
26 #include <tracing.h>
27 #include <util/AutoLock.h>
28 #include <vfs.h>
29 #include <vm.h>
30 #include <vm_address_space.h>
31 #include <vm_priv.h>
32 #include <vm_page.h>
33 #include <vm_cache.h>
34 
35 #include "PageCacheLocker.h"
36 #include "io_requests.h"
37 
38 
39 //#define TRACE_VM_PAGE
40 #ifdef TRACE_VM_PAGE
41 #	define TRACE(x) dprintf x
42 #else
43 #	define TRACE(x) ;
44 #endif
45 
46 #define SCRUB_SIZE 16
47 	// this many pages will be cleared at once in the page scrubber thread
48 
49 // for debugging: limit the amount of available RAM (in MB)
50 //#define LIMIT_AVAILABLE_MEMORY	256
51 
52 
53 typedef struct page_queue {
54 	vm_page *head;
55 	vm_page *tail;
56 	uint32	count;
57 } page_queue;
58 
59 int32 gMappedPagesCount;
60 
61 static page_queue sFreePageQueue;
62 static page_queue sClearPageQueue;
63 static page_queue sModifiedPageQueue;
64 static page_queue sInactivePageQueue;
65 static page_queue sActivePageQueue;
66 
67 static vm_page *sPages;
68 static addr_t sPhysicalPageOffset;
69 static size_t sNumPages;
70 static size_t sReservedPages;
71 static vint32 sPageDeficit;
72 static size_t sModifiedTemporaryPages;
73 
74 static ConditionVariable sFreePageCondition;
75 static spinlock sPageLock;
76 
77 static sem_id sWriterWaitSem;
78 
79 
80 #if PAGE_ALLOCATION_TRACING
81 
82 namespace PageAllocationTracing {
83 
84 class ReservePages : public AbstractTraceEntry {
85 	public:
86 		ReservePages(uint32 count)
87 			:
88 			fCount(count)
89 		{
90 			Initialized();
91 		}
92 
93 		virtual void AddDump(TraceOutput& out)
94 		{
95 			out.Print("page reserve:   %lu", fCount);
96 		}
97 
98 	private:
99 		uint32		fCount;
100 };
101 
102 
103 class UnreservePages : public AbstractTraceEntry {
104 	public:
105 		UnreservePages(uint32 count)
106 			:
107 			fCount(count)
108 		{
109 			Initialized();
110 		}
111 
112 		virtual void AddDump(TraceOutput& out)
113 		{
114 			out.Print("page unreserve: %lu", fCount);
115 		}
116 
117 	private:
118 		uint32		fCount;
119 };
120 
121 
122 class AllocatePage : public AbstractTraceEntry {
123 	public:
124 		AllocatePage(bool reserved)
125 			:
126 			fReserved(reserved)
127 		{
128 			Initialized();
129 		}
130 
131 		virtual void AddDump(TraceOutput& out)
132 		{
133 			out.Print("page alloc");
134 			if (fReserved)
135 				out.Print(" reserved");
136 		}
137 
138 	private:
139 		bool		fReserved;
140 };
141 
142 
143 class AllocatePageRun : public AbstractTraceEntry {
144 	public:
145 		AllocatePageRun(uint32 length)
146 			:
147 			fLength(length)
148 		{
149 			Initialized();
150 		}
151 
152 		virtual void AddDump(TraceOutput& out)
153 		{
154 			out.Print("page alloc run: length: %ld", fLength);
155 		}
156 
157 	private:
158 		uint32		fLength;
159 };
160 
161 
162 class FreePage : public AbstractTraceEntry {
163 	public:
164 		FreePage()
165 		{
166 			Initialized();
167 		}
168 
169 		virtual void AddDump(TraceOutput& out)
170 		{
171 			out.Print("page free");
172 		}
173 };
174 
175 
176 class ScrubbingPages : public AbstractTraceEntry {
177 	public:
178 		ScrubbingPages(uint32 count)
179 			:
180 			fCount(count)
181 		{
182 			Initialized();
183 		}
184 
185 		virtual void AddDump(TraceOutput& out)
186 		{
187 			out.Print("page scrubbing: %lu", fCount);
188 		}
189 
190 	private:
191 		uint32		fCount;
192 };
193 
194 
195 class ScrubbedPages : public AbstractTraceEntry {
196 	public:
197 		ScrubbedPages(uint32 count)
198 			:
199 			fCount(count)
200 		{
201 			Initialized();
202 		}
203 
204 		virtual void AddDump(TraceOutput& out)
205 		{
206 			out.Print("page scrubbed:  %lu", fCount);
207 		}
208 
209 	private:
210 		uint32		fCount;
211 };
212 
213 
214 class StolenPage : public AbstractTraceEntry {
215 	public:
216 		StolenPage()
217 		{
218 			Initialized();
219 		}
220 
221 		virtual void AddDump(TraceOutput& out)
222 		{
223 			out.Print("page stolen");
224 		}
225 };
226 
227 }	// namespace PageAllocationTracing
228 
229 #	define T(x)	new(std::nothrow) PageAllocationTracing::x
230 
231 #else
232 #	define T(x)
233 #endif	// PAGE_ALLOCATION_TRACING
234 
235 
236 /*!	Dequeues a page from the head of the given queue */
237 static vm_page *
238 dequeue_page(page_queue *queue)
239 {
240 	vm_page *page;
241 
242 	page = queue->head;
243 	if (page != NULL) {
244 		if (queue->tail == page)
245 			queue->tail = NULL;
246 		if (page->queue_next != NULL)
247 			page->queue_next->queue_prev = NULL;
248 
249 		queue->head = page->queue_next;
250 		if (page->type != PAGE_TYPE_DUMMY)
251 			queue->count--;
252 
253 #ifdef DEBUG_PAGE_QUEUE
254 		if (page->queue != queue) {
255 			panic("dequeue_page(queue: %p): page %p thinks it is in queue "
256 				"%p", queue, page, page->queue);
257 		}
258 
259 		page->queue = NULL;
260 #endif	// DEBUG_PAGE_QUEUE
261 	}
262 
263 	return page;
264 }
265 
266 
267 /*!	Enqueues a page to the tail of the given queue */
268 static void
269 enqueue_page(page_queue *queue, vm_page *page)
270 {
271 #ifdef DEBUG_PAGE_QUEUE
272 	if (page->queue != NULL) {
273 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
274 			"already in queue %p", queue, page, page->queue);
275 	}
276 #endif	// DEBUG_PAGE_QUEUE
277 
278 	if (queue->tail != NULL)
279 		queue->tail->queue_next = page;
280 	page->queue_prev = queue->tail;
281 	queue->tail = page;
282 	page->queue_next = NULL;
283 	if (queue->head == NULL)
284 		queue->head = page;
285 	if (page->type != PAGE_TYPE_DUMMY)
286 		queue->count++;
287 
288 #ifdef DEBUG_PAGE_QUEUE
289 	page->queue = queue;
290 #endif
291 }
292 
293 
294 /*!	Enqueues a page to the head of the given queue */
295 static void
296 enqueue_page_to_head(page_queue *queue, vm_page *page)
297 {
298 #ifdef DEBUG_PAGE_QUEUE
299 	if (page->queue != NULL) {
300 		panic("enqueue_page_to_head(queue: %p, page: %p): page thinks it is "
301 			"already in queue %p", queue, page, page->queue);
302 	}
303 #endif	// DEBUG_PAGE_QUEUE
304 
305 	if (queue->head != NULL)
306 		queue->head->queue_prev = page;
307 	page->queue_next = queue->head;
308 	queue->head = page;
309 	page->queue_prev = NULL;
310 	if (queue->tail == NULL)
311 		queue->tail = page;
312 	if (page->type != PAGE_TYPE_DUMMY)
313 		queue->count++;
314 
315 #ifdef DEBUG_PAGE_QUEUE
316 	page->queue = queue;
317 #endif
318 }
319 
320 
321 static void
322 remove_page_from_queue(page_queue *queue, vm_page *page)
323 {
324 #ifdef DEBUG_PAGE_QUEUE
325 	if (page->queue != queue) {
326 		panic("remove_page_from_queue(queue: %p, page: %p): page thinks it "
327 			"is in queue %p", queue, page, page->queue);
328 	}
329 #endif	// DEBUG_PAGE_QUEUE
330 
331 	if (page->queue_next != NULL)
332 		page->queue_next->queue_prev = page->queue_prev;
333 	else
334 		queue->tail = page->queue_prev;
335 
336 	if (page->queue_prev != NULL)
337 		page->queue_prev->queue_next = page->queue_next;
338 	else
339 		queue->head = page->queue_next;
340 
341 	if (page->type != PAGE_TYPE_DUMMY)
342 		queue->count--;
343 
344 #ifdef DEBUG_PAGE_QUEUE
345 	page->queue = NULL;
346 #endif
347 }
348 
349 
350 /*!	Moves a page to the tail of the given queue, but only does so if
351 	the page is currently in another queue.
352 */
353 static void
354 move_page_to_queue(page_queue *fromQueue, page_queue *toQueue, vm_page *page)
355 {
356 	if (fromQueue != toQueue) {
357 		remove_page_from_queue(fromQueue, page);
358 		enqueue_page(toQueue, page);
359 	}
360 }
361 
362 
363 /*! Inserts \a page after the \a before page in the \a queue. */
364 static void
365 insert_page_after(page_queue *queue, vm_page *before, vm_page *page)
366 {
367 #ifdef DEBUG_PAGE_QUEUE
368 	if (page->queue != NULL) {
369 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
370 			"already in queue %p", queue, page, page->queue);
371 	}
372 #endif	// DEBUG_PAGE_QUEUE
373 
374 	if (before == NULL) {
375 		enqueue_page(queue, page);
376 		return;
377 	}
378 
379 	page->queue_next = before->queue_next;
380 	if (page->queue_next != NULL)
381 		page->queue_next->queue_prev = page;
382 	page->queue_prev = before;
383 	before->queue_next = page;
384 
385 	if (queue->tail == before)
386 		queue->tail = page;
387 
388 	if (page->type != PAGE_TYPE_DUMMY)
389 		queue->count++;
390 
391 #ifdef DEBUG_PAGE_QUEUE
392 	page->queue = queue;
393 #endif
394 }
395 
396 
397 static int
398 find_page(int argc, char **argv)
399 {
400 	struct vm_page *page;
401 	addr_t address;
402 	int32 index = 1;
403 	int i;
404 
405 	struct {
406 		const char*	name;
407 		page_queue*	queue;
408 	} pageQueueInfos[] = {
409 		{ "free",		&sFreePageQueue },
410 		{ "clear",		&sClearPageQueue },
411 		{ "modified",	&sModifiedPageQueue },
412 		{ "active",		&sActivePageQueue },
413 		{ NULL, NULL }
414 	};
415 
416 	if (argc < 2
417 		|| strlen(argv[index]) <= 2
418 		|| argv[index][0] != '0'
419 		|| argv[index][1] != 'x') {
420 		kprintf("usage: find_page <address>\n");
421 		return 0;
422 	}
423 
424 	address = strtoul(argv[index], NULL, 0);
425 	page = (vm_page*)address;
426 
427 	for (i = 0; pageQueueInfos[i].name; i++) {
428 		vm_page* p = pageQueueInfos[i].queue->head;
429 		while (p) {
430 			if (p == page) {
431 				kprintf("found page %p in queue %p (%s)\n", page,
432 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
433 				return 0;
434 			}
435 			p = p->queue_next;
436 		}
437 	}
438 
439 	kprintf("page %p isn't in any queue\n", page);
440 
441 	return 0;
442 }
443 
444 
445 const char *
446 page_state_to_string(int state)
447 {
448 	switch(state) {
449 		case PAGE_STATE_ACTIVE:
450 			return "active";
451 		case PAGE_STATE_INACTIVE:
452 			return "inactive";
453 		case PAGE_STATE_BUSY:
454 			return "busy";
455 		case PAGE_STATE_MODIFIED:
456 			return "modified";
457 		case PAGE_STATE_FREE:
458 			return "free";
459 		case PAGE_STATE_CLEAR:
460 			return "clear";
461 		case PAGE_STATE_WIRED:
462 			return "wired";
463 		case PAGE_STATE_UNUSED:
464 			return "unused";
465 		default:
466 			return "unknown";
467 	}
468 }
469 
470 
471 static int
472 dump_page(int argc, char **argv)
473 {
474 	struct vm_page *page;
475 	addr_t address;
476 	bool physical = false;
477 	int32 index = 1;
478 
479 	if (argc > 2) {
480 		if (!strcmp(argv[1], "-p")) {
481 			physical = true;
482 			index++;
483 		} else if (!strcmp(argv[1], "-v"))
484 			index++;
485 	}
486 
487 	if (argc < 2
488 		|| strlen(argv[index]) <= 2
489 		|| argv[index][0] != '0'
490 		|| argv[index][1] != 'x') {
491 		kprintf("usage: page [-p|-v] <address>\n"
492 			"  -v looks up a virtual address for the page, -p a physical address.\n"
493 			"  Default is to look for the page structure address directly.\n");
494 		return 0;
495 	}
496 
497 	address = strtoul(argv[index], NULL, 0);
498 
499 	if (index == 2) {
500 		if (!physical) {
501 			vm_address_space *addressSpace = vm_kernel_address_space();
502 			uint32 flags;
503 
504 			if (thread_get_current_thread()->team->address_space != NULL)
505 				addressSpace = thread_get_current_thread()->team->address_space;
506 
507 			addressSpace->translation_map.ops->query_interrupt(
508 				&addressSpace->translation_map, address, &address, &flags);
509 		}
510 		page = vm_lookup_page(address / B_PAGE_SIZE);
511 	} else
512 		page = (struct vm_page *)address;
513 
514 	kprintf("PAGE: %p\n", page);
515 	kprintf("queue_next,prev: %p, %p\n", page->queue_next, page->queue_prev);
516 	kprintf("physical_number: %lx\n", page->physical_page_number);
517 	kprintf("cache:           %p\n", page->cache);
518 	kprintf("cache_offset:    %ld\n", page->cache_offset);
519 	kprintf("cache_next:      %p\n", page->cache_next);
520 	kprintf("type:            %d\n", page->type);
521 	kprintf("state:           %s\n", page_state_to_string(page->state));
522 	kprintf("wired_count:     %d\n", page->wired_count);
523 	kprintf("usage_count:     %d\n", page->usage_count);
524 	kprintf("busy_writing:    %d\n", page->busy_writing);
525 	#ifdef DEBUG_PAGE_QUEUE
526 		kprintf("queue:           %p\n", page->queue);
527 	#endif
528 	#ifdef DEBUG_PAGE_CACHE_TRANSITIONS
529 		kprintf("debug_flags:     0x%lx\n", page->debug_flags);
530 		kprintf("collided page:   %p\n", page->collided_page);
531 	#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
532 	kprintf("area mappings:\n");
533 
534 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
535 	vm_page_mapping *mapping;
536 	while ((mapping = iterator.Next()) != NULL) {
537 		kprintf("  %p (%#lx)\n", mapping->area, mapping->area->id);
538 		mapping = mapping->page_link.next;
539 	}
540 
541 	return 0;
542 }
543 
544 
545 static int
546 dump_page_queue(int argc, char **argv)
547 {
548 	struct page_queue *queue;
549 
550 	if (argc < 2) {
551 		kprintf("usage: page_queue <address/name> [list]\n");
552 		return 0;
553 	}
554 
555 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
556 		queue = (struct page_queue *)strtoul(argv[1], NULL, 16);
557 	if (!strcmp(argv[1], "free"))
558 		queue = &sFreePageQueue;
559 	else if (!strcmp(argv[1], "clear"))
560 		queue = &sClearPageQueue;
561 	else if (!strcmp(argv[1], "modified"))
562 		queue = &sModifiedPageQueue;
563 	else if (!strcmp(argv[1], "active"))
564 		queue = &sActivePageQueue;
565 	else if (!strcmp(argv[1], "inactive"))
566 		queue = &sInactivePageQueue;
567 	else {
568 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
569 		return 0;
570 	}
571 
572 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %ld\n",
573 		queue, queue->head, queue->tail, queue->count);
574 
575 	if (argc == 3) {
576 		struct vm_page *page = queue->head;
577 		const char *type = "none";
578 		int i;
579 
580 		if (page->cache != NULL) {
581 			switch (page->cache->type) {
582 				case CACHE_TYPE_RAM:
583 					type = "RAM";
584 					break;
585 				case CACHE_TYPE_DEVICE:
586 					type = "device";
587 					break;
588 				case CACHE_TYPE_VNODE:
589 					type = "vnode";
590 					break;
591 				case CACHE_TYPE_NULL:
592 					type = "null";
593 					break;
594 				default:
595 					type = "???";
596 					break;
597 			}
598 		}
599 
600 		kprintf("page        cache       type       state  wired  usage\n");
601 		for (i = 0; page; i++, page = page->queue_next) {
602 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->cache,
603 				type, page_state_to_string(page->state),
604 				page->wired_count, page->usage_count);
605 		}
606 	}
607 	return 0;
608 }
609 
610 
611 static int
612 dump_page_stats(int argc, char **argv)
613 {
614 	page_num_t swappableModified = 0;
615 	page_num_t swappableModifiedInactive = 0;
616 	uint32 counter[8];
617 	addr_t i;
618 
619 	memset(counter, 0, sizeof(counter));
620 
621 	for (i = 0; i < sNumPages; i++) {
622 		if (sPages[i].state > 7)
623 			panic("page %li at %p has invalid state!\n", i, &sPages[i]);
624 
625 		counter[sPages[i].state]++;
626 
627 		if (sPages[i].state == PAGE_STATE_MODIFIED && sPages[i].cache != NULL
628 			&& sPages[i].cache->temporary && sPages[i].wired_count == 0) {
629 			swappableModified++;
630 			if (sPages[i].usage_count < 0)
631 				swappableModifiedInactive++;
632 		}
633 	}
634 
635 	kprintf("page stats:\n");
636 	kprintf("total: %lu\n", sNumPages);
637 	kprintf("active: %lu\ninactive: %lu\nbusy: %lu\nunused: %lu\n",
638 		counter[PAGE_STATE_ACTIVE], counter[PAGE_STATE_INACTIVE],
639 		counter[PAGE_STATE_BUSY], counter[PAGE_STATE_UNUSED]);
640 	kprintf("wired: %lu\nmodified: %lu\nfree: %lu\nclear: %lu\n",
641 		counter[PAGE_STATE_WIRED], counter[PAGE_STATE_MODIFIED],
642 		counter[PAGE_STATE_FREE], counter[PAGE_STATE_CLEAR]);
643 	kprintf("reserved pages: %lu\n", sReservedPages);
644 	kprintf("page deficit: %lu\n", sPageDeficit);
645 	kprintf("mapped pages: %lu\n", gMappedPagesCount);
646 
647 	kprintf("\nfree queue: %p, count = %ld\n", &sFreePageQueue,
648 		sFreePageQueue.count);
649 	kprintf("clear queue: %p, count = %ld\n", &sClearPageQueue,
650 		sClearPageQueue.count);
651 	kprintf("modified queue: %p, count = %ld (%ld temporary, %lu swappable, "
652 		"inactive: %lu)\n", &sModifiedPageQueue, sModifiedPageQueue.count,
653 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
654 	kprintf("active queue: %p, count = %ld\n", &sActivePageQueue,
655 		sActivePageQueue.count);
656 	kprintf("inactive queue: %p, count = %ld\n", &sInactivePageQueue,
657 		sInactivePageQueue.count);
658 	return 0;
659 }
660 
661 
662 static inline size_t
663 free_page_queue_count(void)
664 {
665 	return sFreePageQueue.count + sClearPageQueue.count;
666 }
667 
668 
669 static status_t
670 set_page_state_nolock(vm_page *page, int pageState)
671 {
672 	if (pageState == page->state)
673 		return B_OK;
674 
675 	page_queue *fromQueue = NULL;
676 	page_queue *toQueue = NULL;
677 
678 	switch (page->state) {
679 		case PAGE_STATE_BUSY:
680 		case PAGE_STATE_ACTIVE:
681 		case PAGE_STATE_WIRED:
682 		case PAGE_STATE_UNUSED:
683 			fromQueue = &sActivePageQueue;
684 			break;
685 		case PAGE_STATE_INACTIVE:
686 			fromQueue = &sInactivePageQueue;
687 			break;
688 		case PAGE_STATE_MODIFIED:
689 			fromQueue = &sModifiedPageQueue;
690 			break;
691 		case PAGE_STATE_FREE:
692 			fromQueue = &sFreePageQueue;
693 			break;
694 		case PAGE_STATE_CLEAR:
695 			fromQueue = &sClearPageQueue;
696 			break;
697 		default:
698 			panic("vm_page_set_state: vm_page %p in invalid state %d\n",
699 				page, page->state);
700 			break;
701 	}
702 
703 	if (page->state == PAGE_STATE_CLEAR || page->state == PAGE_STATE_FREE) {
704 		if (page->cache != NULL)
705 			panic("free page %p has cache", page);
706 	}
707 
708 	switch (pageState) {
709 		case PAGE_STATE_BUSY:
710 		case PAGE_STATE_ACTIVE:
711 		case PAGE_STATE_WIRED:
712 		case PAGE_STATE_UNUSED:
713 			toQueue = &sActivePageQueue;
714 			break;
715 		case PAGE_STATE_INACTIVE:
716 			toQueue = &sInactivePageQueue;
717 			break;
718 		case PAGE_STATE_MODIFIED:
719 			toQueue = &sModifiedPageQueue;
720 			break;
721 		case PAGE_STATE_FREE:
722 			toQueue = &sFreePageQueue;
723 			break;
724 		case PAGE_STATE_CLEAR:
725 			toQueue = &sClearPageQueue;
726 			break;
727 		default:
728 			panic("vm_page_set_state: invalid target state %d\n", pageState);
729 	}
730 
731 	if (pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE
732 		|| pageState == PAGE_STATE_INACTIVE) {
733 		if (sPageDeficit > 0)
734 			sFreePageCondition.NotifyOne();
735 
736 		if (pageState != PAGE_STATE_INACTIVE && page->cache != NULL)
737 			panic("to be freed page %p has cache", page);
738 	}
739 	if (page->cache != NULL && page->cache->temporary) {
740 		if (pageState == PAGE_STATE_MODIFIED)
741 			sModifiedTemporaryPages++;
742 		else if (page->state == PAGE_STATE_MODIFIED)
743 			sModifiedTemporaryPages--;
744 	}
745 
746 #ifdef PAGE_ALLOCATION_TRACING
747 	if ((pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE)
748 		&& page->state != PAGE_STATE_CLEAR && page->state != PAGE_STATE_FREE) {
749 		T(FreePage());
750 	}
751 #endif	// PAGE_ALLOCATION_TRACING
752 
753 	page->state = pageState;
754 	move_page_to_queue(fromQueue, toQueue, page);
755 
756 	return B_OK;
757 }
758 
759 
760 /*! Moves a modified page into either the active or inactive page queue
761 	depending on its usage count and wiring.
762 */
763 static void
764 move_page_to_active_or_inactive_queue(vm_page *page, bool dequeued)
765 {
766 	// Note, this logic must be in sync with what the page daemon does
767 	int32 state;
768 	if (!page->mappings.IsEmpty() || page->usage_count >= 0
769 		|| page->wired_count)
770 		state = PAGE_STATE_ACTIVE;
771 	else
772 		state = PAGE_STATE_INACTIVE;
773 
774 	if (dequeued) {
775 		page->state = state;
776 		enqueue_page(state == PAGE_STATE_ACTIVE
777 			? &sActivePageQueue : &sInactivePageQueue, page);
778 		if (page->cache->temporary)
779 			sModifiedTemporaryPages--;
780 	} else
781 		set_page_state_nolock(page, state);
782 }
783 
784 
785 static void
786 clear_page(struct vm_page *page)
787 {
788 	addr_t virtualAddress;
789 	vm_get_physical_page(page->physical_page_number << PAGE_SHIFT,
790 		&virtualAddress, PHYSICAL_PAGE_CAN_WAIT);
791 
792 	memset((void *)virtualAddress, 0, B_PAGE_SIZE);
793 
794 	vm_put_physical_page(virtualAddress);
795 }
796 
797 
798 /*!
799 	This is a background thread that wakes up every now and then (every 100ms)
800 	and moves some pages from the free queue over to the clear queue.
801 	Given enough time, it will clear out all pages from the free queue - we
802 	could probably slow it down after having reached a certain threshold.
803 */
804 static int32
805 page_scrubber(void *unused)
806 {
807 	(void)(unused);
808 
809 	TRACE(("page_scrubber starting...\n"));
810 
811 	for (;;) {
812 		snooze(100000); // 100ms
813 
814 		if (sFreePageQueue.count > 0) {
815 			vm_page *page[SCRUB_SIZE];
816 			int32 i, scrubCount;
817 
818 			// get some pages from the free queue
819 
820 			InterruptsSpinLocker locker(sPageLock);
821 
822 			// Since we temporarily remove pages from the free pages reserve,
823 			// we must make sure we don't cause a violation of the page
824 			// reservation warranty. The following is usually stricter than
825 			// necessary, because we don't have information on how many of the
826 			// reserved pages have already been allocated.
827 			scrubCount = SCRUB_SIZE;
828 			uint32 freeCount = free_page_queue_count();
829 			if (freeCount < sReservedPages)
830 				scrubCount = 0;
831 			else if ((uint32)scrubCount > freeCount - sReservedPages)
832 				scrubCount = freeCount - sReservedPages;
833 
834 			for (i = 0; i < scrubCount; i++) {
835 				page[i] = dequeue_page(&sFreePageQueue);
836 				if (page[i] == NULL)
837 					break;
838 				page[i]->state = PAGE_STATE_BUSY;
839 			}
840 
841 			scrubCount = i;
842 
843 			if (scrubCount > 0) {
844 				T(ScrubbingPages(scrubCount));
845 			}
846 
847 			locker.Unlock();
848 
849 			// clear them
850 
851 			for (i = 0; i < scrubCount; i++) {
852 				clear_page(page[i]);
853 			}
854 
855 			locker.Lock();
856 
857 			// and put them into the clear queue
858 
859 			for (i = 0; i < scrubCount; i++) {
860 				page[i]->state = PAGE_STATE_CLEAR;
861 				enqueue_page(&sClearPageQueue, page[i]);
862 			}
863 
864 			if (scrubCount > 0) {
865 				T(ScrubbedPages(scrubCount));
866 			}
867 		}
868 	}
869 
870 	return 0;
871 }
872 
873 
874 static status_t
875 write_page(vm_page *page, uint32 flags, AsyncIOCallback* callback)
876 {
877 	TRACE(("write_page(page = %p): offset = %Ld\n", page, (off_t)page->cache_offset << PAGE_SHIFT));
878 
879 	off_t offset = (off_t)page->cache_offset << PAGE_SHIFT;
880 
881 	iovec vecs[1];
882 	vecs->iov_base = (void*)(addr_t)(page->physical_page_number * B_PAGE_SIZE);
883 	vecs->iov_len = B_PAGE_SIZE;
884 
885 	if (callback != NULL) {
886 		// asynchronous I/O
887 		return page->cache->WriteAsync(offset, vecs, 1, B_PAGE_SIZE,
888 			flags | B_PHYSICAL_IO_REQUEST, callback);
889 	}
890 
891 	// synchronous I/0
892 	size_t length = B_PAGE_SIZE;
893 	status_t status = page->cache->Write(offset, vecs, 1,
894 		flags | B_PHYSICAL_IO_REQUEST, &length);
895 
896 	if (status == B_OK && length == 0)
897 		status = B_ERROR;
898 
899 	return status;
900 }
901 
902 
903 static inline bool
904 is_marker_page(struct vm_page *page)
905 {
906 	return page->type == PAGE_TYPE_DUMMY;
907 }
908 
909 
910 static void
911 remove_page_marker(struct vm_page &marker)
912 {
913 	if (marker.state == PAGE_STATE_UNUSED)
914 		return;
915 
916 	page_queue *queue;
917 
918 	switch (marker.state) {
919 		case PAGE_STATE_ACTIVE:
920 			queue = &sActivePageQueue;
921 			break;
922 		case PAGE_STATE_INACTIVE:
923 			queue = &sInactivePageQueue;
924 			break;
925 		case PAGE_STATE_MODIFIED:
926 			queue = &sModifiedPageQueue;
927 			break;
928 
929 		default:
930 			return;
931 	}
932 
933 	InterruptsSpinLocker locker(sPageLock);
934 	remove_page_from_queue(queue, &marker);
935 
936 	marker.state = PAGE_STATE_UNUSED;
937 }
938 
939 
940 static vm_page *
941 next_modified_page(struct vm_page &marker)
942 {
943 	InterruptsSpinLocker locker(sPageLock);
944 	vm_page *page;
945 
946 	if (marker.state == PAGE_STATE_MODIFIED) {
947 		page = marker.queue_next;
948 		remove_page_from_queue(&sModifiedPageQueue, &marker);
949 		marker.state = PAGE_STATE_UNUSED;
950 	} else
951 		page = sModifiedPageQueue.head;
952 
953 	for (; page != NULL; page = page->queue_next) {
954 		if (!is_marker_page(page) && page->state != PAGE_STATE_BUSY) {
955 			// insert marker
956 			marker.state = PAGE_STATE_MODIFIED;
957 			insert_page_after(&sModifiedPageQueue, page, &marker);
958 			return page;
959 		}
960 	}
961 
962 	return NULL;
963 }
964 
965 
966 class PageWriterCallback;
967 
968 
969 class PageWriterRun {
970 public:
971 	status_t Init(uint32 maxPages);
972 
973 	void PrepareNextRun();
974 	void AddPage(vm_page* page);
975 	void Go();
976 
977 	void PageWritten(PageWriterCallback* callback, status_t status,
978 		bool partialTransfer, size_t bytesTransferred);
979 
980 private:
981 	uint32				fMaxPages;
982 	uint32				fPageCount;
983 	vint32				fPendingPages;
984 	PageWriterCallback*	fCallbacks;
985 	ConditionVariable	fAllFinishedCondition;
986 };
987 
988 
989 class PageWriterCallback : public AsyncIOCallback {
990 public:
991 	void SetTo(PageWriterRun* run, vm_page* page)
992 	{
993 		fRun = run;
994 		fPage = page;
995 		fCache = page->cache;
996 		fStatus = B_OK;
997 		fBusyCondition.Publish(page, "page");
998 	}
999 
1000 	vm_page* Page() const	{ return fPage; }
1001 	VMCache* Cache() const	{ return fCache; }
1002 	status_t Status() const	{ return fStatus; }
1003 
1004 	ConditionVariable& BusyCondition()	{ return fBusyCondition; }
1005 
1006 	virtual void IOFinished(status_t status, bool partialTransfer,
1007 		size_t bytesTransferred)
1008 	{
1009 		fStatus = status == B_OK && bytesTransferred == 0 ? B_ERROR : status;
1010 		fRun->PageWritten(this, status, partialTransfer, bytesTransferred);
1011 	}
1012 
1013 private:
1014 	PageWriterRun*		fRun;
1015 	vm_page*			fPage;
1016 	VMCache*			fCache;
1017 	status_t			fStatus;
1018 	ConditionVariable	fBusyCondition;
1019 };
1020 
1021 
1022 status_t
1023 PageWriterRun::Init(uint32 maxPages)
1024 {
1025 	fMaxPages = maxPages;
1026 	fPageCount = 0;
1027 	fPendingPages = 0;
1028 
1029 	fCallbacks = new(std::nothrow) PageWriterCallback[maxPages];
1030 	if (fCallbacks == NULL)
1031 		return B_NO_MEMORY;
1032 
1033 	return B_OK;
1034 }
1035 
1036 
1037 void
1038 PageWriterRun::PrepareNextRun()
1039 {
1040 	fPageCount = 0;
1041 	fPendingPages = 0;
1042 }
1043 
1044 
1045 void
1046 PageWriterRun::AddPage(vm_page* page)
1047 {
1048 	page->state = PAGE_STATE_BUSY;
1049 	page->busy_writing = true;
1050 
1051 	fCallbacks[fPageCount].SetTo(this, page);
1052 	fPageCount++;
1053 }
1054 
1055 
1056 void
1057 PageWriterRun::Go()
1058 {
1059 	fPendingPages = fPageCount;
1060 
1061 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
1062 	ConditionVariableEntry waitEntry;
1063 	fAllFinishedCondition.Add(&waitEntry);
1064 
1065 	// schedule writes
1066 	for (uint32 i = 0; i < fPageCount; i++) {
1067 		PageWriterCallback& callback = fCallbacks[i];
1068 		write_page(callback.Page(), B_VIP_IO_REQUEST, &callback);
1069 	}
1070 
1071 	// wait until all pages have been written
1072 	waitEntry.Wait();
1073 
1074 	// mark pages depending on whether they could be written or not
1075 
1076 	for (uint32 i = 0; i < fPageCount; i++) {
1077 		PageWriterCallback& callback = fCallbacks[i];
1078 		vm_page* page = callback.Page();
1079 		vm_cache* cache = callback.Cache();
1080 		cache->Lock();
1081 
1082 		if (callback.Status() == B_OK) {
1083 			// put it into the active queue
1084 			InterruptsSpinLocker locker(sPageLock);
1085 			move_page_to_active_or_inactive_queue(page, true);
1086 			page->busy_writing = false;
1087 		} else {
1088 			// We don't have to put the PAGE_MODIFIED bit back, as it's
1089 			// still in the modified pages list.
1090 			{
1091 				InterruptsSpinLocker locker(sPageLock);
1092 				page->state = PAGE_STATE_MODIFIED;
1093 				enqueue_page(&sModifiedPageQueue, page);
1094 			}
1095 			if (!page->busy_writing) {
1096 				// someone has cleared the busy_writing flag which tells
1097 				// us our page has gone invalid
1098 				cache->RemovePage(page);
1099 			} else
1100 				page->busy_writing = false;
1101 		}
1102 
1103 		callback.BusyCondition().Unpublish();
1104 
1105 		cache->Unlock();
1106 	}
1107 
1108 	for (uint32 i = 0; i < fPageCount; i++) {
1109 		vm_cache* cache = fCallbacks[i].Cache();
1110 
1111 		// We release the cache references after all pages were made
1112 		// unbusy again - otherwise releasing a vnode could deadlock.
1113 		cache->ReleaseStoreRef();
1114 		cache->ReleaseRef();
1115 	}
1116 }
1117 
1118 
1119 void
1120 PageWriterRun::PageWritten(PageWriterCallback* callback, status_t status,
1121 	bool partialTransfer, size_t bytesTransferred)
1122 {
1123 	if (atomic_add(&fPendingPages, -1) == 1)
1124 		fAllFinishedCondition.NotifyAll();
1125 }
1126 
1127 
1128 /*!	The page writer continuously takes some pages from the modified
1129 	queue, writes them back, and moves them back to the active queue.
1130 	It runs in its own thread, and is only there to keep the number
1131 	of modified pages low, so that more pages can be reused with
1132 	fewer costs.
1133 */
1134 status_t
1135 page_writer(void* /*unused*/)
1136 {
1137 	const uint32 kNumPages = 256;
1138 	uint32 writtenPages = 0;
1139 	bigtime_t lastWrittenTime = 0;
1140 	bigtime_t pageCollectionTime = 0;
1141 	bigtime_t pageWritingTime = 0;
1142 
1143 	PageWriterRun run;
1144 	if (run.Init(kNumPages) != B_OK) {
1145 		panic("page writer: Failed to init PageWriterRun!");
1146 		return B_ERROR;
1147 	}
1148 
1149 	vm_page marker;
1150 	marker.type = PAGE_TYPE_DUMMY;
1151 	marker.cache = NULL;
1152 	marker.state = PAGE_STATE_UNUSED;
1153 
1154 	while (true) {
1155 		if (sModifiedPageQueue.count - sModifiedTemporaryPages < 1024) {
1156 			int32 count = 0;
1157 			get_sem_count(sWriterWaitSem, &count);
1158 			if (count == 0)
1159 				count = 1;
1160 
1161 			acquire_sem_etc(sWriterWaitSem, count, B_RELATIVE_TIMEOUT, 3000000);
1162 				// all 3 seconds when no one triggers us
1163 		}
1164 
1165 		uint32 numPages = 0;
1166 		run.PrepareNextRun();
1167 
1168 		// TODO: make this laptop friendly, too (ie. only start doing
1169 		// something if someone else did something or there is really
1170 		// enough to do).
1171 
1172 		// collect pages to be written
1173 #if ENABLE_SWAP_SUPPORT
1174 		bool lowOnPages = low_resource_state(B_KERNEL_RESOURCE_PAGES)
1175 			!= B_NO_LOW_RESOURCE;
1176 #endif
1177 
1178 		pageCollectionTime -= system_time();
1179 
1180 		while (numPages < kNumPages) {
1181 			vm_page *page = next_modified_page(marker);
1182 			if (page == NULL)
1183 				break;
1184 
1185 			PageCacheLocker cacheLocker(page, false);
1186 			if (!cacheLocker.IsLocked())
1187 				continue;
1188 
1189 			vm_cache *cache = page->cache;
1190 
1191 			// Don't write back wired (locked) pages and don't write RAM pages
1192 			// until we're low on pages. Also avoid writing temporary pages that
1193 			// are active.
1194 			if (page->wired_count > 0
1195 				|| cache->temporary
1196 #if ENABLE_SWAP_SUPPORT
1197 					&& (!lowOnPages /*|| page->usage_count > 0*/)
1198 #endif
1199 				) {
1200 				continue;
1201 			}
1202 
1203 			// we need our own reference to the store, as it might
1204 			// currently be destructed
1205 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
1206 				cacheLocker.Unlock();
1207 				thread_yield(true);
1208 				continue;
1209 			}
1210 
1211 			InterruptsSpinLocker locker(sPageLock);
1212 
1213 			// state might have change while we were locking the cache
1214 			if (page->state != PAGE_STATE_MODIFIED) {
1215 				// release the cache reference
1216 				locker.Unlock();
1217 				cache->ReleaseStoreRef();
1218 				continue;
1219 			}
1220 
1221 			remove_page_from_queue(&sModifiedPageQueue, page);
1222 
1223 			run.AddPage(page);
1224 
1225 			locker.Unlock();
1226 
1227 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
1228 			vm_clear_map_flags(page, PAGE_MODIFIED);
1229 			cache->AcquireRefLocked();
1230 			numPages++;
1231 		}
1232 
1233 		pageCollectionTime += system_time();
1234 
1235 		if (numPages == 0)
1236 			continue;
1237 
1238 		// write pages to disk and do all the cleanup
1239 		pageWritingTime -= system_time();
1240 		run.Go();
1241 		pageWritingTime += system_time();
1242 
1243 		// debug output only...
1244 		writtenPages += numPages;
1245 		if (writtenPages >= 1024) {
1246 			bigtime_t now = system_time();
1247 			dprintf("page writer: wrote 1024 pages (total: %llu ms, "
1248 				"collect: %llu ms, write: %llu ms)\n",
1249 				(now - lastWrittenTime) / 1000,
1250 				pageCollectionTime / 1000, pageWritingTime / 1000);
1251 			writtenPages -= 1024;
1252 			lastWrittenTime = now;
1253 			pageCollectionTime = 0;
1254 			pageWritingTime = 0;
1255 		}
1256 	}
1257 
1258 	remove_page_marker(marker);
1259 	return B_OK;
1260 }
1261 
1262 
1263 static vm_page *
1264 find_page_candidate(struct vm_page &marker, bool stealActive)
1265 {
1266 	InterruptsSpinLocker locker(sPageLock);
1267 	page_queue *queue;
1268 	vm_page *page;
1269 
1270 	if (marker.state == PAGE_STATE_UNUSED) {
1271 		// Get the first free pages of the (in)active queue
1272 		queue = &sInactivePageQueue;
1273 		page = sInactivePageQueue.head;
1274 		if (page == NULL && stealActive) {
1275 			queue = &sActivePageQueue;
1276 			page = sActivePageQueue.head;
1277 		}
1278 	} else {
1279 		// Get the next page of the current queue
1280 		if (marker.state == PAGE_STATE_INACTIVE)
1281 			queue = &sInactivePageQueue;
1282 		else if (marker.state == PAGE_STATE_ACTIVE)
1283 			queue = &sActivePageQueue;
1284 		else {
1285 			panic("invalid marker %p state", &marker);
1286 			queue = NULL;
1287 		}
1288 
1289 		page = marker.queue_next;
1290 		remove_page_from_queue(queue, &marker);
1291 		marker.state = PAGE_STATE_UNUSED;
1292 	}
1293 
1294 	while (page != NULL) {
1295 		if (!is_marker_page(page)
1296 			&& (page->state == PAGE_STATE_INACTIVE
1297 				|| (stealActive && page->state == PAGE_STATE_ACTIVE
1298 					&& page->wired_count == 0))) {
1299 			// we found a candidate, insert marker
1300 			marker.state = queue == &sActivePageQueue
1301 				? PAGE_STATE_ACTIVE : PAGE_STATE_INACTIVE;
1302 			insert_page_after(queue, page, &marker);
1303 			return page;
1304 		}
1305 
1306 		page = page->queue_next;
1307 		if (page == NULL && stealActive && queue != &sActivePageQueue) {
1308 			queue = &sActivePageQueue;
1309 			page = sActivePageQueue.head;
1310 		}
1311 	}
1312 
1313 	return NULL;
1314 }
1315 
1316 
1317 static bool
1318 steal_page(vm_page *page, bool stealActive)
1319 {
1320 	// try to lock the page's cache
1321 	if (vm_cache_acquire_locked_page_cache(page, false) == NULL)
1322 		return false;
1323 
1324 	AutoLocker<VMCache> cacheLocker(page->cache, true, false);
1325 	MethodDeleter<VMCache> _2(page->cache, &VMCache::ReleaseRefLocked);
1326 
1327 	// check again if that page is still a candidate
1328 	if (page->state != PAGE_STATE_INACTIVE
1329 		&& (!stealActive || page->state != PAGE_STATE_ACTIVE
1330 			|| page->wired_count != 0))
1331 		return false;
1332 
1333 	// recheck eventual last minute changes
1334 	uint32 flags;
1335 	vm_remove_all_page_mappings(page, &flags);
1336 	if ((flags & PAGE_MODIFIED) != 0) {
1337 		// page was modified, don't steal it
1338 		vm_page_set_state(page, PAGE_STATE_MODIFIED);
1339 		return false;
1340 	} else if ((flags & PAGE_ACCESSED) != 0) {
1341 		// page is in active use, don't steal it
1342 		vm_page_set_state(page, PAGE_STATE_ACTIVE);
1343 		return false;
1344 	}
1345 
1346 	// we can now steal this page
1347 
1348 	//dprintf("  steal page %p from cache %p%s\n", page, page->cache,
1349 	//	page->state == PAGE_STATE_INACTIVE ? "" : " (ACTIVE)");
1350 
1351 	page->cache->RemovePage(page);
1352 
1353 	InterruptsSpinLocker _(sPageLock);
1354 	remove_page_from_queue(page->state == PAGE_STATE_ACTIVE
1355 		? &sActivePageQueue : &sInactivePageQueue, page);
1356 	return true;
1357 }
1358 
1359 
1360 static size_t
1361 steal_pages(vm_page **pages, size_t count, bool reserve)
1362 {
1363 	size_t maxCount = count;
1364 
1365 	while (true) {
1366 		vm_page marker;
1367 		marker.type = PAGE_TYPE_DUMMY;
1368 		marker.cache = NULL;
1369 		marker.state = PAGE_STATE_UNUSED;
1370 
1371 		bool tried = false;
1372 		size_t stolen = 0;
1373 
1374 		while (count > 0) {
1375 			vm_page *page = find_page_candidate(marker, false);
1376 			if (page == NULL)
1377 				break;
1378 
1379 			if (steal_page(page, false)) {
1380 				if (reserve || stolen >= maxCount) {
1381 					InterruptsSpinLocker _(sPageLock);
1382 					enqueue_page(&sFreePageQueue, page);
1383 					page->state = PAGE_STATE_FREE;
1384 
1385 					T(StolenPage());
1386 				} else if (stolen < maxCount) {
1387 					pages[stolen] = page;
1388 				}
1389 				stolen++;
1390 				count--;
1391 			} else
1392 				tried = true;
1393 		}
1394 
1395 		remove_page_marker(marker);
1396 
1397 		InterruptsSpinLocker locker(sPageLock);
1398 
1399 		if (reserve && sReservedPages <= free_page_queue_count()
1400 			|| count == 0
1401 			|| !reserve && (sInactivePageQueue.count > 0
1402 				|| free_page_queue_count() > sReservedPages))
1403 			return stolen;
1404 
1405 		if (stolen && !tried && sInactivePageQueue.count > 0) {
1406 			count++;
1407 			continue;
1408 		}
1409 
1410 		// we need to wait for pages to become inactive
1411 
1412 		ConditionVariableEntry freeConditionEntry;
1413 		sPageDeficit++;
1414 		freeConditionEntry.Add(&sFreePageQueue);
1415 		locker.Unlock();
1416 
1417 		if (tried) {
1418 			// We tried all potential pages, but one or more couldn't be stolen
1419 			// at that time (likely because their cache was locked). No one
1420 			// else will have any better luck, so we'll just retry a little
1421 			// later.
1422 			freeConditionEntry.Wait(B_RELATIVE_TIMEOUT, 10000);
1423 		} else {
1424 			low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
1425 			//snooze(50000);
1426 				// sleep for 50ms
1427 
1428 			freeConditionEntry.Wait();
1429 		}
1430 
1431 		locker.Lock();
1432 		sPageDeficit--;
1433 
1434 		if (reserve && sReservedPages <= free_page_queue_count())
1435 			return stolen;
1436 	}
1437 }
1438 
1439 
1440 //	#pragma mark - private kernel API
1441 
1442 
1443 /*!	Writes a range of modified pages of a cache to disk.
1444 	You need to hold the vm_cache lock when calling this function.
1445 	Note that the cache lock is released in this function.
1446 	\param cache The cache.
1447 	\param firstPage Offset (in page size units) of the first page in the range.
1448 	\param endPage End offset (in page size units) of the page range. The page
1449 		at this offset is not included.
1450 */
1451 status_t
1452 vm_page_write_modified_page_range(struct VMCache *cache, uint32 firstPage,
1453 	uint32 endPage)
1454 {
1455 	// TODO: join adjacent pages into one vec list
1456 
1457 	for (VMCachePagesTree::Iterator it
1458 				= cache->pages.GetIterator(firstPage, true, true);
1459 			vm_page *page = it.Next();) {
1460 		bool dequeuedPage = false;
1461 
1462 		if (page->cache_offset >= endPage)
1463 			break;
1464 
1465 		if (page->state == PAGE_STATE_MODIFIED) {
1466 			InterruptsSpinLocker locker(&sPageLock);
1467 			remove_page_from_queue(&sModifiedPageQueue, page);
1468 			dequeuedPage = true;
1469 		} else if (page->state == PAGE_STATE_BUSY
1470 				|| !vm_test_map_modification(page)) {
1471 			continue;
1472 		}
1473 
1474 		page->state = PAGE_STATE_BUSY;
1475 		page->busy_writing = true;
1476 
1477 		ConditionVariable busyCondition;
1478 		busyCondition.Publish(page, "page");
1479 
1480 		// We have a modified page - however, while we're writing it back,
1481 		// the page is still mapped. In order not to lose any changes to the
1482 		// page, we mark it clean before actually writing it back; if writing
1483 		// the page fails for some reason, we just keep it in the modified page
1484 		// list, but that should happen only rarely.
1485 
1486 		// If the page is changed after we cleared the dirty flag, but before we
1487 		// had the chance to write it back, then we'll write it again later -
1488 		// that will probably not happen that often, though.
1489 
1490 		// clear the modified flag
1491 		vm_clear_map_flags(page, PAGE_MODIFIED);
1492 
1493 		cache->Unlock();
1494 		status_t status = write_page(page, 0, NULL);
1495 		cache->Lock();
1496 
1497 		InterruptsSpinLocker locker(&sPageLock);
1498 
1499 		if (status == B_OK) {
1500 			// put it into the active/inactive queue
1501 			move_page_to_active_or_inactive_queue(page, dequeuedPage);
1502 			page->busy_writing = false;
1503 		} else {
1504 			// We don't have to put the PAGE_MODIFIED bit back, as it's still
1505 			// in the modified pages list.
1506 			if (dequeuedPage) {
1507 				page->state = PAGE_STATE_MODIFIED;
1508 				enqueue_page(&sModifiedPageQueue, page);
1509 			}
1510 
1511 			if (!page->busy_writing) {
1512 				// someone has cleared the busy_writing flag which tells
1513 				// us our page has gone invalid
1514 				cache->RemovePage(page);
1515 			} else {
1516 				if (!dequeuedPage)
1517 					set_page_state_nolock(page, PAGE_STATE_MODIFIED);
1518 
1519 				page->busy_writing = false;
1520 			}
1521 		}
1522 
1523 		busyCondition.Unpublish();
1524 	}
1525 
1526 	return B_OK;
1527 }
1528 
1529 
1530 /*!	You need to hold the vm_cache lock when calling this function.
1531 	Note that the cache lock is released in this function.
1532 */
1533 status_t
1534 vm_page_write_modified_pages(vm_cache *cache)
1535 {
1536 	return vm_page_write_modified_page_range(cache, 0,
1537 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
1538 }
1539 
1540 
1541 /*!	Schedules the page writer to write back the specified \a page.
1542 	Note, however, that it might not do this immediately, and it can well
1543 	take several seconds until the page is actually written out.
1544 */
1545 void
1546 vm_page_schedule_write_page(vm_page *page)
1547 {
1548 	ASSERT(page->state == PAGE_STATE_MODIFIED);
1549 
1550 	vm_page_requeue(page, false);
1551 
1552 	release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1553 }
1554 
1555 
1556 /*!	Cache must be locked.
1557 */
1558 void
1559 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
1560 	uint32 endPage)
1561 {
1562 	uint32 modified = 0;
1563 	for (VMCachePagesTree::Iterator it
1564 				= cache->pages.GetIterator(firstPage, true, true);
1565 			vm_page *page = it.Next();) {
1566 		if (page->cache_offset >= endPage)
1567 			break;
1568 
1569 		if (page->state == PAGE_STATE_MODIFIED) {
1570 			vm_page_requeue(page, false);
1571 			modified++;
1572 		}
1573 	}
1574 
1575 	if (modified > 0)
1576 		release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1577 }
1578 
1579 
1580 void
1581 vm_page_init_num_pages(kernel_args *args)
1582 {
1583 	uint32 i;
1584 
1585 	// calculate the size of memory by looking at the physical_memory_range array
1586 	addr_t physicalPagesEnd = 0;
1587 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
1588 
1589 	for (i = 0; i < args->num_physical_memory_ranges; i++) {
1590 		physicalPagesEnd = (args->physical_memory_range[i].start
1591 			+ args->physical_memory_range[i].size) / B_PAGE_SIZE;
1592 	}
1593 
1594 	TRACE(("first phys page = 0x%lx, end 0x%x\n", sPhysicalPageOffset,
1595 		physicalPagesEnd));
1596 
1597 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
1598 
1599 #ifdef LIMIT_AVAILABLE_MEMORY
1600 	if (sNumPages > LIMIT_AVAILABLE_MEMORY * 256)
1601 		sNumPages = LIMIT_AVAILABLE_MEMORY * 256;
1602 #endif
1603 }
1604 
1605 
1606 status_t
1607 vm_page_init(kernel_args *args)
1608 {
1609 	TRACE(("vm_page_init: entry\n"));
1610 
1611 	// map in the new free page table
1612 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
1613 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1614 
1615 	TRACE(("vm_init: putting free_page_table @ %p, # ents %d (size 0x%x)\n",
1616 		sPages, sNumPages, (unsigned int)(sNumPages * sizeof(vm_page))));
1617 
1618 	// initialize the free page table
1619 	for (uint32 i = 0; i < sNumPages; i++) {
1620 		sPages[i].physical_page_number = sPhysicalPageOffset + i;
1621 		sPages[i].type = PAGE_TYPE_PHYSICAL;
1622 		sPages[i].state = PAGE_STATE_FREE;
1623 		new(&sPages[i].mappings) vm_page_mappings();
1624 		sPages[i].wired_count = 0;
1625 		sPages[i].usage_count = 0;
1626 		sPages[i].busy_writing = false;
1627 		sPages[i].merge_swap = false;
1628 		sPages[i].cache = NULL;
1629 		#ifdef DEBUG_PAGE_QUEUE
1630 			sPages[i].queue = NULL;
1631 		#endif
1632 		#ifdef DEBUG_PAGE_CACHE_TRANSITIONS
1633 			sPages[i].debug_flags = 0;
1634 			sPages[i].collided_page = NULL;
1635 		#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
1636 		enqueue_page(&sFreePageQueue, &sPages[i]);
1637 	}
1638 
1639 	TRACE(("initialized table\n"));
1640 
1641 	// mark some of the page ranges inuse
1642 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
1643 		vm_mark_page_range_inuse(args->physical_allocated_range[i].start / B_PAGE_SIZE,
1644 			args->physical_allocated_range[i].size / B_PAGE_SIZE);
1645 	}
1646 
1647 	TRACE(("vm_page_init: exit\n"));
1648 
1649 	return B_OK;
1650 }
1651 
1652 
1653 status_t
1654 vm_page_init_post_area(kernel_args *args)
1655 {
1656 	void *dummy;
1657 
1658 	dummy = sPages;
1659 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
1660 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
1661 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1662 
1663 	add_debugger_command("page_stats", &dump_page_stats, "Dump statistics about page usage");
1664 	add_debugger_command("page", &dump_page, "Dump page info");
1665 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
1666 	add_debugger_command("find_page", &find_page,
1667 		"Find out which queue a page is actually in");
1668 
1669 	return B_OK;
1670 }
1671 
1672 
1673 status_t
1674 vm_page_init_post_thread(kernel_args *args)
1675 {
1676 	new (&sFreePageCondition) ConditionVariable;
1677 	sFreePageCondition.Publish(&sFreePageQueue, "free page");
1678 
1679 	// create a kernel thread to clear out pages
1680 
1681 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
1682 		B_LOWEST_ACTIVE_PRIORITY, NULL);
1683 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1684 
1685 	// start page writer
1686 
1687 	sWriterWaitSem = create_sem(0, "page writer");
1688 
1689 	thread = spawn_kernel_thread(&page_writer, "page writer",
1690 		B_NORMAL_PRIORITY + 1, NULL);
1691 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1692 
1693 	return B_OK;
1694 }
1695 
1696 
1697 status_t
1698 vm_mark_page_inuse(addr_t page)
1699 {
1700 	return vm_mark_page_range_inuse(page, 1);
1701 }
1702 
1703 
1704 status_t
1705 vm_mark_page_range_inuse(addr_t startPage, addr_t length)
1706 {
1707 	TRACE(("vm_mark_page_range_inuse: start 0x%lx, len 0x%lx\n",
1708 		startPage, length));
1709 
1710 	if (sPhysicalPageOffset > startPage) {
1711 		TRACE(("vm_mark_page_range_inuse: start page %ld is before free list\n",
1712 			startPage));
1713 		return B_BAD_VALUE;
1714 	}
1715 	startPage -= sPhysicalPageOffset;
1716 	if (startPage + length > sNumPages) {
1717 		TRACE(("vm_mark_page_range_inuse: range would extend past free list\n"));
1718 		return B_BAD_VALUE;
1719 	}
1720 
1721 	cpu_status state = disable_interrupts();
1722 	acquire_spinlock(&sPageLock);
1723 
1724 	for (addr_t i = 0; i < length; i++) {
1725 		vm_page *page = &sPages[startPage + i];
1726 		switch (page->state) {
1727 			case PAGE_STATE_FREE:
1728 			case PAGE_STATE_CLEAR:
1729 				set_page_state_nolock(page, PAGE_STATE_UNUSED);
1730 				break;
1731 			case PAGE_STATE_WIRED:
1732 				break;
1733 			case PAGE_STATE_ACTIVE:
1734 			case PAGE_STATE_INACTIVE:
1735 			case PAGE_STATE_BUSY:
1736 			case PAGE_STATE_MODIFIED:
1737 			case PAGE_STATE_UNUSED:
1738 			default:
1739 				// uh
1740 				dprintf("vm_mark_page_range_inuse: page 0x%lx in non-free state %d!\n",
1741 					startPage + i, page->state);
1742 				break;
1743 		}
1744 	}
1745 
1746 	release_spinlock(&sPageLock);
1747 	restore_interrupts(state);
1748 
1749 	return B_OK;
1750 }
1751 
1752 
1753 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
1754 	Note, you specify the same \a count here that you specified when
1755 	reserving the pages - you don't need to keep track how many pages
1756 	you actually needed of that upfront allocation.
1757 */
1758 void
1759 vm_page_unreserve_pages(uint32 count)
1760 {
1761 	if (count == 0)
1762 		return;
1763 
1764 	InterruptsSpinLocker locker(sPageLock);
1765 	ASSERT(sReservedPages >= count);
1766 
1767 	T(UnreservePages(count));
1768 
1769 	sReservedPages -= count;
1770 
1771 	if (sPageDeficit > 0)
1772 		sFreePageCondition.NotifyAll();
1773 }
1774 
1775 
1776 /*!	With this call, you can reserve a number of free pages in the system.
1777 	They will only be handed out to someone who has actually reserved them.
1778 	This call returns as soon as the number of requested pages has been
1779 	reached.
1780 */
1781 void
1782 vm_page_reserve_pages(uint32 count)
1783 {
1784 	if (count == 0)
1785 		return;
1786 
1787 	InterruptsSpinLocker locker(sPageLock);
1788 
1789 	T(ReservePages(count));
1790 
1791 	sReservedPages += count;
1792 	size_t freePages = free_page_queue_count();
1793 	if (sReservedPages <= freePages)
1794 		return;
1795 
1796 	locker.Unlock();
1797 
1798 	steal_pages(NULL, count + 1, true);
1799 		// we get one more, just in case we can do something someone
1800 		// else can't
1801 }
1802 
1803 
1804 bool
1805 vm_page_try_reserve_pages(uint32 count)
1806 {
1807 	if (count == 0)
1808 		return true;
1809 
1810 	InterruptsSpinLocker locker(sPageLock);
1811 
1812 	T(ReservePages(count));
1813 
1814 	size_t freePages = free_page_queue_count();
1815 	if (sReservedPages + count > freePages)
1816 		return false;
1817 
1818 	sReservedPages += count;
1819 	return true;
1820 }
1821 
1822 
1823 vm_page *
1824 vm_page_allocate_page(int pageState, bool reserved)
1825 {
1826 	page_queue *queue;
1827 	page_queue *otherQueue;
1828 
1829 	switch (pageState) {
1830 		case PAGE_STATE_FREE:
1831 			queue = &sFreePageQueue;
1832 			otherQueue = &sClearPageQueue;
1833 			break;
1834 		case PAGE_STATE_CLEAR:
1835 			queue = &sClearPageQueue;
1836 			otherQueue = &sFreePageQueue;
1837 			break;
1838 		default:
1839 			return NULL; // invalid
1840 	}
1841 
1842 	InterruptsSpinLocker locker(sPageLock);
1843 
1844 	T(AllocatePage(reserved));
1845 
1846 	vm_page *page = NULL;
1847 	while (true) {
1848 		if (reserved || sReservedPages < free_page_queue_count()) {
1849 			page = dequeue_page(queue);
1850 			if (page == NULL) {
1851 #ifdef DEBUG
1852 				if (queue->count != 0)
1853 					panic("queue %p corrupted, count = %d\n", queue, queue->count);
1854 #endif
1855 
1856 				// if the primary queue was empty, grap the page from the
1857 				// secondary queue
1858 				page = dequeue_page(otherQueue);
1859 			}
1860 		}
1861 
1862 		if (page != NULL)
1863 			break;
1864 
1865 		if (reserved)
1866 			panic("Had reserved page, but there is none!");
1867 
1868 		// steal one from the inactive list
1869 		locker.Unlock();
1870 		size_t stolen = steal_pages(&page, 1, false);
1871 		locker.Lock();
1872 
1873 		if (stolen > 0)
1874 			break;
1875 	}
1876 
1877 	if (page->cache != NULL)
1878 		panic("supposed to be free page %p has cache\n", page);
1879 
1880 	int oldPageState = page->state;
1881 	page->state = PAGE_STATE_BUSY;
1882 	page->usage_count = 2;
1883 
1884 	enqueue_page(&sActivePageQueue, page);
1885 
1886 	locker.Unlock();
1887 
1888 	// if needed take the page from the free queue and zero it out
1889 	if (pageState == PAGE_STATE_CLEAR && oldPageState != PAGE_STATE_CLEAR)
1890 		clear_page(page);
1891 
1892 	return page;
1893 }
1894 
1895 
1896 /*!	Allocates a number of pages and puts their pointers into the provided
1897 	array. All pages are marked busy.
1898 	Returns B_OK on success, and B_NO_MEMORY when there aren't any free
1899 	pages left to allocate.
1900 */
1901 status_t
1902 vm_page_allocate_pages(int pageState, vm_page **pages, uint32 numPages)
1903 {
1904 	uint32 i;
1905 
1906 	for (i = 0; i < numPages; i++) {
1907 		pages[i] = vm_page_allocate_page(pageState, false);
1908 		if (pages[i] == NULL) {
1909 			// allocation failed, we need to free what we already have
1910 			while (i-- > 0)
1911 				vm_page_set_state(pages[i], pageState);
1912 
1913 			return B_NO_MEMORY;
1914 		}
1915 	}
1916 
1917 	return B_OK;
1918 }
1919 
1920 
1921 vm_page *
1922 vm_page_allocate_page_run(int pageState, addr_t base, addr_t length)
1923 {
1924 	vm_page *firstPage = NULL;
1925 	uint32 start = base >> PAGE_SHIFT;
1926 
1927 	InterruptsSpinLocker locker(sPageLock);
1928 
1929 	if (sFreePageQueue.count + sClearPageQueue.count - sReservedPages
1930 			< length) {
1931 		// TODO: add more tries, ie. free some inactive, ...
1932 		// no free space
1933 		return NULL;
1934 	}
1935 
1936 	for (;;) {
1937 		bool foundRun = true;
1938 		if (start + length > sNumPages)
1939 			break;
1940 
1941 		uint32 i;
1942 		for (i = 0; i < length; i++) {
1943 			if (sPages[start + i].state != PAGE_STATE_FREE
1944 				&& sPages[start + i].state != PAGE_STATE_CLEAR) {
1945 				foundRun = false;
1946 				i++;
1947 				break;
1948 			}
1949 		}
1950 		if (foundRun) {
1951 			// pull the pages out of the appropriate queues
1952 			for (i = 0; i < length; i++) {
1953 				sPages[start + i].is_cleared
1954 					= sPages[start + i].state == PAGE_STATE_CLEAR;
1955 				set_page_state_nolock(&sPages[start + i], PAGE_STATE_BUSY);
1956 				sPages[start + i].usage_count = 2;
1957 			}
1958 			firstPage = &sPages[start];
1959 			break;
1960 		} else {
1961 			start += i;
1962 		}
1963 	}
1964 
1965 	T(AllocatePageRun(length));
1966 
1967 	locker.Unlock();
1968 
1969 	if (firstPage != NULL && pageState == PAGE_STATE_CLEAR) {
1970 		for (uint32 i = 0; i < length; i++) {
1971 			if (!sPages[start + i].is_cleared)
1972 	 			clear_page(&sPages[start + i]);
1973 		}
1974 	}
1975 
1976 	return firstPage;
1977 }
1978 
1979 
1980 vm_page *
1981 vm_page_at_index(int32 index)
1982 {
1983 	return &sPages[index];
1984 }
1985 
1986 
1987 vm_page *
1988 vm_lookup_page(addr_t pageNumber)
1989 {
1990 	if (pageNumber < sPhysicalPageOffset)
1991 		return NULL;
1992 
1993 	pageNumber -= sPhysicalPageOffset;
1994 	if (pageNumber >= sNumPages)
1995 		return NULL;
1996 
1997 	return &sPages[pageNumber];
1998 }
1999 
2000 
2001 /*!	Free the page that belonged to a certain cache.
2002 	You can use vm_page_set_state() manually if you prefer, but only
2003 	if the page does not equal PAGE_STATE_MODIFIED.
2004 */
2005 void
2006 vm_page_free(vm_cache *cache, vm_page *page)
2007 {
2008 	InterruptsSpinLocker _(sPageLock);
2009 
2010 	if (page->cache == NULL && page->state == PAGE_STATE_MODIFIED
2011 		&& cache->temporary) {
2012 		sModifiedTemporaryPages--;
2013 	}
2014 
2015 	set_page_state_nolock(page, PAGE_STATE_FREE);
2016 }
2017 
2018 
2019 status_t
2020 vm_page_set_state(vm_page *page, int pageState)
2021 {
2022 	InterruptsSpinLocker _(sPageLock);
2023 
2024 	return set_page_state_nolock(page, pageState);
2025 }
2026 
2027 
2028 /*!	Moves a page to either the tail of the head of its current queue,
2029 	depending on \a tail.
2030 */
2031 void
2032 vm_page_requeue(struct vm_page *page, bool tail)
2033 {
2034 	InterruptsSpinLocker _(sPageLock);
2035 	page_queue *queue = NULL;
2036 
2037 	switch (page->state) {
2038 		case PAGE_STATE_BUSY:
2039 		case PAGE_STATE_ACTIVE:
2040 		case PAGE_STATE_WIRED:
2041 		case PAGE_STATE_UNUSED:
2042 			queue = &sActivePageQueue;
2043 			break;
2044 		case PAGE_STATE_INACTIVE:
2045 			queue = &sInactivePageQueue;
2046 			break;
2047 		case PAGE_STATE_MODIFIED:
2048 			queue = &sModifiedPageQueue;
2049 			break;
2050 		case PAGE_STATE_FREE:
2051 			queue = &sFreePageQueue;
2052 			break;
2053 		case PAGE_STATE_CLEAR:
2054 			queue = &sClearPageQueue;
2055 			break;
2056 		default:
2057 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
2058 				page, page->state);
2059 			break;
2060 	}
2061 
2062 	remove_page_from_queue(queue, page);
2063 
2064 	if (tail)
2065 		enqueue_page(queue, page);
2066 	else
2067 		enqueue_page_to_head(queue, page);
2068 }
2069 
2070 
2071 size_t
2072 vm_page_num_pages(void)
2073 {
2074 	return sNumPages;
2075 }
2076 
2077 
2078 /*! There is a subtle distinction between the page counts returned by
2079 	this function and vm_page_num_free_pages():
2080 	The latter returns the number of pages that are completely uncommitted,
2081 	whereas this one returns the number of pages that are available for
2082 	use by being reclaimed as well (IOW it factors in things like cache pages
2083 	as available).
2084 */
2085 size_t
2086 vm_page_num_available_pages(void)
2087 {
2088 	return vm_available_memory() / B_PAGE_SIZE;
2089 }
2090 
2091 
2092 size_t
2093 vm_page_num_free_pages(void)
2094 {
2095 	size_t reservedPages = sReservedPages;
2096 	size_t count = free_page_queue_count() + sInactivePageQueue.count;
2097 	if (reservedPages > count)
2098 		return 0;
2099 
2100 	return count - reservedPages;
2101 }
2102 
2103 
2104 void
2105 vm_page_get_stats(system_info *info)
2106 {
2107 	// Get free pages count -- not really exact, since we don't know how many
2108 	// of the reserved pages have already been allocated, but good citizens
2109 	// unreserve chunk-wise as they are allocating the pages, if they have
2110 	// reserved a larger quantity.
2111 	page_num_t reserved = sReservedPages;
2112 	page_num_t free = free_page_queue_count();
2113 	free = free > reserved ? free - reserved : 0;
2114 
2115 	// The pages used for the block cache buffers. Those should not be counted
2116 	// as used but as cached pages.
2117 	// TODO: We should subtract the blocks that are in use ATM, since those
2118 	// can't really be freed in a low memory situation.
2119 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
2120 
2121 	info->max_pages = sNumPages;
2122 	info->used_pages = gMappedPagesCount - blockCachePages;
2123 	info->cached_pages = sNumPages >= free + info->used_pages
2124 		? sNumPages - free - info->used_pages : 0;
2125 
2126 	// TODO: We don't consider pages used for page directories/tables yet.
2127 }
2128