xref: /haiku/src/system/kernel/vm/vm_page.cpp (revision c9ad965c81b08802fed0827fd1dd16f45297928a)
1 /*
2  * Copyright 2002-2009, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  *
5  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
6  * Distributed under the terms of the NewOS License.
7  */
8 
9 #include <signal.h>
10 #include <string.h>
11 #include <stdlib.h>
12 
13 #include <KernelExport.h>
14 #include <OS.h>
15 
16 #include <AutoDeleter.h>
17 
18 #include <arch/cpu.h>
19 #include <arch/vm_translation_map.h>
20 #include <block_cache.h>
21 #include <boot/kernel_args.h>
22 #include <condition_variable.h>
23 #include <heap.h>
24 #include <kernel.h>
25 #include <low_resource_manager.h>
26 #include <thread.h>
27 #include <tracing.h>
28 #include <util/AutoLock.h>
29 #include <vfs.h>
30 #include <vm.h>
31 #include <vm_address_space.h>
32 #include <vm_priv.h>
33 #include <vm_page.h>
34 #include <vm_cache.h>
35 
36 #include "VMAnonymousCache.h"
37 #include "IORequest.h"
38 #include "PageCacheLocker.h"
39 
40 
41 //#define TRACE_VM_PAGE
42 #ifdef TRACE_VM_PAGE
43 #	define TRACE(x) dprintf x
44 #else
45 #	define TRACE(x) ;
46 #endif
47 
48 #define SCRUB_SIZE 16
49 	// this many pages will be cleared at once in the page scrubber thread
50 
51 #define MAX_PAGE_WRITER_IO_PRIORITY				B_URGENT_DISPLAY_PRIORITY
52 	// maximum I/O priority of the page writer
53 #define MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD	10000
54 	// the maximum I/O priority shall be reached when this many pages need to
55 	// be written
56 
57 
58 typedef struct page_queue {
59 	vm_page *head;
60 	vm_page *tail;
61 	uint32	count;
62 } page_queue;
63 
64 int32 gMappedPagesCount;
65 
66 static page_queue sFreePageQueue;
67 static page_queue sClearPageQueue;
68 static page_queue sModifiedPageQueue;
69 static page_queue sInactivePageQueue;
70 static page_queue sActivePageQueue;
71 
72 static vm_page *sPages;
73 static addr_t sPhysicalPageOffset;
74 static size_t sNumPages;
75 static size_t sReservedPages;
76 static vint32 sPageDeficit;
77 static size_t sModifiedTemporaryPages;
78 
79 static ConditionVariable sFreePageCondition;
80 static mutex sPageLock = MUTEX_INITIALIZER("pages");
81 
82 static sem_id sWriterWaitSem;
83 
84 
85 #if PAGE_ALLOCATION_TRACING
86 
87 namespace PageAllocationTracing {
88 
89 class ReservePages : public AbstractTraceEntry {
90 	public:
91 		ReservePages(uint32 count)
92 			:
93 			fCount(count)
94 		{
95 			Initialized();
96 		}
97 
98 		virtual void AddDump(TraceOutput& out)
99 		{
100 			out.Print("page reserve:   %lu", fCount);
101 		}
102 
103 	private:
104 		uint32		fCount;
105 };
106 
107 
108 class UnreservePages : public AbstractTraceEntry {
109 	public:
110 		UnreservePages(uint32 count)
111 			:
112 			fCount(count)
113 		{
114 			Initialized();
115 		}
116 
117 		virtual void AddDump(TraceOutput& out)
118 		{
119 			out.Print("page unreserve: %lu", fCount);
120 		}
121 
122 	private:
123 		uint32		fCount;
124 };
125 
126 
127 class AllocatePage : public AbstractTraceEntry {
128 	public:
129 		AllocatePage(bool reserved)
130 			:
131 			fReserved(reserved)
132 		{
133 			Initialized();
134 		}
135 
136 		virtual void AddDump(TraceOutput& out)
137 		{
138 			out.Print("page alloc");
139 			if (fReserved)
140 				out.Print(" reserved");
141 		}
142 
143 	private:
144 		bool		fReserved;
145 };
146 
147 
148 class AllocatePageRun : public AbstractTraceEntry {
149 	public:
150 		AllocatePageRun(uint32 length)
151 			:
152 			fLength(length)
153 		{
154 			Initialized();
155 		}
156 
157 		virtual void AddDump(TraceOutput& out)
158 		{
159 			out.Print("page alloc run: length: %ld", fLength);
160 		}
161 
162 	private:
163 		uint32		fLength;
164 };
165 
166 
167 class FreePage : public AbstractTraceEntry {
168 	public:
169 		FreePage()
170 		{
171 			Initialized();
172 		}
173 
174 		virtual void AddDump(TraceOutput& out)
175 		{
176 			out.Print("page free");
177 		}
178 };
179 
180 
181 class ScrubbingPages : public AbstractTraceEntry {
182 	public:
183 		ScrubbingPages(uint32 count)
184 			:
185 			fCount(count)
186 		{
187 			Initialized();
188 		}
189 
190 		virtual void AddDump(TraceOutput& out)
191 		{
192 			out.Print("page scrubbing: %lu", fCount);
193 		}
194 
195 	private:
196 		uint32		fCount;
197 };
198 
199 
200 class ScrubbedPages : public AbstractTraceEntry {
201 	public:
202 		ScrubbedPages(uint32 count)
203 			:
204 			fCount(count)
205 		{
206 			Initialized();
207 		}
208 
209 		virtual void AddDump(TraceOutput& out)
210 		{
211 			out.Print("page scrubbed:  %lu", fCount);
212 		}
213 
214 	private:
215 		uint32		fCount;
216 };
217 
218 
219 class StolenPage : public AbstractTraceEntry {
220 	public:
221 		StolenPage()
222 		{
223 			Initialized();
224 		}
225 
226 		virtual void AddDump(TraceOutput& out)
227 		{
228 			out.Print("page stolen");
229 		}
230 };
231 
232 }	// namespace PageAllocationTracing
233 
234 #	define T(x)	new(std::nothrow) PageAllocationTracing::x
235 
236 #else
237 #	define T(x)
238 #endif	// PAGE_ALLOCATION_TRACING
239 
240 
241 #if PAGE_WRITER_TRACING
242 
243 namespace PageWriterTracing {
244 
245 class WritePage : public AbstractTraceEntry {
246 	public:
247 		WritePage(vm_page* page)
248 			:
249 			fCache(page->cache),
250 			fPage(page)
251 		{
252 			Initialized();
253 		}
254 
255 		virtual void AddDump(TraceOutput& out)
256 		{
257 			out.Print("page write: %p, cache: %p", fPage, fCache);
258 		}
259 
260 	private:
261 		VMCache*	fCache;
262 		vm_page*	fPage;
263 };
264 
265 }	// namespace PageWriterTracing
266 
267 #	define TPW(x)	new(std::nothrow) PageWriterTracing::x
268 
269 #else
270 #	define TPW(x)
271 #endif	// PAGE_WRITER_TRACING
272 
273 
274 /*!	Dequeues a page from the head of the given queue */
275 static vm_page *
276 dequeue_page(page_queue *queue)
277 {
278 	vm_page *page;
279 
280 	page = queue->head;
281 	if (page != NULL) {
282 		if (queue->tail == page)
283 			queue->tail = NULL;
284 		if (page->queue_next != NULL)
285 			page->queue_next->queue_prev = NULL;
286 
287 		queue->head = page->queue_next;
288 		if (page->type != PAGE_TYPE_DUMMY)
289 			queue->count--;
290 
291 #if DEBUG_PAGE_QUEUE
292 		if (page->queue != queue) {
293 			panic("dequeue_page(queue: %p): page %p thinks it is in queue "
294 				"%p", queue, page, page->queue);
295 		}
296 
297 		page->queue = NULL;
298 #endif	// DEBUG_PAGE_QUEUE
299 	}
300 
301 	return page;
302 }
303 
304 
305 /*!	Enqueues a page to the tail of the given queue */
306 static void
307 enqueue_page(page_queue *queue, vm_page *page)
308 {
309 #if DEBUG_PAGE_QUEUE
310 	if (page->queue != NULL) {
311 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
312 			"already in queue %p", queue, page, page->queue);
313 	}
314 #endif	// DEBUG_PAGE_QUEUE
315 
316 	if (queue->tail != NULL)
317 		queue->tail->queue_next = page;
318 	page->queue_prev = queue->tail;
319 	queue->tail = page;
320 	page->queue_next = NULL;
321 	if (queue->head == NULL)
322 		queue->head = page;
323 	if (page->type != PAGE_TYPE_DUMMY)
324 		queue->count++;
325 
326 #if DEBUG_PAGE_QUEUE
327 	page->queue = queue;
328 #endif
329 }
330 
331 
332 /*!	Enqueues a page to the head of the given queue */
333 static void
334 enqueue_page_to_head(page_queue *queue, vm_page *page)
335 {
336 #if DEBUG_PAGE_QUEUE
337 	if (page->queue != NULL) {
338 		panic("enqueue_page_to_head(queue: %p, page: %p): page thinks it is "
339 			"already in queue %p", queue, page, page->queue);
340 	}
341 #endif	// DEBUG_PAGE_QUEUE
342 
343 	if (queue->head != NULL)
344 		queue->head->queue_prev = page;
345 	page->queue_next = queue->head;
346 	queue->head = page;
347 	page->queue_prev = NULL;
348 	if (queue->tail == NULL)
349 		queue->tail = page;
350 	if (page->type != PAGE_TYPE_DUMMY)
351 		queue->count++;
352 
353 #if DEBUG_PAGE_QUEUE
354 	page->queue = queue;
355 #endif
356 }
357 
358 
359 static void
360 remove_page_from_queue(page_queue *queue, vm_page *page)
361 {
362 #if DEBUG_PAGE_QUEUE
363 	if (page->queue != queue) {
364 		panic("remove_page_from_queue(queue: %p, page: %p): page thinks it "
365 			"is in queue %p", queue, page, page->queue);
366 	}
367 #endif	// DEBUG_PAGE_QUEUE
368 
369 	if (page->queue_next != NULL)
370 		page->queue_next->queue_prev = page->queue_prev;
371 	else
372 		queue->tail = page->queue_prev;
373 
374 	if (page->queue_prev != NULL)
375 		page->queue_prev->queue_next = page->queue_next;
376 	else
377 		queue->head = page->queue_next;
378 
379 	if (page->type != PAGE_TYPE_DUMMY)
380 		queue->count--;
381 
382 #if DEBUG_PAGE_QUEUE
383 	page->queue = NULL;
384 #endif
385 }
386 
387 
388 /*!	Moves a page to the tail of the given queue, but only does so if
389 	the page is currently in another queue.
390 */
391 static void
392 move_page_to_queue(page_queue *fromQueue, page_queue *toQueue, vm_page *page)
393 {
394 	if (fromQueue != toQueue) {
395 		remove_page_from_queue(fromQueue, page);
396 		enqueue_page(toQueue, page);
397 	}
398 }
399 
400 
401 /*! Inserts \a page after the \a before page in the \a queue. */
402 static void
403 insert_page_after(page_queue *queue, vm_page *before, vm_page *page)
404 {
405 #if DEBUG_PAGE_QUEUE
406 	if (page->queue != NULL) {
407 		panic("enqueue_page(queue: %p, page: %p): page thinks it is "
408 			"already in queue %p", queue, page, page->queue);
409 	}
410 #endif	// DEBUG_PAGE_QUEUE
411 
412 	if (before == NULL) {
413 		enqueue_page(queue, page);
414 		return;
415 	}
416 
417 	page->queue_next = before->queue_next;
418 	if (page->queue_next != NULL)
419 		page->queue_next->queue_prev = page;
420 	page->queue_prev = before;
421 	before->queue_next = page;
422 
423 	if (queue->tail == before)
424 		queue->tail = page;
425 
426 	if (page->type != PAGE_TYPE_DUMMY)
427 		queue->count++;
428 
429 #if DEBUG_PAGE_QUEUE
430 	page->queue = queue;
431 #endif
432 }
433 
434 
435 static int
436 find_page(int argc, char **argv)
437 {
438 	struct vm_page *page;
439 	addr_t address;
440 	int32 index = 1;
441 	int i;
442 
443 	struct {
444 		const char*	name;
445 		page_queue*	queue;
446 	} pageQueueInfos[] = {
447 		{ "free",		&sFreePageQueue },
448 		{ "clear",		&sClearPageQueue },
449 		{ "modified",	&sModifiedPageQueue },
450 		{ "active",		&sActivePageQueue },
451 		{ NULL, NULL }
452 	};
453 
454 	if (argc < 2
455 		|| strlen(argv[index]) <= 2
456 		|| argv[index][0] != '0'
457 		|| argv[index][1] != 'x') {
458 		kprintf("usage: find_page <address>\n");
459 		return 0;
460 	}
461 
462 	address = strtoul(argv[index], NULL, 0);
463 	page = (vm_page*)address;
464 
465 	for (i = 0; pageQueueInfos[i].name; i++) {
466 		vm_page* p = pageQueueInfos[i].queue->head;
467 		while (p) {
468 			if (p == page) {
469 				kprintf("found page %p in queue %p (%s)\n", page,
470 					pageQueueInfos[i].queue, pageQueueInfos[i].name);
471 				return 0;
472 			}
473 			p = p->queue_next;
474 		}
475 	}
476 
477 	kprintf("page %p isn't in any queue\n", page);
478 
479 	return 0;
480 }
481 
482 
483 const char *
484 page_state_to_string(int state)
485 {
486 	switch(state) {
487 		case PAGE_STATE_ACTIVE:
488 			return "active";
489 		case PAGE_STATE_INACTIVE:
490 			return "inactive";
491 		case PAGE_STATE_BUSY:
492 			return "busy";
493 		case PAGE_STATE_MODIFIED:
494 			return "modified";
495 		case PAGE_STATE_FREE:
496 			return "free";
497 		case PAGE_STATE_CLEAR:
498 			return "clear";
499 		case PAGE_STATE_WIRED:
500 			return "wired";
501 		case PAGE_STATE_UNUSED:
502 			return "unused";
503 		default:
504 			return "unknown";
505 	}
506 }
507 
508 
509 static int
510 dump_page(int argc, char **argv)
511 {
512 	struct vm_page *page;
513 	addr_t address;
514 	bool physical = false;
515 	int32 index = 1;
516 
517 	if (argc > 2) {
518 		if (!strcmp(argv[1], "-p")) {
519 			physical = true;
520 			index++;
521 		} else if (!strcmp(argv[1], "-v"))
522 			index++;
523 	}
524 
525 	if (argc < 2
526 		|| strlen(argv[index]) <= 2
527 		|| argv[index][0] != '0'
528 		|| argv[index][1] != 'x') {
529 		kprintf("usage: page [-p|-v] <address>\n"
530 			"  -v looks up a virtual address for the page, -p a physical address.\n"
531 			"  Default is to look for the page structure address directly.\n");
532 		return 0;
533 	}
534 
535 	address = strtoul(argv[index], NULL, 0);
536 
537 	if (index == 2) {
538 		if (!physical) {
539 			vm_address_space *addressSpace = vm_kernel_address_space();
540 			uint32 flags;
541 
542 			if (thread_get_current_thread()->team->address_space != NULL)
543 				addressSpace = thread_get_current_thread()->team->address_space;
544 
545 			addressSpace->translation_map.ops->query_interrupt(
546 				&addressSpace->translation_map, address, &address, &flags);
547 		}
548 		page = vm_lookup_page(address / B_PAGE_SIZE);
549 	} else
550 		page = (struct vm_page *)address;
551 
552 	kprintf("PAGE: %p\n", page);
553 	kprintf("queue_next,prev: %p, %p\n", page->queue_next, page->queue_prev);
554 	kprintf("physical_number: %lx\n", page->physical_page_number);
555 	kprintf("cache:           %p\n", page->cache);
556 	kprintf("cache_offset:    %ld\n", page->cache_offset);
557 	kprintf("cache_next:      %p\n", page->cache_next);
558 	kprintf("type:            %d\n", page->type);
559 	kprintf("state:           %s\n", page_state_to_string(page->state));
560 	kprintf("wired_count:     %d\n", page->wired_count);
561 	kprintf("usage_count:     %d\n", page->usage_count);
562 	kprintf("busy_writing:    %d\n", page->busy_writing);
563 	#if DEBUG_PAGE_QUEUE
564 		kprintf("queue:           %p\n", page->queue);
565 	#endif
566 	#if DEBUG_PAGE_CACHE_TRANSITIONS
567 		kprintf("debug_flags:     0x%lx\n", page->debug_flags);
568 		kprintf("collided page:   %p\n", page->collided_page);
569 	#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
570 	kprintf("area mappings:\n");
571 
572 	vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
573 	vm_page_mapping *mapping;
574 	while ((mapping = iterator.Next()) != NULL) {
575 		kprintf("  %p (%#lx)\n", mapping->area, mapping->area->id);
576 		mapping = mapping->page_link.next;
577 	}
578 
579 	return 0;
580 }
581 
582 
583 static int
584 dump_page_queue(int argc, char **argv)
585 {
586 	struct page_queue *queue;
587 
588 	if (argc < 2) {
589 		kprintf("usage: page_queue <address/name> [list]\n");
590 		return 0;
591 	}
592 
593 	if (strlen(argv[1]) >= 2 && argv[1][0] == '0' && argv[1][1] == 'x')
594 		queue = (struct page_queue *)strtoul(argv[1], NULL, 16);
595 	if (!strcmp(argv[1], "free"))
596 		queue = &sFreePageQueue;
597 	else if (!strcmp(argv[1], "clear"))
598 		queue = &sClearPageQueue;
599 	else if (!strcmp(argv[1], "modified"))
600 		queue = &sModifiedPageQueue;
601 	else if (!strcmp(argv[1], "active"))
602 		queue = &sActivePageQueue;
603 	else if (!strcmp(argv[1], "inactive"))
604 		queue = &sInactivePageQueue;
605 	else {
606 		kprintf("page_queue: unknown queue \"%s\".\n", argv[1]);
607 		return 0;
608 	}
609 
610 	kprintf("queue = %p, queue->head = %p, queue->tail = %p, queue->count = %ld\n",
611 		queue, queue->head, queue->tail, queue->count);
612 
613 	if (argc == 3) {
614 		struct vm_page *page = queue->head;
615 		const char *type = "none";
616 		int i;
617 
618 		if (page->cache != NULL) {
619 			switch (page->cache->type) {
620 				case CACHE_TYPE_RAM:
621 					type = "RAM";
622 					break;
623 				case CACHE_TYPE_DEVICE:
624 					type = "device";
625 					break;
626 				case CACHE_TYPE_VNODE:
627 					type = "vnode";
628 					break;
629 				case CACHE_TYPE_NULL:
630 					type = "null";
631 					break;
632 				default:
633 					type = "???";
634 					break;
635 			}
636 		}
637 
638 		kprintf("page        cache       type       state  wired  usage\n");
639 		for (i = 0; page; i++, page = page->queue_next) {
640 			kprintf("%p  %p  %-7s %8s  %5d  %5d\n", page, page->cache,
641 				type, page_state_to_string(page->state),
642 				page->wired_count, page->usage_count);
643 		}
644 	}
645 	return 0;
646 }
647 
648 
649 static int
650 dump_page_stats(int argc, char **argv)
651 {
652 	page_num_t swappableModified = 0;
653 	page_num_t swappableModifiedInactive = 0;
654 	uint32 counter[8];
655 	addr_t i;
656 
657 	memset(counter, 0, sizeof(counter));
658 
659 	for (i = 0; i < sNumPages; i++) {
660 		if (sPages[i].state > 7)
661 			panic("page %li at %p has invalid state!\n", i, &sPages[i]);
662 
663 		counter[sPages[i].state]++;
664 
665 		if (sPages[i].state == PAGE_STATE_MODIFIED && sPages[i].cache != NULL
666 			&& sPages[i].cache->temporary && sPages[i].wired_count == 0) {
667 			swappableModified++;
668 			if (sPages[i].usage_count < 0)
669 				swappableModifiedInactive++;
670 		}
671 	}
672 
673 	kprintf("page stats:\n");
674 	kprintf("total: %lu\n", sNumPages);
675 	kprintf("active: %lu\ninactive: %lu\nbusy: %lu\nunused: %lu\n",
676 		counter[PAGE_STATE_ACTIVE], counter[PAGE_STATE_INACTIVE],
677 		counter[PAGE_STATE_BUSY], counter[PAGE_STATE_UNUSED]);
678 	kprintf("wired: %lu\nmodified: %lu\nfree: %lu\nclear: %lu\n",
679 		counter[PAGE_STATE_WIRED], counter[PAGE_STATE_MODIFIED],
680 		counter[PAGE_STATE_FREE], counter[PAGE_STATE_CLEAR]);
681 	kprintf("reserved pages: %lu\n", sReservedPages);
682 	kprintf("page deficit: %lu\n", sPageDeficit);
683 	kprintf("mapped pages: %lu\n", gMappedPagesCount);
684 
685 	kprintf("\nfree queue: %p, count = %ld\n", &sFreePageQueue,
686 		sFreePageQueue.count);
687 	kprintf("clear queue: %p, count = %ld\n", &sClearPageQueue,
688 		sClearPageQueue.count);
689 	kprintf("modified queue: %p, count = %ld (%ld temporary, %lu swappable, "
690 		"inactive: %lu)\n", &sModifiedPageQueue, sModifiedPageQueue.count,
691 		sModifiedTemporaryPages, swappableModified, swappableModifiedInactive);
692 	kprintf("active queue: %p, count = %ld\n", &sActivePageQueue,
693 		sActivePageQueue.count);
694 	kprintf("inactive queue: %p, count = %ld\n", &sInactivePageQueue,
695 		sInactivePageQueue.count);
696 	return 0;
697 }
698 
699 
700 static inline size_t
701 free_page_queue_count(void)
702 {
703 	return sFreePageQueue.count + sClearPageQueue.count;
704 }
705 
706 
707 static status_t
708 set_page_state_nolock(vm_page *page, int pageState)
709 {
710 	if (pageState == page->state)
711 		return B_OK;
712 
713 	page_queue *fromQueue = NULL;
714 	page_queue *toQueue = NULL;
715 
716 	switch (page->state) {
717 		case PAGE_STATE_BUSY:
718 		case PAGE_STATE_ACTIVE:
719 		case PAGE_STATE_WIRED:
720 		case PAGE_STATE_UNUSED:
721 			fromQueue = &sActivePageQueue;
722 			break;
723 		case PAGE_STATE_INACTIVE:
724 			fromQueue = &sInactivePageQueue;
725 			break;
726 		case PAGE_STATE_MODIFIED:
727 			fromQueue = &sModifiedPageQueue;
728 			break;
729 		case PAGE_STATE_FREE:
730 			fromQueue = &sFreePageQueue;
731 			break;
732 		case PAGE_STATE_CLEAR:
733 			fromQueue = &sClearPageQueue;
734 			break;
735 		default:
736 			panic("vm_page_set_state: vm_page %p in invalid state %d\n",
737 				page, page->state);
738 			break;
739 	}
740 
741 	if (page->state == PAGE_STATE_CLEAR || page->state == PAGE_STATE_FREE) {
742 		if (page->cache != NULL)
743 			panic("free page %p has cache", page);
744 	}
745 
746 	switch (pageState) {
747 		case PAGE_STATE_BUSY:
748 		case PAGE_STATE_ACTIVE:
749 		case PAGE_STATE_WIRED:
750 		case PAGE_STATE_UNUSED:
751 			toQueue = &sActivePageQueue;
752 			break;
753 		case PAGE_STATE_INACTIVE:
754 			toQueue = &sInactivePageQueue;
755 			break;
756 		case PAGE_STATE_MODIFIED:
757 			toQueue = &sModifiedPageQueue;
758 			break;
759 		case PAGE_STATE_FREE:
760 			toQueue = &sFreePageQueue;
761 			break;
762 		case PAGE_STATE_CLEAR:
763 			toQueue = &sClearPageQueue;
764 			break;
765 		default:
766 			panic("vm_page_set_state: invalid target state %d\n", pageState);
767 	}
768 
769 	if (pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE
770 		|| pageState == PAGE_STATE_INACTIVE) {
771 		if (sPageDeficit > 0)
772 			sFreePageCondition.NotifyOne();
773 
774 		if (pageState != PAGE_STATE_INACTIVE) {
775 			if (page->cache != NULL)
776 				panic("to be freed page %p has cache", page);
777 			if (!page->mappings.IsEmpty() || page->wired_count > 0)
778 				panic("to be freed page %p has mappings", page);
779 		}
780 	}
781 	if (page->cache != NULL && page->cache->temporary) {
782 		if (pageState == PAGE_STATE_MODIFIED)
783 			sModifiedTemporaryPages++;
784 		else if (page->state == PAGE_STATE_MODIFIED)
785 			sModifiedTemporaryPages--;
786 	}
787 
788 #ifdef PAGE_ALLOCATION_TRACING
789 	if ((pageState == PAGE_STATE_CLEAR || pageState == PAGE_STATE_FREE)
790 		&& page->state != PAGE_STATE_CLEAR && page->state != PAGE_STATE_FREE) {
791 		T(FreePage());
792 	}
793 #endif	// PAGE_ALLOCATION_TRACING
794 
795 	page->state = pageState;
796 	move_page_to_queue(fromQueue, toQueue, page);
797 
798 	return B_OK;
799 }
800 
801 
802 /*! Moves a modified page into either the active or inactive page queue
803 	depending on its usage count and wiring.
804 */
805 static void
806 move_page_to_active_or_inactive_queue(vm_page *page, bool dequeued)
807 {
808 	// Note, this logic must be in sync with what the page daemon does
809 	int32 state;
810 	if (!page->mappings.IsEmpty() || page->usage_count >= 0
811 		|| page->wired_count)
812 		state = PAGE_STATE_ACTIVE;
813 	else
814 		state = PAGE_STATE_INACTIVE;
815 
816 	if (dequeued) {
817 		page->state = state;
818 		enqueue_page(state == PAGE_STATE_ACTIVE
819 			? &sActivePageQueue : &sInactivePageQueue, page);
820 		if (page->cache->temporary)
821 			sModifiedTemporaryPages--;
822 	} else
823 		set_page_state_nolock(page, state);
824 }
825 
826 
827 static void
828 clear_page(struct vm_page *page)
829 {
830 	vm_memset_physical(page->physical_page_number << PAGE_SHIFT, 0,
831 		B_PAGE_SIZE);
832 }
833 
834 
835 /*!
836 	This is a background thread that wakes up every now and then (every 100ms)
837 	and moves some pages from the free queue over to the clear queue.
838 	Given enough time, it will clear out all pages from the free queue - we
839 	could probably slow it down after having reached a certain threshold.
840 */
841 static int32
842 page_scrubber(void *unused)
843 {
844 	(void)(unused);
845 
846 	TRACE(("page_scrubber starting...\n"));
847 
848 	for (;;) {
849 		snooze(100000); // 100ms
850 
851 		if (sFreePageQueue.count == 0)
852 			continue;
853 
854 		MutexLocker locker(sPageLock);
855 
856 		// Since we temporarily remove pages from the free pages reserve,
857 		// we must make sure we don't cause a violation of the page
858 		// reservation warranty. The following is usually stricter than
859 		// necessary, because we don't have information on how many of the
860 		// reserved pages have already been allocated.
861 		int32 scrubCount = SCRUB_SIZE;
862 		uint32 freeCount = free_page_queue_count();
863 		if (freeCount <= sReservedPages)
864 			continue;
865 
866 		if ((uint32)scrubCount > freeCount - sReservedPages)
867 			scrubCount = freeCount - sReservedPages;
868 
869 		// get some pages from the free queue
870 		vm_page *page[SCRUB_SIZE];
871 		for (int32 i = 0; i < scrubCount; i++) {
872 			page[i] = dequeue_page(&sFreePageQueue);
873 			if (page[i] == NULL) {
874 				scrubCount = i;
875 				break;
876 			}
877 
878 			page[i]->state = PAGE_STATE_BUSY;
879 		}
880 
881 		if (scrubCount == 0)
882 			continue;
883 
884 		T(ScrubbingPages(scrubCount));
885 		locker.Unlock();
886 
887 		// clear them
888 		for (int32 i = 0; i < scrubCount; i++)
889 			clear_page(page[i]);
890 
891 		locker.Lock();
892 
893 		// and put them into the clear queue
894 		for (int32 i = 0; i < scrubCount; i++) {
895 			page[i]->state = PAGE_STATE_CLEAR;
896 			enqueue_page(&sClearPageQueue, page[i]);
897 		}
898 
899 		T(ScrubbedPages(scrubCount));
900 	}
901 
902 	return 0;
903 }
904 
905 
906 static inline bool
907 is_marker_page(struct vm_page *page)
908 {
909 	return page->type == PAGE_TYPE_DUMMY;
910 }
911 
912 
913 static void
914 remove_page_marker(struct vm_page &marker)
915 {
916 	if (marker.state == PAGE_STATE_UNUSED)
917 		return;
918 
919 	page_queue *queue;
920 
921 	switch (marker.state) {
922 		case PAGE_STATE_ACTIVE:
923 			queue = &sActivePageQueue;
924 			break;
925 		case PAGE_STATE_INACTIVE:
926 			queue = &sInactivePageQueue;
927 			break;
928 		case PAGE_STATE_MODIFIED:
929 			queue = &sModifiedPageQueue;
930 			break;
931 
932 		default:
933 			return;
934 	}
935 
936 	MutexLocker locker(sPageLock);
937 	remove_page_from_queue(queue, &marker);
938 
939 	marker.state = PAGE_STATE_UNUSED;
940 }
941 
942 
943 static vm_page *
944 next_modified_page(struct vm_page &marker)
945 {
946 	MutexLocker locker(sPageLock);
947 	vm_page *page;
948 
949 	if (marker.state == PAGE_STATE_MODIFIED) {
950 		page = marker.queue_next;
951 		remove_page_from_queue(&sModifiedPageQueue, &marker);
952 		marker.state = PAGE_STATE_UNUSED;
953 	} else
954 		page = sModifiedPageQueue.head;
955 
956 	for (; page != NULL; page = page->queue_next) {
957 		if (!is_marker_page(page) && page->state != PAGE_STATE_BUSY) {
958 			// insert marker
959 			marker.state = PAGE_STATE_MODIFIED;
960 			insert_page_after(&sModifiedPageQueue, page, &marker);
961 			return page;
962 		}
963 	}
964 
965 	return NULL;
966 }
967 
968 
969 // #pragma mark -
970 
971 
972 class PageWriteTransfer;
973 class PageWriteWrapper;
974 
975 
976 class PageWriterRun {
977 public:
978 	status_t Init(uint32 maxPages);
979 
980 	void PrepareNextRun();
981 	void AddPage(vm_page* page);
982 	void Go();
983 
984 	void PageWritten(PageWriteTransfer* transfer, status_t status,
985 		bool partialTransfer, size_t bytesTransferred);
986 
987 private:
988 	uint32				fMaxPages;
989 	uint32				fWrapperCount;
990 	uint32				fTransferCount;
991 	vint32				fPendingTransfers;
992 	PageWriteWrapper*	fWrappers;
993 	PageWriteTransfer*	fTransfers;
994 	ConditionVariable	fAllFinishedCondition;
995 };
996 
997 
998 class PageWriteTransfer : public AsyncIOCallback {
999 public:
1000 	void SetTo(PageWriterRun* run, vm_page* page, int32 maxPages);
1001 	bool AddPage(vm_page* page);
1002 
1003 	status_t Schedule(uint32 flags);
1004 
1005 	void SetStatus(status_t status, size_t transferred);
1006 
1007 	status_t Status() const	{ return fStatus; }
1008 	struct VMCache* Cache() const { return fCache; }
1009 	uint32 PageCount() const { return fPageCount; }
1010 
1011 	virtual void IOFinished(status_t status, bool partialTransfer,
1012 		size_t bytesTransferred);
1013 private:
1014 	PageWriterRun*		fRun;
1015 	struct VMCache*		fCache;
1016 	off_t				fOffset;
1017 	uint32				fPageCount;
1018 	int32				fMaxPages;
1019 	status_t			fStatus;
1020 	uint32				fVecCount;
1021 	iovec				fVecs[32]; // TODO: make dynamic/configurable
1022 };
1023 
1024 
1025 class PageWriteWrapper {
1026 public:
1027 	PageWriteWrapper();
1028 	~PageWriteWrapper();
1029 	void SetTo(vm_page* page, bool dequeuedPage);
1030 	void ClearModifiedFlag();
1031 	void CheckRemoveFromShrunkenCache();
1032 	void Done(status_t result);
1033 
1034 private:
1035 	vm_page*			fPage;
1036 	struct VMCache*		fCache;
1037 	bool				fDequeuedPage;
1038 	bool				fIsActive;
1039 	int					fOldPageState;
1040 	ConditionVariable	fBusyCondition;
1041 };
1042 
1043 
1044 PageWriteWrapper::PageWriteWrapper()
1045 	:
1046 	fIsActive(false)
1047 {
1048 }
1049 
1050 
1051 PageWriteWrapper::~PageWriteWrapper()
1052 {
1053 	if (fIsActive)
1054 		panic("page write wrapper going out of scope but isn't completed");
1055 }
1056 
1057 
1058 void
1059 PageWriteWrapper::SetTo(vm_page* page, bool dequeuedPage)
1060 {
1061 	if (page->state == PAGE_STATE_BUSY)
1062 		panic("setting page write wrapper to busy page");
1063 
1064 	if (fIsActive)
1065 		panic("re-setting page write wrapper that isn't completed");
1066 
1067 	fPage = page;
1068 	fCache = page->cache;
1069 	fDequeuedPage = dequeuedPage;
1070 	fIsActive = true;
1071 
1072 	fOldPageState = fPage->state;
1073 	fPage->state = PAGE_STATE_BUSY;
1074 	fPage->busy_writing = true;
1075 
1076 	fBusyCondition.Publish(fPage, "page");
1077 }
1078 
1079 
1080 void
1081 PageWriteWrapper::ClearModifiedFlag()
1082 {
1083 	// We have a modified page - however, while we're writing it back,
1084 	// the page is still mapped. In order not to lose any changes to the
1085 	// page, we mark it clean before actually writing it back; if
1086 	// writing the page fails for some reason, we just keep it in the
1087 	// modified page list, but that should happen only rarely.
1088 
1089 	// If the page is changed after we cleared the dirty flag, but
1090 	// before we had the chance to write it back, then we'll write it
1091 	// again later - that will probably not happen that often, though.
1092 
1093 	vm_clear_map_flags(fPage, PAGE_MODIFIED);
1094 }
1095 
1096 
1097 void
1098 PageWriteWrapper::CheckRemoveFromShrunkenCache()
1099 {
1100 	if (fPage->busy_writing)
1101 		return;
1102 
1103 	vm_remove_all_page_mappings(fPage, NULL);
1104 	fCache->RemovePage(fPage);
1105 }
1106 
1107 
1108 void
1109 PageWriteWrapper::Done(status_t result)
1110 {
1111 	if (!fIsActive)
1112 		panic("completing page write wrapper that is not active");
1113 
1114 	if (result == B_OK) {
1115 		// put it into the active/inactive queue
1116 		move_page_to_active_or_inactive_queue(fPage, fDequeuedPage);
1117 		fPage->busy_writing = false;
1118 	} else {
1119 		// Writing the page failed -- move to the modified queue. If we dequeued
1120 		// it from there, just enqueue it again, otherwise set the page state
1121 		// explicitly, which will take care of moving between the queues.
1122 		if (fDequeuedPage) {
1123 			fPage->state = PAGE_STATE_MODIFIED;
1124 			enqueue_page(&sModifiedPageQueue, fPage);
1125 		} else {
1126 			fPage->state = fOldPageState;
1127 			set_page_state_nolock(fPage, PAGE_STATE_MODIFIED);
1128 		}
1129 
1130 		if (!fPage->busy_writing) {
1131 			// The busy_writing flag was cleared. That means the cache has been
1132 			// shrunk while we were trying to write the page and we have to free
1133 			// it now.
1134 
1135 			// Adjust temporary modified pages count, if necessary.
1136 			if (fDequeuedPage && fCache->temporary)
1137 				sModifiedTemporaryPages--;
1138 
1139 			// free the page
1140 			set_page_state_nolock(fPage, PAGE_STATE_FREE);
1141 		} else
1142 			fPage->busy_writing = false;
1143 	}
1144 
1145 	fBusyCondition.Unpublish();
1146 	fIsActive = false;
1147 }
1148 
1149 
1150 void
1151 PageWriteTransfer::SetTo(PageWriterRun* run, vm_page* page, int32 maxPages)
1152 {
1153 	fRun = run;
1154 	fCache = page->cache;
1155 	fOffset = page->cache_offset;
1156 	fPageCount = 1;
1157 	fMaxPages = maxPages;
1158 	fStatus = B_OK;
1159 
1160 	fVecs[0].iov_base = (void*)(page->physical_page_number << PAGE_SHIFT);
1161 	fVecs[0].iov_len = B_PAGE_SIZE;
1162 	fVecCount = 1;
1163 }
1164 
1165 
1166 bool
1167 PageWriteTransfer::AddPage(vm_page* page)
1168 {
1169 	if (page->cache != fCache
1170 		|| (fMaxPages >= 0 && fPageCount >= (uint32)fMaxPages))
1171 		return false;
1172 
1173 	addr_t nextBase
1174 		= (addr_t)fVecs[fVecCount - 1].iov_base + fVecs[fVecCount - 1].iov_len;
1175 
1176 	if (page->physical_page_number << PAGE_SHIFT == nextBase
1177 		&& page->cache_offset == fOffset + fPageCount) {
1178 		// append to last iovec
1179 		fVecs[fVecCount - 1].iov_len += B_PAGE_SIZE;
1180 		fPageCount++;
1181 		return true;
1182 	}
1183 
1184 	nextBase = (addr_t)fVecs[0].iov_base - B_PAGE_SIZE;
1185 	if (page->physical_page_number << PAGE_SHIFT == nextBase
1186 		&& page->cache_offset == fOffset - 1) {
1187 		// prepend to first iovec and adjust offset
1188 		fVecs[0].iov_base = (void*)nextBase;
1189 		fVecs[0].iov_len += B_PAGE_SIZE;
1190 		fOffset = page->cache_offset;
1191 		fPageCount++;
1192 		return true;
1193 	}
1194 
1195 	if ((page->cache_offset == fOffset + fPageCount
1196 			|| page->cache_offset == fOffset - 1)
1197 		&& fVecCount < sizeof(fVecs) / sizeof(fVecs[0])) {
1198 		// not physically contiguous or not in the right order
1199 		uint32 vectorIndex;
1200 		if (page->cache_offset < fOffset) {
1201 			// we are pre-pending another vector, move the other vecs
1202 			for (uint32 i = fVecCount; i > 0; i--)
1203 				fVecs[i] = fVecs[i - 1];
1204 
1205 			fOffset = page->cache_offset;
1206 			vectorIndex = 0;
1207 		} else
1208 			vectorIndex = fVecCount;
1209 
1210 		fVecs[vectorIndex].iov_base
1211 			= (void*)(page->physical_page_number << PAGE_SHIFT);
1212 		fVecs[vectorIndex].iov_len = B_PAGE_SIZE;
1213 
1214 		fVecCount++;
1215 		fPageCount++;
1216 		return true;
1217 	}
1218 
1219 	return false;
1220 }
1221 
1222 
1223 status_t
1224 PageWriteTransfer::Schedule(uint32 flags)
1225 {
1226 	off_t writeOffset = (off_t)fOffset << PAGE_SHIFT;
1227 	size_t writeLength = fPageCount << PAGE_SHIFT;
1228 
1229 	if (fRun != NULL) {
1230 		return fCache->WriteAsync(writeOffset, fVecs, fVecCount, writeLength,
1231 			flags | B_PHYSICAL_IO_REQUEST, this);
1232 	}
1233 
1234 	status_t status = fCache->Write(writeOffset, fVecs, fVecCount,
1235 		flags | B_PHYSICAL_IO_REQUEST, &writeLength);
1236 
1237 	SetStatus(status, writeLength);
1238 	return fStatus;
1239 }
1240 
1241 
1242 void
1243 PageWriteTransfer::SetStatus(status_t status, size_t transferred)
1244 {
1245 	// only succeed if all pages up to the last one have been written fully
1246 	// and the last page has at least been written partially
1247 	if (status == B_OK && transferred <= (fPageCount - 1) * B_PAGE_SIZE)
1248 		status = B_ERROR;
1249 
1250 	fStatus = status;
1251 }
1252 
1253 
1254 void
1255 PageWriteTransfer::IOFinished(status_t status, bool partialTransfer,
1256 	size_t bytesTransferred)
1257 {
1258 	SetStatus(status, bytesTransferred);
1259 	fRun->PageWritten(this, fStatus, partialTransfer, bytesTransferred);
1260 }
1261 
1262 
1263 status_t
1264 PageWriterRun::Init(uint32 maxPages)
1265 {
1266 	fMaxPages = maxPages;
1267 	fWrapperCount = 0;
1268 	fTransferCount = 0;
1269 	fPendingTransfers = 0;
1270 
1271 	fWrappers = new(std::nothrow) PageWriteWrapper[maxPages];
1272 	fTransfers = new(std::nothrow) PageWriteTransfer[maxPages];
1273 	if (fWrappers == NULL || fTransfers == NULL)
1274 		return B_NO_MEMORY;
1275 
1276 	return B_OK;
1277 }
1278 
1279 
1280 void
1281 PageWriterRun::PrepareNextRun()
1282 {
1283 	fWrapperCount = 0;
1284 	fTransferCount = 0;
1285 	fPendingTransfers = 0;
1286 }
1287 
1288 
1289 void
1290 PageWriterRun::AddPage(vm_page* page)
1291 {
1292 	fWrappers[fWrapperCount++].SetTo(page, true);
1293 
1294 	if (fTransferCount == 0 || !fTransfers[fTransferCount - 1].AddPage(page)) {
1295 		fTransfers[fTransferCount++].SetTo(this, page,
1296 			page->cache->MaxPagesPerAsyncWrite());
1297 	}
1298 }
1299 
1300 
1301 void
1302 PageWriterRun::Go()
1303 {
1304 	fPendingTransfers = fTransferCount;
1305 
1306 	fAllFinishedCondition.Init(this, "page writer wait for I/O");
1307 	ConditionVariableEntry waitEntry;
1308 	fAllFinishedCondition.Add(&waitEntry);
1309 
1310 	// schedule writes
1311 	for (uint32 i = 0; i < fTransferCount; i++)
1312 		fTransfers[i].Schedule(B_VIP_IO_REQUEST);
1313 
1314 	// wait until all pages have been written
1315 	waitEntry.Wait();
1316 
1317 	// mark pages depending on whether they could be written or not
1318 
1319 	uint32 wrapperIndex = 0;
1320 	for (uint32 i = 0; i < fTransferCount; i++) {
1321 		PageWriteTransfer& transfer = fTransfers[i];
1322 		transfer.Cache()->Lock();
1323 
1324 		if (transfer.Status() != B_OK) {
1325 			uint32 checkIndex = wrapperIndex;
1326 			for (uint32 j = 0; j < transfer.PageCount(); j++)
1327 				fWrappers[checkIndex++].CheckRemoveFromShrunkenCache();
1328 		}
1329 
1330 		MutexLocker locker(sPageLock);
1331 		for (uint32 j = 0; j < transfer.PageCount(); j++)
1332 			fWrappers[wrapperIndex++].Done(transfer.Status());
1333 
1334 		locker.Unlock();
1335 		transfer.Cache()->Unlock();
1336 	}
1337 
1338 	ASSERT(wrapperIndex == fWrapperCount);
1339 
1340 	for (uint32 i = 0; i < fTransferCount; i++) {
1341 		PageWriteTransfer& transfer = fTransfers[i];
1342 		struct VMCache* cache = transfer.Cache();
1343 
1344 		// We've acquired a references for each page
1345 		for (uint32 j = 0; j < transfer.PageCount(); j++) {
1346 			// We release the cache references after all pages were made
1347 			// unbusy again - otherwise releasing a vnode could deadlock.
1348 			cache->ReleaseStoreRef();
1349 			cache->ReleaseRef();
1350 		}
1351 	}
1352 }
1353 
1354 
1355 void
1356 PageWriterRun::PageWritten(PageWriteTransfer* transfer, status_t status,
1357 	bool partialTransfer, size_t bytesTransferred)
1358 {
1359 	if (atomic_add(&fPendingTransfers, -1) == 1)
1360 		fAllFinishedCondition.NotifyAll();
1361 }
1362 
1363 
1364 /*!	The page writer continuously takes some pages from the modified
1365 	queue, writes them back, and moves them back to the active queue.
1366 	It runs in its own thread, and is only there to keep the number
1367 	of modified pages low, so that more pages can be reused with
1368 	fewer costs.
1369 */
1370 status_t
1371 page_writer(void* /*unused*/)
1372 {
1373 	const uint32 kNumPages = 256;
1374 	uint32 writtenPages = 0;
1375 	bigtime_t lastWrittenTime = 0;
1376 	bigtime_t pageCollectionTime = 0;
1377 	bigtime_t pageWritingTime = 0;
1378 
1379 	PageWriterRun run;
1380 	if (run.Init(kNumPages) != B_OK) {
1381 		panic("page writer: Failed to init PageWriterRun!");
1382 		return B_ERROR;
1383 	}
1384 
1385 	vm_page marker;
1386 	marker.type = PAGE_TYPE_DUMMY;
1387 	marker.cache = NULL;
1388 	marker.state = PAGE_STATE_UNUSED;
1389 
1390 	while (true) {
1391 		if (sModifiedPageQueue.count - sModifiedTemporaryPages < 1024) {
1392 			int32 count = 0;
1393 			get_sem_count(sWriterWaitSem, &count);
1394 			if (count == 0)
1395 				count = 1;
1396 
1397 			acquire_sem_etc(sWriterWaitSem, count, B_RELATIVE_TIMEOUT, 3000000);
1398 				// all 3 seconds when no one triggers us
1399 		}
1400 
1401 		// depending on how urgent it becomes to get pages to disk, we adjust
1402 		// our I/O priority
1403 		page_num_t modifiedPages = sModifiedPageQueue.count
1404 			- sModifiedTemporaryPages;
1405 		uint32 lowPagesState = low_resource_state(B_KERNEL_RESOURCE_PAGES);
1406 		int32 ioPriority = B_IDLE_PRIORITY;
1407 		if (lowPagesState >= B_LOW_RESOURCE_CRITICAL
1408 			|| modifiedPages > MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD) {
1409 			ioPriority = MAX_PAGE_WRITER_IO_PRIORITY;
1410 		} else {
1411 			ioPriority = (uint64)MAX_PAGE_WRITER_IO_PRIORITY * modifiedPages
1412 				/ MAX_PAGE_WRITER_IO_PRIORITY_THRESHOLD;
1413 		}
1414 
1415 		thread_set_io_priority(ioPriority);
1416 
1417 		uint32 numPages = 0;
1418 		run.PrepareNextRun();
1419 
1420 		// TODO: make this laptop friendly, too (ie. only start doing
1421 		// something if someone else did something or there is really
1422 		// enough to do).
1423 
1424 		// collect pages to be written
1425 #if ENABLE_SWAP_SUPPORT
1426 		bool lowOnPages = lowPagesState != B_NO_LOW_RESOURCE;
1427 #endif
1428 
1429 		pageCollectionTime -= system_time();
1430 
1431 		while (numPages < kNumPages) {
1432 			vm_page *page = next_modified_page(marker);
1433 			if (page == NULL)
1434 				break;
1435 
1436 			PageCacheLocker cacheLocker(page, false);
1437 			if (!cacheLocker.IsLocked())
1438 				continue;
1439 
1440 			vm_cache *cache = page->cache;
1441 
1442 			// Don't write back wired (locked) pages and don't write RAM pages
1443 			// until we're low on pages. Also avoid writing temporary pages that
1444 			// are active.
1445 			if (page->wired_count > 0
1446 				|| (cache->temporary
1447 #if ENABLE_SWAP_SUPPORT
1448 					&& (!lowOnPages /*|| page->usage_count > 0*/
1449 						|| !cache->CanWritePage(
1450 								(off_t)page->cache_offset << PAGE_SHIFT))
1451 #endif
1452 				)) {
1453 				continue;
1454 			}
1455 
1456 			// we need our own reference to the store, as it might
1457 			// currently be destructed
1458 			if (cache->AcquireUnreferencedStoreRef() != B_OK) {
1459 				cacheLocker.Unlock();
1460 				thread_yield(true);
1461 				continue;
1462 			}
1463 
1464 			MutexLocker locker(sPageLock);
1465 
1466 			// state might have changed while we were locking the cache
1467 			if (page->state != PAGE_STATE_MODIFIED) {
1468 				// release the cache reference
1469 				locker.Unlock();
1470 				cache->ReleaseStoreRef();
1471 				continue;
1472 			}
1473 
1474 			remove_page_from_queue(&sModifiedPageQueue, page);
1475 
1476 			run.AddPage(page);
1477 
1478 			locker.Unlock();
1479 
1480 			//dprintf("write page %p, cache %p (%ld)\n", page, page->cache, page->cache->ref_count);
1481 			TPW(WritePage(page));
1482 
1483 			vm_clear_map_flags(page, PAGE_MODIFIED);
1484 			cache->AcquireRefLocked();
1485 			numPages++;
1486 		}
1487 
1488 		pageCollectionTime += system_time();
1489 
1490 		if (numPages == 0)
1491 			continue;
1492 
1493 		// write pages to disk and do all the cleanup
1494 		pageWritingTime -= system_time();
1495 		run.Go();
1496 		pageWritingTime += system_time();
1497 
1498 		// debug output only...
1499 		writtenPages += numPages;
1500 		if (writtenPages >= 1024) {
1501 			bigtime_t now = system_time();
1502 			TRACE(("page writer: wrote 1024 pages (total: %llu ms, "
1503 				"collect: %llu ms, write: %llu ms)\n",
1504 				(now - lastWrittenTime) / 1000,
1505 				pageCollectionTime / 1000, pageWritingTime / 1000));
1506 			writtenPages -= 1024;
1507 			lastWrittenTime = now;
1508 			pageCollectionTime = 0;
1509 			pageWritingTime = 0;
1510 		}
1511 	}
1512 
1513 	remove_page_marker(marker);
1514 	return B_OK;
1515 }
1516 
1517 
1518 static vm_page *
1519 find_page_candidate(struct vm_page &marker, bool stealActive)
1520 {
1521 	MutexLocker locker(sPageLock);
1522 	page_queue *queue;
1523 	vm_page *page;
1524 
1525 	if (marker.state == PAGE_STATE_UNUSED) {
1526 		// Get the first free pages of the (in)active queue
1527 		queue = &sInactivePageQueue;
1528 		page = sInactivePageQueue.head;
1529 		if (page == NULL && stealActive) {
1530 			queue = &sActivePageQueue;
1531 			page = sActivePageQueue.head;
1532 		}
1533 	} else {
1534 		// Get the next page of the current queue
1535 		if (marker.state == PAGE_STATE_INACTIVE)
1536 			queue = &sInactivePageQueue;
1537 		else if (marker.state == PAGE_STATE_ACTIVE)
1538 			queue = &sActivePageQueue;
1539 		else {
1540 			panic("invalid marker %p state", &marker);
1541 			queue = NULL;
1542 		}
1543 
1544 		page = marker.queue_next;
1545 		remove_page_from_queue(queue, &marker);
1546 		marker.state = PAGE_STATE_UNUSED;
1547 	}
1548 
1549 	while (page != NULL) {
1550 		if (!is_marker_page(page)
1551 			&& (page->state == PAGE_STATE_INACTIVE
1552 				|| (stealActive && page->state == PAGE_STATE_ACTIVE
1553 					&& page->wired_count == 0))) {
1554 			// we found a candidate, insert marker
1555 			marker.state = queue == &sActivePageQueue
1556 				? PAGE_STATE_ACTIVE : PAGE_STATE_INACTIVE;
1557 			insert_page_after(queue, page, &marker);
1558 			return page;
1559 		}
1560 
1561 		page = page->queue_next;
1562 		if (page == NULL && stealActive && queue != &sActivePageQueue) {
1563 			queue = &sActivePageQueue;
1564 			page = sActivePageQueue.head;
1565 		}
1566 	}
1567 
1568 	return NULL;
1569 }
1570 
1571 
1572 static bool
1573 steal_page(vm_page *page, bool stealActive)
1574 {
1575 	// try to lock the page's cache
1576 	if (vm_cache_acquire_locked_page_cache(page, false) == NULL)
1577 		return false;
1578 
1579 	AutoLocker<VMCache> cacheLocker(page->cache, true, false);
1580 	MethodDeleter<VMCache> _2(page->cache, &VMCache::ReleaseRefLocked);
1581 
1582 	// check again if that page is still a candidate
1583 	if (page->state != PAGE_STATE_INACTIVE
1584 		&& (!stealActive || page->state != PAGE_STATE_ACTIVE
1585 			|| page->wired_count != 0))
1586 		return false;
1587 
1588 	// recheck eventual last minute changes
1589 	uint32 flags;
1590 	vm_remove_all_page_mappings(page, &flags);
1591 	if ((flags & PAGE_MODIFIED) != 0) {
1592 		// page was modified, don't steal it
1593 		vm_page_set_state(page, PAGE_STATE_MODIFIED);
1594 		return false;
1595 	} else if ((flags & PAGE_ACCESSED) != 0) {
1596 		// page is in active use, don't steal it
1597 		vm_page_set_state(page, PAGE_STATE_ACTIVE);
1598 		return false;
1599 	}
1600 
1601 	// we can now steal this page
1602 
1603 	//dprintf("  steal page %p from cache %p%s\n", page, page->cache,
1604 	//	page->state == PAGE_STATE_INACTIVE ? "" : " (ACTIVE)");
1605 
1606 	page->cache->RemovePage(page);
1607 
1608 	MutexLocker _(sPageLock);
1609 	remove_page_from_queue(page->state == PAGE_STATE_ACTIVE
1610 		? &sActivePageQueue : &sInactivePageQueue, page);
1611 	return true;
1612 }
1613 
1614 
1615 static size_t
1616 steal_pages(vm_page **pages, size_t count, bool reserve)
1617 {
1618 	size_t maxCount = count;
1619 
1620 	while (true) {
1621 		vm_page marker;
1622 		marker.type = PAGE_TYPE_DUMMY;
1623 		marker.cache = NULL;
1624 		marker.state = PAGE_STATE_UNUSED;
1625 
1626 		bool tried = false;
1627 		size_t stolen = 0;
1628 
1629 		while (count > 0) {
1630 			vm_page *page = find_page_candidate(marker, false);
1631 			if (page == NULL)
1632 				break;
1633 
1634 			if (steal_page(page, false)) {
1635 				if (reserve || stolen >= maxCount) {
1636 					MutexLocker _(sPageLock);
1637 					enqueue_page(&sFreePageQueue, page);
1638 					page->state = PAGE_STATE_FREE;
1639 
1640 					T(StolenPage());
1641 				} else if (stolen < maxCount)
1642 					pages[stolen] = page;
1643 
1644 				stolen++;
1645 				count--;
1646 			} else
1647 				tried = true;
1648 		}
1649 
1650 		remove_page_marker(marker);
1651 
1652 		MutexLocker locker(sPageLock);
1653 
1654 		if ((reserve && sReservedPages <= free_page_queue_count())
1655 			|| count == 0
1656 			|| ((!reserve && (sInactivePageQueue.count > 0))
1657 				|| free_page_queue_count() > sReservedPages))
1658 			return stolen;
1659 
1660 		if (stolen && !tried && sInactivePageQueue.count > 0) {
1661 			count++;
1662 			continue;
1663 		}
1664 
1665 		// we need to wait for pages to become inactive
1666 
1667 		ConditionVariableEntry freeConditionEntry;
1668 		sPageDeficit++;
1669 		freeConditionEntry.Add(&sFreePageQueue);
1670 		locker.Unlock();
1671 
1672 		if (tried) {
1673 			// We tried all potential pages, but one or more couldn't be stolen
1674 			// at that time (likely because their cache was locked). No one
1675 			// else will have any better luck, so we'll just retry a little
1676 			// later.
1677 			freeConditionEntry.Wait(B_RELATIVE_TIMEOUT, 10000);
1678 		} else {
1679 			low_resource(B_KERNEL_RESOURCE_PAGES, count, B_RELATIVE_TIMEOUT, 0);
1680 			//snooze(50000);
1681 				// sleep for 50ms
1682 
1683 			freeConditionEntry.Wait();
1684 		}
1685 
1686 		locker.Lock();
1687 		sPageDeficit--;
1688 
1689 		if (reserve && sReservedPages <= free_page_queue_count())
1690 			return stolen;
1691 	}
1692 }
1693 
1694 
1695 //	#pragma mark - private kernel API
1696 
1697 
1698 /*!	Writes a range of modified pages of a cache to disk.
1699 	You need to hold the vm_cache lock when calling this function.
1700 	Note that the cache lock is released in this function.
1701 	\param cache The cache.
1702 	\param firstPage Offset (in page size units) of the first page in the range.
1703 	\param endPage End offset (in page size units) of the page range. The page
1704 		at this offset is not included.
1705 */
1706 status_t
1707 vm_page_write_modified_page_range(struct VMCache* cache, uint32 firstPage,
1708 	uint32 endPage)
1709 {
1710 	static const int32 kMaxPages = 256;
1711 	int32 maxPages = cache->MaxPagesPerWrite();
1712 	if (maxPages < 0 || maxPages > kMaxPages)
1713 		maxPages = kMaxPages;
1714 
1715 	PageWriteWrapper stackWrappers[2];
1716 	PageWriteWrapper* wrapperPool = new(nogrow) PageWriteWrapper[maxPages + 1];
1717 	if (wrapperPool == NULL) {
1718 		// don't fail, just limit our capabilities
1719 		wrapperPool = stackWrappers;
1720 		maxPages = 1;
1721 	}
1722 
1723 	int32 nextWrapper = 0;
1724 
1725 	PageWriteWrapper* wrappers[maxPages];
1726 	int32 usedWrappers = 0;
1727 
1728 	PageWriteTransfer transfer;
1729 	bool transferEmpty = true;
1730 
1731 	VMCachePagesTree::Iterator it
1732 		= cache->pages.GetIterator(firstPage, true, true);
1733 
1734 	while (true) {
1735 		vm_page* page = it.Next();
1736 		if (page == NULL || page->cache_offset >= endPage) {
1737 			if (transferEmpty)
1738 				break;
1739 
1740 			page = NULL;
1741 		}
1742 
1743 		bool dequeuedPage = false;
1744 		if (page != NULL) {
1745 			if (page->state == PAGE_STATE_MODIFIED) {
1746 				MutexLocker locker(&sPageLock);
1747 				remove_page_from_queue(&sModifiedPageQueue, page);
1748 				dequeuedPage = true;
1749 			} else if (page->state == PAGE_STATE_BUSY
1750 					|| !vm_test_map_modification(page)) {
1751 				page = NULL;
1752 			}
1753 		}
1754 
1755 		PageWriteWrapper* wrapper = NULL;
1756 		if (page != NULL) {
1757 			wrapper = &wrapperPool[nextWrapper++];
1758 			if (nextWrapper > maxPages)
1759 				nextWrapper = 0;
1760 
1761 			wrapper->SetTo(page, dequeuedPage);
1762 			wrapper->ClearModifiedFlag();
1763 
1764 			if (transferEmpty || transfer.AddPage(page)) {
1765 				if (transferEmpty) {
1766 					transfer.SetTo(NULL, page, maxPages);
1767 					transferEmpty = false;
1768 				}
1769 
1770 				wrappers[usedWrappers++] = wrapper;
1771 				continue;
1772 			}
1773 		}
1774 
1775 		if (transferEmpty)
1776 			continue;
1777 
1778 		cache->Unlock();
1779 		status_t status = transfer.Schedule(0);
1780 		cache->Lock();
1781 
1782 		// Before disabling interrupts handle part of the special case that
1783 		// writing the page failed due the cache having been shrunk. We need to
1784 		// remove the page from the cache and free it.
1785 		if (status != B_OK) {
1786 			for (int32 i = 0; i < usedWrappers; i++)
1787 				wrappers[i]->CheckRemoveFromShrunkenCache();
1788 		}
1789 
1790 		MutexLocker locker(&sPageLock);
1791 
1792 		for (int32 i = 0; i < usedWrappers; i++)
1793 			wrappers[i]->Done(status);
1794 
1795 		locker.Unlock();
1796 
1797 		usedWrappers = 0;
1798 
1799 		if (page != NULL) {
1800 			transfer.SetTo(NULL, page, maxPages);
1801 			wrappers[usedWrappers++] = wrapper;
1802 		} else
1803 			transferEmpty = true;
1804 	}
1805 
1806 	if (wrapperPool != stackWrappers)
1807 		delete [] wrapperPool;
1808 
1809 	return B_OK;
1810 }
1811 
1812 
1813 /*!	You need to hold the vm_cache lock when calling this function.
1814 	Note that the cache lock is released in this function.
1815 */
1816 status_t
1817 vm_page_write_modified_pages(vm_cache *cache)
1818 {
1819 	return vm_page_write_modified_page_range(cache, 0,
1820 		(cache->virtual_end + B_PAGE_SIZE - 1) >> PAGE_SHIFT);
1821 }
1822 
1823 
1824 /*!	Schedules the page writer to write back the specified \a page.
1825 	Note, however, that it might not do this immediately, and it can well
1826 	take several seconds until the page is actually written out.
1827 */
1828 void
1829 vm_page_schedule_write_page(vm_page *page)
1830 {
1831 	ASSERT(page->state == PAGE_STATE_MODIFIED);
1832 
1833 	vm_page_requeue(page, false);
1834 
1835 	release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1836 }
1837 
1838 
1839 /*!	Cache must be locked.
1840 */
1841 void
1842 vm_page_schedule_write_page_range(struct VMCache *cache, uint32 firstPage,
1843 	uint32 endPage)
1844 {
1845 	uint32 modified = 0;
1846 	for (VMCachePagesTree::Iterator it
1847 				= cache->pages.GetIterator(firstPage, true, true);
1848 			vm_page *page = it.Next();) {
1849 		if (page->cache_offset >= endPage)
1850 			break;
1851 
1852 		if (page->state == PAGE_STATE_MODIFIED) {
1853 			vm_page_requeue(page, false);
1854 			modified++;
1855 		}
1856 	}
1857 
1858 	if (modified > 0)
1859 		release_sem_etc(sWriterWaitSem, 1, B_DO_NOT_RESCHEDULE);
1860 }
1861 
1862 
1863 void
1864 vm_page_init_num_pages(kernel_args *args)
1865 {
1866 	uint32 i;
1867 
1868 	// calculate the size of memory by looking at the physical_memory_range array
1869 	addr_t physicalPagesEnd = 0;
1870 	sPhysicalPageOffset = args->physical_memory_range[0].start / B_PAGE_SIZE;
1871 
1872 	for (i = 0; i < args->num_physical_memory_ranges; i++) {
1873 		physicalPagesEnd = (args->physical_memory_range[i].start
1874 			+ args->physical_memory_range[i].size) / B_PAGE_SIZE;
1875 	}
1876 
1877 	TRACE(("first phys page = 0x%lx, end 0x%lx\n", sPhysicalPageOffset,
1878 		physicalPagesEnd));
1879 
1880 	sNumPages = physicalPagesEnd - sPhysicalPageOffset;
1881 
1882 #ifdef LIMIT_AVAILABLE_MEMORY
1883 	if (sNumPages > LIMIT_AVAILABLE_MEMORY * 256)
1884 		sNumPages = LIMIT_AVAILABLE_MEMORY * 256;
1885 #endif
1886 }
1887 
1888 
1889 status_t
1890 vm_page_init(kernel_args *args)
1891 {
1892 	TRACE(("vm_page_init: entry\n"));
1893 
1894 	// map in the new free page table
1895 	sPages = (vm_page *)vm_allocate_early(args, sNumPages * sizeof(vm_page),
1896 		~0L, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1897 
1898 	TRACE(("vm_init: putting free_page_table @ %p, # ents %ld (size 0x%x)\n",
1899 		sPages, sNumPages, (unsigned int)(sNumPages * sizeof(vm_page))));
1900 
1901 	// initialize the free page table
1902 	for (uint32 i = 0; i < sNumPages; i++) {
1903 		sPages[i].physical_page_number = sPhysicalPageOffset + i;
1904 		sPages[i].type = PAGE_TYPE_PHYSICAL;
1905 		sPages[i].state = PAGE_STATE_FREE;
1906 		new(&sPages[i].mappings) vm_page_mappings();
1907 		sPages[i].wired_count = 0;
1908 		sPages[i].usage_count = 0;
1909 		sPages[i].busy_writing = false;
1910 		sPages[i].merge_swap = false;
1911 		sPages[i].cache = NULL;
1912 		#if DEBUG_PAGE_QUEUE
1913 			sPages[i].queue = NULL;
1914 		#endif
1915 		#if DEBUG_PAGE_CACHE_TRANSITIONS
1916 			sPages[i].debug_flags = 0;
1917 			sPages[i].collided_page = NULL;
1918 		#endif	// DEBUG_PAGE_CACHE_TRANSITIONS
1919 		enqueue_page(&sFreePageQueue, &sPages[i]);
1920 	}
1921 
1922 	TRACE(("initialized table\n"));
1923 
1924 	// mark some of the page ranges inuse
1925 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
1926 		vm_mark_page_range_inuse(args->physical_allocated_range[i].start / B_PAGE_SIZE,
1927 			args->physical_allocated_range[i].size / B_PAGE_SIZE);
1928 	}
1929 
1930 	TRACE(("vm_page_init: exit\n"));
1931 
1932 	return B_OK;
1933 }
1934 
1935 
1936 status_t
1937 vm_page_init_post_area(kernel_args *args)
1938 {
1939 	void *dummy;
1940 
1941 	dummy = sPages;
1942 	create_area("page structures", &dummy, B_EXACT_ADDRESS,
1943 		PAGE_ALIGN(sNumPages * sizeof(vm_page)), B_ALREADY_WIRED,
1944 		B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1945 
1946 	add_debugger_command("page_stats", &dump_page_stats, "Dump statistics about page usage");
1947 	add_debugger_command("page", &dump_page, "Dump page info");
1948 	add_debugger_command("page_queue", &dump_page_queue, "Dump page queue");
1949 	add_debugger_command("find_page", &find_page,
1950 		"Find out which queue a page is actually in");
1951 
1952 	return B_OK;
1953 }
1954 
1955 
1956 status_t
1957 vm_page_init_post_thread(kernel_args *args)
1958 {
1959 	new (&sFreePageCondition) ConditionVariable;
1960 	sFreePageCondition.Publish(&sFreePageQueue, "free page");
1961 
1962 	// create a kernel thread to clear out pages
1963 
1964 	thread_id thread = spawn_kernel_thread(&page_scrubber, "page scrubber",
1965 		B_LOWEST_ACTIVE_PRIORITY, NULL);
1966 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1967 
1968 	// start page writer
1969 
1970 	sWriterWaitSem = create_sem(0, "page writer");
1971 
1972 	thread = spawn_kernel_thread(&page_writer, "page writer",
1973 		B_NORMAL_PRIORITY + 1, NULL);
1974 	send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1975 
1976 	return B_OK;
1977 }
1978 
1979 
1980 status_t
1981 vm_mark_page_inuse(addr_t page)
1982 {
1983 	return vm_mark_page_range_inuse(page, 1);
1984 }
1985 
1986 
1987 status_t
1988 vm_mark_page_range_inuse(addr_t startPage, addr_t length)
1989 {
1990 	TRACE(("vm_mark_page_range_inuse: start 0x%lx, len 0x%lx\n",
1991 		startPage, length));
1992 
1993 	if (sPhysicalPageOffset > startPage) {
1994 		TRACE(("vm_mark_page_range_inuse: start page %ld is before free list\n",
1995 			startPage));
1996 		return B_BAD_VALUE;
1997 	}
1998 	startPage -= sPhysicalPageOffset;
1999 	if (startPage + length > sNumPages) {
2000 		TRACE(("vm_mark_page_range_inuse: range would extend past free list\n"));
2001 		return B_BAD_VALUE;
2002 	}
2003 
2004 	MutexLocker _(sPageLock);
2005 
2006 	for (addr_t i = 0; i < length; i++) {
2007 		vm_page *page = &sPages[startPage + i];
2008 		switch (page->state) {
2009 			case PAGE_STATE_FREE:
2010 			case PAGE_STATE_CLEAR:
2011 				set_page_state_nolock(page, PAGE_STATE_UNUSED);
2012 				break;
2013 			case PAGE_STATE_WIRED:
2014 				break;
2015 			case PAGE_STATE_ACTIVE:
2016 			case PAGE_STATE_INACTIVE:
2017 			case PAGE_STATE_BUSY:
2018 			case PAGE_STATE_MODIFIED:
2019 			case PAGE_STATE_UNUSED:
2020 			default:
2021 				// uh
2022 				dprintf("vm_mark_page_range_inuse: page 0x%lx in non-free state %d!\n",
2023 					startPage + i, page->state);
2024 				break;
2025 		}
2026 	}
2027 
2028 	return B_OK;
2029 }
2030 
2031 
2032 /*!	Unreserve pages previously reserved with vm_page_reserve_pages().
2033 	Note, you specify the same \a count here that you specified when
2034 	reserving the pages - you don't need to keep track how many pages
2035 	you actually needed of that upfront allocation.
2036 */
2037 void
2038 vm_page_unreserve_pages(uint32 count)
2039 {
2040 	if (count == 0)
2041 		return;
2042 
2043 	MutexLocker locker(sPageLock);
2044 	ASSERT(sReservedPages >= count);
2045 
2046 	T(UnreservePages(count));
2047 
2048 	sReservedPages -= count;
2049 
2050 	if (sPageDeficit > 0)
2051 		sFreePageCondition.NotifyAll();
2052 }
2053 
2054 
2055 /*!	With this call, you can reserve a number of free pages in the system.
2056 	They will only be handed out to someone who has actually reserved them.
2057 	This call returns as soon as the number of requested pages has been
2058 	reached.
2059 */
2060 void
2061 vm_page_reserve_pages(uint32 count)
2062 {
2063 	if (count == 0)
2064 		return;
2065 
2066 	MutexLocker locker(sPageLock);
2067 
2068 	T(ReservePages(count));
2069 
2070 	sReservedPages += count;
2071 	size_t freePages = free_page_queue_count();
2072 	if (sReservedPages <= freePages)
2073 		return;
2074 
2075 	count = sReservedPages - freePages;
2076 	locker.Unlock();
2077 
2078 	steal_pages(NULL, count + 1, true);
2079 		// we get one more, just in case we can do something someone
2080 		// else can't
2081 }
2082 
2083 
2084 bool
2085 vm_page_try_reserve_pages(uint32 count)
2086 {
2087 	if (count == 0)
2088 		return true;
2089 
2090 	MutexLocker locker(sPageLock);
2091 
2092 	T(ReservePages(count));
2093 
2094 	size_t freePages = free_page_queue_count();
2095 	if (sReservedPages + count > freePages)
2096 		return false;
2097 
2098 	sReservedPages += count;
2099 	return true;
2100 }
2101 
2102 
2103 // TODO: you must not have locked a cache when calling this function with
2104 // reserved == false. See vm_cache_acquire_locked_page_cache().
2105 vm_page *
2106 vm_page_allocate_page(int pageState, bool reserved)
2107 {
2108 	page_queue *queue;
2109 	page_queue *otherQueue;
2110 
2111 	switch (pageState) {
2112 		case PAGE_STATE_FREE:
2113 			queue = &sFreePageQueue;
2114 			otherQueue = &sClearPageQueue;
2115 			break;
2116 		case PAGE_STATE_CLEAR:
2117 			queue = &sClearPageQueue;
2118 			otherQueue = &sFreePageQueue;
2119 			break;
2120 		default:
2121 			return NULL; // invalid
2122 	}
2123 
2124 	MutexLocker locker(sPageLock);
2125 
2126 	T(AllocatePage(reserved));
2127 
2128 	vm_page *page = NULL;
2129 	while (true) {
2130 		if (reserved || sReservedPages < free_page_queue_count()) {
2131 			page = dequeue_page(queue);
2132 			if (page == NULL) {
2133 #if DEBUG_PAGE_QUEUE
2134 				if (queue->count != 0)
2135 					panic("queue %p corrupted, count = %d\n", queue, queue->count);
2136 #endif
2137 
2138 				// if the primary queue was empty, grab the page from the
2139 				// secondary queue
2140 				page = dequeue_page(otherQueue);
2141 			}
2142 		}
2143 
2144 		if (page != NULL)
2145 			break;
2146 
2147 		if (reserved)
2148 			panic("Had reserved page, but there is none!");
2149 
2150 		// steal one from the inactive list
2151 		locker.Unlock();
2152 		size_t stolen = steal_pages(&page, 1, false);
2153 		locker.Lock();
2154 
2155 		if (stolen > 0)
2156 			break;
2157 	}
2158 
2159 	if (page->cache != NULL)
2160 		panic("supposed to be free page %p has cache\n", page);
2161 
2162 	int oldPageState = page->state;
2163 	page->state = PAGE_STATE_BUSY;
2164 	page->usage_count = 2;
2165 
2166 	enqueue_page(&sActivePageQueue, page);
2167 
2168 	locker.Unlock();
2169 
2170 	// if needed take the page from the free queue and zero it out
2171 	if (pageState == PAGE_STATE_CLEAR && oldPageState != PAGE_STATE_CLEAR)
2172 		clear_page(page);
2173 
2174 	return page;
2175 }
2176 
2177 
2178 /*!	Allocates a number of pages and puts their pointers into the provided
2179 	array. All pages are marked busy.
2180 	Returns B_OK on success, and B_NO_MEMORY when there aren't any free
2181 	pages left to allocate.
2182 */
2183 status_t
2184 vm_page_allocate_pages(int pageState, vm_page **pages, uint32 numPages)
2185 {
2186 	uint32 i;
2187 
2188 	for (i = 0; i < numPages; i++) {
2189 		pages[i] = vm_page_allocate_page(pageState, false);
2190 		if (pages[i] == NULL) {
2191 			// allocation failed, we need to free what we already have
2192 			while (i-- > 0)
2193 				vm_page_set_state(pages[i], pageState);
2194 
2195 			return B_NO_MEMORY;
2196 		}
2197 	}
2198 
2199 	return B_OK;
2200 }
2201 
2202 
2203 vm_page *
2204 vm_page_allocate_page_run(int pageState, addr_t base, addr_t length)
2205 {
2206 	vm_page *firstPage = NULL;
2207 	uint32 start = base >> PAGE_SHIFT;
2208 
2209 	MutexLocker locker(sPageLock);
2210 
2211 	if (free_page_queue_count() - sReservedPages < length) {
2212 		// TODO: add more tries, ie. free some inactive, ...
2213 		// no free space
2214 		return NULL;
2215 	}
2216 
2217 	for (;;) {
2218 		bool foundRun = true;
2219 		if (start + length > sNumPages)
2220 			break;
2221 
2222 		uint32 i;
2223 		for (i = 0; i < length; i++) {
2224 			if (sPages[start + i].state != PAGE_STATE_FREE
2225 				&& sPages[start + i].state != PAGE_STATE_CLEAR) {
2226 				foundRun = false;
2227 				i++;
2228 				break;
2229 			}
2230 		}
2231 
2232 		if (foundRun) {
2233 			// pull the pages out of the appropriate queues
2234 			for (i = 0; i < length; i++) {
2235 				sPages[start + i].is_cleared
2236 					= sPages[start + i].state == PAGE_STATE_CLEAR;
2237 				set_page_state_nolock(&sPages[start + i], PAGE_STATE_BUSY);
2238 				sPages[start + i].usage_count = 2;
2239 			}
2240 
2241 			firstPage = &sPages[start];
2242 			break;
2243 		} else
2244 			start += i;
2245 	}
2246 
2247 	T(AllocatePageRun(length));
2248 
2249 	locker.Unlock();
2250 
2251 	if (firstPage != NULL && pageState == PAGE_STATE_CLEAR) {
2252 		for (uint32 i = 0; i < length; i++) {
2253 			if (!sPages[start + i].is_cleared)
2254 	 			clear_page(&sPages[start + i]);
2255 		}
2256 	}
2257 
2258 	return firstPage;
2259 }
2260 
2261 
2262 vm_page *
2263 vm_page_allocate_page_run_no_base(int pageState, addr_t count)
2264 {
2265 	MutexLocker locker(sPageLock);
2266 
2267 	if (free_page_queue_count() - sReservedPages < count) {
2268 		// TODO: add more tries, ie. free some inactive, ...
2269 		// no free space
2270 		return NULL;
2271 	}
2272 
2273 	page_queue *queue;
2274 	page_queue *otherQueue;
2275 	switch (pageState) {
2276 		case PAGE_STATE_FREE:
2277 			queue = &sFreePageQueue;
2278 			otherQueue = &sClearPageQueue;
2279 			break;
2280 		case PAGE_STATE_CLEAR:
2281 			queue = &sClearPageQueue;
2282 			otherQueue = &sFreePageQueue;
2283 			break;
2284 		default:
2285 			return NULL; // invalid
2286 	}
2287 
2288 	vm_page *firstPage = NULL;
2289 	for (uint32 twice = 0; twice < 2; twice++) {
2290 		vm_page *page = queue->head;
2291 		for (; page != NULL; page = page->queue_next) {
2292 			vm_page *current = page;
2293 			if (current >= &sPages[sNumPages - count])
2294 				continue;
2295 
2296 			bool foundRun = true;
2297 			for (uint32 i = 0; i < count; i++, current++) {
2298 				if (current->state != PAGE_STATE_FREE
2299 					&& current->state != PAGE_STATE_CLEAR) {
2300 					foundRun = false;
2301 					break;
2302 				}
2303 			}
2304 
2305 			if (foundRun) {
2306 				// pull the pages out of the appropriate queues
2307 				current = page;
2308 				for (uint32 i = 0; i < count; i++, current++) {
2309 					current->is_cleared = current->state == PAGE_STATE_CLEAR;
2310 					set_page_state_nolock(current, PAGE_STATE_BUSY);
2311 					current->usage_count = 2;
2312 				}
2313 
2314 				firstPage = page;
2315 				break;
2316 			}
2317 		}
2318 
2319 		if (firstPage != NULL)
2320 			break;
2321 
2322 		queue = otherQueue;
2323 	}
2324 
2325 	T(AllocatePageRun(count));
2326 
2327 	locker.Unlock();
2328 
2329 	if (firstPage != NULL && pageState == PAGE_STATE_CLEAR) {
2330 		vm_page *current = firstPage;
2331 		for (uint32 i = 0; i < count; i++, current++) {
2332 			if (!current->is_cleared)
2333 	 			clear_page(current);
2334 		}
2335 	}
2336 
2337 	return firstPage;
2338 }
2339 
2340 
2341 vm_page *
2342 vm_page_at_index(int32 index)
2343 {
2344 	return &sPages[index];
2345 }
2346 
2347 
2348 vm_page *
2349 vm_lookup_page(addr_t pageNumber)
2350 {
2351 	if (pageNumber < sPhysicalPageOffset)
2352 		return NULL;
2353 
2354 	pageNumber -= sPhysicalPageOffset;
2355 	if (pageNumber >= sNumPages)
2356 		return NULL;
2357 
2358 	return &sPages[pageNumber];
2359 }
2360 
2361 
2362 /*!	Free the page that belonged to a certain cache.
2363 	You can use vm_page_set_state() manually if you prefer, but only
2364 	if the page does not equal PAGE_STATE_MODIFIED.
2365 */
2366 void
2367 vm_page_free(vm_cache *cache, vm_page *page)
2368 {
2369 	MutexLocker _(sPageLock);
2370 
2371 	if (page->cache == NULL && page->state == PAGE_STATE_MODIFIED
2372 		&& cache->temporary) {
2373 		sModifiedTemporaryPages--;
2374 	}
2375 
2376 	set_page_state_nolock(page, PAGE_STATE_FREE);
2377 }
2378 
2379 
2380 status_t
2381 vm_page_set_state(vm_page *page, int pageState)
2382 {
2383 	MutexLocker _(sPageLock);
2384 
2385 	return set_page_state_nolock(page, pageState);
2386 }
2387 
2388 
2389 /*!	Moves a page to either the tail of the head of its current queue,
2390 	depending on \a tail.
2391 */
2392 void
2393 vm_page_requeue(struct vm_page *page, bool tail)
2394 {
2395 	MutexLocker _(sPageLock);
2396 	page_queue *queue = NULL;
2397 
2398 	switch (page->state) {
2399 		case PAGE_STATE_BUSY:
2400 		case PAGE_STATE_ACTIVE:
2401 		case PAGE_STATE_WIRED:
2402 		case PAGE_STATE_UNUSED:
2403 			queue = &sActivePageQueue;
2404 			break;
2405 		case PAGE_STATE_INACTIVE:
2406 			queue = &sInactivePageQueue;
2407 			break;
2408 		case PAGE_STATE_MODIFIED:
2409 			queue = &sModifiedPageQueue;
2410 			break;
2411 		case PAGE_STATE_FREE:
2412 			queue = &sFreePageQueue;
2413 			break;
2414 		case PAGE_STATE_CLEAR:
2415 			queue = &sClearPageQueue;
2416 			break;
2417 		default:
2418 			panic("vm_page_touch: vm_page %p in invalid state %d\n",
2419 				page, page->state);
2420 			break;
2421 	}
2422 
2423 	remove_page_from_queue(queue, page);
2424 
2425 	if (tail)
2426 		enqueue_page(queue, page);
2427 	else
2428 		enqueue_page_to_head(queue, page);
2429 }
2430 
2431 
2432 size_t
2433 vm_page_num_pages(void)
2434 {
2435 	return sNumPages;
2436 }
2437 
2438 
2439 /*! There is a subtle distinction between the page counts returned by
2440 	this function and vm_page_num_free_pages():
2441 	The latter returns the number of pages that are completely uncommitted,
2442 	whereas this one returns the number of pages that are available for
2443 	use by being reclaimed as well (IOW it factors in things like cache pages
2444 	as available).
2445 */
2446 size_t
2447 vm_page_num_available_pages(void)
2448 {
2449 	return vm_available_memory() / B_PAGE_SIZE;
2450 }
2451 
2452 
2453 size_t
2454 vm_page_num_free_pages(void)
2455 {
2456 	size_t reservedPages = sReservedPages;
2457 	size_t count = free_page_queue_count() + sInactivePageQueue.count;
2458 	if (reservedPages >= count)
2459 		return 0;
2460 
2461 	return count - reservedPages;
2462 }
2463 
2464 
2465 size_t
2466 vm_page_num_unused_pages(void)
2467 {
2468 	size_t reservedPages = sReservedPages;
2469 	size_t count = free_page_queue_count();
2470 	if (reservedPages >= count)
2471 		return 0;
2472 
2473 	return count - reservedPages;
2474 }
2475 
2476 
2477 void
2478 vm_page_get_stats(system_info *info)
2479 {
2480 	// Get free pages count -- not really exact, since we don't know how many
2481 	// of the reserved pages have already been allocated, but good citizens
2482 	// unreserve chunk-wise as they are allocating the pages, if they have
2483 	// reserved a larger quantity.
2484 	page_num_t reserved = sReservedPages;
2485 	page_num_t free = free_page_queue_count();
2486 	free = free > reserved ? free - reserved : 0;
2487 
2488 	// The pages used for the block cache buffers. Those should not be counted
2489 	// as used but as cached pages.
2490 	// TODO: We should subtract the blocks that are in use ATM, since those
2491 	// can't really be freed in a low memory situation.
2492 	page_num_t blockCachePages = block_cache_used_memory() / B_PAGE_SIZE;
2493 
2494 	info->max_pages = sNumPages;
2495 	info->used_pages = gMappedPagesCount - blockCachePages;
2496 	info->cached_pages = sNumPages >= free + info->used_pages
2497 		? sNumPages - free - info->used_pages : 0;
2498 	info->page_faults = vm_num_page_faults();
2499 
2500 	// TODO: We don't consider pages used for page directories/tables yet.
2501 }
2502