xref: /haiku/src/system/kernel/arch/x86/arch_vm_translation_map.cpp (revision 50b3e74489a1a46fec88df793e4f6780e4de933c)
1 /*
2  * Copyright 2008-2010, Ingo Weinhold, ingo_weinhold@gmx.de.
3  * Copyright 2002-2007, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
4  * Distributed under the terms of the MIT License.
5  *
6  * Copyright 2001-2002, Travis Geiselbrecht. All rights reserved.
7  * Distributed under the terms of the NewOS License.
8  */
9 
10 
11 #include <arch/vm_translation_map.h>
12 
13 #include <stdlib.h>
14 #include <string.h>
15 
16 #include <AutoDeleter.h>
17 
18 #include <arch_system_info.h>
19 #include <heap.h>
20 #include <int.h>
21 #include <thread.h>
22 #include <slab/Slab.h>
23 #include <smp.h>
24 #include <util/AutoLock.h>
25 #include <util/queue.h>
26 #include <vm/vm_page.h>
27 #include <vm/vm_priv.h>
28 #include <vm/VMAddressSpace.h>
29 #include <vm/VMCache.h>
30 
31 #include "x86_paging.h"
32 #include "x86_physical_page_mapper.h"
33 #include "X86VMTranslationMap.h"
34 
35 
36 //#define TRACE_VM_TMAP
37 #ifdef TRACE_VM_TMAP
38 #	define TRACE(x...) dprintf(x)
39 #else
40 #	define TRACE(x...) ;
41 #endif
42 
43 
44 static page_table_entry *sPageHole = NULL;
45 static page_directory_entry *sPageHolePageDir = NULL;
46 static page_directory_entry *sKernelPhysicalPageDirectory = NULL;
47 static page_directory_entry *sKernelVirtualPageDirectory = NULL;
48 
49 static X86PhysicalPageMapper* sPhysicalPageMapper;
50 static TranslationMapPhysicalPageMapper* sKernelPhysicalPageMapper;
51 
52 
53 // Accessor class to reuse the SinglyLinkedListLink of DeferredDeletable for
54 // vm_translation_map_arch_info.
55 struct ArchTMapGetLink {
56 private:
57 	typedef SinglyLinkedListLink<vm_translation_map_arch_info> Link;
58 
59 public:
60 	inline Link* operator()(vm_translation_map_arch_info* element) const
61 	{
62 		return (Link*)element->GetSinglyLinkedListLink();
63 	}
64 
65 	inline const Link* operator()(
66 		const vm_translation_map_arch_info* element) const
67 	{
68 		return (const Link*)element->GetSinglyLinkedListLink();
69 	}
70 
71 };
72 
73 
74 typedef SinglyLinkedList<vm_translation_map_arch_info, ArchTMapGetLink>
75 	ArchTMapList;
76 
77 
78 static ArchTMapList sTMapList;
79 static spinlock sTMapListLock;
80 
81 #define CHATTY_TMAP 0
82 
83 #define FIRST_USER_PGDIR_ENT    (VADDR_TO_PDENT(USER_BASE))
84 #define NUM_USER_PGDIR_ENTS     (VADDR_TO_PDENT(ROUNDUP(USER_SIZE, \
85 									B_PAGE_SIZE * 1024)))
86 #define FIRST_KERNEL_PGDIR_ENT  (VADDR_TO_PDENT(KERNEL_BASE))
87 #define NUM_KERNEL_PGDIR_ENTS   (VADDR_TO_PDENT(KERNEL_SIZE))
88 #define IS_KERNEL_MAP(map)		(fArchData->pgdir_phys \
89 									== sKernelPhysicalPageDirectory)
90 
91 
92 vm_translation_map_arch_info::vm_translation_map_arch_info()
93 	:
94 	pgdir_virt(NULL),
95 	ref_count(1)
96 {
97 }
98 
99 
100 vm_translation_map_arch_info::~vm_translation_map_arch_info()
101 {
102 	// free the page dir
103 	free(pgdir_virt);
104 }
105 
106 
107 void
108 vm_translation_map_arch_info::Delete()
109 {
110 	// remove from global list
111 	InterruptsSpinLocker locker(sTMapListLock);
112 	sTMapList.Remove(this);
113 	locker.Unlock();
114 
115 #if 0
116 	// this sanity check can be enabled when corruption due to
117 	// overwriting an active page directory is suspected
118 	addr_t activePageDirectory;
119 	read_cr3(activePageDirectory);
120 	if (activePageDirectory == (addr_t)pgdir_phys)
121 		panic("deleting a still active page directory\n");
122 #endif
123 
124 	if (are_interrupts_enabled())
125 		delete this;
126 	else
127 		deferred_delete(this);
128 }
129 
130 
131 //	#pragma mark -
132 
133 
134 //! TODO: currently assumes this translation map is active
135 static status_t
136 early_query(addr_t va, addr_t *_physicalAddress)
137 {
138 	if ((sPageHolePageDir[VADDR_TO_PDENT(va)] & X86_PDE_PRESENT) == 0) {
139 		// no pagetable here
140 		return B_ERROR;
141 	}
142 
143 	page_table_entry* pentry = sPageHole + va / B_PAGE_SIZE;
144 	if ((*pentry & X86_PTE_PRESENT) == 0) {
145 		// page mapping not valid
146 		return B_ERROR;
147 	}
148 
149 	*_physicalAddress = *pentry & X86_PTE_ADDRESS_MASK;
150 	return B_OK;
151 }
152 
153 
154 static inline uint32
155 memory_type_to_pte_flags(uint32 memoryType)
156 {
157 	// ATM we only handle the uncacheable and write-through type explicitly. For
158 	// all other types we rely on the MTRRs to be set up correctly. Since we set
159 	// the default memory type to write-back and since the uncacheable type in
160 	// the PTE overrides any MTRR attribute (though, as per the specs, that is
161 	// not recommended for performance reasons), this reduces the work we
162 	// actually *have* to do with the MTRRs to setting the remaining types
163 	// (usually only write-combining for the frame buffer).
164 	switch (memoryType) {
165 		case B_MTR_UC:
166 			return X86_PTE_CACHING_DISABLED | X86_PTE_WRITE_THROUGH;
167 
168 		case B_MTR_WC:
169 			// X86_PTE_WRITE_THROUGH would be closer, but the combination with
170 			// MTRR WC is "implementation defined" for Pentium Pro/II.
171 			return 0;
172 
173 		case B_MTR_WT:
174 			return X86_PTE_WRITE_THROUGH;
175 
176 		case B_MTR_WP:
177 		case B_MTR_WB:
178 		default:
179 			return 0;
180 	}
181 }
182 
183 
184 static void
185 put_page_table_entry_in_pgtable(page_table_entry* entry,
186 	addr_t physicalAddress, uint32 attributes, uint32 memoryType,
187 	bool globalPage)
188 {
189 	page_table_entry page = (physicalAddress & X86_PTE_ADDRESS_MASK)
190 		| X86_PTE_PRESENT | (globalPage ? X86_PTE_GLOBAL : 0)
191 		| memory_type_to_pte_flags(memoryType);
192 
193 	// if the page is user accessible, it's automatically
194 	// accessible in kernel space, too (but with the same
195 	// protection)
196 	if ((attributes & B_USER_PROTECTION) != 0) {
197 		page |= X86_PTE_USER;
198 		if ((attributes & B_WRITE_AREA) != 0)
199 			page |= X86_PTE_WRITABLE;
200 	} else if ((attributes & B_KERNEL_WRITE_AREA) != 0)
201 		page |= X86_PTE_WRITABLE;
202 
203 	// put it in the page table
204 	*(volatile page_table_entry*)entry = page;
205 }
206 
207 
208 //	#pragma mark -
209 
210 
211 void *
212 i386_translation_map_get_pgdir(VMTranslationMap* map)
213 {
214 	return static_cast<X86VMTranslationMap*>(map)->PhysicalPageDir();
215 }
216 
217 
218 void
219 x86_update_all_pgdirs(int index, page_directory_entry e)
220 {
221 	unsigned int state = disable_interrupts();
222 
223 	acquire_spinlock(&sTMapListLock);
224 
225 	ArchTMapList::Iterator it = sTMapList.GetIterator();
226 	while (vm_translation_map_arch_info* info = it.Next())
227 		info->pgdir_virt[index] = e;
228 
229 	release_spinlock(&sTMapListLock);
230 	restore_interrupts(state);
231 }
232 
233 
234 void
235 x86_put_pgtable_in_pgdir(page_directory_entry *entry,
236 	addr_t pgtablePhysical, uint32 attributes)
237 {
238 	*entry = (pgtablePhysical & X86_PDE_ADDRESS_MASK)
239 		| X86_PDE_PRESENT
240 		| X86_PDE_WRITABLE
241 		| X86_PDE_USER;
242 		// TODO: we ignore the attributes of the page table - for compatibility
243 		// with BeOS we allow having user accessible areas in the kernel address
244 		// space. This is currently being used by some drivers, mainly for the
245 		// frame buffer. Our current real time data implementation makes use of
246 		// this fact, too.
247 		// We might want to get rid of this possibility one day, especially if
248 		// we intend to port it to a platform that does not support this.
249 }
250 
251 
252 void
253 x86_early_prepare_page_tables(page_table_entry* pageTables, addr_t address,
254 	size_t size)
255 {
256 	memset(pageTables, 0, B_PAGE_SIZE * (size / (B_PAGE_SIZE * 1024)));
257 
258 	// put the array of pgtables directly into the kernel pagedir
259 	// these will be wired and kept mapped into virtual space to be easy to get
260 	// to
261 	{
262 		addr_t virtualTable = (addr_t)pageTables;
263 
264 		for (size_t i = 0; i < (size / (B_PAGE_SIZE * 1024));
265 				i++, virtualTable += B_PAGE_SIZE) {
266 			addr_t physicalTable = 0;
267 			early_query(virtualTable, &physicalTable);
268 			page_directory_entry* entry = &sPageHolePageDir[
269 				(address / (B_PAGE_SIZE * 1024)) + i];
270 			x86_put_pgtable_in_pgdir(entry, physicalTable,
271 				B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
272 		}
273 	}
274 }
275 
276 
277 // #pragma mark - VM ops
278 
279 
280 X86VMTranslationMap::X86VMTranslationMap()
281 {
282 }
283 
284 
285 X86VMTranslationMap::~X86VMTranslationMap()
286 {
287 	if (fArchData->page_mapper != NULL)
288 		fArchData->page_mapper->Delete();
289 
290 	if (fArchData->pgdir_virt != NULL) {
291 		// cycle through and free all of the user space pgtables
292 		for (uint32 i = VADDR_TO_PDENT(USER_BASE);
293 				i <= VADDR_TO_PDENT(USER_BASE + (USER_SIZE - 1)); i++) {
294 			if ((fArchData->pgdir_virt[i] & X86_PDE_PRESENT) != 0) {
295 				addr_t address = fArchData->pgdir_virt[i]
296 					& X86_PDE_ADDRESS_MASK;
297 				vm_page* page = vm_lookup_page(address / B_PAGE_SIZE);
298 				if (!page)
299 					panic("destroy_tmap: didn't find pgtable page\n");
300 				DEBUG_PAGE_ACCESS_START(page);
301 				vm_page_set_state(page, PAGE_STATE_FREE);
302 			}
303 		}
304 	}
305 
306 	fArchData->RemoveReference();
307 }
308 
309 
310 status_t
311 X86VMTranslationMap::Init(bool kernel)
312 {
313 	TRACE("X86VMTranslationMap::Init()\n");
314 
315 	fArchData = new(std::nothrow) vm_translation_map_arch_info;
316 	if (fArchData == NULL)
317 		return B_NO_MEMORY;
318 
319 	fArchData->active_on_cpus = 0;
320 	fArchData->num_invalidate_pages = 0;
321 	fArchData->page_mapper = NULL;
322 
323 	if (!kernel) {
324 		// user
325 		// allocate a physical page mapper
326 		status_t error = sPhysicalPageMapper
327 			->CreateTranslationMapPhysicalPageMapper(
328 				&fArchData->page_mapper);
329 		if (error != B_OK)
330 			return error;
331 
332 		// allocate a pgdir
333 		fArchData->pgdir_virt = (page_directory_entry *)memalign(
334 			B_PAGE_SIZE, B_PAGE_SIZE);
335 		if (fArchData->pgdir_virt == NULL) {
336 			fArchData->page_mapper->Delete();
337 			return B_NO_MEMORY;
338 		}
339 		vm_get_page_mapping(VMAddressSpace::KernelID(),
340 			(addr_t)fArchData->pgdir_virt,
341 			(addr_t*)&fArchData->pgdir_phys);
342 	} else {
343 		// kernel
344 		// get the physical page mapper
345 		fArchData->page_mapper = sKernelPhysicalPageMapper;
346 
347 		// we already know the kernel pgdir mapping
348 		fArchData->pgdir_virt = sKernelVirtualPageDirectory;
349 		fArchData->pgdir_phys = sKernelPhysicalPageDirectory;
350 	}
351 
352 	// zero out the bottom portion of the new pgdir
353 	memset(fArchData->pgdir_virt + FIRST_USER_PGDIR_ENT, 0,
354 		NUM_USER_PGDIR_ENTS * sizeof(page_directory_entry));
355 
356 	// insert this new map into the map list
357 	{
358 		int state = disable_interrupts();
359 		acquire_spinlock(&sTMapListLock);
360 
361 		// copy the top portion of the pgdir from the current one
362 		memcpy(fArchData->pgdir_virt + FIRST_KERNEL_PGDIR_ENT,
363 			sKernelVirtualPageDirectory + FIRST_KERNEL_PGDIR_ENT,
364 			NUM_KERNEL_PGDIR_ENTS * sizeof(page_directory_entry));
365 
366 		sTMapList.Add(fArchData);
367 
368 		release_spinlock(&sTMapListLock);
369 		restore_interrupts(state);
370 	}
371 
372 	return B_OK;
373 }
374 
375 
376 status_t
377 X86VMTranslationMap::InitPostSem()
378 {
379 	return B_OK;
380 }
381 
382 
383 /*!	Acquires the map's recursive lock, and resets the invalidate pages counter
384 	in case it's the first locking recursion.
385 */
386 bool
387 X86VMTranslationMap::Lock()
388 {
389 	TRACE("%p->X86VMTranslationMap::Lock()\n", this);
390 
391 	recursive_lock_lock(&fLock);
392 	if (recursive_lock_get_recursion(&fLock) == 1) {
393 		// we were the first one to grab the lock
394 		TRACE("clearing invalidated page count\n");
395 		fArchData->num_invalidate_pages = 0;
396 	}
397 
398 	return true;
399 }
400 
401 
402 /*!	Unlocks the map, and, if we'll actually losing the recursive lock,
403 	flush all pending changes of this map (ie. flush TLB caches as
404 	needed).
405 */
406 void
407 X86VMTranslationMap::Unlock()
408 {
409 	TRACE("%p->X86VMTranslationMap::Unlock()\n", this);
410 
411 	if (recursive_lock_get_recursion(&fLock) == 1) {
412 		// we're about to release it for the last time
413 		X86VMTranslationMap::Flush();
414 	}
415 
416 	recursive_lock_unlock(&fLock);
417 }
418 
419 
420 size_t
421 X86VMTranslationMap::MaxPagesNeededToMap(addr_t start, addr_t end) const
422 {
423 	// If start == 0, the actual base address is not yet known to the caller and
424 	// we shall assume the worst case.
425 	if (start == 0) {
426 		// offset the range so it has the worst possible alignment
427 		start = 1023 * B_PAGE_SIZE;
428 		end += 1023 * B_PAGE_SIZE;
429 	}
430 
431 	return VADDR_TO_PDENT(end) + 1 - VADDR_TO_PDENT(start);
432 }
433 
434 
435 status_t
436 X86VMTranslationMap::Map(addr_t va, addr_t pa, uint32 attributes,
437 	uint32 memoryType, vm_page_reservation* reservation)
438 {
439 	TRACE("map_tmap: entry pa 0x%lx va 0x%lx\n", pa, va);
440 
441 /*
442 	dprintf("pgdir at 0x%x\n", pgdir);
443 	dprintf("index is %d\n", va / B_PAGE_SIZE / 1024);
444 	dprintf("final at 0x%x\n", &pgdir[va / B_PAGE_SIZE / 1024]);
445 	dprintf("value is 0x%x\n", *(int *)&pgdir[va / B_PAGE_SIZE / 1024]);
446 	dprintf("present bit is %d\n", pgdir[va / B_PAGE_SIZE / 1024].present);
447 	dprintf("addr is %d\n", pgdir[va / B_PAGE_SIZE / 1024].addr);
448 */
449 	page_directory_entry* pd = fArchData->pgdir_virt;
450 
451 	// check to see if a page table exists for this range
452 	uint32 index = VADDR_TO_PDENT(va);
453 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
454 		addr_t pgtable;
455 		vm_page *page;
456 
457 		// we need to allocate a pgtable
458 		page = vm_page_allocate_page(reservation,
459 			PAGE_STATE_WIRED | VM_PAGE_ALLOC_CLEAR);
460 
461 		DEBUG_PAGE_ACCESS_END(page);
462 
463 		pgtable = page->physical_page_number * B_PAGE_SIZE;
464 
465 		TRACE("map_tmap: asked for free page for pgtable. 0x%lx\n", pgtable);
466 
467 		// put it in the pgdir
468 		x86_put_pgtable_in_pgdir(&pd[index], pgtable, attributes
469 			| ((attributes & B_USER_PROTECTION) != 0
470 					? B_WRITE_AREA : B_KERNEL_WRITE_AREA));
471 
472 		// update any other page directories, if it maps kernel space
473 		if (index >= FIRST_KERNEL_PGDIR_ENT
474 			&& index < (FIRST_KERNEL_PGDIR_ENT + NUM_KERNEL_PGDIR_ENTS))
475 			x86_update_all_pgdirs(index, pd[index]);
476 
477 		fMapCount++;
478 	}
479 
480 	// now, fill in the pentry
481 	struct thread* thread = thread_get_current_thread();
482 	ThreadCPUPinner pinner(thread);
483 
484 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
485 		pd[index] & X86_PDE_ADDRESS_MASK);
486 	index = VADDR_TO_PTENT(va);
487 
488 	ASSERT_PRINT((pt[index] & X86_PTE_PRESENT) == 0,
489 		"virtual address: %#" B_PRIxADDR ", existing pte: %#" B_PRIx32, va,
490 		pt[index]);
491 
492 	put_page_table_entry_in_pgtable(&pt[index], pa, attributes, memoryType,
493 		IS_KERNEL_MAP(map));
494 
495 	pinner.Unlock();
496 
497 	// Note: We don't need to invalidate the TLB for this address, as previously
498 	// the entry was not present and the TLB doesn't cache those entries.
499 
500 	fMapCount++;
501 
502 	return 0;
503 }
504 
505 
506 status_t
507 X86VMTranslationMap::Unmap(addr_t start, addr_t end)
508 {
509 	page_directory_entry *pd = fArchData->pgdir_virt;
510 
511 	start = ROUNDDOWN(start, B_PAGE_SIZE);
512 	end = ROUNDUP(end, B_PAGE_SIZE);
513 
514 	TRACE("unmap_tmap: asked to free pages 0x%lx to 0x%lx\n", start, end);
515 
516 restart:
517 	if (start >= end)
518 		return B_OK;
519 
520 	int index = VADDR_TO_PDENT(start);
521 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
522 		// no pagetable here, move the start up to access the next page table
523 		start = ROUNDUP(start + 1, B_PAGE_SIZE * 1024);
524 		if (start == 0)
525 			return B_OK;
526 		goto restart;
527 	}
528 
529 	struct thread* thread = thread_get_current_thread();
530 	ThreadCPUPinner pinner(thread);
531 
532 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
533 		pd[index]  & X86_PDE_ADDRESS_MASK);
534 
535 	for (index = VADDR_TO_PTENT(start); (index < 1024) && (start < end);
536 			index++, start += B_PAGE_SIZE) {
537 		if ((pt[index] & X86_PTE_PRESENT) == 0) {
538 			// page mapping not valid
539 			continue;
540 		}
541 
542 		TRACE("unmap_tmap: removing page 0x%lx\n", start);
543 
544 		page_table_entry oldEntry = clear_page_table_entry_flags(&pt[index],
545 			X86_PTE_PRESENT);
546 		fMapCount--;
547 
548 		if ((oldEntry & X86_PTE_ACCESSED) != 0) {
549 			// Note, that we only need to invalidate the address, if the
550 			// accessed flags was set, since only then the entry could have been
551 			// in any TLB.
552 			if (fArchData->num_invalidate_pages
553 					< PAGE_INVALIDATE_CACHE_SIZE) {
554 				fArchData->pages_to_invalidate[
555 					fArchData->num_invalidate_pages] = start;
556 			}
557 
558 			fArchData->num_invalidate_pages++;
559 		}
560 	}
561 
562 	pinner.Unlock();
563 
564 	goto restart;
565 }
566 
567 
568 /*!	Caller must have locked the cache of the page to be unmapped.
569 	This object shouldn't be locked.
570 */
571 status_t
572 X86VMTranslationMap::UnmapPage(VMArea* area, addr_t address,
573 	bool updatePageQueue)
574 {
575 	ASSERT(address % B_PAGE_SIZE == 0);
576 
577 	page_directory_entry* pd = fArchData->pgdir_virt;
578 
579 	TRACE("X86VMTranslationMap::UnmapPage(%#" B_PRIxADDR ")\n", address);
580 
581 	RecursiveLocker locker(fLock);
582 
583 	int index = VADDR_TO_PDENT(address);
584 	if ((pd[index] & X86_PDE_PRESENT) == 0)
585 		return B_ENTRY_NOT_FOUND;
586 
587 	ThreadCPUPinner pinner(thread_get_current_thread());
588 
589 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
590 		pd[index] & X86_PDE_ADDRESS_MASK);
591 
592 	index = VADDR_TO_PTENT(address);
593 	page_table_entry oldEntry = clear_page_table_entry(&pt[index]);
594 
595 	pinner.Unlock();
596 
597 	if ((oldEntry & X86_PTE_PRESENT) == 0) {
598 		// page mapping not valid
599 		return B_ENTRY_NOT_FOUND;
600 	}
601 
602 	fMapCount--;
603 
604 	if ((oldEntry & X86_PTE_ACCESSED) != 0) {
605 		// Note, that we only need to invalidate the address, if the
606 		// accessed flags was set, since only then the entry could have been
607 		// in any TLB.
608 		if (fArchData->num_invalidate_pages
609 				< PAGE_INVALIDATE_CACHE_SIZE) {
610 			fArchData->pages_to_invalidate[fArchData->num_invalidate_pages]
611 				= address;
612 		}
613 
614 		fArchData->num_invalidate_pages++;
615 
616 		Flush();
617 
618 		// NOTE: Between clearing the page table entry and Flush() other
619 		// processors (actually even this processor with another thread of the
620 		// same team) could still access the page in question via their cached
621 		// entry. We can obviously lose a modified flag in this case, with the
622 		// effect that the page looks unmodified (and might thus be recycled),
623 		// but is actually modified.
624 		// In most cases this is harmless, but for vm_remove_all_page_mappings()
625 		// this is actually a problem.
626 		// Interestingly FreeBSD seems to ignore this problem as well
627 		// (cf. pmap_remove_all()), unless I've missed something.
628 	}
629 
630 	if (area->cache_type == CACHE_TYPE_DEVICE)
631 		return B_OK;
632 
633 	// get the page
634 	vm_page* page = vm_lookup_page(
635 		(oldEntry & X86_PTE_ADDRESS_MASK) / B_PAGE_SIZE);
636 	ASSERT(page != NULL);
637 
638 	// transfer the accessed/dirty flags to the page
639 	if ((oldEntry & X86_PTE_ACCESSED) != 0)
640 		page->accessed = true;
641 	if ((oldEntry & X86_PTE_DIRTY) != 0)
642 		page->modified = true;
643 
644 	// remove the mapping object/decrement the wired_count of the page
645 	vm_page_mapping* mapping = NULL;
646 	if (area->wiring == B_NO_LOCK) {
647 		vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
648 		while ((mapping = iterator.Next()) != NULL) {
649 			if (mapping->area == area) {
650 				area->mappings.Remove(mapping);
651 				page->mappings.Remove(mapping);
652 				break;
653 			}
654 		}
655 
656 		ASSERT(mapping != NULL);
657 	} else
658 		page->wired_count--;
659 
660 	locker.Unlock();
661 
662 	if (page->wired_count == 0 && page->mappings.IsEmpty()) {
663 		atomic_add(&gMappedPagesCount, -1);
664 
665 		if (updatePageQueue) {
666 			if (page->Cache()->temporary)
667 				vm_page_set_state(page, PAGE_STATE_INACTIVE);
668 			else if (page->modified)
669 				vm_page_set_state(page, PAGE_STATE_MODIFIED);
670 			else
671 				vm_page_set_state(page, PAGE_STATE_CACHED);
672 		}
673 	}
674 
675 	if (mapping != NULL) {
676 		bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
677 		object_cache_free(gPageMappingsObjectCache, mapping,
678 			CACHE_DONT_WAIT_FOR_MEMORY
679 				| (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0));
680 	}
681 
682 	return B_OK;
683 }
684 
685 
686 void
687 X86VMTranslationMap::UnmapPages(VMArea* area, addr_t base, size_t size,
688 	bool updatePageQueue)
689 {
690 	page_directory_entry* pd = fArchData->pgdir_virt;
691 
692 	addr_t start = base;
693 	addr_t end = base + size;
694 
695 	TRACE("X86VMTranslationMap::UnmapPages(%p, %#" B_PRIxADDR ", %#"
696 		B_PRIxADDR ")\n", area, start, end);
697 
698 	VMAreaMappings queue;
699 
700 	RecursiveLocker locker(fLock);
701 
702 	while (start < end) {
703 		int index = VADDR_TO_PDENT(start);
704 		if ((pd[index] & X86_PDE_PRESENT) == 0) {
705 			// no page table here, move the start up to access the next page
706 			// table
707 			start = ROUNDUP(start + 1, B_PAGE_SIZE * 1024);
708 			if (start == 0)
709 				break;
710 			continue;
711 		}
712 
713 		struct thread* thread = thread_get_current_thread();
714 		ThreadCPUPinner pinner(thread);
715 
716 		page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
717 			pd[index]  & X86_PDE_ADDRESS_MASK);
718 
719 		for (index = VADDR_TO_PTENT(start); (index < 1024) && (start < end);
720 				index++, start += B_PAGE_SIZE) {
721 			page_table_entry oldEntry = clear_page_table_entry(&pt[index]);
722 			if ((oldEntry & X86_PTE_PRESENT) == 0)
723 				continue;
724 
725 			fMapCount--;
726 
727 			if ((oldEntry & X86_PTE_ACCESSED) != 0) {
728 				// Note, that we only need to invalidate the address, if the
729 				// accessed flags was set, since only then the entry could have
730 				// been in any TLB.
731 				if (fArchData->num_invalidate_pages
732 						< PAGE_INVALIDATE_CACHE_SIZE) {
733 					fArchData->pages_to_invalidate[
734 						fArchData->num_invalidate_pages] = start;
735 				}
736 
737 				fArchData->num_invalidate_pages++;
738 			}
739 
740 			if (area->cache_type != CACHE_TYPE_DEVICE) {
741 				// get the page
742 				vm_page* page = vm_lookup_page(
743 					(oldEntry & X86_PTE_ADDRESS_MASK) / B_PAGE_SIZE);
744 				ASSERT(page != NULL);
745 
746 				DEBUG_PAGE_ACCESS_START(page);
747 
748 				// transfer the accessed/dirty flags to the page
749 				if ((oldEntry & X86_PTE_ACCESSED) != 0)
750 					page->accessed = true;
751 				if ((oldEntry & X86_PTE_DIRTY) != 0)
752 					page->modified = true;
753 
754 				// remove the mapping object/decrement the wired_count of the
755 				// page
756 				if (area->wiring == B_NO_LOCK) {
757 					vm_page_mapping* mapping = NULL;
758 					vm_page_mappings::Iterator iterator
759 						= page->mappings.GetIterator();
760 					while ((mapping = iterator.Next()) != NULL) {
761 						if (mapping->area == area)
762 							break;
763 					}
764 
765 					ASSERT(mapping != NULL);
766 
767 					area->mappings.Remove(mapping);
768 					page->mappings.Remove(mapping);
769 					queue.Add(mapping);
770 				} else
771 					page->wired_count--;
772 
773 				if (page->wired_count == 0 && page->mappings.IsEmpty()) {
774 					atomic_add(&gMappedPagesCount, -1);
775 
776 					if (updatePageQueue) {
777 						if (page->Cache()->temporary)
778 							vm_page_set_state(page, PAGE_STATE_INACTIVE);
779 						else if (page->modified)
780 							vm_page_set_state(page, PAGE_STATE_MODIFIED);
781 						else
782 							vm_page_set_state(page, PAGE_STATE_CACHED);
783 					}
784 				}
785 
786 				DEBUG_PAGE_ACCESS_END(page);
787 			}
788 		}
789 
790 		Flush();
791 			// flush explicitly, since we directly use the lock
792 
793 		pinner.Unlock();
794 	}
795 
796 	// TODO: As in UnmapPage() we can lose page dirty flags here. ATM it's not
797 	// really critical here, as in all cases this method is used, the unmapped
798 	// area range is unmapped for good (resized/cut) and the pages will likely
799 	// be freed.
800 
801 	locker.Unlock();
802 
803 	// free removed mappings
804 	bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
805 	uint32 freeFlags = CACHE_DONT_WAIT_FOR_MEMORY
806 		| (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0);
807 	while (vm_page_mapping* mapping = queue.RemoveHead())
808 		object_cache_free(gPageMappingsObjectCache, mapping, freeFlags);
809 }
810 
811 
812 void
813 X86VMTranslationMap::UnmapArea(VMArea* area, bool deletingAddressSpace,
814 	bool ignoreTopCachePageFlags)
815 {
816 	if (area->cache_type == CACHE_TYPE_DEVICE || area->wiring != B_NO_LOCK) {
817 		X86VMTranslationMap::UnmapPages(area, area->Base(), area->Size(), true);
818 		return;
819 	}
820 
821 	bool unmapPages = !deletingAddressSpace || !ignoreTopCachePageFlags;
822 
823 	page_directory_entry* pd = fArchData->pgdir_virt;
824 
825 	RecursiveLocker locker(fLock);
826 
827 	VMAreaMappings mappings;
828 	mappings.MoveFrom(&area->mappings);
829 
830 	for (VMAreaMappings::Iterator it = mappings.GetIterator();
831 			vm_page_mapping* mapping = it.Next();) {
832 		vm_page* page = mapping->page;
833 		page->mappings.Remove(mapping);
834 
835 		VMCache* cache = page->Cache();
836 
837 		bool pageFullyUnmapped = false;
838 		if (page->wired_count == 0 && page->mappings.IsEmpty()) {
839 			atomic_add(&gMappedPagesCount, -1);
840 			pageFullyUnmapped = true;
841 		}
842 
843 		if (unmapPages || cache != area->cache) {
844 			addr_t address = area->Base()
845 				+ ((page->cache_offset * B_PAGE_SIZE) - area->cache_offset);
846 
847 			int index = VADDR_TO_PDENT(address);
848 			if ((pd[index] & X86_PDE_PRESENT) == 0) {
849 				panic("page %p has mapping for area %p (%#" B_PRIxADDR "), but "
850 					"has no page dir entry", page, area, address);
851 				continue;
852 			}
853 
854 			ThreadCPUPinner pinner(thread_get_current_thread());
855 
856 			page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
857 				pd[index] & X86_PDE_ADDRESS_MASK);
858 			page_table_entry oldEntry = clear_page_table_entry(
859 				&pt[VADDR_TO_PTENT(address)]);
860 
861 			pinner.Unlock();
862 
863 			if ((oldEntry & X86_PTE_PRESENT) == 0) {
864 				panic("page %p has mapping for area %p (%#" B_PRIxADDR "), but "
865 					"has no page table entry", page, area, address);
866 				continue;
867 			}
868 
869 			// transfer the accessed/dirty flags to the page and invalidate
870 			// the mapping, if necessary
871 			if ((oldEntry & X86_PTE_ACCESSED) != 0) {
872 				page->accessed = true;
873 
874 				if (!deletingAddressSpace) {
875 					if (fArchData->num_invalidate_pages
876 							< PAGE_INVALIDATE_CACHE_SIZE) {
877 						fArchData->pages_to_invalidate[
878 							fArchData->num_invalidate_pages] = address;
879 					}
880 
881 					fArchData->num_invalidate_pages++;
882 				}
883 			}
884 
885 			if ((oldEntry & X86_PTE_DIRTY) != 0)
886 				page->modified = true;
887 
888 			if (pageFullyUnmapped) {
889 				DEBUG_PAGE_ACCESS_START(page);
890 
891 				if (cache->temporary)
892 					vm_page_set_state(page, PAGE_STATE_INACTIVE);
893 				else if (page->modified)
894 					vm_page_set_state(page, PAGE_STATE_MODIFIED);
895 				else
896 					vm_page_set_state(page, PAGE_STATE_CACHED);
897 
898 				DEBUG_PAGE_ACCESS_END(page);
899 			}
900 		}
901 
902 		fMapCount--;
903 	}
904 
905 	Flush();
906 		// flush explicitely, since we directly use the lock
907 
908 	locker.Unlock();
909 
910 	bool isKernelSpace = area->address_space == VMAddressSpace::Kernel();
911 	uint32 freeFlags = CACHE_DONT_WAIT_FOR_MEMORY
912 		| (isKernelSpace ? CACHE_DONT_LOCK_KERNEL_SPACE : 0);
913 	while (vm_page_mapping* mapping = mappings.RemoveHead())
914 		object_cache_free(gPageMappingsObjectCache, mapping, freeFlags);
915 }
916 
917 
918 status_t
919 X86VMTranslationMap::Query(addr_t va, addr_t *_physical, uint32 *_flags)
920 {
921 	// default the flags to not present
922 	*_flags = 0;
923 	*_physical = 0;
924 
925 	int index = VADDR_TO_PDENT(va);
926 	page_directory_entry *pd = fArchData->pgdir_virt;
927 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
928 		// no pagetable here
929 		return B_OK;
930 	}
931 
932 	struct thread* thread = thread_get_current_thread();
933 	ThreadCPUPinner pinner(thread);
934 
935 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
936 		pd[index] & X86_PDE_ADDRESS_MASK);
937 	page_table_entry entry = pt[VADDR_TO_PTENT(va)];
938 
939 	*_physical = entry & X86_PDE_ADDRESS_MASK;
940 
941 	// read in the page state flags
942 	if ((entry & X86_PTE_USER) != 0) {
943 		*_flags |= ((entry & X86_PTE_WRITABLE) != 0 ? B_WRITE_AREA : 0)
944 			| B_READ_AREA;
945 	}
946 
947 	*_flags |= ((entry & X86_PTE_WRITABLE) != 0 ? B_KERNEL_WRITE_AREA : 0)
948 		| B_KERNEL_READ_AREA
949 		| ((entry & X86_PTE_DIRTY) != 0 ? PAGE_MODIFIED : 0)
950 		| ((entry & X86_PTE_ACCESSED) != 0 ? PAGE_ACCESSED : 0)
951 		| ((entry & X86_PTE_PRESENT) != 0 ? PAGE_PRESENT : 0);
952 
953 	pinner.Unlock();
954 
955 	TRACE("query_tmap: returning pa 0x%lx for va 0x%lx\n", *_physical, va);
956 
957 	return B_OK;
958 }
959 
960 
961 status_t
962 X86VMTranslationMap::QueryInterrupt(addr_t va, addr_t *_physical,
963 	uint32 *_flags)
964 {
965 	*_flags = 0;
966 	*_physical = 0;
967 
968 	int index = VADDR_TO_PDENT(va);
969 	page_directory_entry* pd = fArchData->pgdir_virt;
970 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
971 		// no pagetable here
972 		return B_OK;
973 	}
974 
975 	// map page table entry
976 	page_table_entry* pt = sPhysicalPageMapper->InterruptGetPageTableAt(
977 		pd[index] & X86_PDE_ADDRESS_MASK);
978 	page_table_entry entry = pt[VADDR_TO_PTENT(va)];
979 
980 	*_physical = entry & X86_PDE_ADDRESS_MASK;
981 
982 	// read in the page state flags
983 	if ((entry & X86_PTE_USER) != 0) {
984 		*_flags |= ((entry & X86_PTE_WRITABLE) != 0 ? B_WRITE_AREA : 0)
985 			| B_READ_AREA;
986 	}
987 
988 	*_flags |= ((entry & X86_PTE_WRITABLE) != 0 ? B_KERNEL_WRITE_AREA : 0)
989 		| B_KERNEL_READ_AREA
990 		| ((entry & X86_PTE_DIRTY) != 0 ? PAGE_MODIFIED : 0)
991 		| ((entry & X86_PTE_ACCESSED) != 0 ? PAGE_ACCESSED : 0)
992 		| ((entry & X86_PTE_PRESENT) != 0 ? PAGE_PRESENT : 0);
993 
994 	return B_OK;
995 }
996 
997 
998 addr_t
999 X86VMTranslationMap::MappedSize() const
1000 {
1001 	return fMapCount;
1002 }
1003 
1004 
1005 status_t
1006 X86VMTranslationMap::Protect(addr_t start, addr_t end, uint32 attributes,
1007 	uint32 memoryType)
1008 {
1009 	page_directory_entry *pd = fArchData->pgdir_virt;
1010 
1011 	start = ROUNDDOWN(start, B_PAGE_SIZE);
1012 
1013 	TRACE("protect_tmap: pages 0x%lx to 0x%lx, attributes %lx\n", start, end,
1014 		attributes);
1015 
1016 	// compute protection flags
1017 	uint32 newProtectionFlags = 0;
1018 	if ((attributes & B_USER_PROTECTION) != 0) {
1019 		newProtectionFlags = X86_PTE_USER;
1020 		if ((attributes & B_WRITE_AREA) != 0)
1021 			newProtectionFlags |= X86_PTE_WRITABLE;
1022 	} else if ((attributes & B_KERNEL_WRITE_AREA) != 0)
1023 		newProtectionFlags = X86_PTE_WRITABLE;
1024 
1025 restart:
1026 	if (start >= end)
1027 		return B_OK;
1028 
1029 	int index = VADDR_TO_PDENT(start);
1030 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
1031 		// no pagetable here, move the start up to access the next page table
1032 		start = ROUNDUP(start + 1, B_PAGE_SIZE * 1024);
1033 		if (start == 0)
1034 			return B_OK;
1035 		goto restart;
1036 	}
1037 
1038 	struct thread* thread = thread_get_current_thread();
1039 	ThreadCPUPinner pinner(thread);
1040 
1041 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
1042 		pd[index] & X86_PDE_ADDRESS_MASK);
1043 
1044 	for (index = VADDR_TO_PTENT(start); index < 1024 && start < end;
1045 			index++, start += B_PAGE_SIZE) {
1046 		page_table_entry entry = pt[index];
1047 		if ((entry & X86_PTE_PRESENT) == 0) {
1048 			// page mapping not valid
1049 			continue;
1050 		}
1051 
1052 		TRACE("protect_tmap: protect page 0x%lx\n", start);
1053 
1054 		// set the new protection flags -- we want to do that atomically,
1055 		// without changing the accessed or dirty flag
1056 		page_table_entry oldEntry;
1057 		while (true) {
1058 			oldEntry = test_and_set_page_table_entry(&pt[index],
1059 				(entry & ~(X86_PTE_PROTECTION_MASK | X86_PTE_MEMORY_TYPE_MASK))
1060 					| newProtectionFlags | memory_type_to_pte_flags(memoryType),
1061 				entry);
1062 			if (oldEntry == entry)
1063 				break;
1064 			entry = oldEntry;
1065 		}
1066 
1067 		if ((oldEntry & X86_PTE_ACCESSED) != 0) {
1068 			// Note, that we only need to invalidate the address, if the
1069 			// accessed flag was set, since only then the entry could have been
1070 			// in any TLB.
1071 			if (fArchData->num_invalidate_pages
1072 					< PAGE_INVALIDATE_CACHE_SIZE) {
1073 				fArchData->pages_to_invalidate[
1074 					fArchData->num_invalidate_pages] = start;
1075 			}
1076 
1077 			fArchData->num_invalidate_pages++;
1078 		}
1079 	}
1080 
1081 	pinner.Unlock();
1082 
1083 	goto restart;
1084 }
1085 
1086 
1087 status_t
1088 X86VMTranslationMap::ClearFlags(addr_t va, uint32 flags)
1089 {
1090 	int index = VADDR_TO_PDENT(va);
1091 	page_directory_entry* pd = fArchData->pgdir_virt;
1092 	if ((pd[index] & X86_PDE_PRESENT) == 0) {
1093 		// no pagetable here
1094 		return B_OK;
1095 	}
1096 
1097 	uint32 flagsToClear = ((flags & PAGE_MODIFIED) ? X86_PTE_DIRTY : 0)
1098 		| ((flags & PAGE_ACCESSED) ? X86_PTE_ACCESSED : 0);
1099 
1100 	struct thread* thread = thread_get_current_thread();
1101 	ThreadCPUPinner pinner(thread);
1102 
1103 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
1104 		pd[index] & X86_PDE_ADDRESS_MASK);
1105 	index = VADDR_TO_PTENT(va);
1106 
1107 	// clear out the flags we've been requested to clear
1108 	page_table_entry oldEntry
1109 		= clear_page_table_entry_flags(&pt[index], flagsToClear);
1110 
1111 	pinner.Unlock();
1112 
1113 	if ((oldEntry & flagsToClear) != 0) {
1114 		if (fArchData->num_invalidate_pages < PAGE_INVALIDATE_CACHE_SIZE) {
1115 			fArchData->pages_to_invalidate[
1116 				fArchData->num_invalidate_pages] = va;
1117 		}
1118 
1119 		fArchData->num_invalidate_pages++;
1120 	}
1121 
1122 	return B_OK;
1123 }
1124 
1125 
1126 bool
1127 X86VMTranslationMap::ClearAccessedAndModified(VMArea* area, addr_t address,
1128 	bool unmapIfUnaccessed, bool& _modified)
1129 {
1130 	ASSERT(address % B_PAGE_SIZE == 0);
1131 
1132 	page_directory_entry* pd = fArchData->pgdir_virt;
1133 
1134 	TRACE("X86VMTranslationMap::ClearAccessedAndModified(%#" B_PRIxADDR ")\n",
1135 		address);
1136 
1137 	RecursiveLocker locker(fLock);
1138 
1139 	int index = VADDR_TO_PDENT(address);
1140 	if ((pd[index] & X86_PDE_PRESENT) == 0)
1141 		return false;
1142 
1143 	ThreadCPUPinner pinner(thread_get_current_thread());
1144 
1145 	page_table_entry* pt = fArchData->page_mapper->GetPageTableAt(
1146 		pd[index] & X86_PDE_ADDRESS_MASK);
1147 
1148 	index = VADDR_TO_PTENT(address);
1149 
1150 	// perform the deed
1151 	page_table_entry oldEntry;
1152 
1153 	if (unmapIfUnaccessed) {
1154 		while (true) {
1155 			oldEntry = pt[index];
1156 			if ((oldEntry & X86_PTE_PRESENT) == 0) {
1157 				// page mapping not valid
1158 				return false;
1159 			}
1160 
1161 			if (oldEntry & X86_PTE_ACCESSED) {
1162 				// page was accessed -- just clear the flags
1163 				oldEntry = clear_page_table_entry_flags(&pt[index],
1164 					X86_PTE_ACCESSED | X86_PTE_DIRTY);
1165 				break;
1166 			}
1167 
1168 			// page hasn't been accessed -- unmap it
1169 			if (test_and_set_page_table_entry(&pt[index], 0, oldEntry)
1170 					== oldEntry) {
1171 				break;
1172 			}
1173 
1174 			// something changed -- check again
1175 		}
1176 	} else {
1177 		oldEntry = clear_page_table_entry_flags(&pt[index],
1178 			X86_PTE_ACCESSED | X86_PTE_DIRTY);
1179 	}
1180 
1181 	pinner.Unlock();
1182 
1183 	_modified = (oldEntry & X86_PTE_DIRTY) != 0;
1184 
1185 	if ((oldEntry & X86_PTE_ACCESSED) != 0) {
1186 		// Note, that we only need to invalidate the address, if the
1187 		// accessed flags was set, since only then the entry could have been
1188 		// in any TLB.
1189 		if (fArchData->num_invalidate_pages
1190 				< PAGE_INVALIDATE_CACHE_SIZE) {
1191 			fArchData->pages_to_invalidate[fArchData->num_invalidate_pages]
1192 				= address;
1193 		}
1194 
1195 		fArchData->num_invalidate_pages++;
1196 
1197 		Flush();
1198 
1199 		return true;
1200 	}
1201 
1202 	if (!unmapIfUnaccessed)
1203 		return false;
1204 
1205 	// We have unmapped the address. Do the "high level" stuff.
1206 
1207 	fMapCount--;
1208 
1209 	if (area->cache_type == CACHE_TYPE_DEVICE)
1210 		return false;
1211 
1212 	// get the page
1213 	vm_page* page = vm_lookup_page(
1214 		(oldEntry & X86_PTE_ADDRESS_MASK) / B_PAGE_SIZE);
1215 	ASSERT(page != NULL);
1216 
1217 	// remove the mapping object/decrement the wired_count of the page
1218 	vm_page_mapping* mapping = NULL;
1219 	if (area->wiring == B_NO_LOCK) {
1220 		vm_page_mappings::Iterator iterator = page->mappings.GetIterator();
1221 		while ((mapping = iterator.Next()) != NULL) {
1222 			if (mapping->area == area) {
1223 				area->mappings.Remove(mapping);
1224 				page->mappings.Remove(mapping);
1225 				break;
1226 			}
1227 		}
1228 
1229 		ASSERT(mapping != NULL);
1230 	} else
1231 		page->wired_count--;
1232 
1233 	locker.Unlock();
1234 
1235 	if (page->wired_count == 0 && page->mappings.IsEmpty())
1236 		atomic_add(&gMappedPagesCount, -1);
1237 
1238 	if (mapping != NULL) {
1239 		object_cache_free(gPageMappingsObjectCache, mapping,
1240 			CACHE_DONT_WAIT_FOR_MEMORY | CACHE_DONT_LOCK_KERNEL_SPACE);
1241 			// Since this is called by the page daemon, we never want to lock
1242 			// the kernel address space.
1243 	}
1244 
1245 	return false;
1246 }
1247 
1248 
1249 void
1250 X86VMTranslationMap::Flush()
1251 {
1252 	if (fArchData->num_invalidate_pages <= 0)
1253 		return;
1254 
1255 	struct thread* thread = thread_get_current_thread();
1256 	thread_pin_to_current_cpu(thread);
1257 
1258 	if (fArchData->num_invalidate_pages > PAGE_INVALIDATE_CACHE_SIZE) {
1259 		// invalidate all pages
1260 		TRACE("flush_tmap: %d pages to invalidate, invalidate all\n",
1261 			fArchData->num_invalidate_pages);
1262 
1263 		if (IS_KERNEL_MAP(map)) {
1264 			arch_cpu_global_TLB_invalidate();
1265 			smp_send_broadcast_ici(SMP_MSG_GLOBAL_INVALIDATE_PAGES, 0, 0, 0,
1266 				NULL, SMP_MSG_FLAG_SYNC);
1267 		} else {
1268 			cpu_status state = disable_interrupts();
1269 			arch_cpu_user_TLB_invalidate();
1270 			restore_interrupts(state);
1271 
1272 			int cpu = smp_get_current_cpu();
1273 			uint32 cpuMask = fArchData->active_on_cpus
1274 				& ~((uint32)1 << cpu);
1275 			if (cpuMask != 0) {
1276 				smp_send_multicast_ici(cpuMask, SMP_MSG_USER_INVALIDATE_PAGES,
1277 					0, 0, 0, NULL, SMP_MSG_FLAG_SYNC);
1278 			}
1279 		}
1280 	} else {
1281 		TRACE("flush_tmap: %d pages to invalidate, invalidate list\n",
1282 			fArchData->num_invalidate_pages);
1283 
1284 		arch_cpu_invalidate_TLB_list(fArchData->pages_to_invalidate,
1285 			fArchData->num_invalidate_pages);
1286 
1287 		if (IS_KERNEL_MAP(map)) {
1288 			smp_send_broadcast_ici(SMP_MSG_INVALIDATE_PAGE_LIST,
1289 				(uint32)fArchData->pages_to_invalidate,
1290 				fArchData->num_invalidate_pages, 0, NULL,
1291 				SMP_MSG_FLAG_SYNC);
1292 		} else {
1293 			int cpu = smp_get_current_cpu();
1294 			uint32 cpuMask = fArchData->active_on_cpus
1295 				& ~((uint32)1 << cpu);
1296 			if (cpuMask != 0) {
1297 				smp_send_multicast_ici(cpuMask, SMP_MSG_INVALIDATE_PAGE_LIST,
1298 					(uint32)fArchData->pages_to_invalidate,
1299 					fArchData->num_invalidate_pages, 0, NULL,
1300 					SMP_MSG_FLAG_SYNC);
1301 			}
1302 		}
1303 	}
1304 	fArchData->num_invalidate_pages = 0;
1305 
1306 	thread_unpin_from_current_cpu(thread);
1307 }
1308 
1309 
1310 // #pragma mark - VM API
1311 
1312 
1313 status_t
1314 arch_vm_translation_map_create_map(bool kernel, VMTranslationMap** _map)
1315 {
1316 	X86VMTranslationMap* map = new(std::nothrow) X86VMTranslationMap;
1317 	if (map == NULL)
1318 		return B_NO_MEMORY;
1319 
1320 	status_t error = map->Init(kernel);
1321 	if (error != B_OK) {
1322 		delete map;
1323 		return error;
1324 	}
1325 
1326 	*_map = map;
1327 	return B_OK;
1328 }
1329 
1330 
1331 status_t
1332 arch_vm_translation_map_init(kernel_args *args,
1333 	VMPhysicalPageMapper** _physicalPageMapper)
1334 {
1335 	TRACE("vm_translation_map_init: entry\n");
1336 
1337 	// page hole set up in stage2
1338 	sPageHole = (page_table_entry *)args->arch_args.page_hole;
1339 	// calculate where the pgdir would be
1340 	sPageHolePageDir = (page_directory_entry*)
1341 		(((addr_t)args->arch_args.page_hole)
1342 			+ (B_PAGE_SIZE * 1024 - B_PAGE_SIZE));
1343 	// clear out the bottom 2 GB, unmap everything
1344 	memset(sPageHolePageDir + FIRST_USER_PGDIR_ENT, 0,
1345 		sizeof(page_directory_entry) * NUM_USER_PGDIR_ENTS);
1346 
1347 	sKernelPhysicalPageDirectory = (page_directory_entry*)
1348 		args->arch_args.phys_pgdir;
1349 	sKernelVirtualPageDirectory = (page_directory_entry*)
1350 		args->arch_args.vir_pgdir;
1351 
1352 #ifdef TRACE_VM_TMAP
1353 	TRACE("page hole: %p, page dir: %p\n", sPageHole, sPageHolePageDir);
1354 	TRACE("page dir: %p (physical: %p)\n", sKernelVirtualPageDirectory,
1355 		sKernelPhysicalPageDirectory);
1356 
1357 	TRACE("physical memory ranges:\n");
1358 	for (uint32 i = 0; i < args->num_physical_memory_ranges; i++) {
1359 		addr_t start = args->physical_memory_range[i].start;
1360 		addr_t end = start + args->physical_memory_range[i].size;
1361 		TRACE("  %#10" B_PRIxADDR " - %#10" B_PRIxADDR "\n", start, end);
1362 	}
1363 
1364 	TRACE("allocated physical ranges:\n");
1365 	for (uint32 i = 0; i < args->num_physical_allocated_ranges; i++) {
1366 		addr_t start = args->physical_allocated_range[i].start;
1367 		addr_t end = start + args->physical_allocated_range[i].size;
1368 		TRACE("  %#10" B_PRIxADDR " - %#10" B_PRIxADDR "\n", start, end);
1369 	}
1370 
1371 	TRACE("allocated virtual ranges:\n");
1372 	for (uint32 i = 0; i < args->num_virtual_allocated_ranges; i++) {
1373 		addr_t start = args->virtual_allocated_range[i].start;
1374 		addr_t end = start + args->virtual_allocated_range[i].size;
1375 		TRACE("  %#10" B_PRIxADDR " - %#10" B_PRIxADDR "\n", start, end);
1376 	}
1377 #endif
1378 
1379 	B_INITIALIZE_SPINLOCK(&sTMapListLock);
1380 	new (&sTMapList) ArchTMapList;
1381 
1382 	large_memory_physical_page_ops_init(args, sPhysicalPageMapper,
1383 		sKernelPhysicalPageMapper);
1384 		// TODO: Select the best page mapper!
1385 
1386 	// enable global page feature if available
1387 	if (x86_check_feature(IA32_FEATURE_PGE, FEATURE_COMMON)) {
1388 		// this prevents kernel pages from being flushed from TLB on
1389 		// context-switch
1390 		x86_write_cr4(x86_read_cr4() | IA32_CR4_GLOBAL_PAGES);
1391 	}
1392 
1393 	TRACE("vm_translation_map_init: done\n");
1394 
1395 	*_physicalPageMapper = sPhysicalPageMapper;
1396 	return B_OK;
1397 }
1398 
1399 
1400 status_t
1401 arch_vm_translation_map_init_post_sem(kernel_args *args)
1402 {
1403 	return B_OK;
1404 }
1405 
1406 
1407 status_t
1408 arch_vm_translation_map_init_post_area(kernel_args *args)
1409 {
1410 	// now that the vm is initialized, create a region that represents
1411 	// the page hole
1412 	void *temp;
1413 	status_t error;
1414 	area_id area;
1415 
1416 	TRACE("vm_translation_map_init_post_area: entry\n");
1417 
1418 	// unmap the page hole hack we were using before
1419 	sKernelVirtualPageDirectory[1023] = 0;
1420 	sPageHolePageDir = NULL;
1421 	sPageHole = NULL;
1422 
1423 	temp = (void *)sKernelVirtualPageDirectory;
1424 	area = create_area("kernel_pgdir", &temp, B_EXACT_ADDRESS, B_PAGE_SIZE,
1425 		B_ALREADY_WIRED, B_KERNEL_READ_AREA | B_KERNEL_WRITE_AREA);
1426 	if (area < B_OK)
1427 		return area;
1428 
1429 	error = sPhysicalPageMapper->InitPostArea(args);
1430 	if (error != B_OK)
1431 		return error;
1432 
1433 	TRACE("vm_translation_map_init_post_area: done\n");
1434 	return B_OK;
1435 }
1436 
1437 
1438 // XXX horrible back door to map a page quickly regardless of translation map
1439 // object, etc.
1440 // used only during VM setup.
1441 // uses a 'page hole' set up in the stage 2 bootloader. The page hole is created
1442 // by pointing one of the pgdir entries back at itself, effectively mapping the
1443 // contents of all of the 4MB of pagetables into a 4 MB region. It's only used
1444 // here, and is later unmapped.
1445 
1446 status_t
1447 arch_vm_translation_map_early_map(kernel_args *args, addr_t va, addr_t pa,
1448 	uint8 attributes, addr_t (*get_free_page)(kernel_args *))
1449 {
1450 	int index;
1451 
1452 	TRACE("early_tmap: entry pa 0x%lx va 0x%lx\n", pa, va);
1453 
1454 	// check to see if a page table exists for this range
1455 	index = VADDR_TO_PDENT(va);
1456 	if ((sPageHolePageDir[index] & X86_PDE_PRESENT) == 0) {
1457 		addr_t pgtable;
1458 		page_directory_entry *e;
1459 		// we need to allocate a pgtable
1460 		pgtable = get_free_page(args);
1461 		// pgtable is in pages, convert to physical address
1462 		pgtable *= B_PAGE_SIZE;
1463 
1464 		TRACE("early_map: asked for free page for pgtable. 0x%lx\n", pgtable);
1465 
1466 		// put it in the pgdir
1467 		e = &sPageHolePageDir[index];
1468 		x86_put_pgtable_in_pgdir(e, pgtable, attributes);
1469 
1470 		// zero it out in it's new mapping
1471 		memset((unsigned int*)((addr_t)sPageHole
1472 			+ (va / B_PAGE_SIZE / 1024) * B_PAGE_SIZE), 0, B_PAGE_SIZE);
1473 	}
1474 
1475 	ASSERT_PRINT((sPageHole[va / B_PAGE_SIZE] & X86_PTE_PRESENT) == 0,
1476 		"virtual address: %#" B_PRIxADDR ", pde: %#" B_PRIx32
1477 		", existing pte: %#" B_PRIx32, va, sPageHolePageDir[index],
1478 		sPageHole[va / B_PAGE_SIZE]);
1479 
1480 	// now, fill in the pentry
1481 	put_page_table_entry_in_pgtable(sPageHole + va / B_PAGE_SIZE, pa,
1482 		attributes, 0, IS_KERNEL_ADDRESS(va));
1483 
1484 	return B_OK;
1485 }
1486 
1487 
1488 /*!	Verifies that the page at the given virtual address can be accessed in the
1489 	current context.
1490 
1491 	This function is invoked in the kernel debugger. Paranoid checking is in
1492 	order.
1493 
1494 	\param virtualAddress The virtual address to be checked.
1495 	\param protection The area protection for which to check. Valid is a bitwise
1496 		or of one or more of \c B_KERNEL_READ_AREA or \c B_KERNEL_WRITE_AREA.
1497 	\return \c true, if the address can be accessed in all ways specified by
1498 		\a protection, \c false otherwise.
1499 */
1500 bool
1501 arch_vm_translation_map_is_kernel_page_accessible(addr_t virtualAddress,
1502 	uint32 protection)
1503 {
1504 	// We only trust the kernel team's page directory. So switch to it first.
1505 	// Always set it to make sure the TLBs don't contain obsolete data.
1506 	addr_t physicalPageDirectory;
1507 	read_cr3(physicalPageDirectory);
1508 	write_cr3(sKernelPhysicalPageDirectory);
1509 
1510 	// get the page directory entry for the address
1511 	page_directory_entry pageDirectoryEntry;
1512 	uint32 index = VADDR_TO_PDENT(virtualAddress);
1513 
1514 	if (physicalPageDirectory == (addr_t)sKernelPhysicalPageDirectory) {
1515 		pageDirectoryEntry = sKernelVirtualPageDirectory[index];
1516 	} else if (sPhysicalPageMapper != NULL) {
1517 		// map the original page directory and get the entry
1518 		void* handle;
1519 		addr_t virtualPageDirectory;
1520 		status_t error = sPhysicalPageMapper->GetPageDebug(
1521 			physicalPageDirectory, &virtualPageDirectory, &handle);
1522 		if (error == B_OK) {
1523 			pageDirectoryEntry
1524 				= ((page_directory_entry*)virtualPageDirectory)[index];
1525 			sPhysicalPageMapper->PutPageDebug(virtualPageDirectory,
1526 				handle);
1527 		} else
1528 			pageDirectoryEntry = 0;
1529 	} else
1530 		pageDirectoryEntry = 0;
1531 
1532 	// map the page table and get the entry
1533 	page_table_entry pageTableEntry;
1534 	index = VADDR_TO_PTENT(virtualAddress);
1535 
1536 	if ((pageDirectoryEntry & X86_PDE_PRESENT) != 0
1537 			&& sPhysicalPageMapper != NULL) {
1538 		void* handle;
1539 		addr_t virtualPageTable;
1540 		status_t error = sPhysicalPageMapper->GetPageDebug(
1541 			pageDirectoryEntry & X86_PDE_ADDRESS_MASK, &virtualPageTable,
1542 			&handle);
1543 		if (error == B_OK) {
1544 			pageTableEntry = ((page_table_entry*)virtualPageTable)[index];
1545 			sPhysicalPageMapper->PutPageDebug(virtualPageTable, handle);
1546 		} else
1547 			pageTableEntry = 0;
1548 	} else
1549 		pageTableEntry = 0;
1550 
1551 	// switch back to the original page directory
1552 	if (physicalPageDirectory != (addr_t)sKernelPhysicalPageDirectory)
1553 		write_cr3(physicalPageDirectory);
1554 
1555 	if ((pageTableEntry & X86_PTE_PRESENT) == 0)
1556 		return false;
1557 
1558 	// present means kernel-readable, so check for writable
1559 	return (protection & B_KERNEL_WRITE_AREA) == 0
1560 		|| (pageTableEntry & X86_PTE_WRITABLE) != 0;
1561 }
1562