diff --git a/arch/Kconfig b/arch/Kconfig index ed7278b1167..5c6aee3ea4c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -61,6 +61,7 @@ config X86 select ARCH_HAS_GDBSTUB if !X86_64 select ARCH_HAS_TIMING_FUNCTIONS select ARCH_HAS_THREAD_LOCAL_STORAGE + select ARCH_HAS_DEMAND_PAGING help x86 architecture diff --git a/arch/x86/core/fatal.c b/arch/x86/core/fatal.c index fd37522381a..b8778c6122a 100644 --- a/arch/x86/core/fatal.c +++ b/arch/x86/core/fatal.c @@ -10,6 +10,7 @@ #include #include #include +#include LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL); #if defined(CONFIG_BOARD_QEMU_X86) || defined(CONFIG_BOARD_QEMU_X86_64) @@ -359,6 +360,44 @@ static const struct z_exc_handle exceptions[] = { void z_x86_page_fault_handler(z_arch_esf_t *esf) { +#ifdef CONFIG_DEMAND_PAGING + if ((esf->errorCode & PF_P) == 0) { + /* Page was non-present at time exception happened. + * Get faulting virtual address from CR2 register + */ + void *virt = z_x86_cr2_get(); + bool was_valid_access; + +#ifdef CONFIG_X86_KPTI + /* Protection ring is lowest 2 bits in interrupted CS */ + bool was_user = ((esf->cs & 0x3) != 0U); + + /* Need to check if the interrupted context was a user thread + * that hit a non-present page that was flipped due to KPTI in + * the thread's page tables, in which case this is an access + * violation and we should treat this as an error. + * + * We're probably not locked, but if there is a race, we will + * be fine, the kernel page fault code will later detect that + * the page is present in the kernel's page tables and the + * instruction will just be re-tried, producing another fault. + */ + if (was_user && + !z_x86_kpti_is_access_ok(virt, get_ptables(esf))) { + was_valid_access = false; + } else +#else + { + was_valid_access = z_page_fault(virt); + } +#endif /* CONFIG_X86_KPTI */ + if (was_valid_access) { + /* Page fault handled, re-try */ + return; + } + } +#endif /* CONFIG_DEMAND_PAGING */ + #if !defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_COREDUMP) z_x86_exception_vector = IV_PAGE_FAULT; #endif diff --git a/arch/x86/core/x86_mmu.c b/arch/x86/core/x86_mmu.c index 4d1de69cf52..e99d4dec3f0 100644 --- a/arch/x86/core/x86_mmu.c +++ b/arch/x86/core/x86_mmu.c @@ -19,6 +19,7 @@ #include #include #include +#include LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL); @@ -1720,3 +1721,153 @@ void arch_reserved_pages_update(void) } } #endif /* CONFIG_ARCH_HAS_RESERVED_PAGE_FRAMES */ + +#ifdef CONFIG_DEMAND_PAGING +#define PTE_MASK (paging_levels[PTE_LEVEL].mask) + +void arch_mem_page_out(void *addr, uintptr_t location) +{ + pentry_t mask = PTE_MASK | MMU_P | MMU_A; + + /* Accessed bit set to guarantee the entry is not completely 0 in + * case of location value 0. A totally 0 PTE is un-mapped. + */ + range_map(addr, location, CONFIG_MMU_PAGE_SIZE, MMU_A, mask, + OPTION_FLUSH); +} + +void arch_mem_page_in(void *addr, uintptr_t phys) +{ + pentry_t mask = PTE_MASK | MMU_P | MMU_D | MMU_A; + + range_map(addr, phys, CONFIG_MMU_PAGE_SIZE, MMU_P, mask, + OPTION_FLUSH); +} + +void arch_mem_scratch(uintptr_t phys) +{ + page_map_set(z_x86_page_tables_get(), Z_SCRATCH_PAGE, + phys | MMU_P | MMU_RW | MMU_XD, NULL, MASK_ALL, + OPTION_FLUSH); +} + +uintptr_t arch_page_info_get(void *addr, uintptr_t *phys, bool clear_accessed) +{ + pentry_t all_pte, mask; + uint32_t options; + + /* What to change, if anything, in the page_map_set() calls */ + if (clear_accessed) { + mask = MMU_A; + options = OPTION_FLUSH; + } else { + /* In this configuration page_map_set() just queries the + * page table and makes no changes + */ + mask = 0; + options = 0; + } + + page_map_set(z_x86_kernel_ptables, addr, 0, &all_pte, mask, options); + + /* Un-mapped PTEs are completely zeroed. No need to report anything + * else in this case. + */ + if (all_pte == 0) { + return ARCH_DATA_PAGE_NOT_MAPPED; + } + +#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) + /* Don't bother looking at other page tables if non-present as we + * are not required to report accurate accessed/dirty in this case + * and all mappings are otherwise the same. + */ + if ((all_pte & MMU_P) != 0) { + sys_snode_t *node; + + /* IRQs are locked, safe to do this */ + SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) { + pentry_t cur_pte; + struct arch_mem_domain *domain = + CONTAINER_OF(node, struct arch_mem_domain, + node); + + page_map_set(domain->ptables, addr, 0, &cur_pte, + mask, options | OPTION_USER); + + /* Logical OR of relevant PTE in all page tables. + * addr/location and present state should be identical + * among them. + */ + all_pte |= cur_pte; + } + } +#endif /* USERSPACE && ~X86_COMMON_PAGE_TABLE */ + + /* NOTE: We are truncating the PTE on PAE systems, whose pentry_t + * are larger than a uintptr_t. + * + * We currently aren't required to report back XD state (bit 63), and + * Zephyr just doesn't support large physical memory on 32-bit + * systems, PAE was only implemented for XD support. + */ + if (phys != NULL) { + *phys = (uintptr_t)get_entry_phys(all_pte, PTE_LEVEL); + } + + /* We don't filter out any other bits in the PTE and the kernel + * ignores them. For the case of ARCH_DATA_PAGE_NOT_MAPPED, + * we use a bit which is never set in a real PTE (the PAT bit) in the + * current system. + * + * The other ARCH_DATA_PAGE_* macros are defined to their corresponding + * bits in the PTE. + */ + return (uintptr_t)all_pte; +} + +enum arch_page_location arch_page_location_get(void *addr, uintptr_t *location) +{ + pentry_t pte; + int level; + + /* TODO: since we only have to query the current set of page tables, + * could optimize this with recursive page table mapping + */ + pentry_get(&level, &pte, z_x86_page_tables_get(), addr); + + if (pte == 0) { + /* Not mapped */ + return ARCH_PAGE_LOCATION_BAD; + } + + __ASSERT(level == PTE_LEVEL, "bigpage found at %p", addr); + *location = (uintptr_t)get_entry_phys(pte, PTE_LEVEL); + + if ((pte & MMU_P) != 0) { + return ARCH_PAGE_LOCATION_PAGED_IN; + } else { + return ARCH_PAGE_LOCATION_PAGED_OUT; + } +} + +#ifdef CONFIG_X86_KPTI +bool z_x86_kpti_is_access_ok(void *addr, pentry_t *ptables) +{ + pentry_t pte; + int level; + + pentry_get(&level, &pte, ptables, addr); + + /* Might as well also check if it's un-mapped, normally we don't + * fetch the PTE from the page tables until we are inside + * z_page_fault() and call arch_page_fault_status_get() + */ + if (level != PTE_LEVEL || pte == 0 || is_flipped_pte(pte)) { + return false; + } + + return true; +} +#endif /* CONFIG_X86_KPTI */ +#endif /* CONFIG_DEMAND_PAGING */ diff --git a/arch/x86/include/x86_mmu.h b/arch/x86/include/x86_mmu.h index 52084858ec8..02835aa6949 100644 --- a/arch/x86/include/x86_mmu.h +++ b/arch/x86/include/x86_mmu.h @@ -169,6 +169,13 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack); * IDT, etc) */ extern uint8_t z_shared_kernel_page_start; + +#ifdef CONFIG_DEMAND_PAGING +/* Called from page fault handler. ptables here is the ptage tables for the + * faulting user thread and not the current set of page tables + */ +extern bool z_x86_kpti_is_access_ok(void *virt, pentry_t *ptables) +#endif /* CONFIG_DEMAND_PAGING */ #endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_USERSPACE */ diff --git a/include/arch/x86/mmustructs.h b/include/arch/x86/mmustructs.h index 36334e394ed..511d6f61e34 100644 --- a/include/arch/x86/mmustructs.h +++ b/include/arch/x86/mmustructs.h @@ -24,6 +24,14 @@ #define Z_X86_MMU_XD 0 #endif +/* For these we'll just use the same bits in the PTE */ +#define ARCH_DATA_PAGE_DIRTY ((uintptr_t)BIT(6)) +#define ARCH_DATA_PAGE_LOADED ((uintptr_t)BIT(0)) +#define ARCH_DATA_PAGE_ACCESSED ((uintptr_t)BIT(5)) + +/* Use an PAT bit for this one since it's never set in a mapped PTE */ +#define ARCH_DATA_PAGE_NOT_MAPPED ((uintptr_t)BIT(7)) + /* Always true with 32-bit page tables, don't enable * CONFIG_EXECUTE_XOR_WRITE and expect it to work for you */