x86: paging code rewrite

The x86 paging code has been rewritten to support another paging mode
and non-identity virtual mappings.

 - Paging code now uses an array of paging level characteristics and
   walks tables using for loops. This is opposed to having different
   functions for every paging level and lots of #ifdefs. The code is
   now more concise and adding new paging modes should be trivial.

 - We now support 32-bit, PAE, and IA-32e page tables.

 - The page tables created by gen_mmu.py are now installed at early
   boot. There are no longer separate "flat" page tables. These tables
   are mutable at any time.

 - The x86_mmu code now has a private header. Many definitions that did
   not need to be in public scope have been moved out of mmustructs.h
   and either placed in the C file or in the private header.

 - Improvements to dumping page table information, with the physical
   mapping and flags all shown

 - arch_mem_map() implemented

 - x86 userspace/memory domain code ported to use the new
   infrastructure.

 - add logic for physical -> virtual instruction pointer transition,
   including cleaning up identity mappings after this takes place.

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2020-07-04 16:23:32 -07:00 committed by Anas Nashif
commit 38e17b68e3
26 changed files with 1574 additions and 1751 deletions

View file

@ -46,6 +46,7 @@ config X86_64
select USE_SWITCH
select USE_SWITCH_SUPPORTED
select SCHED_IPI_SUPPORTED
select X86_MMU
config X86_KERNEL_OFFSET
int "Kernel offset from beginning of RAM"

View file

@ -76,6 +76,15 @@ config X86_USERSPACE
supporting user-level threads that are protected from each other and
from crashing the kernel.
config X86_PAE
bool "Use PAE page tables"
default y
depends on X86_MMU
help
If enabled, use PAE-style page tables instead of 32-bit page tables.
The advantage is support for the Execute Disable bit, at a cost of
more memory for paging structures.
menu "Architecture Floating Point Options"
depends on CPU_HAS_FPU

View file

@ -9,6 +9,7 @@
#include <kernel_internal.h>
#include <exc_handle.h>
#include <logging/log.h>
#include <x86_mmu.h>
LOG_MODULE_DECLARE(os);
#if defined(CONFIG_BOARD_QEMU_X86) || defined(CONFIG_BOARD_QEMU_X86_64)
@ -71,18 +72,24 @@ bool z_x86_check_stack_bounds(uintptr_t addr, size_t size, uint16_t cs)
start = (uintptr_t)Z_KERNEL_STACK_BUFFER(
z_interrupt_stacks[cpu_id]);
end = start + CONFIG_ISR_STACK_SIZE;
} else if ((cs & 0x3U) != 0U ||
(_current->base.user_options & K_USER) == 0) {
/* Thread was in user mode, or is not a user mode thread.
* The normal stack buffer is what we will check.
#ifdef CONFIG_USERSPACE
} else if ((cs & 0x3U) == 0 &&
(_current->base.user_options & K_USER) != 0) {
/* The low two bits of the CS register is the privilege
* level. It will be 0 in supervisor mode and 3 in user mode
* corresponding to ring 0 / ring 3.
*
* If we get here, we must have been doing a syscall, check
* privilege elevation stack bounds
*/
start = _current->stack_info.start - CONFIG_MMU_PAGE_SIZE;
end = _current->stack_info.start;
#endif /* CONFIG_USERSPACE */
} else {
/* Normal thread operation, check its stack buffer */
start = _current->stack_info.start;
end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
_current->stack_info.size);
} else {
/* User thread was doing a syscall, check kernel stack bounds */
start = _current->stack_info.start - MMU_PAGE_SIZE;
end = _current->stack_info.start;
_current->stack_info.size);
}
return (addr <= start) || (addr + size > end);
@ -146,19 +153,27 @@ static void unwind_stack(uintptr_t base_ptr, uint16_t cs)
}
#endif /* CONFIG_X86_EXCEPTION_STACK_TRACE */
static inline struct x86_page_tables *get_ptables(const z_arch_esf_t *esf)
static inline uintptr_t get_cr3(const z_arch_esf_t *esf)
{
#if defined(CONFIG_USERSPACE) && defined(CONFIG_X86_KPTI)
/* If the interrupted thread was in user mode, we did a page table
* switch when we took the exception via z_x86_trampoline_to_kernel
*/
if ((esf->cs & 0x3) != 0) {
return z_x86_thread_page_tables_get(_current);
return _current->arch.ptables;
}
#else
ARG_UNUSED(esf);
#endif
return z_x86_page_tables_get();
/* Return the current CR3 value, it didn't change when we took
* the exception
*/
return z_x86_cr3_get();
}
static inline pentry_t *get_ptables(const z_arch_esf_t *esf)
{
return z_mem_virt_addr(get_cr3(esf));
}
#ifdef CONFIG_X86_64
@ -172,8 +187,8 @@ static void dump_regs(const z_arch_esf_t *esf)
esf->r8, esf->r9, esf->r10, esf->r11);
LOG_ERR("R12: 0x%016lx R13: 0x%016lx R14: 0x%016lx R15: 0x%016lx",
esf->r12, esf->r13, esf->r14, esf->r15);
LOG_ERR("RSP: 0x%016lx RFLAGS: 0x%016lx CS: 0x%04lx CR3: %p", esf->rsp,
esf->rflags, esf->cs & 0xFFFFU, get_ptables(esf));
LOG_ERR("RSP: 0x%016lx RFLAGS: 0x%016lx CS: 0x%04lx CR3: 0x%016lx",
esf->rsp, esf->rflags, esf->cs & 0xFFFFU, get_cr3(esf));
#ifdef CONFIG_X86_EXCEPTION_STACK_TRACE
LOG_ERR("call trace:");
@ -190,8 +205,8 @@ static void dump_regs(const z_arch_esf_t *esf)
esf->eax, esf->ebx, esf->ecx, esf->edx);
LOG_ERR("ESI: 0x%08x, EDI: 0x%08x, EBP: 0x%08x, ESP: 0x%08x",
esf->esi, esf->edi, esf->ebp, esf->esp);
LOG_ERR("EFLAGS: 0x%08x CS: 0x%04x CR3: %p", esf->eflags,
esf->cs & 0xFFFFU, get_ptables(esf));
LOG_ERR("EFLAGS: 0x%08x CS: 0x%04x CR3: 0x%08lx", esf->eflags,
esf->cs & 0xFFFFU, get_cr3(esf));
#ifdef CONFIG_X86_EXCEPTION_STACK_TRACE
LOG_ERR("call trace:");
@ -309,7 +324,7 @@ static void dump_page_fault(z_arch_esf_t *esf)
}
#ifdef CONFIG_X86_MMU
z_x86_dump_mmu_flags(get_ptables(esf), cr2);
z_x86_dump_mmu_flags(get_ptables(esf), (void *)cr2);
#endif /* CONFIG_X86_MMU */
}
#endif /* CONFIG_EXCEPTION_DEBUG */

View file

@ -16,6 +16,7 @@
#include <kernel_arch_data.h>
#include <arch/cpu.h>
#include <arch/x86/multiboot.h>
#include <sys/mem_manage.h>
/* exports (private APIs) */
@ -41,7 +42,55 @@
GTEXT(_sys_resume_from_deep_sleep)
#endif
.macro install_page_tables
#ifdef CONFIG_X86_MMU
/* Enable paging. If virtual memory is enabled, the instruction pointer
* is currently at a physical address. There is an identity mapping
* for all RAM, plus a virtual mapping of RAM starting at
* CONFIG_KERNEL_VM_BASE using the same paging structures.
*
* Until we enable these page tables, only physical memory addresses
* work.
*/
movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %eax
movl %eax, %cr3
#ifdef CONFIG_X86_PAE
/* Enable PAE */
movl %cr4, %eax
orl $CR4_PAE, %eax
movl %eax, %cr4
/* IA32_EFER NXE bit set */
movl $0xC0000080, %ecx
rdmsr
orl $0x800, %eax
wrmsr
#endif /* CONFIG_X86_PAE */
/* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */
movl %cr0, %eax
orl $(CR0_PG | CR0_WP), %eax
movl %eax, %cr0
#if CONFIG_KERNEL_VM_BASE != CONFIG_SRAM_BASE_ADDRESS
/* Jump to a virtual address, which works because the identity and
* virtual mappings both are to the same physical address.
*/
lea vm_enter, %eax
jmp *%eax
vm_enter:
/* We are now executing in virtual memory. We'll un-map the identity
* mappings later once we are in the C domain
*/
#endif /* CONFIG_KERNEL_VM_BASE != CONFIG_SRAM_BASE_ADDRESS */
#endif /* CONFIG_X86_MMU */
.endm
SECTION_FUNC(TEXT_START, __start)
#ifndef CONFIG_XIP
install_page_tables
#endif /* CONFIG_XIP */
#include "../common.S"
@ -64,11 +113,7 @@ SECTION_FUNC(TEXT_START, __start)
*/
#if CONFIG_SET_GDT
lgdt _gdt_rom /* load 32-bit operand size GDT */
#endif
#ifdef CONFIG_SET_GDT
/* If we set our own GDT, update the segment registers as well.
*/
movw $DATA_SEG, %ax /* data segment selector (entry = 3) */
@ -84,7 +129,6 @@ SECTION_FUNC(TEXT_START, __start)
__csSet:
#endif /* CONFIG_SET_GDT */
#if !defined(CONFIG_FPU)
/*
* Force an #NM exception for floating point instructions
@ -206,6 +250,10 @@ __csSet:
call _x86_data_copy
#endif /* CONFIG_USERSPACE */
/* Have to do this here, the page tables aren't loaded into RAM
* until after the data copy
*/
install_page_tables
#endif /* CONFIG_XIP */
/*
@ -308,30 +356,6 @@ dataWords:
ret
#endif /* CONFIG_XIP */
#ifdef CONFIG_X86_MMU
z_x86_enable_paging:
/* load the page directory address into the registers*/
movl $z_x86_kernel_ptables, %eax
movl %eax, %cr3
/* Enable PAE */
movl %cr4, %eax
orl $CR4_PAE, %eax
movl %eax, %cr4
/* IA32_EFER NXE bit set */
movl $0xC0000080, %ecx
rdmsr
orl $0x800, %eax
wrmsr
/* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */
movl %cr0, %eax
orl $(CR0_PG | CR0_WP), %eax
movl %eax, %cr0
ret
#endif /* CONFIG_X86_MMU */
#if defined(CONFIG_SSE)

View file

@ -18,6 +18,9 @@
#include <inttypes.h>
#include <exc_handle.h>
#include <logging/log.h>
#include <x86_mmu.h>
#include <sys/mem_manage.h>
LOG_MODULE_DECLARE(os);
#ifdef CONFIG_DEBUG_COREDUMP
@ -148,7 +151,7 @@ struct task_state_segment _df_tss = {
.es = DATA_SEG,
.ss = DATA_SEG,
.eip = (uint32_t)df_handler_top,
.cr3 = (uint32_t)&z_x86_kernel_ptables
.cr3 = Z_MEM_PHYS_ADDR((uint32_t)&z_x86_kernel_ptables)
};
static __used void df_handler_bottom(void)
@ -196,7 +199,7 @@ static FUNC_NORETURN __used void df_handler_top(void)
_main_tss.es = DATA_SEG;
_main_tss.ss = DATA_SEG;
_main_tss.eip = (uint32_t)df_handler_bottom;
_main_tss.cr3 = (uint32_t)&z_x86_kernel_ptables;
_main_tss.cr3 = z_mem_phys_addr(&z_x86_kernel_ptables);
_main_tss.eflags = 0U;
/* NT bit is set in EFLAGS so we will task switch back to _main_tss

View file

@ -16,6 +16,7 @@
#include <ksched.h>
#include <arch/x86/mmustructs.h>
#include <kswap.h>
#include <x86_mmu.h>
/* forward declaration */

View file

@ -8,6 +8,7 @@
#include <arch/cpu.h>
#include <offsets_short.h>
#include <syscall.h>
#include <sys/mem_manage.h>
/* Exports */
GTEXT(z_x86_syscall_entry_stub)
@ -49,7 +50,7 @@ SECTION_FUNC(TEXT, z_x86_trampoline_to_kernel)
pushl %edi
/* Switch to kernel page table */
movl $z_x86_kernel_ptables, %esi
movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %esi
movl %esi, %cr3
/* Save old trampoline stack pointer in %edi */
@ -154,7 +155,7 @@ SECTION_FUNC(TEXT, z_x86_syscall_entry_stub)
pushl %edi
/* Switch to kernel page table */
movl $z_x86_kernel_ptables, %esi
movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %esi
movl %esi, %cr3
/* Save old trampoline stack pointer in %edi */
@ -304,7 +305,10 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter)
* want to leak any information.
*/
mov %edi, %esp
subl $Z_X86_PDPT_SIZE, %esp
#ifdef CONFIG_X86_PAE
/* Skip over the toplevel PDPT stored here */
subl $0x20, %esp
#endif /* CONFIG_X86_PAE */
/* Stash some registers we are going to need to erase the user
* stack.

View file

@ -9,7 +9,7 @@
#include <kernel_structs.h>
#include <kernel_internal.h>
#include <arch/x86/multiboot.h>
#include <arch/x86/mmustructs.h>
#include <x86_mmu.h>
#include <drivers/interrupt_controller/loapic.h>
/*
@ -80,8 +80,6 @@ struct x86_tss64 tss3 = {
};
#endif
extern struct x86_page_tables z_x86_flat_ptables;
struct x86_cpuboot x86_cpuboot[] = {
{
.tr = X86_KERNEL_CPU0_TR,
@ -89,9 +87,6 @@ struct x86_cpuboot x86_cpuboot[] = {
.sp = (uint64_t) z_interrupt_stacks[0] +
Z_KERNEL_STACK_SIZE_ADJUST(CONFIG_ISR_STACK_SIZE),
.fn = z_x86_prep_c,
#ifdef CONFIG_X86_MMU
.ptables = &z_x86_flat_ptables,
#endif
},
#if CONFIG_MP_NUM_CPUS > 1
{
@ -127,9 +122,6 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz,
x86_cpuboot[cpu_num].sp = (uint64_t) Z_KERNEL_STACK_BUFFER(stack) + sz;
x86_cpuboot[cpu_num].fn = fn;
x86_cpuboot[cpu_num].arg = arg;
#ifdef CONFIG_X86_MMU
x86_cpuboot[cpu_num].ptables = &z_x86_kernel_ptables;
#endif /* CONFIG_X86_MMU */
z_loapic_ipi(apic_id, LOAPIC_ICR_IPI_INIT, 0);
k_busy_wait(10000);

View file

@ -10,6 +10,7 @@
#include <offsets_short.h>
#include <drivers/interrupt_controller/loapic.h>
#include <arch/cpu.h>
#include <sys/mem_manage.h>
.macro read_tsc var_name
push %rax
@ -21,8 +22,69 @@
pop %rax
.endm
/*
* Definitions/macros for enabling paging
*/
/* Long mode, no-execute, syscall */
#define EFER_BITS (X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE)
/* Paging, write-protect */
#define CR0_BITS (CR0_PG | CR0_WP)
/* PAE, SSE */
#define CR4_BITS (CR4_PAE | CR4_OSFXSR)
.macro set_efer
movl $X86_EFER_MSR, %ecx
rdmsr
orl $EFER_BITS, %eax
wrmsr
.endm
.macro install_pagetables_32
movl %cr4, %eax
orl $CR4_BITS, %eax
movl %eax, %cr4
clts
/* Page tables created at build time by gen_mmu.py */
movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %eax
movl %eax, %cr3
set_efer
movl %cr0, %eax
orl $CR0_BITS, %eax
movl %eax, %cr0
.endm
.macro install_pagetables_64
/* Here, we are already in long mode with paging enabled and
* just need to switch to our own page tables, but let's be
* paranoid and ensure CR4, CR0, and EFER_MSR are set up
* exactly how we expect. Logic is the same as install_pagetables_32
*/
movq %cr4, %rax
orq $CR4_BITS, %rax
movq %rax, %cr4
clts
movq $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax
movq %rax, %cr3
set_efer
movq %cr0, %rax
/* Use 32-bit instructions due to assembler fussiness with large
* immediate values with `orq`, CR0_PG is bit 31. We don't ever set any
* high bits in cr0 anyway.
*/
orl $CR0_BITS, %eax
movq %rax, %cr0
.endm
.section .locore,"ax"
.code32
#if CONFIG_MP_NUM_CPUS > 1
@ -79,7 +141,6 @@ unknown_loapic_id:
.code32
.globl __start
__start:
/*
* kernel execution begins here in 32-bit mode, with flat-mode
* descriptors in all segment registers, interrupts disabled.
@ -98,7 +159,6 @@ __start:
* next, clear the BSS. note we're still in 32-bit mode,
* so the BSS must fit entirely in the first 4GB of RAM.
*/
cld
xorl %eax, %eax
movl $__bss_start, %edi
@ -108,37 +168,17 @@ __start:
movl $x86_cpuboot, %ebp /* BSP is always logical CPU id 0 */
movl %ebx, __x86_cpuboot_t_arg_OFFSET(%ebp) /* multiboot info */
/*
* transition to long mode, reload the segment registers,
* and configure per-CPU stuff: GS, task register, stack.
*/
go64: movl %cr4, %eax /* enable PAE and SSE */
orl $(CR4_PAE | CR4_OSFXSR), %eax
movl %eax, %cr4
clts
#ifdef CONFIG_X86_MMU
movl __x86_cpuboot_t_ptables_OFFSET(%ebp), %eax
#else
movl $z_x86_flat_ptables, %eax
#endif
movl %eax, %cr3
movl $X86_EFER_MSR, %ecx /* enable long mode, no-execute, syscall */
rdmsr
orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %eax
wrmsr
movl %cr0, %eax /* enable paging */
orl $(CR0_PG | CR0_WP), %eax
movl %eax, %cr0
go64: /* Install page tables and transition to long mode */
install_pagetables_32
jmpl $X86_KERNEL_CS, $enter_code64
/* Long mode entry point. Arrive here from the code
* immediately above (shared between main CPU startup and AP
* startup), or from EFI entry in __start64
* startup), or from EFI entry in __start64.
*
* Here we reload the segment registers,
* and configure per-CPU stuff: GS, task register, stack.
*/
.code64
enter_code64:
@ -200,20 +240,7 @@ __start64:
lidt idt80
lgdt gdt80
/* These state and flag settings really should be done later,
* in the shared startup path, they aren't required for mode
* transition and having them in the 32 bit stub means they
* have to be duplicated here.
*/
movq %cr4, %rax
orq $(CR4_PAE | CR4_OSFXSR), %rax
movq %rax, %cr4
clts
movq $X86_EFER_MSR, %rcx
rdmsr
orq $(X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %rax
wrmsr
cld
install_pagetables_64
/* Disable 8259 PIT. Almost certainly not needed on modern
* UEFI platforms taking this code path, but...
@ -949,44 +976,6 @@ idt80: /* LIDT descriptor for 64 bit mode */
.word (idt_end - idt - 1)
.quad idt
/* Initial page tables for long mode entry. This generates a second
* level page full of 512 1G PTE entries of the form:
*
* 0x000000nnn0000083
*
* Where nnn is an identity-mapped 1G page index in the range
* 0x000-0x1ff, and 0x83 marks a present, 1G, read/write page
* entry. It's split up somewhat awkwardly to get around gas's
* recursion limits in macro expansion.
*
* This maps the first 512GB of memory space by default, which will
* hopefully be enough to reach everything we need before we can
* bootstrap the real page tables later.
*/
.macro populate_ptable base, count=64
.long 0x00000083
.long 64 - \count + \base
.long 0x40000083
.long 64 - \count + \base
.long 0x80000083
.long 64 - \count + \base
.long 0xC0000083
.long 64 - \count + \base
.if \count > 1
populate_ptable \base, (\count - 1)
.endif
.endm
.align 4096
.globl z_x86_flat_ptables
z_x86_flat_ptables:
.long pdp + 0x03 /* 0x03 = R/W, P */
.long 0
.fill 4088, 1, 0
pdp:
populate_ptable 0
populate_ptable 64
.section .gdt,"ad"
/*

View file

@ -8,6 +8,7 @@
#include <kernel_structs.h>
#include <kernel_internal.h>
#include <offsets_short.h>
#include <x86_mmu.h>
extern void x86_sse_init(struct k_thread *); /* in locore.S */

View file

@ -7,6 +7,7 @@
#include <arch/cpu.h>
#include <offsets_short.h>
#include <syscall.h>
#include <sys/mem_manage.h>
#ifdef CONFIG_X86_KPTI
/* Copy interrupt return stack context to the trampoline stack, switch back
@ -83,7 +84,7 @@ z_x86_syscall_entry_stub:
/* Load kernel's page table */
pushq %rax
movq $z_x86_kernel_ptables, %rax
movq $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax
movq %rax, %cr3
popq %rax
movq $0, -8(%rsp) /* Delete stashed RAX data */

View file

@ -36,7 +36,6 @@ GEN_OFFSET_SYM(_thread_arch_t, excNestCount);
#ifdef CONFIG_USERSPACE
GEN_OFFSET_SYM(_thread_arch_t, psp);
GEN_OFFSET_SYM(_thread_arch_t, ptables);
GEN_ABSOLUTE_SYM(Z_X86_PDPT_SIZE, sizeof(struct x86_mmu_pdpt));
#endif
GEN_OFFSET_SYM(_thread_arch_t, preempFloatReg);
@ -65,9 +64,4 @@ GEN_OFFSET_SYM(z_arch_esf_t, errorCode);
GEN_OFFSET_SYM(z_arch_esf_t, eip);
GEN_OFFSET_SYM(z_arch_esf_t, cs);
GEN_OFFSET_SYM(z_arch_esf_t, eflags);
/* size of the MMU_REGION structure. Used by linker scripts */
GEN_ABSOLUTE_SYM(__MMU_REGION_SIZEOF, sizeof(struct mmu_region));
#endif /* _X86_OFFSETS_INC_ */

View file

@ -49,9 +49,6 @@ GEN_OFFSET_SYM(x86_cpuboot_t, gs_base);
GEN_OFFSET_SYM(x86_cpuboot_t, sp);
GEN_OFFSET_SYM(x86_cpuboot_t, fn);
GEN_OFFSET_SYM(x86_cpuboot_t, arg);
#ifdef CONFIG_X86_MMU
GEN_OFFSET_SYM(x86_cpuboot_t, ptables);
#endif /* CONFIG_X86_MMU */
GEN_ABSOLUTE_SYM(__X86_CPUBOOT_SIZEOF, sizeof(x86_cpuboot_t));
#endif /* _X86_OFFSETS_INC_ */

View file

@ -8,6 +8,7 @@
#include <kernel_internal.h>
#include <arch/x86/acpi.h>
#include <arch/x86/multiboot.h>
#include <x86_mmu.h>
extern FUNC_NORETURN void z_cstart(void);
extern void x86_64_irq_init(void);
@ -25,6 +26,10 @@ FUNC_NORETURN void z_x86_prep_c(void *arg)
z_x86_early_serial_init();
#endif
#ifdef CONFIG_MMU
z_x86_mmu_init();
#endif
#ifdef CONFIG_X86_64
x86_64_irq_init();
#endif
@ -35,10 +40,6 @@ FUNC_NORETURN void z_x86_prep_c(void *arg)
ARG_UNUSED(info);
#endif
#ifdef CONFIG_X86_MMU
z_x86_paging_init();
#endif
#if CONFIG_X86_STACK_PROTECTION
for (int i = 0; i < CONFIG_MP_NUM_CPUS; i++) {
z_x86_set_stack_guard(z_interrupt_stacks[i]);

View file

@ -9,19 +9,20 @@
#include <syscall_handler.h>
#include <kernel_arch_func.h>
#include <ksched.h>
#include <x86_mmu.h>
#ifndef CONFIG_X86_KPTI
/* Change to new set of page tables. ONLY intended for use from
* z_x88_swap_update_page_tables(). This changes CR3, no memory access
* afterwards is legal unless it is known for sure that the relevant
* mappings are identical wrt supervisor mode until we iret out.
/* Set CR3 to a physical address. There must be a valid top-level paging
* structure here or the CPU will triple fault. The incoming page tables must
* have the same kernel mappings wrt supervisor mode. Don't use this function
* unless you know exactly what you are doing.
*/
static inline void page_tables_set(struct x86_page_tables *ptables)
static inline void cr3_set(uintptr_t phys)
{
#ifdef CONFIG_X86_64
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (ptables) : "memory");
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
#else
__asm__ volatile("movl %0, %%cr3\n\t" : : "r" (ptables) : "memory");
__asm__ volatile("movl %0, %%cr3\n\t" : : "r" (phys) : "memory");
#endif
}
@ -43,7 +44,7 @@ static inline void page_tables_set(struct x86_page_tables *ptables)
*/
void z_x86_swap_update_page_tables(struct k_thread *incoming)
{
struct x86_page_tables *ptables;
uintptr_t ptables_phys;
#ifndef CONFIG_X86_64
/* 64-bit uses syscall/sysret which switches stacks manually,
@ -57,10 +58,10 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
/* Check first that we actually need to do this, since setting
* CR3 involves an expensive full TLB flush.
*/
ptables = z_x86_thread_page_tables_get(incoming);
ptables_phys = incoming->arch.ptables;
if (ptables != z_x86_page_tables_get()) {
page_tables_set(ptables);
if (ptables_phys != z_x86_cr3_get()) {
cr3_set(ptables_phys);
}
}
#endif /* CONFIG_X86_KPTI */
@ -99,7 +100,7 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
z_x86_thread_pt_init(thread);
initial_entry = drop_to_user;
} else {
thread->arch.ptables = &z_x86_kernel_ptables;
thread->arch.ptables = z_mem_phys_addr(&z_x86_kernel_ptables);
initial_entry = z_thread_entry;
}
@ -115,7 +116,7 @@ FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
* started in user mode already had this done via z_setup_new_thread()
*/
if (_current->mem_domain_info.mem_domain != NULL) {
z_x86_apply_mem_domain(_current->arch.ptables,
z_x86_apply_mem_domain(_current,
_current->mem_domain_info.mem_domain);
}

File diff suppressed because it is too large Load diff

View file

@ -25,9 +25,6 @@ struct x86_cpuboot {
uint64_t sp; /* initial stack pointer */
arch_cpustart_t fn; /* kernel entry function */
void *arg; /* argument for above function */
#ifdef CONFIG_X86_MMU
struct x86_page_tables *ptables; /* Runtime page tables to install */
#endif /* CONFIG_X86_MMU */
};
typedef struct x86_cpuboot x86_cpuboot_t;

View file

@ -44,20 +44,6 @@ extern FUNC_NORETURN void z_x86_prep_c(void *arg);
void z_x86_early_serial_init(void);
#endif /* CONFIG_X86_VERY_EARLY_CONSOLE */
#ifdef CONFIG_X86_MMU
/* Create all page tables with boot configuration and enable paging */
void z_x86_paging_init(void);
static inline struct x86_page_tables *
z_x86_thread_page_tables_get(struct k_thread *thread)
{
#ifdef CONFIG_USERSPACE
return thread->arch.ptables;
#else
return &z_x86_kernel_ptables;
#endif
}
#endif /* CONFIG_X86_MMU */
/* Called upon CPU exception that is unhandled and hence fatal; dump
* interesting info and call z_x86_fatal_error()
@ -102,19 +88,10 @@ extern FUNC_NORETURN void z_x86_userspace_enter(k_thread_entry_t user_entry,
*/
void *z_x86_userspace_prepare_thread(struct k_thread *thread);
void z_x86_thread_pt_init(struct k_thread *thread);
void z_x86_apply_mem_domain(struct x86_page_tables *ptables,
struct k_mem_domain *mem_domain);
#endif /* CONFIG_USERSPACE */
void z_x86_do_kernel_oops(const z_arch_esf_t *esf);
#ifdef CONFIG_X86_STACK_PROTECTION
void z_x86_set_stack_guard(k_thread_stack_t *stack);
#endif
#endif /* !_ASMLANGUAGE */
#endif /* ZEPHYR_ARCH_X86_INCLUDE_KERNEL_ARCH_FUNC_H_ */

164
arch/x86/include/x86_mmu.h Normal file
View file

@ -0,0 +1,164 @@
/*
* Copyright (c) 2011-2014 Wind River Systems, Inc.
* Copyright (c) 2017-2020 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*
* Internal memory management interfaces implemented in x86_mmu.c.
* None of these are application-facing, use only if you know what you are
* doing!
*/
#ifndef ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H
#define ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H
#include <kernel.h>
#include <arch/x86/mmustructs.h>
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#define XD_SUPPORTED
#define BITL BIT64
#define PRI_ENTRY "0x%016llx"
#else
#define BITL BIT
#define PRI_ENTRY "0x%08x"
#endif
/*
* Common flags in the same bit position regardless of which structure level,
* although not every flag is supported at every level, and some may be
* ignored depending on the state of other bits (such as P or PS)
*
* These flags indicate bit position, and can be used for setting flags or
* masks as needed.
*/
#define MMU_P BITL(0) /** Present */
#define MMU_RW BITL(1) /** Read-Write */
#define MMU_US BITL(2) /** User-Supervisor */
#define MMU_PWT BITL(3) /** Page Write Through */
#define MMU_PCD BITL(4) /** Page Cache Disable */
#define MMU_A BITL(5) /** Accessed */
#define MMU_D BITL(6) /** Dirty */
#define MMU_PS BITL(7) /** Page Size */
#define MMU_G BITL(8) /** Global */
#ifdef XD_SUPPORTED
#define MMU_XD BITL(63) /** Execute Disable */
#else
#define MMU_XD 0
#endif
#ifdef CONFIG_EXCEPTION_DEBUG
/**
* Dump out page table entries for a particular virtual memory address
*
* For the provided memory address, dump out interesting information about
* its mapping to the error log
*
* @param ptables Page tables to walk
* @param virt Virtual address to inspect
*/
void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt);
/**
* Fetch the page table entry for a virtual memory address
*
* @param paging_level [out] what paging level the entry was found at.
* 0=toplevel
* @param val Value stored in page table entry, with address and flags
* @param ptables Toplevel pointer to page tables
* @param virt Virtual address to lookup
*/
void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables,
void *virt);
/**
* Debug function for dumping out page tables
*
* Iterates through the entire linked set of page table structures,
* dumping out codes for the configuration of each table entry.
*
* Entry codes:
*
* . - not present
* w - present, writable, not executable
* a - present, writable, executable
* r - present, read-only, not executable
* x - present, read-only, executable
*
* Entry codes in uppercase indicate that user mode may access.
*
* Color is used to indicate the physical mapping characteristics:
*
* yellow - Identity mapping (virt = phys)
* green - Fixed virtual memory mapping (virt = phys + constant)
* magenta - entry is child page table
* cyan - General mapped memory
*
* @param ptables Top-level pointer to the page tables, as programmed in CR3
*/
void z_x86_dump_page_tables(pentry_t *ptables);
#endif /* CONFIG_EXCEPTION_DEBUG */
#ifdef CONFIG_HW_STACK_PROTECTION
/* Legacy function - set identity-mapped MMU stack guard page to RO in the
* kernel's page tables to prevent writes and generate an exception
*/
void z_x86_set_stack_guard(k_thread_stack_t *stack);
#endif
#ifdef CONFIG_USERSPACE
#ifdef CONFIG_X86_KPTI
/* Defined in linker script. Contains all the data that must be mapped
* in a KPTI table even though US bit is not set (trampoline stack, GDT,
* IDT, etc)
*/
extern uint8_t z_shared_kernel_page_start;
#endif /* CONFIG_X86_KPTI */
/* Set up per-thread page tables just prior to entering user mode */
void z_x86_thread_pt_init(struct k_thread *thread);
/* Apply a memory domain policy to a set of thread page tables */
void z_x86_apply_mem_domain(struct k_thread *thread,
struct k_mem_domain *mem_domain);
#endif /* CONFIG_USERSPACE */
/* Return cr3 value, which is the physical (not virtual) address of the
* current set of page tables
*/
static inline uintptr_t z_x86_cr3_get(void)
{
uintptr_t cr3;
#ifdef CONFIG_X86_64
__asm__ volatile("movq %%cr3, %0\n\t" : "=r" (cr3));
#else
__asm__ volatile("movl %%cr3, %0\n\t" : "=r" (cr3));
#endif
return cr3;
}
/* Return the virtual address of the page tables installed in this CPU in CR3 */
static inline pentry_t *z_x86_page_tables_get(void)
{
return z_mem_virt_addr(z_x86_cr3_get());
}
/* Kernel's page table. This is in CR3 for all supervisor threads.
* if KPTI is enabled, we switch to this when handling exceptions or syscalls
*/
extern pentry_t z_x86_kernel_ptables;
/* Get the page tables used by this thread during normal execution */
static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread)
{
#ifdef CONFIG_USERSPACE
return z_mem_virt_addr(thread->arch.ptables);
#else
return &z_x86_kernel_ptables;
#endif
}
/* Early-boot paging setup tasks, called from prep_c */
void z_x86_mmu_init(void);
#endif /* ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H */

View file

@ -51,9 +51,12 @@
#define RAMABLE_REGION RAM
#endif
#ifdef CONFIG_X86_MMU
#define MMU_PAGE_SIZE KB(4)
#define MMU_PAGE_ALIGN . = ALIGN(MMU_PAGE_SIZE);
/* Used to align areas with separate memory permission characteristics
* so that the page permissions can be set in the MMU. Without this,
* the kernel is just one blob with the same RWX permissions on all RAM
*/
#ifdef CONFIG_SRAM_REGION_PERMISSIONS
#define MMU_PAGE_ALIGN . = ALIGN(CONFIG_MMU_PAGE_SIZE);
#else
#define MMU_PAGE_ALIGN
#endif
@ -317,9 +320,11 @@ SECTIONS
__data_rom_start = LOADADDR(_DATA_SECTION_NAME);
#include <linker/common-ram.ld>
#include <linker/kobject.ld>
#include <linker/cplusplus-ram.ld>
#include <arch/x86/pagetables.ld>
/* Must be last in RAM */
#include <linker/kobject.ld>
MMU_PAGE_ALIGN
__data_ram_end = .;

View file

@ -215,11 +215,16 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
/* Pointer to page tables used by this thread. Supervisor threads
* always use the kernel's page table, user thread use per-thread
* tables stored in the stack object
/* Physical address of the page tables used by this thread. Supervisor
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object.
*/
struct x86_page_tables *ptables;
uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.

View file

@ -9,9 +9,12 @@
#define ROMABLE_REGION RAM
#define RAMABLE_REGION RAM
#ifdef CONFIG_X86_MMU
#define MMU_PAGE_SIZE KB(4)
#define MMU_PAGE_ALIGN . = ALIGN(MMU_PAGE_SIZE);
/* Used to align areas with separate memory permission characteristics
* so that the page permissions can be set in the MMU. Without this,
* the kernel is just one blob with the same RWX permissions on all RAM
*/
#ifdef CONFIG_SRAM_REGION_PERMISSIONS
#define MMU_PAGE_ALIGN . = ALIGN(CONFIG_MMU_PAGE_SIZE);
#else
#define MMU_PAGE_ALIGN
#endif
@ -167,10 +170,11 @@ SECTIONS
#include <snippets-ram-sections.ld>
#include <linker/common-ram.ld>
#include <linker/cplusplus-ram.ld>
#include <arch/x86/pagetables.ld>
/* Must be last in RAM */
#include <linker/kobject.ld>
. = ALIGN(8);
MMU_PAGE_ALIGN
_image_ram_end = .;
_end = .;

View file

@ -41,6 +41,7 @@
#ifndef _ASMLANGUAGE
#include <zephyr/types.h>
#include <arch/x86/mmustructs.h>
/*
* 64-bit Task State Segment. One defined per CPU.
@ -114,11 +115,16 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
/* Pointer to page tables used by this thread. Supervisor threads
* always use the kernel's page table, user thread use per-thread
* tables stored in the stack object
/* Physical address to page tables used by this thread. Supervisor
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object
*/
struct x86_page_tables *ptables;
uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.

View file

@ -1,429 +1,161 @@
/*
* Copyright (c) 2011-2014 Wind River Systems, Inc.
* Copyright (c) 2017 Intel Corporation
* Copyright (c) 2020 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_
#define ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_
#ifndef ZEPHYR_INCLUDE_ARCH_X86_MMU_H
#define ZEPHYR_INCLUDE_ARCH_X86_MMU_H
#include <sys/util.h>
#define MMU_PAGE_SIZE 4096UL
#define MMU_PAGE_MASK 0xfffU
#define MMU_PAGE_SHIFT 12U
#define PAGES(x) ((x) << (MMU_PAGE_SHIFT))
#define MMU_ARE_IN_SAME_PAGE(a, b) \
(((uint32_t)(a) & ~MMU_PAGE_MASK) == ((uint32_t)(b) & ~MMU_PAGE_MASK))
#define MMU_IS_ON_PAGE_BOUNDARY(a) (!((uint32_t)(a) & MMU_PAGE_MASK))
/* Macros for reserving space for page tables
*
* Z_X86_NUM_TABLE_PAGES. In order to produce a set of page tables which has
* virtual mappings for all system RAM, Z_X86_NUM_TABLE_PAGES is the number of
* memory pages required. If CONFIG_X86_PAE is enabled, an additional 0x20
* bytes are required for the toplevel 4-entry PDPT.
*
* Z_X86_INITIAL_PAGETABLE_SIZE is the total amount of memory in bytes
* required, for any paging mode.
*
* These macros are currently used for two purposes:
* - Reserving memory in the stack for thread-level page tables (slated
* for eventual removal when USERSPACE is reworked to fully utilize
* virtual memory and page tables are maintained at the process level)
* - Reserving room for dummy pagetable memory for the first link, so that
* memory addresses are not disturbed by the insertion of the real page
* tables created by gen_mmu.py in the second link phase.
*/
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
#ifdef CONFIG_X86_64
#define Z_X86_NUM_PML4_ENTRIES 512U
#define Z_X86_NUM_PDPT_ENTRIES 512U
#else
#define Z_X86_NUM_PDPT_ENTRIES 4U
#endif /* CONFIG_X86_64 */
#define Z_X86_NUM_PD_ENTRIES 512U
#define Z_X86_NUM_PT_ENTRIES 512U
#else
#define Z_X86_NUM_PD_ENTRIES 1024U
#define Z_X86_NUM_PT_ENTRIES 1024U
#endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */
/* Memory range covered by an instance of various table types */
#define Z_X86_PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * \
Z_X86_NUM_PT_ENTRIES))
#define Z_X86_PD_AREA (Z_X86_PT_AREA * Z_X86_NUM_PD_ENTRIES)
#ifdef CONFIG_X86_64
#define Z_X86_PDPT_AREA (Z_X86_PD_AREA * Z_X86_NUM_PDPT_ENTRIES)
#endif
#define PHYS_RAM_ADDR DT_REG_ADDR(DT_CHOSEN(zephyr_sram))
#define PHYS_RAM_SIZE DT_REG_SIZE(DT_CHOSEN(zephyr_sram))
/* Define a range [Z_X86_PT_START, Z_X86_PT_END) which is the memory range
* covered by all the page tables needed for system RAM
*/
#define Z_X86_PT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PT_AREA))
#define Z_X86_PT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PT_AREA))
/* Number of page tables needed to cover system RAM. Depends on the specific
* bounds of system RAM, but roughly 1 page table per 2MB of RAM
*/
#define Z_X86_NUM_PT ((Z_X86_PT_END - Z_X86_PT_START) / Z_X86_PT_AREA)
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
/* Same semantics as above, but for the page directories needed to cover
* system RAM.
*/
#define Z_X86_PD_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PD_AREA))
#define Z_X86_PD_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PD_AREA))
/* Number of page directories needed to cover system RAM. Depends on the
* specific bounds of system RAM, but roughly 1 page directory per 1GB of RAM
*/
#define Z_X86_NUM_PD ((Z_X86_PD_END - Z_X86_PD_START) / Z_X86_PD_AREA)
#else
/* 32-bit page tables just have one toplevel page directory */
#define Z_X86_NUM_PD 1
#endif
#ifdef CONFIG_X86_64
/* Same semantics as above, but for the page directory pointer tables needed
* to cover system RAM. On 32-bit there is just one 4-entry PDPT.
*/
#define Z_X86_PDPT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, \
Z_X86_PDPT_AREA))
#define Z_X86_PDPT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PDPT_AREA))
/* Number of PDPTs needed to cover system RAM. Depends on the
* specific bounds of system RAM, but roughly 1 PDPT per 512GB of RAM
*/
#define Z_X86_NUM_PDPT ((Z_X86_PDPT_END - Z_X86_PDPT_START) / Z_X86_PDPT_AREA)
/* All pages needed for page tables, using computed values plus one more for
* the top-level PML4
*/
#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD + \
Z_X86_NUM_PDPT + 1)
#else /* !CONFIG_X86_64 */
/* Number of pages we need to reserve in the stack for per-thread page tables */
#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD)
#endif /* CONFIG_X86_64 */
#ifdef CONFIG_X86_PAE
/* Toplevel PDPT wasn't included as it is not a page in size */
#define Z_X86_INITIAL_PAGETABLE_SIZE ((Z_X86_NUM_TABLE_PAGES * \
CONFIG_MMU_PAGE_SIZE) + 0x20)
#else
#define Z_X86_INITIAL_PAGETABLE_SIZE (Z_X86_NUM_TABLE_PAGES * \
CONFIG_MMU_PAGE_SIZE)
#endif
/*
* Common flags in the same bit position regardless of which structure level,
* although not every flag is supported at every level, and some may be
* ignored depending on the state of other bits (such as P or PS)
* K_MEM_PARTITION_* defines
*
* These flags indicate bit position, and can be used for setting flags or
* masks as needed.
* Slated for removal when virtual memory is implemented, memory
* mapping APIs will replace memory domains.
*/
#define Z_X86_MMU_P BIT64(0) /** Present */
#define Z_X86_MMU_RW BIT64(1) /** Read-Write */
#define Z_X86_MMU_US BIT64(2) /** User-Supervisor */
#define Z_X86_MMU_PWT BIT64(3) /** Page Write Through */
#define Z_X86_MMU_PCD BIT64(4) /** Page Cache Disable */
#define Z_X86_MMU_A BIT64(5) /** Accessed */
#define Z_X86_MMU_D BIT64(6) /** Dirty */
#define Z_X86_MMU_PS BIT64(7) /** Page Size */
#define Z_X86_MMU_G BIT64(8) /** Global */
#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
#define Z_X86_MMU_XD BIT64(63) /** Execute Disable */
#ifdef CONFIG_X86_64
#define Z_X86_MMU_PROT_KEY_MASK 0x7800000000000000ULL
#else
#define Z_X86_MMU_XD 0
#endif
/*
* Structure-specific flags / masks
/* Always true with 32-bit page tables, don't enable
* CONFIG_EXECUTE_XOR_WRITE and expect it to work for you
*/
#define Z_X86_MMU_PDPTE_PAT BIT64(12)
#define Z_X86_MMU_PDE_PAT BIT64(12)
#define Z_X86_MMU_PTE_PAT BIT64(7) /** Page Attribute Table */
/* The true size of the mask depends on MAXADDR, which is found at run-time.
* As a simplification, roll the area for the memory address, and the
* reserved or ignored regions immediately above it, into a single area.
* This will work as expected if valid memory addresses are written.
*/
#ifdef CONFIG_X86_64
#define Z_X86_MMU_PML4E_PDPT_MASK 0x7FFFFFFFFFFFF000ULL
#endif
#define Z_X86_MMU_PDPTE_PD_MASK 0x7FFFFFFFFFFFF000ULL
#ifdef CONFIG_X86_64
#define Z_X86_MMU_PDPTE_1G_MASK 0x07FFFFFFC0000000ULL
#endif
#define Z_X86_MMU_PDE_PT_MASK 0x7FFFFFFFFFFFF000ULL
#define Z_X86_MMU_PDE_2MB_MASK 0x07FFFFFFFFC00000ULL
#define Z_X86_MMU_PTE_ADDR_MASK 0x07FFFFFFFFFFF000ULL
/*
* These flags indicate intention when setting access properties.
*/
#define MMU_ENTRY_NOT_PRESENT 0ULL
#define MMU_ENTRY_PRESENT Z_X86_MMU_P
#define MMU_ENTRY_READ 0ULL
#define MMU_ENTRY_WRITE Z_X86_MMU_RW
#define MMU_ENTRY_SUPERVISOR 0ULL
#define MMU_ENTRY_USER Z_X86_MMU_US
#define MMU_ENTRY_WRITE_BACK 0ULL
#define MMU_ENTRY_WRITE_THROUGH Z_X86_MMU_PWT
#define MMU_ENTRY_CACHING_ENABLE 0ULL
#define MMU_ENTRY_CACHING_DISABLE Z_X86_MMU_PCD
#define MMU_ENTRY_NOT_ACCESSED 0ULL
#define MMU_ENTRY_ACCESSED Z_X86_MMU_A
#define MMU_ENTRY_NOT_DIRTY 0ULL
#define MMU_ENTRY_DIRTY Z_X86_MMU_D
#define MMU_ENTRY_NOT_GLOBAL 0ULL
#define MMU_ENTRY_GLOBAL Z_X86_MMU_G
#define MMU_ENTRY_EXECUTE_DISABLE Z_X86_MMU_XD
#define MMU_ENTRY_EXECUTE_ENABLE 0ULL
/* memory partition arch/soc independent attribute */
#define K_MEM_PARTITION_P_RW_U_RW (MMU_ENTRY_WRITE | \
MMU_ENTRY_USER | \
MMU_ENTRY_EXECUTE_DISABLE)
#define K_MEM_PARTITION_P_RW_U_NA (MMU_ENTRY_WRITE | \
MMU_ENTRY_SUPERVISOR | \
MMU_ENTRY_EXECUTE_DISABLE)
#define K_MEM_PARTITION_P_RO_U_RO (MMU_ENTRY_READ | \
MMU_ENTRY_USER | \
MMU_ENTRY_EXECUTE_DISABLE)
#define K_MEM_PARTITION_P_RO_U_NA (MMU_ENTRY_READ | \
MMU_ENTRY_SUPERVISOR | \
MMU_ENTRY_EXECUTE_DISABLE)
/* Execution-allowed attributes */
#define K_MEM_PARTITION_P_RWX_U_RWX (MMU_ENTRY_WRITE | MMU_ENTRY_USER)
#define K_MEM_PARTITION_P_RWX_U_NA (MMU_ENTRY_WRITE | MMU_ENTRY_SUPERVISOR)
#define K_MEM_PARTITION_P_RX_U_RX (MMU_ENTRY_READ | MMU_ENTRY_USER)
#define K_MEM_PARTITION_P_RX_U_NA (MMU_ENTRY_READ | MMU_ENTRY_SUPERVISOR)
#define K_MEM_PARTITION_IS_EXECUTABLE(attr) (((attr) & Z_X86_MMU_XD) == 0)
#define K_MEM_PARTITION_IS_WRITABLE(attr) (((attr) & Z_X86_MMU_RW) != 0)
/* memory partition arch/soc independent attribute */
#define K_MEM_PARTITION_P_RW_U_RW (Z_X86_MMU_RW | Z_X86_MMU_US | \
Z_X86_MMU_XD)
#define K_MEM_PARTITION_P_RW_U_NA (Z_X86_MMU_RW | Z_X86_MMU_XD)
#define K_MEM_PARTITION_P_RO_U_RO (Z_X86_MMU_US | Z_X86_MMU_XD)
#define K_MEM_PARTITION_P_RO_U_NA Z_X86_MMU_XD
/* Execution-allowed attributes */
#define K_MEM_PARTITION_P_RWX_U_RWX (Z_X86_MMU_RW | Z_X86_MMU_US)
#define K_MEM_PARTITION_P_RWX_U_NA Z_X96_MMU_RW
#define K_MEM_PARTITION_P_RX_U_RX Z_X86_MMU_US
#define K_MEM_PARTITION_P_RX_U_NA (0)
/* memory partition access permission mask */
#define K_MEM_PARTITION_PERM_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \
Z_X86_MMU_XD)
#define K_MEM_PARTITION_PERM_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \
Z_X86_MMU_XD)
#ifndef _ASMLANGUAGE
#include <sys/__assert.h>
#include <zephyr/types.h>
/* Structure used by gen_mmu.py to create page directories and page tables.
* In order to populate this structure use macro MMU_BOOT_REGION.
/* Page table entry data type at all levels. Defined here due to
* k_mem_partition_attr_t, eventually move to private x86_mmu.h
*/
struct mmu_region {
uintptr_t address; /*Start address of the memory region */
size_t size; /* Size of the memory region*/
uint64_t flags; /* Permissions needed for this region*/
};
/* permission_flags are calculated using the macros
* region_size has to be provided in bytes
* for read write access = MMU_ENTRY_READ/MMU_ENTRY_WRITE
* for supervisor/user mode access = MMU_ENTRY_SUPERVISOR/MMU_ENTRY_USER
*
* Preprocessor indirection layers used to ensure __COUNTER__ is expanded
* properly.
*/
#define __MMU_BOOT_REGION(id, addr, region_size, permission_flags) \
static const Z_STRUCT_SECTION_ITERABLE(mmu_region, region_##id) = \
{ \
.address = (uintptr_t)(addr), \
.size = (size_t)(region_size), \
.flags = (permission_flags), \
}
#define Z_MMU_BOOT_REGION(id, addr, region_size, permission_flags) \
__MMU_BOOT_REGION(id, addr, region_size, permission_flags)
#define MMU_BOOT_REGION(addr, region_size, permission_flags) \
Z_MMU_BOOT_REGION(__COUNTER__, addr, region_size, permission_flags)
#ifdef CONFIG_X86_64
#define Z_X86_NUM_PML4_ENTRIES 512U
#define Z_X86_NUM_PDPT_ENTRIES 512U
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
typedef uint64_t pentry_t;
#else
#define Z_X86_NUM_PDPT_ENTRIES 4U
typedef uint32_t pentry_t;
#endif
#define Z_X86_NUM_PD_ENTRIES 512U
#define Z_X86_NUM_PT_ENTRIES 512U
/* Memory range covered by an instance of various table types */
#define Z_X86_PT_AREA (MMU_PAGE_SIZE * Z_X86_NUM_PT_ENTRIES)
#define Z_X86_PD_AREA (Z_X86_PT_AREA * Z_X86_NUM_PD_ENTRIES)
#define Z_X86_PDPT_AREA (Z_X86_PD_AREA * Z_X86_NUM_PDPT_ENTRIES)
typedef uint64_t k_mem_partition_attr_t;
#ifdef CONFIG_X86_64
struct x86_mmu_pml4 {
uint64_t entry[Z_X86_NUM_PML4_ENTRIES];
};
#endif
struct x86_mmu_pdpt {
uint64_t entry[Z_X86_NUM_PDPT_ENTRIES];
};
struct x86_mmu_pd {
uint64_t entry[Z_X86_NUM_PD_ENTRIES];
};
struct x86_mmu_pt {
uint64_t entry[Z_X86_NUM_PT_ENTRIES];
};
struct x86_page_tables {
#ifdef CONFIG_X86_64
struct x86_mmu_pml4 pml4;
#else
struct x86_mmu_pdpt pdpt;
#endif
};
/*
* Inline functions for getting the next linked structure
*/
#ifdef CONFIG_X86_64
static inline uint64_t *z_x86_pml4_get_pml4e(struct x86_mmu_pml4 *pml4,
uintptr_t addr)
{
int index = (addr >> 39U) & (Z_X86_NUM_PML4_ENTRIES - 1);
return &pml4->entry[index];
}
static inline struct x86_mmu_pdpt *z_x86_pml4e_get_pdpt(uint64_t pml4e)
{
uintptr_t addr = pml4e & Z_X86_MMU_PML4E_PDPT_MASK;
return (struct x86_mmu_pdpt *)addr;
}
#endif
static inline uint64_t *z_x86_pdpt_get_pdpte(struct x86_mmu_pdpt *pdpt,
uintptr_t addr)
{
int index = (addr >> 30U) & (Z_X86_NUM_PDPT_ENTRIES - 1);
return &pdpt->entry[index];
}
static inline struct x86_mmu_pd *z_x86_pdpte_get_pd(uint64_t pdpte)
{
uintptr_t addr = pdpte & Z_X86_MMU_PDPTE_PD_MASK;
#ifdef CONFIG_X86_64
__ASSERT((pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page");
#endif
return (struct x86_mmu_pd *)addr;
}
static inline uint64_t *z_x86_pd_get_pde(struct x86_mmu_pd *pd, uintptr_t addr)
{
int index = (addr >> 21U) & (Z_X86_NUM_PD_ENTRIES - 1);
return &pd->entry[index];
}
static inline struct x86_mmu_pt *z_x86_pde_get_pt(uint64_t pde)
{
uintptr_t addr = pde & Z_X86_MMU_PDE_PT_MASK;
__ASSERT((pde & Z_X86_MMU_PS) == 0, "pde is for 2MB page");
return (struct x86_mmu_pt *)addr;
}
static inline uint64_t *z_x86_pt_get_pte(struct x86_mmu_pt *pt, uintptr_t addr)
{
int index = (addr >> 12U) & (Z_X86_NUM_PT_ENTRIES - 1);
return &pt->entry[index];
}
/*
* Inline functions for obtaining page table structures from the top-level
*/
#ifdef CONFIG_X86_64
static inline struct x86_mmu_pml4 *
z_x86_get_pml4(struct x86_page_tables *ptables)
{
return &ptables->pml4;
}
static inline uint64_t *z_x86_get_pml4e(struct x86_page_tables *ptables,
uintptr_t addr)
{
return z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
}
static inline struct x86_mmu_pdpt *
z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr)
{
return z_x86_pml4e_get_pdpt(*z_x86_get_pml4e(ptables, addr));
}
#else
static inline struct x86_mmu_pdpt *
z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr)
{
ARG_UNUSED(addr);
return &ptables->pdpt;
}
#endif /* CONFIG_X86_64 */
static inline uint64_t *z_x86_get_pdpte(struct x86_page_tables *ptables,
uintptr_t addr)
{
return z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr), addr);
}
static inline struct x86_mmu_pd *
z_x86_get_pd(struct x86_page_tables *ptables, uintptr_t addr)
{
return z_x86_pdpte_get_pd(*z_x86_get_pdpte(ptables, addr));
}
static inline uint64_t *z_x86_get_pde(struct x86_page_tables *ptables,
uintptr_t addr)
{
return z_x86_pd_get_pde(z_x86_get_pd(ptables, addr), addr);
}
static inline struct x86_mmu_pt *
z_x86_get_pt(struct x86_page_tables *ptables, uintptr_t addr)
{
return z_x86_pde_get_pt(*z_x86_get_pde(ptables, addr));
}
static inline uint64_t *z_x86_get_pte(struct x86_page_tables *ptables,
uintptr_t addr)
{
return z_x86_pt_get_pte(z_x86_get_pt(ptables, addr), addr);
}
/**
* Dump out page table entries for a particular memory address
*
* For the provided memory address, dump out the P, W, XD, US flags
* at each paging level to the error log.
*/
void z_x86_dump_mmu_flags(struct x86_page_tables *ptables, uintptr_t addr);
/**
* Debug function for dumping out page tables
*
* Iterates through the entire linked set of page table structures,
* dumping out codes for the configuration of each table entry.
*
* Entry codes:
*
* . - not present
* w - present, writable, not executable
* a - present, writable, executable
* r - present, read-only, not executable
* x - present, read-only, executable
*
* Entry codes in uppercase indicate that user mode may access.
*
* @param ptables Top-level pointer to the page tables, as programmed in CR3
*/
void z_x86_dump_page_tables(struct x86_page_tables *ptables);
static inline struct x86_page_tables *z_x86_page_tables_get(void)
{
struct x86_page_tables *ret;
#ifdef CONFIG_X86_64
__asm__ volatile("movq %%cr3, %0\n\t" : "=r" (ret));
#else
__asm__ volatile("movl %%cr3, %0\n\t" : "=r" (ret));
#endif
return ret;
}
/* Kernel's page table. Always active when threads are running in supervisor
* mode, or handling an interrupt.
*
* If KPTI is not enabled, this is used as a template to create per-thread
* page tables for when threads run in user mode.
*/
extern struct x86_page_tables z_x86_kernel_ptables;
#ifdef CONFIG_X86_KPTI
/* Separate page tables for user mode threads. This is never installed into the
* CPU; instead it is used as a template for creating per-thread page tables.
*/
extern struct x86_page_tables z_x86_user_ptables;
#define USER_PTABLES z_x86_user_ptables
#else
#define USER_PTABLES z_x86_kernel_ptables
#endif
/**
* @brief Fetch page table flags for a particular page
*
* Given a memory address, return the flags for the containing page's
* PDE and PTE entries. Intended for debugging.
*
* @param ptables Which set of page tables to use
* @param addr Memory address to example
* @param pde_flags Output parameter for page directory entry flags
* @param pte_flags Output parameter for page table entry flags
*/
void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr,
uint64_t *pde_flags, uint64_t *pte_flags);
/**
* @brief set flags in the MMU page tables
*
* Modify bits in the existing page tables for a particular memory
* range, which must be page-aligned
*
* @param ptables Which set of page tables to use
* @param ptr Starting memory address which must be page-aligned
* @param size Size of the region, must be page size multiple
* @param flags Value of bits to set in the page table entries
* @param mask Mask indicating which particular bits in the page table entries
* to modify
* @param flush Whether to flush the TLB for the modified pages, only needed
* when modifying the active page tables
*/
void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
size_t size, uint64_t flags, uint64_t mask, bool flush);
int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size,
bool write);
void z_x86_add_mmu_region(uintptr_t addr, size_t size, uint64_t flags);
typedef pentry_t k_mem_partition_attr_t;
#endif /* _ASMLANGUAGE */
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_ */
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */

View file

@ -0,0 +1,39 @@
/*
* Copyright (c) 2020 Intel Corp.
* SPDX-License-Identifier: Apache-2.0
*/
/* Pagetables. These are produced by arch/x86/gen-mmu.py based on
* data in zephyr_prebuilt.elf (the result of linker pass 1).
* For the pass 1 build, an equal-sized dummy area is provided as
* to not shift memory addresses that occur after this.
*/
#ifdef CONFIG_MMU
SECTION_DATA_PROLOGUE(pagetables,,)
{
. = ALIGN(4096);
z_x86_pagetables_start = .;
#ifdef LINKER_PASS2
KEEP(*(pagetables)) /* gen_mmu.py */
#else
KEEP(*(.dummy_pagetables)) /* from x86_mmu.c, just an empty array */
#endif /* LINKER_PASS2 */
/* Top-level paging structure is the last thing in this section */
#if CONFIG_X86_PAE
/* 4-entry PDPT */
z_x86_kernel_ptables = . - 32;
#else
/* Page directory or PML4 */
z_x86_kernel_ptables = . - 4096;
#endif /* CONFIG_X86_PAE */
} GROUP_DATA_LINK_IN(RAMABLE_REGION, ROMABLE_REGION)
#ifdef LINKER_PASS2
/DISCARD/ :
{
/* We have the real ones in this build */
*(.dummy_pagetables)
}
#endif /* LINKER_PASS2 */
#endif /* CONFIG_MMU */

View file

@ -19,13 +19,16 @@
* user mode. For each thread, we have:
*
* - On 32-bit
* - a toplevel PD
* - On 32-bit (PAE)
* - a toplevel PDPT
* - a set of PDs for the memory range covered by system RAM
* - On 64-bit
* - a toplevel PML4
* - a set of PDPTs for the memory range covered by system RAM
* - a set of PDs for the memory range covered by system RAM
* - On all modes:
* - a set of page directories for the memory range covered by system RAM
* - a set of page tbales for the memory range covered by system RAM
* - a set of PTs for the memory range covered by system RAM
*
* Directories and tables for memory ranges outside of system RAM will be
* shared and not thread-specific.
@ -44,73 +47,21 @@
*
* The PDPT is fairly small singleton on x86 PAE (32 bytes) and also must
* be aligned to 32 bytes, so we place it at the highest addresses of the
* page reserved for the privilege elevation stack. On 64-bit all table
* entities up to and including the PML4 are page-sized.
* page reserved for the privilege elevation stack. On 64-bit or legacy 32-bit
* all table entities up to and including the PML4 are page-sized.
*
* The page directories and tables require page alignment so we put them as
* additional fields in the stack object, using the below macros to compute how
* many pages we need.
*/
#define PHYS_RAM_ADDR DT_REG_ADDR(DT_CHOSEN(zephyr_sram))
#define PHYS_RAM_SIZE DT_REG_SIZE(DT_CHOSEN(zephyr_sram))
/* Define a range [Z_X86_PT_START, Z_X86_PT_END) which is the memory range
* covered by all the page tables needed for system RAM
*/
#define Z_X86_PT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PT_AREA))
#define Z_X86_PT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PT_AREA))
/* Number of page tables needed to cover system RAM. Depends on the specific
* bounds of system RAM, but roughly 1 page table per 2MB of RAM
*/
#define Z_X86_NUM_PT ((Z_X86_PT_END - Z_X86_PT_START) / Z_X86_PT_AREA)
/* Same semantics as above, but for the page directories needed to cover
* system RAM.
*/
#define Z_X86_PD_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PD_AREA))
#define Z_X86_PD_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PD_AREA))
/* Number of page directories needed to cover system RAM. Depends on the
* specific bounds of system RAM, but roughly 1 page directory per 1GB of RAM
*/
#define Z_X86_NUM_PD ((Z_X86_PD_END - Z_X86_PD_START) / Z_X86_PD_AREA)
#ifdef CONFIG_X86_64
/* Same semantics as above, but for the page directory pointer tables needed
* to cover system RAM. On 32-bit there is just one 4-entry PDPT.
*/
#define Z_X86_PDPT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, \
Z_X86_PDPT_AREA))
#define Z_X86_PDPT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \
Z_X86_PDPT_AREA))
/* Number of PDPTs needed to cover system RAM. Depends on the
* specific bounds of system RAM, but roughly 1 PDPT per 512GB of RAM
*/
#define Z_X86_NUM_PDPT ((Z_X86_PDPT_END - Z_X86_PDPT_START) / Z_X86_PDPT_AREA)
/* All pages needed for page tables, using computed values plus one more for
* the top-level PML4
*/
#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD + \
Z_X86_NUM_PDPT + 1)
#else /* !CONFIG_X86_64 */
/* Number of pages we need to reserve in the stack for per-thread page tables */
#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD)
#endif /* CONFIG_X86_64 */
#else /* !CONFIG_USERSPACE */
/* If we're not implementing user mode, then the MMU tables don't get changed
* on context switch and we don't need any per-thread page tables
*/
#define Z_X86_NUM_TABLE_PAGES 0UL
#endif /* CONFIG_USERSPACE */
#define Z_X86_THREAD_PT_AREA (Z_X86_NUM_TABLE_PAGES * MMU_PAGE_SIZE)
#define Z_X86_THREAD_PT_AREA (Z_X86_NUM_TABLE_PAGES * \
(uintptr_t)CONFIG_MMU_PAGE_SIZE)
#else
#define Z_X86_THREAD_PT_AREA 0UL
#endif
#if defined(CONFIG_HW_STACK_PROTECTION) || defined(CONFIG_USERSPACE)
#define Z_X86_STACK_BASE_ALIGN MMU_PAGE_SIZE
#define Z_X86_STACK_BASE_ALIGN CONFIG_MMU_PAGE_SIZE
#else
#define Z_X86_STACK_BASE_ALIGN ARCH_STACK_PTR_ALIGN
#endif
@ -120,7 +71,7 @@
* the access control granularity and we don't want other kernel data to
* unintentionally fall in the latter part of the page
*/
#define Z_X86_STACK_SIZE_ALIGN MMU_PAGE_SIZE
#define Z_X86_STACK_SIZE_ALIGN CONFIG_MMU_PAGE_SIZE
#else
#define Z_X86_STACK_SIZE_ALIGN ARCH_STACK_PTR_ALIGN
#endif
@ -136,7 +87,7 @@ struct z_x86_kernel_stack_data {
* are page-aligned and we just reserve room for them in
* Z_X86_THREAD_PT_AREA.
*/
struct x86_page_tables ptables;
uint8_t ptables[0x20];
} __aligned(0x20);
#endif /* !CONFIG_X86_64 */
@ -180,14 +131,14 @@ struct z_x86_thread_stack_header {
#endif
#ifdef CONFIG_HW_STACK_PROTECTION
char guard_page[MMU_PAGE_SIZE];
char guard_page[CONFIG_MMU_PAGE_SIZE];
#endif
#ifdef CONFIG_USERSPACE
#ifdef CONFIG_X86_64
char privilege_stack[MMU_PAGE_SIZE];
char privilege_stack[CONFIG_MMU_PAGE_SIZE];
#else
char privilege_stack[MMU_PAGE_SIZE -
char privilege_stack[CONFIG_MMU_PAGE_SIZE -
sizeof(struct z_x86_kernel_stack_data)];
struct z_x86_kernel_stack_data kernel_data;
@ -204,8 +155,8 @@ struct z_x86_thread_stack_header {
sizeof(struct z_x86_thread_stack_header)
#ifdef CONFIG_HW_STACK_PROTECTION
#define ARCH_KERNEL_STACK_RESERVED MMU_PAGE_SIZE
#define ARCH_KERNEL_STACK_OBJ_ALIGN MMU_PAGE_SIZE
#define ARCH_KERNEL_STACK_RESERVED CONFIG_MMU_PAGE_SIZE
#define ARCH_KERNEL_STACK_OBJ_ALIGN CONFIG_MMU_PAGE_SIZE
#else
#define ARCH_KERNEL_STACK_RESERVED 0
#define ARCH_KERNEL_STACK_OBJ_ALIGN ARCH_STACK_PTR_ALIGN