diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d015253f572..e38255f77a3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -46,6 +46,7 @@ config X86_64 select USE_SWITCH select USE_SWITCH_SUPPORTED select SCHED_IPI_SUPPORTED + select X86_MMU config X86_KERNEL_OFFSET int "Kernel offset from beginning of RAM" diff --git a/arch/x86/core/Kconfig.ia32 b/arch/x86/core/Kconfig.ia32 index c17106b6814..32b8cd94e79 100644 --- a/arch/x86/core/Kconfig.ia32 +++ b/arch/x86/core/Kconfig.ia32 @@ -76,6 +76,15 @@ config X86_USERSPACE supporting user-level threads that are protected from each other and from crashing the kernel. +config X86_PAE + bool "Use PAE page tables" + default y + depends on X86_MMU + help + If enabled, use PAE-style page tables instead of 32-bit page tables. + The advantage is support for the Execute Disable bit, at a cost of + more memory for paging structures. + menu "Architecture Floating Point Options" depends on CPU_HAS_FPU diff --git a/arch/x86/core/fatal.c b/arch/x86/core/fatal.c index 7eb7c5801c7..b146c0ed413 100644 --- a/arch/x86/core/fatal.c +++ b/arch/x86/core/fatal.c @@ -9,6 +9,7 @@ #include #include #include +#include LOG_MODULE_DECLARE(os); #if defined(CONFIG_BOARD_QEMU_X86) || defined(CONFIG_BOARD_QEMU_X86_64) @@ -71,18 +72,24 @@ bool z_x86_check_stack_bounds(uintptr_t addr, size_t size, uint16_t cs) start = (uintptr_t)Z_KERNEL_STACK_BUFFER( z_interrupt_stacks[cpu_id]); end = start + CONFIG_ISR_STACK_SIZE; - } else if ((cs & 0x3U) != 0U || - (_current->base.user_options & K_USER) == 0) { - /* Thread was in user mode, or is not a user mode thread. - * The normal stack buffer is what we will check. +#ifdef CONFIG_USERSPACE + } else if ((cs & 0x3U) == 0 && + (_current->base.user_options & K_USER) != 0) { + /* The low two bits of the CS register is the privilege + * level. It will be 0 in supervisor mode and 3 in user mode + * corresponding to ring 0 / ring 3. + * + * If we get here, we must have been doing a syscall, check + * privilege elevation stack bounds */ + start = _current->stack_info.start - CONFIG_MMU_PAGE_SIZE; + end = _current->stack_info.start; +#endif /* CONFIG_USERSPACE */ + } else { + /* Normal thread operation, check its stack buffer */ start = _current->stack_info.start; end = Z_STACK_PTR_ALIGN(_current->stack_info.start + - _current->stack_info.size); - } else { - /* User thread was doing a syscall, check kernel stack bounds */ - start = _current->stack_info.start - MMU_PAGE_SIZE; - end = _current->stack_info.start; + _current->stack_info.size); } return (addr <= start) || (addr + size > end); @@ -146,19 +153,27 @@ static void unwind_stack(uintptr_t base_ptr, uint16_t cs) } #endif /* CONFIG_X86_EXCEPTION_STACK_TRACE */ -static inline struct x86_page_tables *get_ptables(const z_arch_esf_t *esf) +static inline uintptr_t get_cr3(const z_arch_esf_t *esf) { #if defined(CONFIG_USERSPACE) && defined(CONFIG_X86_KPTI) /* If the interrupted thread was in user mode, we did a page table * switch when we took the exception via z_x86_trampoline_to_kernel */ if ((esf->cs & 0x3) != 0) { - return z_x86_thread_page_tables_get(_current); + return _current->arch.ptables; } #else ARG_UNUSED(esf); #endif - return z_x86_page_tables_get(); + /* Return the current CR3 value, it didn't change when we took + * the exception + */ + return z_x86_cr3_get(); +} + +static inline pentry_t *get_ptables(const z_arch_esf_t *esf) +{ + return z_mem_virt_addr(get_cr3(esf)); } #ifdef CONFIG_X86_64 @@ -172,8 +187,8 @@ static void dump_regs(const z_arch_esf_t *esf) esf->r8, esf->r9, esf->r10, esf->r11); LOG_ERR("R12: 0x%016lx R13: 0x%016lx R14: 0x%016lx R15: 0x%016lx", esf->r12, esf->r13, esf->r14, esf->r15); - LOG_ERR("RSP: 0x%016lx RFLAGS: 0x%016lx CS: 0x%04lx CR3: %p", esf->rsp, - esf->rflags, esf->cs & 0xFFFFU, get_ptables(esf)); + LOG_ERR("RSP: 0x%016lx RFLAGS: 0x%016lx CS: 0x%04lx CR3: 0x%016lx", + esf->rsp, esf->rflags, esf->cs & 0xFFFFU, get_cr3(esf)); #ifdef CONFIG_X86_EXCEPTION_STACK_TRACE LOG_ERR("call trace:"); @@ -190,8 +205,8 @@ static void dump_regs(const z_arch_esf_t *esf) esf->eax, esf->ebx, esf->ecx, esf->edx); LOG_ERR("ESI: 0x%08x, EDI: 0x%08x, EBP: 0x%08x, ESP: 0x%08x", esf->esi, esf->edi, esf->ebp, esf->esp); - LOG_ERR("EFLAGS: 0x%08x CS: 0x%04x CR3: %p", esf->eflags, - esf->cs & 0xFFFFU, get_ptables(esf)); + LOG_ERR("EFLAGS: 0x%08x CS: 0x%04x CR3: 0x%08lx", esf->eflags, + esf->cs & 0xFFFFU, get_cr3(esf)); #ifdef CONFIG_X86_EXCEPTION_STACK_TRACE LOG_ERR("call trace:"); @@ -309,7 +324,7 @@ static void dump_page_fault(z_arch_esf_t *esf) } #ifdef CONFIG_X86_MMU - z_x86_dump_mmu_flags(get_ptables(esf), cr2); + z_x86_dump_mmu_flags(get_ptables(esf), (void *)cr2); #endif /* CONFIG_X86_MMU */ } #endif /* CONFIG_EXCEPTION_DEBUG */ diff --git a/arch/x86/core/ia32/crt0.S b/arch/x86/core/ia32/crt0.S index ac1b43126fe..63e6b93ea49 100644 --- a/arch/x86/core/ia32/crt0.S +++ b/arch/x86/core/ia32/crt0.S @@ -16,6 +16,7 @@ #include #include #include +#include /* exports (private APIs) */ @@ -41,7 +42,55 @@ GTEXT(_sys_resume_from_deep_sleep) #endif +.macro install_page_tables +#ifdef CONFIG_X86_MMU + /* Enable paging. If virtual memory is enabled, the instruction pointer + * is currently at a physical address. There is an identity mapping + * for all RAM, plus a virtual mapping of RAM starting at + * CONFIG_KERNEL_VM_BASE using the same paging structures. + * + * Until we enable these page tables, only physical memory addresses + * work. + */ + movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %eax + movl %eax, %cr3 + +#ifdef CONFIG_X86_PAE + /* Enable PAE */ + movl %cr4, %eax + orl $CR4_PAE, %eax + movl %eax, %cr4 + + /* IA32_EFER NXE bit set */ + movl $0xC0000080, %ecx + rdmsr + orl $0x800, %eax + wrmsr +#endif /* CONFIG_X86_PAE */ + + /* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */ + movl %cr0, %eax + orl $(CR0_PG | CR0_WP), %eax + movl %eax, %cr0 + +#if CONFIG_KERNEL_VM_BASE != CONFIG_SRAM_BASE_ADDRESS + /* Jump to a virtual address, which works because the identity and + * virtual mappings both are to the same physical address. + */ + lea vm_enter, %eax + jmp *%eax +vm_enter: + /* We are now executing in virtual memory. We'll un-map the identity + * mappings later once we are in the C domain + */ +#endif /* CONFIG_KERNEL_VM_BASE != CONFIG_SRAM_BASE_ADDRESS */ +#endif /* CONFIG_X86_MMU */ +.endm + SECTION_FUNC(TEXT_START, __start) +#ifndef CONFIG_XIP + install_page_tables +#endif /* CONFIG_XIP */ #include "../common.S" @@ -64,11 +113,7 @@ SECTION_FUNC(TEXT_START, __start) */ #if CONFIG_SET_GDT lgdt _gdt_rom /* load 32-bit operand size GDT */ -#endif - - -#ifdef CONFIG_SET_GDT /* If we set our own GDT, update the segment registers as well. */ movw $DATA_SEG, %ax /* data segment selector (entry = 3) */ @@ -84,7 +129,6 @@ SECTION_FUNC(TEXT_START, __start) __csSet: #endif /* CONFIG_SET_GDT */ - #if !defined(CONFIG_FPU) /* * Force an #NM exception for floating point instructions @@ -206,6 +250,10 @@ __csSet: call _x86_data_copy #endif /* CONFIG_USERSPACE */ + /* Have to do this here, the page tables aren't loaded into RAM + * until after the data copy + */ + install_page_tables #endif /* CONFIG_XIP */ /* @@ -308,30 +356,6 @@ dataWords: ret #endif /* CONFIG_XIP */ -#ifdef CONFIG_X86_MMU -z_x86_enable_paging: - /* load the page directory address into the registers*/ - movl $z_x86_kernel_ptables, %eax - movl %eax, %cr3 - - /* Enable PAE */ - movl %cr4, %eax - orl $CR4_PAE, %eax - movl %eax, %cr4 - - /* IA32_EFER NXE bit set */ - movl $0xC0000080, %ecx - rdmsr - orl $0x800, %eax - wrmsr - - /* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */ - movl %cr0, %eax - orl $(CR0_PG | CR0_WP), %eax - movl %eax, %cr0 - - ret -#endif /* CONFIG_X86_MMU */ #if defined(CONFIG_SSE) diff --git a/arch/x86/core/ia32/fatal.c b/arch/x86/core/ia32/fatal.c index 298ec9b562f..5256bccdfed 100644 --- a/arch/x86/core/ia32/fatal.c +++ b/arch/x86/core/ia32/fatal.c @@ -18,6 +18,9 @@ #include #include #include +#include +#include + LOG_MODULE_DECLARE(os); #ifdef CONFIG_DEBUG_COREDUMP @@ -148,7 +151,7 @@ struct task_state_segment _df_tss = { .es = DATA_SEG, .ss = DATA_SEG, .eip = (uint32_t)df_handler_top, - .cr3 = (uint32_t)&z_x86_kernel_ptables + .cr3 = Z_MEM_PHYS_ADDR((uint32_t)&z_x86_kernel_ptables) }; static __used void df_handler_bottom(void) @@ -196,7 +199,7 @@ static FUNC_NORETURN __used void df_handler_top(void) _main_tss.es = DATA_SEG; _main_tss.ss = DATA_SEG; _main_tss.eip = (uint32_t)df_handler_bottom; - _main_tss.cr3 = (uint32_t)&z_x86_kernel_ptables; + _main_tss.cr3 = z_mem_phys_addr(&z_x86_kernel_ptables); _main_tss.eflags = 0U; /* NT bit is set in EFLAGS so we will task switch back to _main_tss diff --git a/arch/x86/core/ia32/thread.c b/arch/x86/core/ia32/thread.c index 57253b11ad0..7d579f527ab 100644 --- a/arch/x86/core/ia32/thread.c +++ b/arch/x86/core/ia32/thread.c @@ -16,6 +16,7 @@ #include #include #include +#include /* forward declaration */ diff --git a/arch/x86/core/ia32/userspace.S b/arch/x86/core/ia32/userspace.S index 3ac93a3e89b..bf98bb266a6 100644 --- a/arch/x86/core/ia32/userspace.S +++ b/arch/x86/core/ia32/userspace.S @@ -8,6 +8,7 @@ #include #include #include +#include /* Exports */ GTEXT(z_x86_syscall_entry_stub) @@ -49,7 +50,7 @@ SECTION_FUNC(TEXT, z_x86_trampoline_to_kernel) pushl %edi /* Switch to kernel page table */ - movl $z_x86_kernel_ptables, %esi + movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %esi movl %esi, %cr3 /* Save old trampoline stack pointer in %edi */ @@ -154,7 +155,7 @@ SECTION_FUNC(TEXT, z_x86_syscall_entry_stub) pushl %edi /* Switch to kernel page table */ - movl $z_x86_kernel_ptables, %esi + movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %esi movl %esi, %cr3 /* Save old trampoline stack pointer in %edi */ @@ -304,7 +305,10 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter) * want to leak any information. */ mov %edi, %esp - subl $Z_X86_PDPT_SIZE, %esp +#ifdef CONFIG_X86_PAE + /* Skip over the toplevel PDPT stored here */ + subl $0x20, %esp +#endif /* CONFIG_X86_PAE */ /* Stash some registers we are going to need to erase the user * stack. diff --git a/arch/x86/core/intel64/cpu.c b/arch/x86/core/intel64/cpu.c index c0b0f06a577..4bf38136b14 100644 --- a/arch/x86/core/intel64/cpu.c +++ b/arch/x86/core/intel64/cpu.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include /* @@ -80,8 +80,6 @@ struct x86_tss64 tss3 = { }; #endif -extern struct x86_page_tables z_x86_flat_ptables; - struct x86_cpuboot x86_cpuboot[] = { { .tr = X86_KERNEL_CPU0_TR, @@ -89,9 +87,6 @@ struct x86_cpuboot x86_cpuboot[] = { .sp = (uint64_t) z_interrupt_stacks[0] + Z_KERNEL_STACK_SIZE_ADJUST(CONFIG_ISR_STACK_SIZE), .fn = z_x86_prep_c, -#ifdef CONFIG_X86_MMU - .ptables = &z_x86_flat_ptables, -#endif }, #if CONFIG_MP_NUM_CPUS > 1 { @@ -127,9 +122,6 @@ void arch_start_cpu(int cpu_num, k_thread_stack_t *stack, int sz, x86_cpuboot[cpu_num].sp = (uint64_t) Z_KERNEL_STACK_BUFFER(stack) + sz; x86_cpuboot[cpu_num].fn = fn; x86_cpuboot[cpu_num].arg = arg; -#ifdef CONFIG_X86_MMU - x86_cpuboot[cpu_num].ptables = &z_x86_kernel_ptables; -#endif /* CONFIG_X86_MMU */ z_loapic_ipi(apic_id, LOAPIC_ICR_IPI_INIT, 0); k_busy_wait(10000); diff --git a/arch/x86/core/intel64/locore.S b/arch/x86/core/intel64/locore.S index 6787c541280..0eb7b493410 100644 --- a/arch/x86/core/intel64/locore.S +++ b/arch/x86/core/intel64/locore.S @@ -10,6 +10,7 @@ #include #include #include +#include .macro read_tsc var_name push %rax @@ -21,8 +22,69 @@ pop %rax .endm +/* + * Definitions/macros for enabling paging + */ + +/* Long mode, no-execute, syscall */ +#define EFER_BITS (X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE) + +/* Paging, write-protect */ +#define CR0_BITS (CR0_PG | CR0_WP) + +/* PAE, SSE */ +#define CR4_BITS (CR4_PAE | CR4_OSFXSR) + +.macro set_efer + movl $X86_EFER_MSR, %ecx + rdmsr + orl $EFER_BITS, %eax + wrmsr +.endm + +.macro install_pagetables_32 + movl %cr4, %eax + orl $CR4_BITS, %eax + movl %eax, %cr4 + clts + + /* Page tables created at build time by gen_mmu.py */ + movl $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %eax + movl %eax, %cr3 + + set_efer + + movl %cr0, %eax + orl $CR0_BITS, %eax + movl %eax, %cr0 +.endm + +.macro install_pagetables_64 + /* Here, we are already in long mode with paging enabled and + * just need to switch to our own page tables, but let's be + * paranoid and ensure CR4, CR0, and EFER_MSR are set up + * exactly how we expect. Logic is the same as install_pagetables_32 + */ + movq %cr4, %rax + orq $CR4_BITS, %rax + movq %rax, %cr4 + clts + + movq $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax + movq %rax, %cr3 + + set_efer + + movq %cr0, %rax + /* Use 32-bit instructions due to assembler fussiness with large + * immediate values with `orq`, CR0_PG is bit 31. We don't ever set any + * high bits in cr0 anyway. + */ + orl $CR0_BITS, %eax + movq %rax, %cr0 +.endm + .section .locore,"ax" -.code32 #if CONFIG_MP_NUM_CPUS > 1 @@ -79,7 +141,6 @@ unknown_loapic_id: .code32 .globl __start __start: - /* * kernel execution begins here in 32-bit mode, with flat-mode * descriptors in all segment registers, interrupts disabled. @@ -98,7 +159,6 @@ __start: * next, clear the BSS. note we're still in 32-bit mode, * so the BSS must fit entirely in the first 4GB of RAM. */ - cld xorl %eax, %eax movl $__bss_start, %edi @@ -108,37 +168,17 @@ __start: movl $x86_cpuboot, %ebp /* BSP is always logical CPU id 0 */ movl %ebx, __x86_cpuboot_t_arg_OFFSET(%ebp) /* multiboot info */ - /* - * transition to long mode, reload the segment registers, - * and configure per-CPU stuff: GS, task register, stack. - */ - -go64: movl %cr4, %eax /* enable PAE and SSE */ - orl $(CR4_PAE | CR4_OSFXSR), %eax - movl %eax, %cr4 - clts - -#ifdef CONFIG_X86_MMU - movl __x86_cpuboot_t_ptables_OFFSET(%ebp), %eax -#else - movl $z_x86_flat_ptables, %eax -#endif - movl %eax, %cr3 - - movl $X86_EFER_MSR, %ecx /* enable long mode, no-execute, syscall */ - rdmsr - orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %eax - wrmsr - - movl %cr0, %eax /* enable paging */ - orl $(CR0_PG | CR0_WP), %eax - movl %eax, %cr0 +go64: /* Install page tables and transition to long mode */ + install_pagetables_32 jmpl $X86_KERNEL_CS, $enter_code64 /* Long mode entry point. Arrive here from the code * immediately above (shared between main CPU startup and AP - * startup), or from EFI entry in __start64 + * startup), or from EFI entry in __start64. + * + * Here we reload the segment registers, + * and configure per-CPU stuff: GS, task register, stack. */ .code64 enter_code64: @@ -200,20 +240,7 @@ __start64: lidt idt80 lgdt gdt80 - /* These state and flag settings really should be done later, - * in the shared startup path, they aren't required for mode - * transition and having them in the 32 bit stub means they - * have to be duplicated here. - */ - movq %cr4, %rax - orq $(CR4_PAE | CR4_OSFXSR), %rax - movq %rax, %cr4 - clts - movq $X86_EFER_MSR, %rcx - rdmsr - orq $(X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %rax - wrmsr - cld + install_pagetables_64 /* Disable 8259 PIT. Almost certainly not needed on modern * UEFI platforms taking this code path, but... @@ -949,44 +976,6 @@ idt80: /* LIDT descriptor for 64 bit mode */ .word (idt_end - idt - 1) .quad idt -/* Initial page tables for long mode entry. This generates a second - * level page full of 512 1G PTE entries of the form: - * - * 0x000000nnn0000083 - * - * Where nnn is an identity-mapped 1G page index in the range - * 0x000-0x1ff, and 0x83 marks a present, 1G, read/write page - * entry. It's split up somewhat awkwardly to get around gas's - * recursion limits in macro expansion. - * - * This maps the first 512GB of memory space by default, which will - * hopefully be enough to reach everything we need before we can - * bootstrap the real page tables later. - */ -.macro populate_ptable base, count=64 - .long 0x00000083 - .long 64 - \count + \base - .long 0x40000083 - .long 64 - \count + \base - .long 0x80000083 - .long 64 - \count + \base - .long 0xC0000083 - .long 64 - \count + \base -.if \count > 1 - populate_ptable \base, (\count - 1) -.endif -.endm - -.align 4096 -.globl z_x86_flat_ptables -z_x86_flat_ptables: - .long pdp + 0x03 /* 0x03 = R/W, P */ - .long 0 - .fill 4088, 1, 0 -pdp: - populate_ptable 0 - populate_ptable 64 - .section .gdt,"ad" /* diff --git a/arch/x86/core/intel64/thread.c b/arch/x86/core/intel64/thread.c index d9a3182b18c..451f34b50ca 100644 --- a/arch/x86/core/intel64/thread.c +++ b/arch/x86/core/intel64/thread.c @@ -8,6 +8,7 @@ #include #include #include +#include extern void x86_sse_init(struct k_thread *); /* in locore.S */ diff --git a/arch/x86/core/intel64/userspace.S b/arch/x86/core/intel64/userspace.S index d6aaa7512a9..00cb52249b0 100644 --- a/arch/x86/core/intel64/userspace.S +++ b/arch/x86/core/intel64/userspace.S @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef CONFIG_X86_KPTI /* Copy interrupt return stack context to the trampoline stack, switch back @@ -83,7 +84,7 @@ z_x86_syscall_entry_stub: /* Load kernel's page table */ pushq %rax - movq $z_x86_kernel_ptables, %rax + movq $Z_MEM_PHYS_ADDR(z_x86_kernel_ptables), %rax movq %rax, %cr3 popq %rax movq $0, -8(%rsp) /* Delete stashed RAX data */ diff --git a/arch/x86/core/offsets/ia32_offsets.c b/arch/x86/core/offsets/ia32_offsets.c index 2c10938bedc..3cd68729748 100644 --- a/arch/x86/core/offsets/ia32_offsets.c +++ b/arch/x86/core/offsets/ia32_offsets.c @@ -36,7 +36,6 @@ GEN_OFFSET_SYM(_thread_arch_t, excNestCount); #ifdef CONFIG_USERSPACE GEN_OFFSET_SYM(_thread_arch_t, psp); GEN_OFFSET_SYM(_thread_arch_t, ptables); -GEN_ABSOLUTE_SYM(Z_X86_PDPT_SIZE, sizeof(struct x86_mmu_pdpt)); #endif GEN_OFFSET_SYM(_thread_arch_t, preempFloatReg); @@ -65,9 +64,4 @@ GEN_OFFSET_SYM(z_arch_esf_t, errorCode); GEN_OFFSET_SYM(z_arch_esf_t, eip); GEN_OFFSET_SYM(z_arch_esf_t, cs); GEN_OFFSET_SYM(z_arch_esf_t, eflags); - -/* size of the MMU_REGION structure. Used by linker scripts */ - -GEN_ABSOLUTE_SYM(__MMU_REGION_SIZEOF, sizeof(struct mmu_region)); - #endif /* _X86_OFFSETS_INC_ */ diff --git a/arch/x86/core/offsets/intel64_offsets.c b/arch/x86/core/offsets/intel64_offsets.c index f71c81f4d8b..452d6da2098 100644 --- a/arch/x86/core/offsets/intel64_offsets.c +++ b/arch/x86/core/offsets/intel64_offsets.c @@ -49,9 +49,6 @@ GEN_OFFSET_SYM(x86_cpuboot_t, gs_base); GEN_OFFSET_SYM(x86_cpuboot_t, sp); GEN_OFFSET_SYM(x86_cpuboot_t, fn); GEN_OFFSET_SYM(x86_cpuboot_t, arg); -#ifdef CONFIG_X86_MMU -GEN_OFFSET_SYM(x86_cpuboot_t, ptables); -#endif /* CONFIG_X86_MMU */ GEN_ABSOLUTE_SYM(__X86_CPUBOOT_SIZEOF, sizeof(x86_cpuboot_t)); #endif /* _X86_OFFSETS_INC_ */ diff --git a/arch/x86/core/prep_c.c b/arch/x86/core/prep_c.c index 7a73e1cf8d6..cc6f908486f 100644 --- a/arch/x86/core/prep_c.c +++ b/arch/x86/core/prep_c.c @@ -8,6 +8,7 @@ #include #include #include +#include extern FUNC_NORETURN void z_cstart(void); extern void x86_64_irq_init(void); @@ -25,6 +26,10 @@ FUNC_NORETURN void z_x86_prep_c(void *arg) z_x86_early_serial_init(); #endif +#ifdef CONFIG_MMU + z_x86_mmu_init(); +#endif + #ifdef CONFIG_X86_64 x86_64_irq_init(); #endif @@ -35,10 +40,6 @@ FUNC_NORETURN void z_x86_prep_c(void *arg) ARG_UNUSED(info); #endif -#ifdef CONFIG_X86_MMU - z_x86_paging_init(); -#endif - #if CONFIG_X86_STACK_PROTECTION for (int i = 0; i < CONFIG_MP_NUM_CPUS; i++) { z_x86_set_stack_guard(z_interrupt_stacks[i]); diff --git a/arch/x86/core/userspace.c b/arch/x86/core/userspace.c index b2f54c3cd9c..e38dfe4caa8 100644 --- a/arch/x86/core/userspace.c +++ b/arch/x86/core/userspace.c @@ -9,19 +9,20 @@ #include #include #include +#include #ifndef CONFIG_X86_KPTI -/* Change to new set of page tables. ONLY intended for use from - * z_x88_swap_update_page_tables(). This changes CR3, no memory access - * afterwards is legal unless it is known for sure that the relevant - * mappings are identical wrt supervisor mode until we iret out. +/* Set CR3 to a physical address. There must be a valid top-level paging + * structure here or the CPU will triple fault. The incoming page tables must + * have the same kernel mappings wrt supervisor mode. Don't use this function + * unless you know exactly what you are doing. */ -static inline void page_tables_set(struct x86_page_tables *ptables) +static inline void cr3_set(uintptr_t phys) { #ifdef CONFIG_X86_64 - __asm__ volatile("movq %0, %%cr3\n\t" : : "r" (ptables) : "memory"); + __asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory"); #else - __asm__ volatile("movl %0, %%cr3\n\t" : : "r" (ptables) : "memory"); + __asm__ volatile("movl %0, %%cr3\n\t" : : "r" (phys) : "memory"); #endif } @@ -43,7 +44,7 @@ static inline void page_tables_set(struct x86_page_tables *ptables) */ void z_x86_swap_update_page_tables(struct k_thread *incoming) { - struct x86_page_tables *ptables; + uintptr_t ptables_phys; #ifndef CONFIG_X86_64 /* 64-bit uses syscall/sysret which switches stacks manually, @@ -57,10 +58,10 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming) /* Check first that we actually need to do this, since setting * CR3 involves an expensive full TLB flush. */ - ptables = z_x86_thread_page_tables_get(incoming); + ptables_phys = incoming->arch.ptables; - if (ptables != z_x86_page_tables_get()) { - page_tables_set(ptables); + if (ptables_phys != z_x86_cr3_get()) { + cr3_set(ptables_phys); } } #endif /* CONFIG_X86_KPTI */ @@ -99,7 +100,7 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread) z_x86_thread_pt_init(thread); initial_entry = drop_to_user; } else { - thread->arch.ptables = &z_x86_kernel_ptables; + thread->arch.ptables = z_mem_phys_addr(&z_x86_kernel_ptables); initial_entry = z_thread_entry; } @@ -115,7 +116,7 @@ FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry, * started in user mode already had this done via z_setup_new_thread() */ if (_current->mem_domain_info.mem_domain != NULL) { - z_x86_apply_mem_domain(_current->arch.ptables, + z_x86_apply_mem_domain(_current, _current->mem_domain_info.mem_domain); } diff --git a/arch/x86/core/x86_mmu.c b/arch/x86/core/x86_mmu.c index 5a5bd6d265f..46ccf10858f 100644 --- a/arch/x86/core/x86_mmu.c +++ b/arch/x86/core/x86_mmu.c @@ -1,530 +1,218 @@ /* * Copyright (c) 2011-2014 Wind River Systems, Inc. - * Copyright (c) 2017 Intel Corporation + * Copyright (c) 2017-2020 Intel Corporation * * SPDX-License-Identifier: Apache-2.0 */ + #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include + LOG_MODULE_DECLARE(os); -void z_pcie_add_mmu_regions(void); - -#define PHYS_RAM_ADDR DT_REG_ADDR(DT_CHOSEN(zephyr_sram)) -#define PHYS_RAM_SIZE DT_REG_SIZE(DT_CHOSEN(zephyr_sram)) - -/* Despite our use of PAE page tables, we do not (and will never) actually - * support PAE. Use a 64-bit x86 target if you have that much RAM. +/* "dummy" pagetables for the first-phase build. The real page tables + * are produced by gen-mmu.py based on data read in zephyr-prebuilt.elf, + * and this dummy array is discarded. */ -BUILD_ASSERT(PHYS_RAM_ADDR + PHYS_RAM_SIZE - 1ULL <= - (unsigned long long)UINTPTR_MAX); - -/* Common regions for all x86 processors. - * Peripheral I/O ranges configured at the SOC level - */ - -/* Mark text and rodata as read-only. - * Userspace may read all text and rodata. - */ -MMU_BOOT_REGION(&_image_text_start, &_image_text_size, - Z_X86_MMU_US); - -MMU_BOOT_REGION(&_image_rodata_start, &_image_rodata_size, - Z_X86_MMU_US | Z_X86_MMU_XD); - -#ifdef CONFIG_USERSPACE -MMU_BOOT_REGION(&_app_smem_start, &_app_smem_size, - Z_X86_MMU_RW | Z_X86_MMU_XD); -#endif - -#ifdef CONFIG_COVERAGE_GCOV -MMU_BOOT_REGION(&__gcov_bss_start, &__gcov_bss_size, - Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_XD); -#endif - -#ifdef CONFIG_X86_64 -extern char _locore_start[]; -extern char _locore_size[]; -extern char _lorodata_start[]; -extern char _lorodata_size[]; -extern char _lodata_start[]; -extern char _lodata_size[]; - -/* Early boot regions that need to be in low memory to be comprehensible - * by the CPU in 16-bit mode - */ - -MMU_BOOT_REGION(&_locore_start, &_locore_size, 0); -MMU_BOOT_REGION(&_lorodata_start, &_lorodata_size, Z_X86_MMU_XD); -MMU_BOOT_REGION(&_lodata_start, &_lodata_size, Z_X86_MMU_RW | Z_X86_MMU_XD); -#endif - -/* __kernel_ram_size includes all unused memory, which is used for heaps. - * User threads cannot access this unless granted at runtime. This is done - * automatically for stacks. - */ -MMU_BOOT_REGION(&__kernel_ram_start, &__kernel_ram_size, - Z_X86_MMU_RW | Z_X86_MMU_XD); +Z_GENERIC_SECTION(.dummy_pagetables) +char z_x86_dummy_pagetables[Z_X86_INITIAL_PAGETABLE_SIZE]; /* - * Inline functions for setting memory addresses in page table structures + * Definitions for building an ontology of paging levels and capabilities + * at each level */ -#ifdef CONFIG_X86_64 -static inline void pml4e_update_pdpt(uint64_t *pml4e, struct x86_mmu_pdpt *pdpt) -{ - uintptr_t pdpt_addr = (uintptr_t)pdpt; - - *pml4e = ((*pml4e & ~Z_X86_MMU_PML4E_PDPT_MASK) | - (pdpt_addr & Z_X86_MMU_PML4E_PDPT_MASK)); -} -#endif /* CONFIG_X86_64 */ - -static inline void pdpte_update_pd(uint64_t *pdpte, struct x86_mmu_pd *pd) -{ - uintptr_t pd_addr = (uintptr_t)pd; - -#ifdef CONFIG_X86_64 - __ASSERT((*pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page"); -#endif - *pdpte = ((*pdpte & ~Z_X86_MMU_PDPTE_PD_MASK) | - (pd_addr & Z_X86_MMU_PDPTE_PD_MASK)); -} - -static inline void pde_update_pt(uint64_t *pde, struct x86_mmu_pt *pt) -{ - uintptr_t pt_addr = (uintptr_t)pt; - - __ASSERT((*pde & Z_X86_MMU_PS) == 0, "pde is for 2MB page"); - - *pde = ((*pde & ~Z_X86_MMU_PDE_PT_MASK) | - (pt_addr & Z_X86_MMU_PDE_PT_MASK)); -} - -static inline void pte_update_addr(uint64_t *pte, uintptr_t addr) -{ - *pte = ((*pte & ~Z_X86_MMU_PTE_ADDR_MASK) | - (addr & Z_X86_MMU_PTE_ADDR_MASK)); -} - -/* - * Functions for dumping page tables to console +/* Data structure describing the characteristics of a particular paging + * level */ +struct paging_level { + /* What bits are used to store physical address */ + pentry_t mask; -/* Works for PDPT, PD, PT entries, the bits we check here are all the same. - * - * Not trying to capture every flag, just the most interesting stuff, - * Present, write, XD, user, in typically encountered combinations. - */ -static bool dump_entry_flags(const char *name, uint64_t flags) -{ - if ((flags & Z_X86_MMU_P) == 0) { - LOG_ERR("%s: Non-present", name); - return false; - } + /* Number of entries in this paging structure */ + size_t entries; - LOG_ERR("%s: 0x%016llx %s, %s, %s", name, flags, - flags & MMU_ENTRY_WRITE ? - "Writable" : "Read-only", - flags & MMU_ENTRY_USER ? - "User" : "Supervisor", - flags & MMU_ENTRY_EXECUTE_DISABLE ? - "Execute Disable" : "Execute Enabled"); - return true; -} - -void z_x86_dump_mmu_flags(struct x86_page_tables *ptables, uintptr_t addr) -{ - uint64_t entry; - -#ifdef CONFIG_X86_64 - entry = *z_x86_get_pml4e(ptables, addr); - if (!dump_entry_flags("PML4E", entry)) { - return; - } - - entry = *z_x86_pdpt_get_pdpte(z_x86_pml4e_get_pdpt(entry), addr); - if (!dump_entry_flags("PDPTE", entry)) { - return; - } -#else - /* 32-bit doesn't have anything interesting in the PDPTE except - * the present bit + /* How many bits to right-shift a virtual address to obtain the + * appropriate entry within this table. + * + * The memory scope of each entry in this table is 1 << shift. */ - entry = *z_x86_get_pdpte(ptables, addr); - if ((entry & Z_X86_MMU_P) == 0) { - LOG_ERR("PDPTE: Non-present"); - return; - } + unsigned int shift; +#ifdef CONFIG_EXCEPTION_DEBUG + /* Name of this level, for debug purposes */ + const char *name; #endif +}; - entry = *z_x86_pd_get_pde(z_x86_pdpte_get_pd(entry), addr); - if (!dump_entry_flags(" PDE", entry)) { - return; - } - - entry = *z_x86_pt_get_pte(z_x86_pde_get_pt(entry), addr); - if (!dump_entry_flags(" PTE", entry)) { - return; - } -} - -static char get_entry_code(uint64_t value) -{ - char ret; - - if ((value & Z_X86_MMU_P) == 0) { - ret = '.'; - } else { - if ((value & Z_X86_MMU_RW) != 0) { - /* Writable page */ - if ((value & Z_X86_MMU_XD) != 0) { - /* RW */ - ret = 'w'; - } else { - /* RWX */ - ret = 'a'; - } - } else { - if ((value & Z_X86_MMU_XD) != 0) { - /* R */ - ret = 'r'; - } else { - /* RX */ - ret = 'x'; - } - } - - if ((value & Z_X86_MMU_US) != 0) { - /* Uppercase indicates user mode access */ - ret = toupper(ret); - } - } - - return ret; -} - -static void print_entries(uint64_t entries_array[], size_t count) -{ - int column = 0; - - for (int i = 0; i < count; i++) { - printk("%c", get_entry_code(entries_array[i])); - - column++; - if (column == 64) { - column = 0; - printk("\n"); - } - } - - if (column != 0) { - printk("\n"); - } -} - -static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index) -{ - printk("Page table %d for 0x%016lX - 0x%016lX at %p\n", - index, base, base + Z_X86_PT_AREA - 1, pt); - - print_entries(pt->entry, Z_X86_NUM_PT_ENTRIES); -} - -static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index) -{ - printk("Page directory %d for 0x%016lX - 0x%016lX at %p\n", - index, base, base + Z_X86_PD_AREA - 1, pd); - - print_entries(pd->entry, Z_X86_NUM_PD_ENTRIES); - - for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) { - struct x86_mmu_pt *pt; - uint64_t pde = pd->entry[i]; - - if (((pde & Z_X86_MMU_P) == 0) || ((pde & Z_X86_MMU_PS) != 0)) { - /* Skip non-present, or 2MB directory entries, there's - * no page table to examine */ - continue; - } - pt = z_x86_pde_get_pt(pde); - - z_x86_dump_pt(pt, base + (i * Z_X86_PT_AREA), i); - } -} - -static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base, - int index) -{ - printk("Page directory pointer table %d for 0x%0816lX - 0x%016lX at %p\n", - index, base, base + Z_X86_PDPT_AREA - 1, pdpt); - - print_entries(pdpt->entry, Z_X86_NUM_PDPT_ENTRIES); - - for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) { - struct x86_mmu_pd *pd; - uint64_t pdpte = pdpt->entry[i]; - - if ((pdpte & Z_X86_MMU_P) == 0) { - continue; - } -#ifdef CONFIG_X86_64 - if ((pdpte & Z_X86_MMU_PS) != 0) { - continue; - } -#endif - pd = z_x86_pdpte_get_pd(pdpte); - z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i); - } -} - -#ifdef CONFIG_X86_64 -static void z_x86_dump_pml4(struct x86_mmu_pml4 *pml4) -{ - printk("Page mapping level 4 at %p for all memory addresses\n", pml4); - - print_entries(pml4->entry, Z_X86_NUM_PML4_ENTRIES); - - for (int i = 0; i < Z_X86_NUM_PML4_ENTRIES; i++) { - struct x86_mmu_pdpt *pdpt; - uint64_t pml4e = pml4->entry[i]; - - if ((pml4e & Z_X86_MMU_P) == 0) { - continue; - } - - pdpt = z_x86_pml4e_get_pdpt(pml4e); - z_x86_dump_pdpt(pdpt, i * Z_X86_PDPT_AREA, i); - } -} - -void z_x86_dump_page_tables(struct x86_page_tables *ptables) -{ - z_x86_dump_pml4(z_x86_get_pml4(ptables)); -} - -#else -void z_x86_dump_page_tables(struct x86_page_tables *ptables) -{ - z_x86_dump_pdpt(z_x86_get_pdpt(ptables, 0), 0, 0); -} -#endif - -void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr, - uint64_t *pde_flags, uint64_t *pte_flags) -{ - *pde_flags = *z_x86_get_pde(ptables, (uintptr_t)addr) & - ~Z_X86_MMU_PDE_PT_MASK; - - if ((*pde_flags & Z_X86_MMU_P) != 0) { - *pte_flags = *z_x86_get_pte(ptables, (uintptr_t)addr) & - ~Z_X86_MMU_PTE_ADDR_MASK; - } else { - *pte_flags = 0; - } -} - -/* Given an address/size pair, which corresponds to some memory address - * within a table of table_size, return the maximum number of bytes to - * examine so we look just to the end of the table and no further. +/* Flags for all entries in intermediate paging levels. + * Fortunately, the same bits are set for all intermediate levels for all + * three paging modes. * - * If size fits entirely within the table, just return size. + * Obviously P is set. + * + * We want RW and US bit always set; actual access control will be + * done at the leaf level. + * + * XD (if supported) always 0. Disabling execution done at leaf level. + * + * PCD/PWT always 0. Caching properties again done at leaf level. */ -static size_t get_table_max(uintptr_t addr, size_t size, size_t table_size) -{ - size_t table_remaining; - - addr &= (table_size - 1); - table_remaining = table_size - addr; - - if (size < table_remaining) { - return size; - } else { - return table_remaining; - } -} - -/* Range [addr, addr + size) must fall within the bounds of the pt */ -static int x86_mmu_validate_pt(struct x86_mmu_pt *pt, uintptr_t addr, - size_t size, bool write) -{ - uintptr_t pos = addr; - size_t remaining = size; - int ret = 0; - - while (true) { - uint64_t pte = *z_x86_pt_get_pte(pt, pos); - - if ((pte & Z_X86_MMU_P) == 0 || (pte & Z_X86_MMU_US) == 0 || - (write && (pte & Z_X86_MMU_RW) == 0)) { - ret = -1; - break; - } - - if (remaining <= MMU_PAGE_SIZE) { - break; - } - - remaining -= MMU_PAGE_SIZE; - pos += MMU_PAGE_SIZE; - } - - return ret; -} - -/* Range [addr, addr + size) must fall within the bounds of the pd */ -static int x86_mmu_validate_pd(struct x86_mmu_pd *pd, uintptr_t addr, - size_t size, bool write) -{ - uintptr_t pos = addr; - size_t remaining = size; - int ret = 0; - size_t to_examine; - - while (remaining) { - uint64_t pde = *z_x86_pd_get_pde(pd, pos); - - if ((pde & Z_X86_MMU_P) == 0 || (pde & Z_X86_MMU_US) == 0 || - (write && (pde & Z_X86_MMU_RW) == 0)) { - ret = -1; - break; - } - - to_examine = get_table_max(pos, remaining, Z_X86_PT_AREA); - - if ((pde & Z_X86_MMU_PS) == 0) { - /* Not a 2MB PDE. Need to check all the linked - * tables for this entry - */ - struct x86_mmu_pt *pt; - - pt = z_x86_pde_get_pt(pde); - ret = x86_mmu_validate_pt(pt, pos, to_examine, write); - if (ret != 0) { - break; - } - } else { - ret = 0; - } - - remaining -= to_examine; - pos += to_examine; - } - - return ret; -} - -/* Range [addr, addr + size) must fall within the bounds of the pdpt */ -static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr, - size_t size, bool write) -{ - uintptr_t pos = addr; - size_t remaining = size; - int ret = 0; - size_t to_examine; - - while (remaining) { - uint64_t pdpte = *z_x86_pdpt_get_pdpte(pdpt, pos); - - if ((pdpte & Z_X86_MMU_P) == 0) { - /* Non-present */ - ret = -1; - break; - } +#define INT_FLAGS (MMU_P | MMU_RW | MMU_US) +/* Paging level ontology for the selected paging mode. + * + * See Figures 4-4, 4-7, 4-11 in the Intel SDM, vol 3A + */ +static const struct paging_level paging_levels[] = { #ifdef CONFIG_X86_64 - if ((pdpte & Z_X86_MMU_US) == 0 || - (write && (pdpte & Z_X86_MMU_RW) == 0)) { - ret = -1; - break; - } + /* Page Map Level 4 */ + { + .mask = 0x7FFFFFFFFFFFF000ULL, + .entries = 512U, + .shift = 39U, +#ifdef CONFIG_EXCEPTION_DEBUG + .name = "PML4" #endif - to_examine = get_table_max(pos, remaining, Z_X86_PD_AREA); - -#ifdef CONFIG_X86_64 - /* Check if 1GB page, if not, examine linked page directory */ - if ((pdpte & Z_X86_MMU_PS) == 0) { -#endif - struct x86_mmu_pd *pd = z_x86_pdpte_get_pd(pdpte); - - ret = x86_mmu_validate_pd(pd, pos, to_examine, write); - if (ret != 0) { - break; - } -#ifdef CONFIG_X86_64 - } else { - ret = 0; - } -#endif - remaining -= to_examine; - pos += to_examine; - } - - return ret; -} - -#ifdef CONFIG_X86_64 -static int x86_mmu_validate_pml4(struct x86_mmu_pml4 *pml4, uintptr_t addr, - size_t size, bool write) -{ - uintptr_t pos = addr; - size_t remaining = size; - int ret = 0; - size_t to_examine; - - while (remaining) { - uint64_t pml4e = *z_x86_pml4_get_pml4e(pml4, pos); - struct x86_mmu_pdpt *pdpt; - - if ((pml4e & Z_X86_MMU_P) == 0 || (pml4e & Z_X86_MMU_US) == 0 || - (write && (pml4e & Z_X86_MMU_RW) == 0)) { - ret = -1; - break; - } - - to_examine = get_table_max(pos, remaining, Z_X86_PDPT_AREA); - pdpt = z_x86_pml4e_get_pdpt(pml4e); - - ret = x86_mmu_validate_pdpt(pdpt, pos, to_examine, write); - if (ret != 0) { - break; - } - - remaining -= to_examine; - pos += to_examine; - } - - return ret; -} + }, #endif /* CONFIG_X86_64 */ - -int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size, - bool write) -{ - int ret; - +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + /* Page Directory Pointer Table */ + { + .mask = 0x7FFFFFFFFFFFF000ULL, #ifdef CONFIG_X86_64 - struct x86_mmu_pml4 *pml4 = z_x86_get_pml4(ptables); - - ret = x86_mmu_validate_pml4(pml4, (uintptr_t)addr, size, write); + .entries = 512U, #else - struct x86_mmu_pdpt *pdpt = z_x86_get_pdpt(ptables, (uintptr_t)addr); - - ret = x86_mmu_validate_pdpt(pdpt, (uintptr_t)addr, size, write); + /* PAE version */ + .entries = 4U, #endif - -#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION - __asm__ volatile ("lfence" : : : "memory"); + .shift = 30U, +#ifdef CONFIG_EXCEPTION_DEBUG + .name = "PDPT" #endif + }, +#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ + /* Page Directory */ + { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + .mask = 0x7FFFFFFFFFFFF000ULL, + .entries = 512U, + .shift = 21U, +#else + /* 32-bit */ + .mask = 0xFFFFF000U, + .entries = 1024U, + .shift = 22U, +#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ +#ifdef CONFIG_EXCEPTION_DEBUG + .name = "PD" +#endif + }, + /* Page Table */ + { +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + .mask = 0x07FFFFFFFFFFF000ULL, + .entries = 512U, + .shift = 12U, +#else + /* 32-bit */ + .mask = 0xFFFFF000U, + .entries = 1024U, + .shift = 12U, +#endif /* CONFIG_X86_64 || CONFIG_X86_PAE */ +#ifdef CONFIG_EXCEPTION_DEBUG + .name = "PT" +#endif + } +}; - return ret; +#define NUM_LEVELS ARRAY_SIZE(paging_levels) + +/* + * Utility functions + */ + +/* For a physical address, return its permanent virtual mapping in the kernel's + * address space + */ +static inline void *ram_phys_to_virt(uintptr_t phys) +{ + return (void *)(phys + Z_MEM_VM_OFFSET); +} + +/* For a table at a particular level, get the entry index that corresponds to + * the provided virtual address + */ +static inline int get_index(void *virt, int level) +{ + return (((uintptr_t)virt >> paging_levels[level].shift) % + paging_levels[level].entries); +} + +static inline pentry_t *get_entry_ptr(pentry_t *ptables, void *virt, int level) +{ + return &ptables[get_index(virt, level)]; +} + +static inline pentry_t get_entry(pentry_t *ptables, void *virt, int level) +{ + return ptables[get_index(virt, level)]; +} + +/* Get the physical memory address associated with this table entry */ +static inline uintptr_t get_entry_phys(pentry_t entry, int level) +{ + return entry & paging_levels[level].mask; +} + +/* Return the virtual address of a linked table stored in the provided entry */ +static inline pentry_t *next_table(pentry_t entry, int level) +{ + return ram_phys_to_virt(get_entry_phys(entry, level)); +} + +/* 4K for everything except PAE PDPTs */ +static inline size_t table_size(int level) +{ + return paging_levels[level].entries * sizeof(pentry_t); +} + +/* For a table at a particular level, size of the amount of virtual memory + * that an entry within the table covers + */ +static inline size_t get_entry_scope(int level) +{ + return (1UL << paging_levels[level].shift); +} + +/* For a table at a particular level, size of the amount of virtual memory + * that this entire table covers + */ +static inline size_t get_table_scope(int level) +{ + return get_entry_scope(level) * paging_levels[level].entries; +} + +/* Must have checked Present bit first! Non-present entries may have OS data + * stored in any other bits + */ +static inline bool is_leaf(int level, pentry_t entry) +{ + if (level == NUM_LEVELS - 1) { + /* Always true for PTE */ + return true; + } + + return ((entry & MMU_PS) != 0U); } static inline void tlb_flush_page(void *addr) @@ -535,366 +223,535 @@ static inline void tlb_flush_page(void *addr) char *page = (char *)addr; __asm__ ("invlpg %0" :: "m" (*page)); + + /* TODO: Need to implement TLB shootdown for SMP */ } -#ifdef CONFIG_X86_64 -#define PML4E_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_P) +static inline void assert_addr_aligned(uintptr_t addr) +{ +#if __ASSERT_ON + __ASSERT((addr & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U, + "unaligned address 0x%" PRIxPTR, addr); +#endif +} -#define PDPTE_FLAGS_MASK PML4E_FLAGS_MASK +static inline void assert_virt_addr_aligned(void *addr) +{ + assert_addr_aligned((uintptr_t)addr); +} -#define PDE_FLAGS_MASK PDPTE_FLAGS_MASK +static inline void assert_region_page_aligned(void *addr, size_t size) +{ + assert_virt_addr_aligned(addr); +#if __ASSERT_ON + __ASSERT((size & (CONFIG_MMU_PAGE_SIZE - 1)) == 0U, + "unaligned size %zu", size); +#endif +} + +/* + * Debug functions. All conditionally compiled with CONFIG_EXCEPTION_DEBUG. + */ +#ifdef CONFIG_EXCEPTION_DEBUG + +/* Add colors to page table dumps to indicate mapping type */ +#define COLOR_PAGE_TABLES 1 + +#if COLOR_PAGE_TABLES +#define ANSI_DEFAULT "\x1B[0m" +#define ANSI_RED "\x1B[1;31m" +#define ANSI_GREEN "\x1B[1;32m" +#define ANSI_YELLOW "\x1B[1;33m" +#define ANSI_BLUE "\x1B[1;34m" +#define ANSI_MAGENTA "\x1B[1;35m" +#define ANSI_CYAN "\x1B[1;36m" +#define ANSI_GREY "\x1B[1;90m" + +#define COLOR(x) printk(_CONCAT(ANSI_, x)) #else -#define PDPTE_FLAGS_MASK Z_X86_MMU_P - -#define PDE_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \ - PDPTE_FLAGS_MASK) +#define COLOR(x) do { } while (0) #endif -#define PTE_FLAGS_MASK (PDE_FLAGS_MASK | Z_X86_MMU_XD | \ - Z_X86_MMU_PWT | \ - Z_X86_MMU_PCD) - -void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr, - size_t size, uint64_t flags, uint64_t mask, bool flush) +static char get_entry_code(pentry_t value) { - uintptr_t addr = (uintptr_t)ptr; + char ret; - __ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided"); - __ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided"); + if ((value & MMU_P) == 0U) { + ret = '.'; + } else { + if ((value & MMU_RW) != 0U) { + /* Writable page */ + if ((value & MMU_XD) != 0U) { + /* RW */ + ret = 'w'; + } else { + /* RWX */ + ret = 'a'; + } + } else { + if ((value & MMU_XD) != 0U) { + /* R */ + ret = 'r'; + } else { + /* RX */ + ret = 'x'; + } + } - /* L1TF mitigation: non-present PTEs will have address fields - * zeroed. Expand the mask to include address bits if we are changing - * the present bit. - */ - if ((mask & Z_X86_MMU_P) != 0) { - mask |= Z_X86_MMU_PTE_ADDR_MASK; + if ((value & MMU_US) != 0U) { + /* Uppercase indicates user mode access */ + ret = toupper(ret); + } } - /* NOTE: All of this code assumes that 2MB or 1GB pages are not being - * modified. - */ - while (size != 0) { - uint64_t *pte; - uint64_t *pde; - uint64_t *pdpte; -#ifdef CONFIG_X86_64 - uint64_t *pml4e; -#endif - uint64_t cur_flags = flags; - bool exec = (flags & Z_X86_MMU_XD) == 0; + return ret; +} -#ifdef CONFIG_X86_64 - pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr); - __ASSERT((*pml4e & Z_X86_MMU_P) != 0, - "set flags on non-present PML4e"); - *pml4e |= (flags & PML4E_FLAGS_MASK); +static void print_entries(pentry_t entries_array[], uint8_t *base, int level, + size_t count) +{ + int column = 0; - if (exec) { - *pml4e &= ~Z_X86_MMU_XD; + for (int i = 0; i < count; i++) { + pentry_t entry = entries_array[i]; + + uintptr_t phys = get_entry_phys(entry, level); + uintptr_t virt = + (uintptr_t)base + (get_entry_scope(level) * i); + + if (entry & MMU_P) { + if (is_leaf(level, entry)) { + if (phys == virt) { + /* Identity mappings */ + COLOR(YELLOW); + } else if (phys + Z_MEM_VM_OFFSET == virt) { + /* Permanent ram mappings */ + COLOR(GREEN); + } else { + /* general mapped pages */ + COLOR(CYAN); + } + } else { + COLOR(MAGENTA); + } + } else { + COLOR(GREY); } - pdpte = z_x86_pdpt_get_pdpte(z_x86_pml4e_get_pdpt(*pml4e), - addr); -#else - pdpte = z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr), - addr); -#endif - __ASSERT((*pdpte & Z_X86_MMU_P) != 0, - "set flags on non-present PDPTE"); - *pdpte |= (flags & PDPTE_FLAGS_MASK); -#ifdef CONFIG_X86_64 - if (exec) { - *pdpte &= ~Z_X86_MMU_XD; + printk("%c", get_entry_code(entry)); + + column++; + if (column == 64) { + column = 0; + printk("\n"); } -#endif - pde = z_x86_pd_get_pde(z_x86_pdpte_get_pd(*pdpte), addr); - __ASSERT((*pde & Z_X86_MMU_P) != 0, - "set flags on non-present PDE"); - *pde |= (flags & PDE_FLAGS_MASK); + } + COLOR(DEFAULT); - /* If any flags enable execution, clear execute disable at the - * page directory level - */ - if (exec) { - *pde &= ~Z_X86_MMU_XD; - } - - pte = z_x86_pt_get_pte(z_x86_pde_get_pt(*pde), addr); - - /* If we're setting the present bit, restore the address - * field. If we're clearing it, then the address field - * will be zeroed instead, mapping the PTE to the NULL page. - */ - if ((mask & Z_X86_MMU_P) != 0 && ((flags & Z_X86_MMU_P) != 0)) { - cur_flags |= addr; - } - - *pte = (*pte & ~mask) | cur_flags; - if (flush) { - tlb_flush_page((void *)addr); - } - - size -= MMU_PAGE_SIZE; - addr += MMU_PAGE_SIZE; + if (column != 0) { + printk("\n"); } } -static char __aligned(MMU_PAGE_SIZE) - page_pool[MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES]; - -static char *page_pos = page_pool + sizeof(page_pool); - -static void *get_page(void) +static void dump_ptables(pentry_t *table, uint8_t *base, int level) { - page_pos -= MMU_PAGE_SIZE; - - __ASSERT(page_pos >= page_pool, "out of MMU pages\n"); - - return page_pos; -} + const struct paging_level *info = &paging_levels[level]; #ifdef CONFIG_X86_64 -#define PTABLES_ALIGN 4096 -#else -#define PTABLES_ALIGN 32 + /* Account for the virtual memory "hole" with sign-extension */ + if (((uintptr_t)base & BITL(47)) != 0) { + base = (uint8_t *)((uintptr_t)base | (0xFFFFULL << 48)); + } #endif -__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_kernel_ptables; -#ifdef CONFIG_X86_KPTI -__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_user_ptables; -#endif + printk("%s at %p (0x%" PRIxPTR ") ", info->name, table, + z_mem_phys_addr(table)); + if (level == 0) { + printk("entire address space\n"); + } else { + printk("for %p - %p\n", base, + base + get_table_scope(level) - 1); + } -extern char z_shared_kernel_page_start[]; + print_entries(table, base, level, info->entries); -#ifdef CONFIG_X86_KPTI -static inline bool is_within_system_ram(uintptr_t addr) -{ -#ifdef CONFIG_X86_64 - /* FIXME: locore not included in CONFIG_SRAM_BASE_ADDRESS */ - return addr < (PHYS_RAM_ADDR + PHYS_RAM_SIZE); -#else - return (addr >= PHYS_RAM_ADDR) && - (addr < (PHYS_RAM_ADDR + PHYS_RAM_SIZE)); -#endif + /* Check if we're a page table */ + if (level == (NUM_LEVELS - 1)) { + return; + } + + /* Dump all linked child tables */ + for (int j = 0; j < info->entries; j++) { + pentry_t entry = table[j]; + pentry_t *next; + + if ((entry & MMU_P) == 0U || + (entry & MMU_PS) != 0U) { + /* Not present or big page, skip */ + continue; + } + + next = next_table(entry, level); + dump_ptables(next, base + (j * get_entry_scope(level)), + level + 1); + } } + +void z_x86_dump_page_tables(pentry_t *ptables) +{ + dump_ptables(ptables, NULL, 0); +} + +/* Enable to dump out the kernel's page table right before main() starts, + * sometimes useful for deep debugging. May overwhelm sanitycheck. + */ +#define DUMP_PAGE_TABLES 0 + +#if DUMP_PAGE_TABLES +static int dump_kernel_tables(struct device *unused) +{ + z_x86_dump_page_tables(&z_x86_kernel_ptables); + + return 0; +} + +SYS_INIT(dump_kernel_tables, APPLICATION, CONFIG_KERNEL_INIT_PRIORITY_DEFAULT); #endif -/* Ignored bit posiition at all levels */ -#define IGNORED BIT64(11) - -static void maybe_clear_xd(uint64_t *entry, bool exec) +static void str_append(char **buf, size_t *size, const char *str) { - /* Execute disable bit needs special handling, we should only set it at - * intermediate levels if ALL containing pages have XD set (instead of - * just one). + int ret = snprintk(*buf, *size, "%s", str); + + if (ret >= *size) { + /* Truncated */ + *size = 0U; + } else { + *size -= ret; + *buf += ret; + } + +} + +static void dump_entry(int level, void *virt, pentry_t entry) +{ + const struct paging_level *info = &paging_levels[level]; + char buf[24] = { 0 }; + char *pos = buf; + size_t sz = sizeof(buf); + uint8_t *virtmap = (uint8_t *)ROUND_DOWN(virt, get_entry_scope(level)); + + #define DUMP_BIT(bit) do { \ + if ((entry & MMU_##bit) != 0U) { \ + str_append(&pos, &sz, #bit " "); \ + } \ + } while (0) + + DUMP_BIT(RW); + DUMP_BIT(US); + DUMP_BIT(PWT); + DUMP_BIT(PCD); + DUMP_BIT(A); + DUMP_BIT(D); + DUMP_BIT(G); + DUMP_BIT(XD); + + LOG_ERR("%sE: %p -> " PRI_ENTRY ": %s", info->name, + virtmap, entry & info->mask, log_strdup(buf)); + + #undef DUMP_BIT +} + +void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables, + void *virt) +{ + pentry_t *table = ptables; + + for (int level = 0; level < NUM_LEVELS; level++) { + pentry_t entry = get_entry(table, virt, level); + + if ((entry & MMU_P) == 0 || is_leaf(level, entry)) { + *val = entry; + *paging_level = level; + break; + } else { + table = next_table(entry, level); + } + } +} + +/* + * Debug function for dumping out MMU table information to the LOG for a + * specific virtual address, such as when we get an unexpected page fault. + */ +void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt) +{ + pentry_t entry; + int level; + + z_x86_pentry_get(&level, &entry, ptables, virt); + + if ((entry & MMU_P) == 0) { + LOG_ERR("%sE: not present", paging_levels[level].name); + } else { + dump_entry(level, virt, entry); + } +} +#endif /* CONFIG_EXCEPTION_DEBUG */ + +/* Page allocation function prototype, passed to map_page() */ +typedef void * (*page_get_func_t)(void *); + +/* + * Pool of free memory pages for creating new page tables, as needed. + * + * This is very crude, once obtained, pages may not be returned. Fine for + * permanent kernel mappings. + */ +static uint8_t __noinit + page_pool[CONFIG_MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES] + __aligned(CONFIG_MMU_PAGE_SIZE); + +static uint8_t *page_pos = page_pool + sizeof(page_pool); + +static struct k_spinlock pool_lock; + +/* Return a zeroed and suitably aligned memory page for page table data + * from the global page pool + */ +static void *page_pool_get(void *context) +{ + void *ret; + k_spinlock_key_t key; + + ARG_UNUSED(context); + + key = k_spin_lock(&pool_lock); + if (page_pos == page_pool) { + ret = NULL; + } else { + page_pos -= CONFIG_MMU_PAGE_SIZE; + ret = page_pos; + } + k_spin_unlock(&pool_lock, key); + + if (ret != NULL) { + memset(ret, 0, CONFIG_MMU_PAGE_SIZE); + } + + return ret; +} + +/** + * Low-level mapping function + * + * Walk the provided page tables until we get to the PTE for the provided + * virtual address, and set that to whatever is in 'entry_val'. + * + * If memory must be drawn to instantiate page table memory, it will be + * obtained from the provided get_page() function. The function must + * return a page-aligned pointer to a page-sized block of zeroed memory. + * All intermediate tables have hard-coded flags of INT_FLAGS. + * + * Presumes we want to map a minimally sized page of CONFIG_MMU_PAGE_SIZE. + * No support for mapping big pages yet; unclear if we will ever need it given + * Zephyr's typical use-cases. + * + * TODO: There may be opportunities to optimize page table walks such as this + * by using recusrive page table mappings, see for example + * https://os.phil-opp.com/paging-implementation/#recursive-page-tables + * May also help if we need fast virtual-to-physical translation outside of + * the permanent memory mapping area. + * + * @param ptables Top-level page tables pointer + * @param virt Virtual address to set mapping + * @param entry_val Value to set PTE to + * @param get_page Function to draw memory pages from + * @param ctx Context pointer to pass to get_page() + * @retval 0 success + * @retval -ENOMEM get_page() failed + */ +static int page_map_set(pentry_t *ptables, void *virt, pentry_t entry_val, + page_get_func_t get_page, void *ctx) +{ + pentry_t *table = ptables; + + for (int level = 0; level < NUM_LEVELS; level++) { + int index; + pentry_t *entryp; + + index = get_index(virt, level); + entryp = &table[index]; + + /* Check if we're a PTE */ + if (level == (NUM_LEVELS - 1)) { + *entryp = entry_val; + break; + } + + /* This is a non-leaf entry */ + if ((*entryp & MMU_P) == 0U) { + /* Not present. Never done a mapping here yet, need + * some RAM for linked tables + */ + void *new_table = get_page(ctx); + + if (new_table == NULL) { + return -ENOMEM; + } + *entryp = z_mem_phys_addr(new_table) | INT_FLAGS; + table = new_table; + } else { + /* We fail an assertion here due to no support for + * splitting existing bigpage mappings. + * If the PS bit is not supported at some level (like + * in a PML4 entry) it is always reserved and must be 0 + */ + __ASSERT((*entryp & MMU_PS) == 0U, + "large page encountered"); + table = next_table(*entryp, level); + } + } + + return 0; +} + +/* map region virt..virt+size to phys with provided arch-neutral flags */ +int arch_mem_map(void *virt, uintptr_t phys, size_t size, uint32_t flags) +{ + pentry_t entry_flags = MMU_P; + pentry_t *ptables; + + LOG_DBG("%s: %p -> %p (%zu) flags 0x%x", + __func__, (void *)phys, virt, size, flags); + +#ifdef CONFIG_X86_64 + /* There's a gap in the "64-bit" address space, as 4-level paging + * requires bits 48 to 63 to be copies of bit 47. Test this + * by treating as a signed value and shifting. + */ + __ASSERT(((((intptr_t)virt) << 16) >> 16) == (intptr_t)virt, + "non-canonical virtual address mapping %p (size %zu)", + virt, size); +#endif /* CONFIG_X86_64 */ + + /* For now, always map in the kernel's page tables, we're just using + * this for driver mappings. User mode mappings + * (and interactions with KPTI) not implemented yet. + */ + ptables = (pentry_t *)&z_x86_kernel_ptables; + + /* Translate flags argument into HW-recognized entry flags. * - * Use an ignored bit position in the PDE to store a marker on whether - * any configured region allows execution. + * Support for PAT is not implemented yet. Many systems may have + * BIOS-populated MTRR values such that these cache settings are + * redundant. */ - if (exec) { - *entry |= IGNORED; - *entry &= ~Z_X86_MMU_XD; - } else if ((*entry & IGNORED) == 0) { - *entry |= Z_X86_MMU_XD; - } -} - -static void add_mmu_region_page(struct x86_page_tables *ptables, - uintptr_t addr, uint64_t flags, bool user_table) -{ -#ifdef CONFIG_X86_64 - uint64_t *pml4e; -#endif - struct x86_mmu_pdpt *pdpt; - uint64_t *pdpte; - struct x86_mmu_pd *pd; - uint64_t *pde; - struct x86_mmu_pt *pt; - uint64_t *pte; - bool exec = (flags & Z_X86_MMU_XD) == 0; - -#ifdef CONFIG_X86_KPTI - /* If we are generating a page table for user mode, and this address - * does not have the user flag set, and this address falls outside - * of system RAM, then don't bother generating any tables for it, - * we will never need them later as memory domains are limited to - * regions within system RAM. - */ - if (user_table && (flags & Z_X86_MMU_US) == 0 && - !is_within_system_ram(addr)) { - return; - } -#endif - -#ifdef CONFIG_X86_64 - pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr); - if ((*pml4e & Z_X86_MMU_P) == 0) { - pdpt = get_page(); - pml4e_update_pdpt(pml4e, pdpt); - } else { - pdpt = z_x86_pml4e_get_pdpt(*pml4e); - } - *pml4e |= (flags & PML4E_FLAGS_MASK); - maybe_clear_xd(pml4e, exec); -#else - pdpt = z_x86_get_pdpt(ptables, addr); -#endif - - /* Setup the PDPTE entry for the address, creating a page directory - * if one didn't exist - */ - pdpte = z_x86_pdpt_get_pdpte(pdpt, addr); - if ((*pdpte & Z_X86_MMU_P) == 0) { - pd = get_page(); - pdpte_update_pd(pdpte, pd); - } else { - pd = z_x86_pdpte_get_pd(*pdpte); - } - *pdpte |= (flags & PDPTE_FLAGS_MASK); -#ifdef CONFIG_X86_64 - maybe_clear_xd(pdpte, exec); -#endif - - /* Setup the PDE entry for the address, creating a page table - * if necessary - */ - pde = z_x86_pd_get_pde(pd, addr); - if ((*pde & Z_X86_MMU_P) == 0) { - pt = get_page(); - pde_update_pt(pde, pt); - } else { - pt = z_x86_pde_get_pt(*pde); - } - *pde |= (flags & PDE_FLAGS_MASK); - maybe_clear_xd(pde, exec); - -#ifdef CONFIG_X86_KPTI - if (user_table && (flags & Z_X86_MMU_US) == 0 && -#ifdef CONFIG_X86_64 - addr >= (uintptr_t)&_lodata_start && -#endif - addr != (uintptr_t)(&z_shared_kernel_page_start)) { - /* All non-user accessible pages except the shared page - * are marked non-present in the page table. - * - * For x86_64 we also make the locore text/rodata areas - * present even though they don't have user mode access, - * they contain necessary tables and program text for - * successfully handling exceptions and interrupts. - */ - return; - } -#else - ARG_UNUSED(user_table); -#endif - - /* Finally set up the page table entry */ - pte = z_x86_pt_get_pte(pt, addr); - pte_update_addr(pte, addr); - *pte |= (flags & PTE_FLAGS_MASK); -} - -static void add_mmu_region(struct x86_page_tables *ptables, - struct mmu_region *rgn, - bool user_table) -{ - size_t size; - uint64_t flags; - uintptr_t addr; - - __ASSERT((rgn->address & MMU_PAGE_MASK) == 0U, - "unaligned address provided"); - __ASSERT((rgn->size & MMU_PAGE_MASK) == 0U, - "unaligned size provided"); - addr = rgn->address; - flags = rgn->flags | Z_X86_MMU_P; - - /* Iterate through the region a page at a time, creating entries as - * necessary. - */ - size = rgn->size; - while (size > 0) { - add_mmu_region_page(ptables, addr, flags, user_table); - - size -= MMU_PAGE_SIZE; - addr += MMU_PAGE_SIZE; - } -} - - -void z_x86_add_mmu_region(uintptr_t addr, size_t size, uint64_t flags) -{ - struct mmu_region rgn = { - .address = addr, - .size = size, - .flags = flags, - }; - - add_mmu_region(&z_x86_kernel_ptables, &rgn, false); -#ifdef CONFIG_X86_KPTI - add_mmu_region(&z_x86_user_ptables, &rgn, true); -#endif -} - -int arch_mem_map(void *dest, uintptr_t addr, size_t size, uint32_t flags) -{ - uint64_t entry_flags = Z_X86_MMU_P; - - __ASSERT((uintptr_t)dest == addr, - "only identity mapping supported"); - switch (flags & K_MEM_CACHE_MASK) { case K_MEM_CACHE_NONE: - entry_flags |= Z_X86_MMU_PCD; + entry_flags |= MMU_PCD; break; case K_MEM_CACHE_WT: - entry_flags |= Z_X86_MMU_PWT; + entry_flags |= MMU_PWT; break; case K_MEM_CACHE_WB: break; default: return -ENOTSUP; } - if ((flags & K_MEM_PERM_RW) != 0) { - entry_flags |= Z_X86_MMU_RW; + if ((flags & K_MEM_PERM_RW) != 0U) { + entry_flags |= MMU_RW; } - if ((flags & K_MEM_PERM_USER) != 0) { + if ((flags & K_MEM_PERM_USER) != 0U) { /* TODO: user mode support * entry_flags |= MMU_US; */ return -ENOTSUP; } - if ((flags & K_MEM_PERM_EXEC) == 0) { - entry_flags |= Z_X86_MMU_XD; + if ((flags & K_MEM_PERM_EXEC) == 0U) { + entry_flags |= MMU_XD; } - z_x86_add_mmu_region(addr, size, entry_flags); + for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) { + int ret; + pentry_t entry_val = (phys + offset) | entry_flags; + uint8_t *dest_virt = (uint8_t *)virt + offset; + + ret = page_map_set(ptables, dest_virt, entry_val, + page_pool_get, NULL); + + /* Currently used for new mappings, no TLB flush. Re-visit + * as capabilities increase + */ + + if (ret != 0) { + /* NOTE: Currently we do not un-map a partially + * completed mapping. + */ + return ret; + } + } return 0; } -/* Called from x86's arch_kernel_init() */ -void z_x86_paging_init(void) +static void identity_map_remove(void) { - Z_STRUCT_SECTION_FOREACH(mmu_region, rgn) { - add_mmu_region(&z_x86_kernel_ptables, rgn, false); -#ifdef CONFIG_X86_KPTI - add_mmu_region(&z_x86_user_ptables, rgn, true); -#endif - } +#if CONFIG_SRAM_BASE_ADDRESS != CONFIG_KERNEL_VM_BASE + size_t size, scope = get_entry_scope(0); + uint8_t *pos; -#ifdef CONFIG_X86_64 - /* MMU already enabled at boot for long mode, we just need to - * program CR3 with our newly generated page tables. + k_mem_region_align((uintptr_t *)&pos, &size, + (uintptr_t)CONFIG_SRAM_BASE_ADDRESS, + (size_t)CONFIG_SRAM_SIZE * 1024U, scope); + + /* We booted with RAM mapped both to its identity and virtual + * mapping starting at CONFIG_KERNEL_VM_BASE. This was done by + * double-linking the relevant tables in the top-level table. + * At this point we don't need the identity mapping(s) any more, + * zero the top-level table entries corresponding to the + * physical mapping. */ - __asm__ volatile("movq %0, %%cr3\n\t" - : : "r" (&z_x86_kernel_ptables) : "memory"); -#else - z_x86_enable_paging(); + while (size) { + pentry_t *entry = get_entry_ptr(&z_x86_kernel_ptables, pos, 0); + + /* set_pte */ + *entry = 0; + pos += scope; + size -= scope; + } #endif } -#ifdef CONFIG_X86_STACK_PROTECTION +/* Invoked to remove the identity mappings in the page tables, + * they were only needed to tranisition the instruction pointer at early boot + */ +void z_x86_mmu_init(void) +{ + identity_map_remove(); +} + +#if CONFIG_X86_STACK_PROTECTION +/* Legacy stack guard function. This will eventually be replaced in favor + * of memory-mapping stacks (with a non-present mapping immediately below each + * one to catch overflows) instead of using in-place + */ +static void stack_guard_set(void *guard_page) +{ + pentry_t pte = z_mem_phys_addr(guard_page) | MMU_P | MMU_XD; + int ret; + + assert_virt_addr_aligned(guard_page); + + /* Always modify the kernel's page tables since this is for + * supervisor threads or handling syscalls + */ + ret = page_map_set(&z_x86_kernel_ptables, guard_page, pte, + page_pool_get, NULL); + /* Literally should never happen */ + __ASSERT(ret == 0, "stack guard mapping failed for %p", guard_page); + (void)ret; +} + void z_x86_set_stack_guard(k_thread_stack_t *stack) { #ifdef CONFIG_USERSPACE @@ -902,262 +759,358 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack) struct z_x86_thread_stack_header *header = (struct z_x86_thread_stack_header *)stack; - /* Set guard area to read-only to catch stack overflows */ - z_x86_mmu_set_flags(&z_x86_kernel_ptables, &header->guard_page, - MMU_PAGE_SIZE, MMU_ENTRY_READ, Z_X86_MMU_RW, - true); - + stack_guard_set(&header->guard_page); } else #endif /* CONFIG_USERSPACE */ { - /* Kernel-only stacks have the guard be the first page */ - z_x86_mmu_set_flags(&z_x86_kernel_ptables, stack, - MMU_PAGE_SIZE, MMU_ENTRY_READ, Z_X86_MMU_RW, - true); + stack_guard_set(stack); } } #endif /* CONFIG_X86_STACK_PROTECTION */ -#ifdef CONFIG_X86_USERSPACE +#ifdef CONFIG_USERSPACE +/* + * All of the code below will eventually be removed/replaced with a virtual + * address space aware userspace that doesn't do a physical memory map with + * memory domains. + */ +static bool page_validate(pentry_t *ptables, uint8_t *addr, bool write) +{ + pentry_t *table = (pentry_t *)ptables; + + for (int level = 0; level < NUM_LEVELS; level++) { + pentry_t entry = get_entry(table, addr, level); + + if ((entry & MMU_P) == 0U) { + /* Non-present, no access. + * TODO: will need re-visiting with demand paging + * implemented, could just be paged out + */ + return false; + } + + if (is_leaf(level, entry)) { + if (((entry & MMU_US) == 0U) || + (write && ((entry & MMU_RW) == 0U))) { + return false; + } + } else { + table = next_table(entry, level); + } + } + + return true; +} + +static inline void bcb_fence(void) +{ +#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION + __asm__ volatile ("lfence" : : : "memory"); +#endif +} + int arch_buffer_validate(void *addr, size_t size, int write) { - return z_x86_mmu_validate(z_x86_thread_page_tables_get(_current), addr, - size, write != 0); -} + pentry_t *ptables = z_x86_thread_page_tables_get(_current); + uint8_t *virt; + size_t aligned_size; + int ret = 0; -#ifdef CONFIG_X86_64 -static uintptr_t thread_pdpt_create(uintptr_t pages, - struct x86_page_tables *thread_ptables, - struct x86_page_tables *master_ptables) -{ - uintptr_t pos = pages, phys_addr = Z_X86_PDPT_START; + /* addr/size arbitrary, fix this up into an aligned region */ + k_mem_region_align((uintptr_t *)&virt, &aligned_size, + (uintptr_t)addr, size, CONFIG_MMU_PAGE_SIZE); - for (int i = 0; i < Z_X86_NUM_PDPT; i++, phys_addr += Z_X86_PDPT_AREA) { - uint64_t *pml4e; - struct x86_mmu_pdpt *master_pdpt, *dest_pdpt; - - /* obtain master PDPT tables for the address range and copy - * into per-thread PDPT for this range - */ - master_pdpt = z_x86_get_pdpt(master_ptables, phys_addr); - dest_pdpt = (struct x86_mmu_pdpt *)pos; - (void)memcpy(dest_pdpt, master_pdpt, - sizeof(struct x86_mmu_pdpt)); - - /* And then wire this up to the relevant per-thread PML4E */ - pml4e = z_x86_get_pml4e(thread_ptables, phys_addr); - pml4e_update_pdpt(pml4e, dest_pdpt); - pos += MMU_PAGE_SIZE; + for (size_t offset = 0; offset < aligned_size; + offset += CONFIG_MMU_PAGE_SIZE) { + if (!page_validate(ptables, virt + offset, write)) { + ret = -1; + break; + } } - return pos; -} -#endif /* CONFIG_X86_64 */ + bcb_fence(); -static uintptr_t thread_pd_create(uintptr_t pages, - struct x86_page_tables *thread_ptables, - struct x86_page_tables *master_ptables) -{ - uintptr_t pos = pages, phys_addr = Z_X86_PD_START; - - for (int i = 0; i < Z_X86_NUM_PD; i++, phys_addr += Z_X86_PD_AREA) { - uint64_t *pdpte; - struct x86_mmu_pd *master_pd, *dest_pd; - - /* Obtain PD in master tables for the address range and copy - * into the per-thread PD for this range - */ - master_pd = z_x86_get_pd(master_ptables, phys_addr); - dest_pd = (struct x86_mmu_pd *)pos; - - (void)memcpy(dest_pd, master_pd, sizeof(struct x86_mmu_pd)); - - /* Update pointer in per-thread pdpt to point to the per-thread - * directory we just copied - */ - pdpte = z_x86_get_pdpte(thread_ptables, phys_addr); - pdpte_update_pd(pdpte, dest_pd); - pos += MMU_PAGE_SIZE; - } - - return pos; + return ret; } -/* thread_ptables must be initialized, as well as all the page directories */ -static uintptr_t thread_pt_create(uintptr_t pages, - struct x86_page_tables *thread_ptables, - struct x86_page_tables *master_ptables) -{ - uintptr_t pos = pages, phys_addr = Z_X86_PT_START; - - for (int i = 0; i < Z_X86_NUM_PT; i++, phys_addr += Z_X86_PT_AREA) { - uint64_t *pde; - struct x86_mmu_pt *master_pt, *dest_pt; - - /* Same as we did with the directories, obtain PT in master - * tables for the address range and copy into per-thread PT - * for this range - */ - master_pt = z_x86_get_pt(master_ptables, phys_addr); - dest_pt = (struct x86_mmu_pt *)pos; - (void)memcpy(dest_pt, master_pt, sizeof(struct x86_mmu_pt)); - - /* And then wire this up to the relevant per-thread - * page directory entry - */ - pde = z_x86_get_pde(thread_ptables, phys_addr); - pde_update_pt(pde, dest_pt); - pos += MMU_PAGE_SIZE; - } - - return pos; -} - -/* Initialize the page tables for a thread. This will contain, once done, - * the boot-time configuration for a user thread page tables. There are - * no pre-conditions on the existing state of the per-thread tables. +/* Fetch pages for per-thread page tables from reserved space within the + * thread stack object * - * pos represents the page we are working with in the reserved area - * in the stack buffer for per-thread tables. As we create tables in - * this area, pos is incremented to the next free page. - * - * The layout of the stack object, when this is done: - * - * For 32-bit: - * - * +---------------------------+ <- thread->stack_obj - * | PDE(0) | - * +---------------------------+ - * | ... | - * +---------------------------+ - * | PDE(Z_X86_NUM_PD - 1) | - * +---------------------------+ - * | PTE(0) | - * +---------------------------+ - * | ... | - * +---------------------------+ - * | PTE(Z_X86_NUM_PT - 1) | - * +---------------------------+ <- pos once this logic completes - * | Stack guard | - * +---------------------------+ - * | Privilege elevation stack | - * | PDPT | - * +---------------------------+ <- thread->stack_info.start - * | Thread stack | - * | ... | - * - * For 64-bit: - * - * +---------------------------+ <- thread->stack_obj - * | PML4 | - * +---------------------------| - * | PDPT(0) | - * +---------------------------| - * | ... | - * +---------------------------| - * | PDPT(Z_X86_NUM_PDPT - 1) | - * +---------------------------+ - * | PDE(0) | - * +---------------------------+ - * | ... | - * +---------------------------+ - * | PDE(Z_X86_NUM_PD - 1) | - * +---------------------------+ - * | PTE(0) | - * +---------------------------+ - * | ... | - * +---------------------------+ - * | PTE(Z_X86_NUM_PT - 1) | - * +---------------------------+ <- pos once this logic completes - * | Stack guard | - * +---------------------------+ - * | Privilege elevation stack | - * +---------------------------+ <- thread->stack_info.start - * | Thread stack | - * | ... | + * For the moment, re-use pool_lock for synchronization */ -static void copy_page_tables(struct k_thread *thread, - struct x86_page_tables *thread_ptables, - struct x86_page_tables *master_ptables) +static void *thread_page_pool_get(void *context) { - uintptr_t pos, start; + struct k_thread *thread = context; + uint8_t *stack_object = (uint8_t *)thread->stack_obj; + void *ret; + k_spinlock_key_t key; + + key = k_spin_lock(&pool_lock); + ret = thread->arch.mmu_pos; + + if (thread->arch.mmu_pos >= stack_object + Z_X86_THREAD_PT_AREA) { + ret = NULL; + } else { + thread->arch.mmu_pos += CONFIG_MMU_PAGE_SIZE; + memset(ret, 0, CONFIG_MMU_PAGE_SIZE); + } + k_spin_unlock(&pool_lock, key); + + return ret; +} + +#define RAM_BASE ((uintptr_t)CONFIG_KERNEL_VM_BASE) +#define RAM_END (RAM_BASE + (CONFIG_SRAM_SIZE * 1024UL)) + +/* Establish a mapping in the thread's page tables */ +static void thread_map(struct k_thread *thread, void *ptr, size_t size, + pentry_t flags, bool flush) +{ + pentry_t *ptables = z_x86_thread_page_tables_get(thread); + + assert_region_page_aligned(ptr, size); + + /* Only mapping system RAM addresses is supported in thread page tables, + * as the thread does not have its own copies of tables outside of it + */ + __ASSERT((uintptr_t)ptr >= RAM_BASE, + "%p below system RAM", ptr); + __ASSERT((uintptr_t)ptr < RAM_END, + "%p above system ram", ptr); + + for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) { + pentry_t pte; + uint8_t *pos; + int ret; + + pos = (uint8_t *)ptr + offset; + + if ((flags & MMU_P) == 0U) { + /* L1TF */ + pte = 0U; + } else { + pte = z_mem_phys_addr(pos) | flags; + } + + ret = page_map_set(ptables, pos, pte, thread_page_pool_get, + thread); + __ASSERT(ret == 0, "mapping failed for %p", pos); + (void)ret; + + if (flush) { + tlb_flush_page(pos); + } + } +} + +/* Get the kernel's PTE value for a particular virtual address */ +static pentry_t kernel_page_map_get(void *virt) +{ + pentry_t *table = &z_x86_kernel_ptables; + + for (int level = 0; level < NUM_LEVELS; level++) { + pentry_t entry = get_entry(table, virt, level); + + if ((entry & MMU_P) == 0U) { + break; + } + + if (is_leaf(level, entry)) { + __ASSERT((entry & MMU_PS) == 0, "bigpage found"); + return entry; + } + + table = next_table(entry, level); + } + + return 0; +} + +/* In thread page tables, set mapping for a particular address to whatever + * mapping is set up for that address in the kernel's page tables. + */ +static void page_reset(struct k_thread *thread, void *virt) +{ + pentry_t kern_pte = kernel_page_map_get(virt); + pentry_t *thread_ptables = z_x86_thread_page_tables_get(thread); + int ret; + +#ifdef CONFIG_X86_KPTI + /* Shared kernel page needs to be mapped in page tables as it contains + * trampoline stack and important data structures. Otherwise, non-User + * pages aren't present. + */ + if ((kern_pte & MMU_US) == 0U && virt != &z_shared_kernel_page_start) { + kern_pte = 0; + } +#endif /* CONFIG_X86_KPTI */ + + ret = page_map_set(thread_ptables, virt, + kern_pte, thread_page_pool_get, thread); + __ASSERT(ret == 0, "mapping failed for %p", virt); + (void)ret; +} + +#ifdef CONFIG_X86_KPTI +/* KPTI version. The thread-level page tables are ONLY used by user mode + * and very briefly when changing privileges. + * + * We leave any memory addresses outside of system RAM unmapped. + * Any addresses within system RAM are also unmapped unless they have the US + * bit set, or are the trampoline page. + */ +static void setup_thread_tables(struct k_thread *thread, + pentry_t *thread_ptables) +{ + ARG_UNUSED(thread_ptables); + + for (uint8_t *pos = (uint8_t *)RAM_BASE; pos < (uint8_t *)RAM_END; + pos += CONFIG_MMU_PAGE_SIZE) { + page_reset(thread, pos); + } +} +#else +/* get the Nth level paging structure for a particular virtual address */ +static pentry_t *page_table_get(pentry_t *toplevel, void *virt, int level) +{ + pentry_t *table = toplevel; + + __ASSERT(level < NUM_LEVELS, "bad level argument %d", level); + + for (int i = 0; i < level; i++) { + pentry_t entry = get_entry(table, virt, level); + + if ((entry & MMU_P) == 0U) { + return NULL; + } + __ASSERT((entry & MMU_PS) == 0, "bigpage found"); + table = next_table(entry, i); + } + + return table; +} + +/* Get a pointer to the N-th level entry for a particular virtual address */ +static pentry_t *page_entry_ptr_get(pentry_t *toplevel, void *virt, int level) +{ + pentry_t *table = page_table_get(toplevel, virt, level); + + __ASSERT(table != NULL, "no table mapping for %p at level %d", + virt, level); + return get_entry_ptr(table, virt, level); +} + + /* Non-KPTI version. The thread-level page tables are used even during + * interrupts, exceptions, and syscalls, so we need all mappings. + * Copies will be made of all tables that provide mappings for system RAM, + * otherwise the kernel table will just be linked instead. + */ +static void setup_thread_tables(struct k_thread *thread, + pentry_t *thread_ptables) +{ + /* Copy top-level structure verbatim */ + (void)memcpy(thread_ptables, &z_x86_kernel_ptables, table_size(0)); + + /* Proceed through linked structure levels, and for all system RAM + * virtual addresses, create copies of all relevant tables. + */ + for (int level = 1; level < NUM_LEVELS; level++) { + uint8_t *start, *end; + size_t increment; + + increment = get_entry_scope(level); + start = (uint8_t *)ROUND_DOWN(RAM_BASE, increment); + end = (uint8_t *)ROUND_UP(RAM_END, increment); + + for (uint8_t *virt = start; virt < end; virt += increment) { + pentry_t *link, *master_table, *user_table; + + /* We're creating a new thread page table, so get the + * pointer to the entry in the previous table to have + * it point to the new location + */ + link = page_entry_ptr_get(thread_ptables, virt, + level - 1); + + /* Master table contents, which we make a copy of */ + master_table = page_table_get(&z_x86_kernel_ptables, + virt, level); + + /* Pulled out of reserved memory in the stack object */ + user_table = thread_page_pool_get(thread); + __ASSERT(user_table != NULL, "out of memory") + + (void)memcpy(user_table, master_table, + table_size(level)); + + *link = z_mem_phys_addr(user_table) | INT_FLAGS; + } + } +} +#endif /* CONFIG_X86_KPTI */ + +/* Called on creation of a user thread or when a supervisor thread drops to + * user mode. + * + * Sets up the per-thread page tables, such that when they are activated on + * context switch, everything is rseady to go. thread->arch.ptables is updated + * to the thread-level tables instead of the kernel's page tbales. + * + * Memory for the per-thread page table structures is drawn from the stack + * object, a buffer of size Z_X86_THREAD_PT_AREA starting from the beginning + * of the stack object. + */ +void z_x86_thread_pt_init(struct k_thread *thread) +{ + pentry_t *ptables; + + /* thread_page_pool_get() memory starts at the beginning of the + * stack object + */ + assert_virt_addr_aligned(thread->stack_obj); + thread->arch.mmu_pos = (uint8_t *)thread->stack_obj; + + /* Get memory for the top-level structure */ +#ifndef CONFIG_X86_PAE + ptables = thread_page_pool_get(thread); + __ASSERT(ptables != NULL, "out of memory"); +#else struct z_x86_thread_stack_header *header = (struct z_x86_thread_stack_header *)thread->stack_obj; - __ASSERT(thread->stack_obj != NULL, "no stack object assigned"); - __ASSERT(z_x86_page_tables_get() != thread_ptables, - "tables are active"); - __ASSERT(((uintptr_t)thread_ptables & 0x1f) == 0, - "unaligned page tables at %p", thread_ptables); - - (void)memcpy(thread_ptables, master_ptables, - sizeof(struct x86_page_tables)); - - start = (uintptr_t)(&header->page_tables); -#ifdef CONFIG_X86_64 - pos = start + sizeof(struct x86_mmu_pml4); - pos = thread_pdpt_create(pos, thread_ptables, master_ptables); -#else - pos = start; + ptables = (pentry_t *)&header->kernel_data.ptables; #endif - pos = thread_pd_create(pos, thread_ptables, master_ptables); - pos = thread_pt_create(pos, thread_ptables, master_ptables); + thread->arch.ptables = z_mem_phys_addr(ptables); - __ASSERT(pos == (start + Z_X86_THREAD_PT_AREA), - "wrong amount of stack object memory used"); + setup_thread_tables(thread, ptables); + + /* Enable access to the thread's own stack buffer */ + thread_map(thread, (void *)thread->stack_info.start, + ROUND_UP(thread->stack_info.size, + CONFIG_MMU_PAGE_SIZE), + MMU_P | MMU_RW | MMU_US | MMU_XD, false); } -static void reset_mem_partition(struct x86_page_tables *thread_ptables, +static inline void apply_mem_partition(struct k_thread *thread, + struct k_mem_partition *partition) +{ + thread_map(thread, (void *)partition->start, partition->size, + partition->attr | MMU_P, false); +} + +static void reset_mem_partition(struct k_thread *thread, struct k_mem_partition *partition) { - uintptr_t addr = partition->start; + uint8_t *addr = (uint8_t *)partition->start; size_t size = partition->size; - __ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided"); - __ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided"); - - while (size != 0) { - uint64_t *thread_pte, *master_pte; - - thread_pte = z_x86_get_pte(thread_ptables, addr); - master_pte = z_x86_get_pte(&USER_PTABLES, addr); - - *thread_pte = *master_pte; - - size -= MMU_PAGE_SIZE; - addr += MMU_PAGE_SIZE; + assert_region_page_aligned(addr, size); + for (size_t offset = 0; offset < size; offset += CONFIG_MMU_PAGE_SIZE) { + page_reset(thread, addr + offset); } } -static void apply_mem_partition(struct x86_page_tables *ptables, - struct k_mem_partition *partition) -{ - uint64_t x86_attr; - uint64_t mask; - - if (IS_ENABLED(CONFIG_X86_KPTI)) { - x86_attr = partition->attr | Z_X86_MMU_P; - mask = K_MEM_PARTITION_PERM_MASK | Z_X86_MMU_P; - } else { - x86_attr = partition->attr; - mask = K_MEM_PARTITION_PERM_MASK; - } - - __ASSERT(partition->start >= PHYS_RAM_ADDR, - "region at %08lx[%zu] extends below system ram start 0x%08lx", - partition->start, partition->size, (uintptr_t)PHYS_RAM_ADDR); - __ASSERT(((partition->start + partition->size) <= - (PHYS_RAM_ADDR + PHYS_RAM_SIZE)), - "region at %08lx[%zu] end at %08lx extends beyond system ram end 0x%08lx", - partition->start, partition->size, - partition->start + partition->size, - ((uintptr_t)PHYS_RAM_ADDR) + (size_t)PHYS_RAM_SIZE); - - z_x86_mmu_set_flags(ptables, (void *)partition->start, partition->size, - x86_attr, mask, false); -} - -void z_x86_apply_mem_domain(struct x86_page_tables *ptables, +void z_x86_apply_mem_domain(struct k_thread *thread, struct k_mem_domain *mem_domain) { for (int i = 0, pcount = 0; pcount < mem_domain->num_partitions; i++) { @@ -1169,56 +1122,17 @@ void z_x86_apply_mem_domain(struct x86_page_tables *ptables, } pcount++; - apply_mem_partition(ptables, partition); + apply_mem_partition(thread, partition); } } -/* Called on creation of a user thread or when a supervisor thread drops to - * user mode. - * - * Sets up the per-thread page tables, such that when they are activated on - * context switch, everything is ready to go. thread->arch.ptables is updated - * to the thread-level tables instead of the kernel's page tbales. - */ -void z_x86_thread_pt_init(struct k_thread *thread) -{ - struct x86_page_tables *ptables; - struct z_x86_thread_stack_header *header = - (struct z_x86_thread_stack_header *)thread->stack_obj; - -#ifdef CONFIG_X86_64 - ptables = (struct x86_page_tables *)(&header->page_tables); -#else - ptables = &header->kernel_data.ptables; -#endif - thread->arch.ptables = ptables; - - /* USER_PDPT contains the page tables with the boot time memory - * policy. We use it as a template to set up the per-thread page - * tables. - * - * With KPTI, this is a distinct set of tables z_x86_user_pdpt from the - * kernel page tables in z_x86_kernel_pdpt; it has all non user - * accessible pages except the trampoline page marked as non-present. - * Without KPTI, they are the same object. - */ - copy_page_tables(thread, ptables, &USER_PTABLES); - - /* Enable access to the thread's own stack buffer */ - z_x86_mmu_set_flags(ptables, (void *)thread->stack_info.start, - ROUND_UP(thread->stack_info.size, MMU_PAGE_SIZE), - Z_X86_MMU_P | K_MEM_PARTITION_P_RW_U_RW, - Z_X86_MMU_P | K_MEM_PARTITION_PERM_MASK, - false); -} - /* - * Memory domain interface + * Arch interface implementations for memory domains * - * In all cases, if one of these APIs is called on a supervisor thread, - * we don't need to do anything. If the thread later drops into supervisor - * mode the per-thread page tables will be generated and the memory domain - * configuration applied. + * In all cases, if one of these arch_mem_domain_* APIs is called on a + * supervisor thread, we don't need to do anything. If the thread later drops + * into user mode the per-thread page tables will be generated and the memory + * domain configuration applied. */ void arch_mem_domain_partition_remove(struct k_mem_domain *domain, uint32_t partition_id) @@ -1236,8 +1150,7 @@ void arch_mem_domain_partition_remove(struct k_mem_domain *domain, continue; } - reset_mem_partition(z_x86_thread_page_tables_get(thread), - &domain->partitions[partition_id]); + reset_mem_partition(thread, &domain->partitions[partition_id]); } } @@ -1274,8 +1187,7 @@ void arch_mem_domain_thread_remove(struct k_thread *thread) } pcount++; - reset_mem_partition(z_x86_thread_page_tables_get(thread), - partition); + reset_mem_partition(thread, partition); } } @@ -1292,8 +1204,7 @@ void arch_mem_domain_partition_add(struct k_mem_domain *domain, continue; } - apply_mem_partition(z_x86_thread_page_tables_get(thread), - &domain->partitions[partition_id]); + apply_mem_partition(thread, &domain->partitions[partition_id]); } } @@ -1303,12 +1214,11 @@ void arch_mem_domain_thread_add(struct k_thread *thread) return; } - z_x86_apply_mem_domain(z_x86_thread_page_tables_get(thread), - thread->mem_domain_info.mem_domain); + z_x86_apply_mem_domain(thread, thread->mem_domain_info.mem_domain); } int arch_mem_domain_max_partitions_get(void) { return CONFIG_MAX_DOMAIN_PARTITIONS; } -#endif /* CONFIG_X86_USERSPACE*/ +#endif /* CONFIG_USERSPACE */ diff --git a/arch/x86/include/intel64/kernel_arch_data.h b/arch/x86/include/intel64/kernel_arch_data.h index 06212a5d26d..312feaea041 100644 --- a/arch/x86/include/intel64/kernel_arch_data.h +++ b/arch/x86/include/intel64/kernel_arch_data.h @@ -25,9 +25,6 @@ struct x86_cpuboot { uint64_t sp; /* initial stack pointer */ arch_cpustart_t fn; /* kernel entry function */ void *arg; /* argument for above function */ -#ifdef CONFIG_X86_MMU - struct x86_page_tables *ptables; /* Runtime page tables to install */ -#endif /* CONFIG_X86_MMU */ }; typedef struct x86_cpuboot x86_cpuboot_t; diff --git a/arch/x86/include/kernel_arch_func.h b/arch/x86/include/kernel_arch_func.h index adca7281fae..8fb8538bc24 100644 --- a/arch/x86/include/kernel_arch_func.h +++ b/arch/x86/include/kernel_arch_func.h @@ -44,20 +44,6 @@ extern FUNC_NORETURN void z_x86_prep_c(void *arg); void z_x86_early_serial_init(void); #endif /* CONFIG_X86_VERY_EARLY_CONSOLE */ -#ifdef CONFIG_X86_MMU -/* Create all page tables with boot configuration and enable paging */ -void z_x86_paging_init(void); - -static inline struct x86_page_tables * -z_x86_thread_page_tables_get(struct k_thread *thread) -{ -#ifdef CONFIG_USERSPACE - return thread->arch.ptables; -#else - return &z_x86_kernel_ptables; -#endif -} -#endif /* CONFIG_X86_MMU */ /* Called upon CPU exception that is unhandled and hence fatal; dump * interesting info and call z_x86_fatal_error() @@ -102,19 +88,10 @@ extern FUNC_NORETURN void z_x86_userspace_enter(k_thread_entry_t user_entry, */ void *z_x86_userspace_prepare_thread(struct k_thread *thread); -void z_x86_thread_pt_init(struct k_thread *thread); - -void z_x86_apply_mem_domain(struct x86_page_tables *ptables, - struct k_mem_domain *mem_domain); - #endif /* CONFIG_USERSPACE */ void z_x86_do_kernel_oops(const z_arch_esf_t *esf); -#ifdef CONFIG_X86_STACK_PROTECTION -void z_x86_set_stack_guard(k_thread_stack_t *stack); -#endif - #endif /* !_ASMLANGUAGE */ #endif /* ZEPHYR_ARCH_X86_INCLUDE_KERNEL_ARCH_FUNC_H_ */ diff --git a/arch/x86/include/x86_mmu.h b/arch/x86/include/x86_mmu.h new file mode 100644 index 00000000000..343a5b26116 --- /dev/null +++ b/arch/x86/include/x86_mmu.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2011-2014 Wind River Systems, Inc. + * Copyright (c) 2017-2020 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + * + * Internal memory management interfaces implemented in x86_mmu.c. + * None of these are application-facing, use only if you know what you are + * doing! + */ + +#ifndef ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H +#define ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H + +#include +#include + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define XD_SUPPORTED +#define BITL BIT64 +#define PRI_ENTRY "0x%016llx" +#else +#define BITL BIT +#define PRI_ENTRY "0x%08x" +#endif + +/* + * Common flags in the same bit position regardless of which structure level, + * although not every flag is supported at every level, and some may be + * ignored depending on the state of other bits (such as P or PS) + * + * These flags indicate bit position, and can be used for setting flags or + * masks as needed. + */ + +#define MMU_P BITL(0) /** Present */ +#define MMU_RW BITL(1) /** Read-Write */ +#define MMU_US BITL(2) /** User-Supervisor */ +#define MMU_PWT BITL(3) /** Page Write Through */ +#define MMU_PCD BITL(4) /** Page Cache Disable */ +#define MMU_A BITL(5) /** Accessed */ +#define MMU_D BITL(6) /** Dirty */ +#define MMU_PS BITL(7) /** Page Size */ +#define MMU_G BITL(8) /** Global */ +#ifdef XD_SUPPORTED +#define MMU_XD BITL(63) /** Execute Disable */ +#else +#define MMU_XD 0 +#endif + +#ifdef CONFIG_EXCEPTION_DEBUG +/** + * Dump out page table entries for a particular virtual memory address + * + * For the provided memory address, dump out interesting information about + * its mapping to the error log + * + * @param ptables Page tables to walk + * @param virt Virtual address to inspect + */ +void z_x86_dump_mmu_flags(pentry_t *ptables, void *virt); + +/** + * Fetch the page table entry for a virtual memory address + * + * @param paging_level [out] what paging level the entry was found at. + * 0=toplevel + * @param val Value stored in page table entry, with address and flags + * @param ptables Toplevel pointer to page tables + * @param virt Virtual address to lookup + */ +void z_x86_pentry_get(int *paging_level, pentry_t *val, pentry_t *ptables, + void *virt); + +/** + * Debug function for dumping out page tables + * + * Iterates through the entire linked set of page table structures, + * dumping out codes for the configuration of each table entry. + * + * Entry codes: + * + * . - not present + * w - present, writable, not executable + * a - present, writable, executable + * r - present, read-only, not executable + * x - present, read-only, executable + * + * Entry codes in uppercase indicate that user mode may access. + * + * Color is used to indicate the physical mapping characteristics: + * + * yellow - Identity mapping (virt = phys) + * green - Fixed virtual memory mapping (virt = phys + constant) + * magenta - entry is child page table + * cyan - General mapped memory + * + * @param ptables Top-level pointer to the page tables, as programmed in CR3 + */ +void z_x86_dump_page_tables(pentry_t *ptables); +#endif /* CONFIG_EXCEPTION_DEBUG */ + +#ifdef CONFIG_HW_STACK_PROTECTION +/* Legacy function - set identity-mapped MMU stack guard page to RO in the + * kernel's page tables to prevent writes and generate an exception + */ +void z_x86_set_stack_guard(k_thread_stack_t *stack); +#endif + +#ifdef CONFIG_USERSPACE +#ifdef CONFIG_X86_KPTI +/* Defined in linker script. Contains all the data that must be mapped + * in a KPTI table even though US bit is not set (trampoline stack, GDT, + * IDT, etc) + */ +extern uint8_t z_shared_kernel_page_start; +#endif /* CONFIG_X86_KPTI */ + +/* Set up per-thread page tables just prior to entering user mode */ +void z_x86_thread_pt_init(struct k_thread *thread); + +/* Apply a memory domain policy to a set of thread page tables */ +void z_x86_apply_mem_domain(struct k_thread *thread, + struct k_mem_domain *mem_domain); +#endif /* CONFIG_USERSPACE */ + +/* Return cr3 value, which is the physical (not virtual) address of the + * current set of page tables + */ +static inline uintptr_t z_x86_cr3_get(void) +{ + uintptr_t cr3; +#ifdef CONFIG_X86_64 + __asm__ volatile("movq %%cr3, %0\n\t" : "=r" (cr3)); +#else + __asm__ volatile("movl %%cr3, %0\n\t" : "=r" (cr3)); +#endif + return cr3; +} + +/* Return the virtual address of the page tables installed in this CPU in CR3 */ +static inline pentry_t *z_x86_page_tables_get(void) +{ + return z_mem_virt_addr(z_x86_cr3_get()); +} + +/* Kernel's page table. This is in CR3 for all supervisor threads. + * if KPTI is enabled, we switch to this when handling exceptions or syscalls + */ +extern pentry_t z_x86_kernel_ptables; + +/* Get the page tables used by this thread during normal execution */ +static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread) +{ +#ifdef CONFIG_USERSPACE + return z_mem_virt_addr(thread->arch.ptables); +#else + return &z_x86_kernel_ptables; +#endif +} + +/* Early-boot paging setup tasks, called from prep_c */ +void z_x86_mmu_init(void); +#endif /* ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H */ diff --git a/include/arch/x86/ia32/linker.ld b/include/arch/x86/ia32/linker.ld index 5974db7097b..408a223704c 100644 --- a/include/arch/x86/ia32/linker.ld +++ b/include/arch/x86/ia32/linker.ld @@ -51,9 +51,12 @@ #define RAMABLE_REGION RAM #endif -#ifdef CONFIG_X86_MMU - #define MMU_PAGE_SIZE KB(4) - #define MMU_PAGE_ALIGN . = ALIGN(MMU_PAGE_SIZE); +/* Used to align areas with separate memory permission characteristics + * so that the page permissions can be set in the MMU. Without this, + * the kernel is just one blob with the same RWX permissions on all RAM + */ +#ifdef CONFIG_SRAM_REGION_PERMISSIONS + #define MMU_PAGE_ALIGN . = ALIGN(CONFIG_MMU_PAGE_SIZE); #else #define MMU_PAGE_ALIGN #endif @@ -317,9 +320,11 @@ SECTIONS __data_rom_start = LOADADDR(_DATA_SECTION_NAME); #include - -#include #include +#include + +/* Must be last in RAM */ +#include MMU_PAGE_ALIGN __data_ram_end = .; diff --git a/include/arch/x86/ia32/thread.h b/include/arch/x86/ia32/thread.h index 8a847c87c4e..9f69db286c2 100644 --- a/include/arch/x86/ia32/thread.h +++ b/include/arch/x86/ia32/thread.h @@ -215,11 +215,16 @@ struct _thread_arch { uint8_t flags; #ifdef CONFIG_USERSPACE - /* Pointer to page tables used by this thread. Supervisor threads - * always use the kernel's page table, user thread use per-thread - * tables stored in the stack object + /* Physical address of the page tables used by this thread. Supervisor + * threads always use the kernel's page table, user thread use + * per-thread tables stored in the stack object. */ - struct x86_page_tables *ptables; + uintptr_t ptables; + + /* Track available unused space in the stack object used for building + * thread-specific page tables. + */ + uint8_t *mmu_pos; /* Initial privilege mode stack pointer when doing a system call. * Un-set for supervisor threads. diff --git a/include/arch/x86/intel64/linker.ld b/include/arch/x86/intel64/linker.ld index dec0ff9099b..58f9981e203 100644 --- a/include/arch/x86/intel64/linker.ld +++ b/include/arch/x86/intel64/linker.ld @@ -9,9 +9,12 @@ #define ROMABLE_REGION RAM #define RAMABLE_REGION RAM -#ifdef CONFIG_X86_MMU - #define MMU_PAGE_SIZE KB(4) - #define MMU_PAGE_ALIGN . = ALIGN(MMU_PAGE_SIZE); +/* Used to align areas with separate memory permission characteristics + * so that the page permissions can be set in the MMU. Without this, + * the kernel is just one blob with the same RWX permissions on all RAM + */ +#ifdef CONFIG_SRAM_REGION_PERMISSIONS + #define MMU_PAGE_ALIGN . = ALIGN(CONFIG_MMU_PAGE_SIZE); #else #define MMU_PAGE_ALIGN #endif @@ -167,10 +170,11 @@ SECTIONS #include #include #include +#include + +/* Must be last in RAM */ #include - . = ALIGN(8); - MMU_PAGE_ALIGN _image_ram_end = .; _end = .; diff --git a/include/arch/x86/intel64/thread.h b/include/arch/x86/intel64/thread.h index f79e7fb1585..3263b3612d8 100644 --- a/include/arch/x86/intel64/thread.h +++ b/include/arch/x86/intel64/thread.h @@ -41,6 +41,7 @@ #ifndef _ASMLANGUAGE #include +#include /* * 64-bit Task State Segment. One defined per CPU. @@ -114,11 +115,16 @@ struct _thread_arch { uint8_t flags; #ifdef CONFIG_USERSPACE - /* Pointer to page tables used by this thread. Supervisor threads - * always use the kernel's page table, user thread use per-thread - * tables stored in the stack object + /* Physical address to page tables used by this thread. Supervisor + * threads always use the kernel's page table, user thread use + * per-thread tables stored in the stack object */ - struct x86_page_tables *ptables; + uintptr_t ptables; + + /* Track available unused space in the stack object used for building + * thread-specific page tables. + */ + uint8_t *mmu_pos; /* Initial privilege mode stack pointer when doing a system call. * Un-set for supervisor threads. diff --git a/include/arch/x86/mmustructs.h b/include/arch/x86/mmustructs.h index 01acb1bea50..797b6a45e5b 100644 --- a/include/arch/x86/mmustructs.h +++ b/include/arch/x86/mmustructs.h @@ -1,429 +1,161 @@ /* * Copyright (c) 2011-2014 Wind River Systems, Inc. - * Copyright (c) 2017 Intel Corporation + * Copyright (c) 2020 Intel Corporation * * SPDX-License-Identifier: Apache-2.0 */ -#ifndef ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_ -#define ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_ +#ifndef ZEPHYR_INCLUDE_ARCH_X86_MMU_H +#define ZEPHYR_INCLUDE_ARCH_X86_MMU_H #include -#define MMU_PAGE_SIZE 4096UL -#define MMU_PAGE_MASK 0xfffU -#define MMU_PAGE_SHIFT 12U -#define PAGES(x) ((x) << (MMU_PAGE_SHIFT)) -#define MMU_ARE_IN_SAME_PAGE(a, b) \ - (((uint32_t)(a) & ~MMU_PAGE_MASK) == ((uint32_t)(b) & ~MMU_PAGE_MASK)) -#define MMU_IS_ON_PAGE_BOUNDARY(a) (!((uint32_t)(a) & MMU_PAGE_MASK)) +/* Macros for reserving space for page tables + * + * Z_X86_NUM_TABLE_PAGES. In order to produce a set of page tables which has + * virtual mappings for all system RAM, Z_X86_NUM_TABLE_PAGES is the number of + * memory pages required. If CONFIG_X86_PAE is enabled, an additional 0x20 + * bytes are required for the toplevel 4-entry PDPT. + * + * Z_X86_INITIAL_PAGETABLE_SIZE is the total amount of memory in bytes + * required, for any paging mode. + * + * These macros are currently used for two purposes: + * - Reserving memory in the stack for thread-level page tables (slated + * for eventual removal when USERSPACE is reworked to fully utilize + * virtual memory and page tables are maintained at the process level) + * - Reserving room for dummy pagetable memory for the first link, so that + * memory addresses are not disturbed by the insertion of the real page + * tables created by gen_mmu.py in the second link phase. + */ +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#ifdef CONFIG_X86_64 +#define Z_X86_NUM_PML4_ENTRIES 512U +#define Z_X86_NUM_PDPT_ENTRIES 512U +#else +#define Z_X86_NUM_PDPT_ENTRIES 4U +#endif /* CONFIG_X86_64 */ +#define Z_X86_NUM_PD_ENTRIES 512U +#define Z_X86_NUM_PT_ENTRIES 512U +#else +#define Z_X86_NUM_PD_ENTRIES 1024U +#define Z_X86_NUM_PT_ENTRIES 1024U +#endif /* !CONFIG_X86_64 && !CONFIG_X86_PAE */ +/* Memory range covered by an instance of various table types */ +#define Z_X86_PT_AREA ((uintptr_t)(CONFIG_MMU_PAGE_SIZE * \ + Z_X86_NUM_PT_ENTRIES)) +#define Z_X86_PD_AREA (Z_X86_PT_AREA * Z_X86_NUM_PD_ENTRIES) +#ifdef CONFIG_X86_64 +#define Z_X86_PDPT_AREA (Z_X86_PD_AREA * Z_X86_NUM_PDPT_ENTRIES) +#endif + +#define PHYS_RAM_ADDR DT_REG_ADDR(DT_CHOSEN(zephyr_sram)) +#define PHYS_RAM_SIZE DT_REG_SIZE(DT_CHOSEN(zephyr_sram)) + +/* Define a range [Z_X86_PT_START, Z_X86_PT_END) which is the memory range + * covered by all the page tables needed for system RAM + */ +#define Z_X86_PT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PT_AREA)) +#define Z_X86_PT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ + Z_X86_PT_AREA)) + +/* Number of page tables needed to cover system RAM. Depends on the specific + * bounds of system RAM, but roughly 1 page table per 2MB of RAM + */ +#define Z_X86_NUM_PT ((Z_X86_PT_END - Z_X86_PT_START) / Z_X86_PT_AREA) + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +/* Same semantics as above, but for the page directories needed to cover + * system RAM. + */ +#define Z_X86_PD_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PD_AREA)) +#define Z_X86_PD_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ + Z_X86_PD_AREA)) +/* Number of page directories needed to cover system RAM. Depends on the + * specific bounds of system RAM, but roughly 1 page directory per 1GB of RAM + */ +#define Z_X86_NUM_PD ((Z_X86_PD_END - Z_X86_PD_START) / Z_X86_PD_AREA) +#else +/* 32-bit page tables just have one toplevel page directory */ +#define Z_X86_NUM_PD 1 +#endif + +#ifdef CONFIG_X86_64 +/* Same semantics as above, but for the page directory pointer tables needed + * to cover system RAM. On 32-bit there is just one 4-entry PDPT. + */ +#define Z_X86_PDPT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, \ + Z_X86_PDPT_AREA)) +#define Z_X86_PDPT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ + Z_X86_PDPT_AREA)) +/* Number of PDPTs needed to cover system RAM. Depends on the + * specific bounds of system RAM, but roughly 1 PDPT per 512GB of RAM + */ +#define Z_X86_NUM_PDPT ((Z_X86_PDPT_END - Z_X86_PDPT_START) / Z_X86_PDPT_AREA) + +/* All pages needed for page tables, using computed values plus one more for + * the top-level PML4 + */ +#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD + \ + Z_X86_NUM_PDPT + 1) +#else /* !CONFIG_X86_64 */ +/* Number of pages we need to reserve in the stack for per-thread page tables */ +#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD) +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_X86_PAE +/* Toplevel PDPT wasn't included as it is not a page in size */ +#define Z_X86_INITIAL_PAGETABLE_SIZE ((Z_X86_NUM_TABLE_PAGES * \ + CONFIG_MMU_PAGE_SIZE) + 0x20) +#else +#define Z_X86_INITIAL_PAGETABLE_SIZE (Z_X86_NUM_TABLE_PAGES * \ + CONFIG_MMU_PAGE_SIZE) +#endif /* - * Common flags in the same bit position regardless of which structure level, - * although not every flag is supported at every level, and some may be - * ignored depending on the state of other bits (such as P or PS) + * K_MEM_PARTITION_* defines * - * These flags indicate bit position, and can be used for setting flags or - * masks as needed. + * Slated for removal when virtual memory is implemented, memory + * mapping APIs will replace memory domains. */ - -#define Z_X86_MMU_P BIT64(0) /** Present */ #define Z_X86_MMU_RW BIT64(1) /** Read-Write */ #define Z_X86_MMU_US BIT64(2) /** User-Supervisor */ -#define Z_X86_MMU_PWT BIT64(3) /** Page Write Through */ -#define Z_X86_MMU_PCD BIT64(4) /** Page Cache Disable */ -#define Z_X86_MMU_A BIT64(5) /** Accessed */ -#define Z_X86_MMU_D BIT64(6) /** Dirty */ -#define Z_X86_MMU_PS BIT64(7) /** Page Size */ -#define Z_X86_MMU_G BIT64(8) /** Global */ +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) #define Z_X86_MMU_XD BIT64(63) /** Execute Disable */ - -#ifdef CONFIG_X86_64 -#define Z_X86_MMU_PROT_KEY_MASK 0x7800000000000000ULL +#else +#define Z_X86_MMU_XD 0 #endif -/* - * Structure-specific flags / masks +/* Always true with 32-bit page tables, don't enable + * CONFIG_EXECUTE_XOR_WRITE and expect it to work for you */ -#define Z_X86_MMU_PDPTE_PAT BIT64(12) -#define Z_X86_MMU_PDE_PAT BIT64(12) -#define Z_X86_MMU_PTE_PAT BIT64(7) /** Page Attribute Table */ - -/* The true size of the mask depends on MAXADDR, which is found at run-time. - * As a simplification, roll the area for the memory address, and the - * reserved or ignored regions immediately above it, into a single area. - * This will work as expected if valid memory addresses are written. - */ -#ifdef CONFIG_X86_64 -#define Z_X86_MMU_PML4E_PDPT_MASK 0x7FFFFFFFFFFFF000ULL -#endif -#define Z_X86_MMU_PDPTE_PD_MASK 0x7FFFFFFFFFFFF000ULL -#ifdef CONFIG_X86_64 -#define Z_X86_MMU_PDPTE_1G_MASK 0x07FFFFFFC0000000ULL -#endif -#define Z_X86_MMU_PDE_PT_MASK 0x7FFFFFFFFFFFF000ULL -#define Z_X86_MMU_PDE_2MB_MASK 0x07FFFFFFFFC00000ULL -#define Z_X86_MMU_PTE_ADDR_MASK 0x07FFFFFFFFFFF000ULL - -/* - * These flags indicate intention when setting access properties. - */ - -#define MMU_ENTRY_NOT_PRESENT 0ULL -#define MMU_ENTRY_PRESENT Z_X86_MMU_P - -#define MMU_ENTRY_READ 0ULL -#define MMU_ENTRY_WRITE Z_X86_MMU_RW - -#define MMU_ENTRY_SUPERVISOR 0ULL -#define MMU_ENTRY_USER Z_X86_MMU_US - -#define MMU_ENTRY_WRITE_BACK 0ULL -#define MMU_ENTRY_WRITE_THROUGH Z_X86_MMU_PWT - -#define MMU_ENTRY_CACHING_ENABLE 0ULL -#define MMU_ENTRY_CACHING_DISABLE Z_X86_MMU_PCD - -#define MMU_ENTRY_NOT_ACCESSED 0ULL -#define MMU_ENTRY_ACCESSED Z_X86_MMU_A - -#define MMU_ENTRY_NOT_DIRTY 0ULL -#define MMU_ENTRY_DIRTY Z_X86_MMU_D - -#define MMU_ENTRY_NOT_GLOBAL 0ULL -#define MMU_ENTRY_GLOBAL Z_X86_MMU_G - -#define MMU_ENTRY_EXECUTE_DISABLE Z_X86_MMU_XD -#define MMU_ENTRY_EXECUTE_ENABLE 0ULL - -/* memory partition arch/soc independent attribute */ -#define K_MEM_PARTITION_P_RW_U_RW (MMU_ENTRY_WRITE | \ - MMU_ENTRY_USER | \ - MMU_ENTRY_EXECUTE_DISABLE) - -#define K_MEM_PARTITION_P_RW_U_NA (MMU_ENTRY_WRITE | \ - MMU_ENTRY_SUPERVISOR | \ - MMU_ENTRY_EXECUTE_DISABLE) - -#define K_MEM_PARTITION_P_RO_U_RO (MMU_ENTRY_READ | \ - MMU_ENTRY_USER | \ - MMU_ENTRY_EXECUTE_DISABLE) - -#define K_MEM_PARTITION_P_RO_U_NA (MMU_ENTRY_READ | \ - MMU_ENTRY_SUPERVISOR | \ - MMU_ENTRY_EXECUTE_DISABLE) - -/* Execution-allowed attributes */ -#define K_MEM_PARTITION_P_RWX_U_RWX (MMU_ENTRY_WRITE | MMU_ENTRY_USER) - -#define K_MEM_PARTITION_P_RWX_U_NA (MMU_ENTRY_WRITE | MMU_ENTRY_SUPERVISOR) - -#define K_MEM_PARTITION_P_RX_U_RX (MMU_ENTRY_READ | MMU_ENTRY_USER) - -#define K_MEM_PARTITION_P_RX_U_NA (MMU_ENTRY_READ | MMU_ENTRY_SUPERVISOR) - #define K_MEM_PARTITION_IS_EXECUTABLE(attr) (((attr) & Z_X86_MMU_XD) == 0) #define K_MEM_PARTITION_IS_WRITABLE(attr) (((attr) & Z_X86_MMU_RW) != 0) +/* memory partition arch/soc independent attribute */ +#define K_MEM_PARTITION_P_RW_U_RW (Z_X86_MMU_RW | Z_X86_MMU_US | \ + Z_X86_MMU_XD) +#define K_MEM_PARTITION_P_RW_U_NA (Z_X86_MMU_RW | Z_X86_MMU_XD) +#define K_MEM_PARTITION_P_RO_U_RO (Z_X86_MMU_US | Z_X86_MMU_XD) +#define K_MEM_PARTITION_P_RO_U_NA Z_X86_MMU_XD +/* Execution-allowed attributes */ +#define K_MEM_PARTITION_P_RWX_U_RWX (Z_X86_MMU_RW | Z_X86_MMU_US) +#define K_MEM_PARTITION_P_RWX_U_NA Z_X96_MMU_RW +#define K_MEM_PARTITION_P_RX_U_RX Z_X86_MMU_US +#define K_MEM_PARTITION_P_RX_U_NA (0) /* memory partition access permission mask */ -#define K_MEM_PARTITION_PERM_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \ - Z_X86_MMU_XD) +#define K_MEM_PARTITION_PERM_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \ + Z_X86_MMU_XD) #ifndef _ASMLANGUAGE -#include -#include - -/* Structure used by gen_mmu.py to create page directories and page tables. - * In order to populate this structure use macro MMU_BOOT_REGION. +/* Page table entry data type at all levels. Defined here due to + * k_mem_partition_attr_t, eventually move to private x86_mmu.h */ -struct mmu_region { - uintptr_t address; /*Start address of the memory region */ - size_t size; /* Size of the memory region*/ - uint64_t flags; /* Permissions needed for this region*/ -}; - -/* permission_flags are calculated using the macros - * region_size has to be provided in bytes - * for read write access = MMU_ENTRY_READ/MMU_ENTRY_WRITE - * for supervisor/user mode access = MMU_ENTRY_SUPERVISOR/MMU_ENTRY_USER - * - * Preprocessor indirection layers used to ensure __COUNTER__ is expanded - * properly. - */ - -#define __MMU_BOOT_REGION(id, addr, region_size, permission_flags) \ - static const Z_STRUCT_SECTION_ITERABLE(mmu_region, region_##id) = \ - { \ - .address = (uintptr_t)(addr), \ - .size = (size_t)(region_size), \ - .flags = (permission_flags), \ - } - -#define Z_MMU_BOOT_REGION(id, addr, region_size, permission_flags) \ - __MMU_BOOT_REGION(id, addr, region_size, permission_flags) - -#define MMU_BOOT_REGION(addr, region_size, permission_flags) \ - Z_MMU_BOOT_REGION(__COUNTER__, addr, region_size, permission_flags) - -#ifdef CONFIG_X86_64 -#define Z_X86_NUM_PML4_ENTRIES 512U -#define Z_X86_NUM_PDPT_ENTRIES 512U +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +typedef uint64_t pentry_t; #else -#define Z_X86_NUM_PDPT_ENTRIES 4U +typedef uint32_t pentry_t; #endif -#define Z_X86_NUM_PD_ENTRIES 512U -#define Z_X86_NUM_PT_ENTRIES 512U - -/* Memory range covered by an instance of various table types */ -#define Z_X86_PT_AREA (MMU_PAGE_SIZE * Z_X86_NUM_PT_ENTRIES) -#define Z_X86_PD_AREA (Z_X86_PT_AREA * Z_X86_NUM_PD_ENTRIES) -#define Z_X86_PDPT_AREA (Z_X86_PD_AREA * Z_X86_NUM_PDPT_ENTRIES) - -typedef uint64_t k_mem_partition_attr_t; - -#ifdef CONFIG_X86_64 -struct x86_mmu_pml4 { - uint64_t entry[Z_X86_NUM_PML4_ENTRIES]; -}; -#endif - -struct x86_mmu_pdpt { - uint64_t entry[Z_X86_NUM_PDPT_ENTRIES]; -}; - -struct x86_mmu_pd { - uint64_t entry[Z_X86_NUM_PD_ENTRIES]; -}; - -struct x86_mmu_pt { - uint64_t entry[Z_X86_NUM_PT_ENTRIES]; -}; - -struct x86_page_tables { -#ifdef CONFIG_X86_64 - struct x86_mmu_pml4 pml4; -#else - struct x86_mmu_pdpt pdpt; -#endif -}; - -/* - * Inline functions for getting the next linked structure - */ -#ifdef CONFIG_X86_64 -static inline uint64_t *z_x86_pml4_get_pml4e(struct x86_mmu_pml4 *pml4, - uintptr_t addr) -{ - int index = (addr >> 39U) & (Z_X86_NUM_PML4_ENTRIES - 1); - - return &pml4->entry[index]; -} - -static inline struct x86_mmu_pdpt *z_x86_pml4e_get_pdpt(uint64_t pml4e) -{ - uintptr_t addr = pml4e & Z_X86_MMU_PML4E_PDPT_MASK; - - return (struct x86_mmu_pdpt *)addr; -} -#endif - -static inline uint64_t *z_x86_pdpt_get_pdpte(struct x86_mmu_pdpt *pdpt, - uintptr_t addr) -{ - int index = (addr >> 30U) & (Z_X86_NUM_PDPT_ENTRIES - 1); - - return &pdpt->entry[index]; -} - -static inline struct x86_mmu_pd *z_x86_pdpte_get_pd(uint64_t pdpte) -{ - uintptr_t addr = pdpte & Z_X86_MMU_PDPTE_PD_MASK; - -#ifdef CONFIG_X86_64 - __ASSERT((pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page"); -#endif - return (struct x86_mmu_pd *)addr; -} - -static inline uint64_t *z_x86_pd_get_pde(struct x86_mmu_pd *pd, uintptr_t addr) -{ - int index = (addr >> 21U) & (Z_X86_NUM_PD_ENTRIES - 1); - - return &pd->entry[index]; -} - -static inline struct x86_mmu_pt *z_x86_pde_get_pt(uint64_t pde) -{ - uintptr_t addr = pde & Z_X86_MMU_PDE_PT_MASK; - - __ASSERT((pde & Z_X86_MMU_PS) == 0, "pde is for 2MB page"); - - return (struct x86_mmu_pt *)addr; -} - -static inline uint64_t *z_x86_pt_get_pte(struct x86_mmu_pt *pt, uintptr_t addr) -{ - int index = (addr >> 12U) & (Z_X86_NUM_PT_ENTRIES - 1); - - return &pt->entry[index]; -} - -/* - * Inline functions for obtaining page table structures from the top-level - */ - -#ifdef CONFIG_X86_64 -static inline struct x86_mmu_pml4 * -z_x86_get_pml4(struct x86_page_tables *ptables) -{ - return &ptables->pml4; -} - -static inline uint64_t *z_x86_get_pml4e(struct x86_page_tables *ptables, - uintptr_t addr) -{ - return z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr); -} - -static inline struct x86_mmu_pdpt * -z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr) -{ - return z_x86_pml4e_get_pdpt(*z_x86_get_pml4e(ptables, addr)); -} -#else -static inline struct x86_mmu_pdpt * -z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr) -{ - ARG_UNUSED(addr); - - return &ptables->pdpt; -} -#endif /* CONFIG_X86_64 */ - -static inline uint64_t *z_x86_get_pdpte(struct x86_page_tables *ptables, - uintptr_t addr) -{ - return z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr), addr); -} - -static inline struct x86_mmu_pd * -z_x86_get_pd(struct x86_page_tables *ptables, uintptr_t addr) -{ - return z_x86_pdpte_get_pd(*z_x86_get_pdpte(ptables, addr)); -} - -static inline uint64_t *z_x86_get_pde(struct x86_page_tables *ptables, - uintptr_t addr) -{ - return z_x86_pd_get_pde(z_x86_get_pd(ptables, addr), addr); -} - -static inline struct x86_mmu_pt * -z_x86_get_pt(struct x86_page_tables *ptables, uintptr_t addr) -{ - return z_x86_pde_get_pt(*z_x86_get_pde(ptables, addr)); -} - -static inline uint64_t *z_x86_get_pte(struct x86_page_tables *ptables, - uintptr_t addr) -{ - return z_x86_pt_get_pte(z_x86_get_pt(ptables, addr), addr); -} - -/** - * Dump out page table entries for a particular memory address - * - * For the provided memory address, dump out the P, W, XD, US flags - * at each paging level to the error log. - */ -void z_x86_dump_mmu_flags(struct x86_page_tables *ptables, uintptr_t addr); - -/** - * Debug function for dumping out page tables - * - * Iterates through the entire linked set of page table structures, - * dumping out codes for the configuration of each table entry. - * - * Entry codes: - * - * . - not present - * w - present, writable, not executable - * a - present, writable, executable - * r - present, read-only, not executable - * x - present, read-only, executable - * - * Entry codes in uppercase indicate that user mode may access. - * - * @param ptables Top-level pointer to the page tables, as programmed in CR3 - */ -void z_x86_dump_page_tables(struct x86_page_tables *ptables); - -static inline struct x86_page_tables *z_x86_page_tables_get(void) -{ - struct x86_page_tables *ret; - -#ifdef CONFIG_X86_64 - __asm__ volatile("movq %%cr3, %0\n\t" : "=r" (ret)); -#else - __asm__ volatile("movl %%cr3, %0\n\t" : "=r" (ret)); -#endif - - return ret; -} - -/* Kernel's page table. Always active when threads are running in supervisor - * mode, or handling an interrupt. - * - * If KPTI is not enabled, this is used as a template to create per-thread - * page tables for when threads run in user mode. - */ -extern struct x86_page_tables z_x86_kernel_ptables; -#ifdef CONFIG_X86_KPTI -/* Separate page tables for user mode threads. This is never installed into the - * CPU; instead it is used as a template for creating per-thread page tables. - */ -extern struct x86_page_tables z_x86_user_ptables; -#define USER_PTABLES z_x86_user_ptables -#else -#define USER_PTABLES z_x86_kernel_ptables -#endif -/** - * @brief Fetch page table flags for a particular page - * - * Given a memory address, return the flags for the containing page's - * PDE and PTE entries. Intended for debugging. - * - * @param ptables Which set of page tables to use - * @param addr Memory address to example - * @param pde_flags Output parameter for page directory entry flags - * @param pte_flags Output parameter for page table entry flags - */ -void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr, - uint64_t *pde_flags, uint64_t *pte_flags); - -/** - * @brief set flags in the MMU page tables - * - * Modify bits in the existing page tables for a particular memory - * range, which must be page-aligned - * - * @param ptables Which set of page tables to use - * @param ptr Starting memory address which must be page-aligned - * @param size Size of the region, must be page size multiple - * @param flags Value of bits to set in the page table entries - * @param mask Mask indicating which particular bits in the page table entries - * to modify - * @param flush Whether to flush the TLB for the modified pages, only needed - * when modifying the active page tables - */ -void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr, - size_t size, uint64_t flags, uint64_t mask, bool flush); - -int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size, - bool write); - -void z_x86_add_mmu_region(uintptr_t addr, size_t size, uint64_t flags); - +typedef pentry_t k_mem_partition_attr_t; #endif /* _ASMLANGUAGE */ - -#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMUSTRUCTS_H_ */ +#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */ diff --git a/include/arch/x86/pagetables.ld b/include/arch/x86/pagetables.ld new file mode 100644 index 00000000000..905a0486621 --- /dev/null +++ b/include/arch/x86/pagetables.ld @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2020 Intel Corp. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* Pagetables. These are produced by arch/x86/gen-mmu.py based on + * data in zephyr_prebuilt.elf (the result of linker pass 1). + * For the pass 1 build, an equal-sized dummy area is provided as + * to not shift memory addresses that occur after this. + */ +#ifdef CONFIG_MMU + SECTION_DATA_PROLOGUE(pagetables,,) + { + . = ALIGN(4096); + z_x86_pagetables_start = .; +#ifdef LINKER_PASS2 + KEEP(*(pagetables)) /* gen_mmu.py */ +#else + KEEP(*(.dummy_pagetables)) /* from x86_mmu.c, just an empty array */ +#endif /* LINKER_PASS2 */ + + /* Top-level paging structure is the last thing in this section */ +#if CONFIG_X86_PAE + /* 4-entry PDPT */ + z_x86_kernel_ptables = . - 32; +#else + /* Page directory or PML4 */ + z_x86_kernel_ptables = . - 4096; +#endif /* CONFIG_X86_PAE */ + } GROUP_DATA_LINK_IN(RAMABLE_REGION, ROMABLE_REGION) + +#ifdef LINKER_PASS2 + /DISCARD/ : + { + /* We have the real ones in this build */ + *(.dummy_pagetables) + } +#endif /* LINKER_PASS2 */ +#endif /* CONFIG_MMU */ diff --git a/include/arch/x86/thread_stack.h b/include/arch/x86/thread_stack.h index 26bdec82d4f..efc00118960 100644 --- a/include/arch/x86/thread_stack.h +++ b/include/arch/x86/thread_stack.h @@ -19,13 +19,16 @@ * user mode. For each thread, we have: * * - On 32-bit + * - a toplevel PD + * - On 32-bit (PAE) * - a toplevel PDPT + * - a set of PDs for the memory range covered by system RAM * - On 64-bit * - a toplevel PML4 * - a set of PDPTs for the memory range covered by system RAM + * - a set of PDs for the memory range covered by system RAM * - On all modes: - * - a set of page directories for the memory range covered by system RAM - * - a set of page tbales for the memory range covered by system RAM + * - a set of PTs for the memory range covered by system RAM * * Directories and tables for memory ranges outside of system RAM will be * shared and not thread-specific. @@ -44,73 +47,21 @@ * * The PDPT is fairly small singleton on x86 PAE (32 bytes) and also must * be aligned to 32 bytes, so we place it at the highest addresses of the - * page reserved for the privilege elevation stack. On 64-bit all table - * entities up to and including the PML4 are page-sized. + * page reserved for the privilege elevation stack. On 64-bit or legacy 32-bit + * all table entities up to and including the PML4 are page-sized. * * The page directories and tables require page alignment so we put them as * additional fields in the stack object, using the below macros to compute how * many pages we need. */ - -#define PHYS_RAM_ADDR DT_REG_ADDR(DT_CHOSEN(zephyr_sram)) -#define PHYS_RAM_SIZE DT_REG_SIZE(DT_CHOSEN(zephyr_sram)) - -/* Define a range [Z_X86_PT_START, Z_X86_PT_END) which is the memory range - * covered by all the page tables needed for system RAM - */ -#define Z_X86_PT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PT_AREA)) -#define Z_X86_PT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ - Z_X86_PT_AREA)) - -/* Number of page tables needed to cover system RAM. Depends on the specific - * bounds of system RAM, but roughly 1 page table per 2MB of RAM - */ -#define Z_X86_NUM_PT ((Z_X86_PT_END - Z_X86_PT_START) / Z_X86_PT_AREA) - -/* Same semantics as above, but for the page directories needed to cover - * system RAM. - */ -#define Z_X86_PD_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, Z_X86_PD_AREA)) -#define Z_X86_PD_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ - Z_X86_PD_AREA)) -/* Number of page directories needed to cover system RAM. Depends on the - * specific bounds of system RAM, but roughly 1 page directory per 1GB of RAM - */ -#define Z_X86_NUM_PD ((Z_X86_PD_END - Z_X86_PD_START) / Z_X86_PD_AREA) - -#ifdef CONFIG_X86_64 -/* Same semantics as above, but for the page directory pointer tables needed - * to cover system RAM. On 32-bit there is just one 4-entry PDPT. - */ -#define Z_X86_PDPT_START ((uintptr_t)ROUND_DOWN(PHYS_RAM_ADDR, \ - Z_X86_PDPT_AREA)) -#define Z_X86_PDPT_END ((uintptr_t)ROUND_UP(PHYS_RAM_ADDR + PHYS_RAM_SIZE, \ - Z_X86_PDPT_AREA)) -/* Number of PDPTs needed to cover system RAM. Depends on the - * specific bounds of system RAM, but roughly 1 PDPT per 512GB of RAM - */ -#define Z_X86_NUM_PDPT ((Z_X86_PDPT_END - Z_X86_PDPT_START) / Z_X86_PDPT_AREA) - -/* All pages needed for page tables, using computed values plus one more for - * the top-level PML4 - */ -#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD + \ - Z_X86_NUM_PDPT + 1) -#else /* !CONFIG_X86_64 */ -/* Number of pages we need to reserve in the stack for per-thread page tables */ -#define Z_X86_NUM_TABLE_PAGES (Z_X86_NUM_PT + Z_X86_NUM_PD) -#endif /* CONFIG_X86_64 */ -#else /* !CONFIG_USERSPACE */ -/* If we're not implementing user mode, then the MMU tables don't get changed - * on context switch and we don't need any per-thread page tables - */ -#define Z_X86_NUM_TABLE_PAGES 0UL -#endif /* CONFIG_USERSPACE */ - -#define Z_X86_THREAD_PT_AREA (Z_X86_NUM_TABLE_PAGES * MMU_PAGE_SIZE) +#define Z_X86_THREAD_PT_AREA (Z_X86_NUM_TABLE_PAGES * \ + (uintptr_t)CONFIG_MMU_PAGE_SIZE) +#else +#define Z_X86_THREAD_PT_AREA 0UL +#endif #if defined(CONFIG_HW_STACK_PROTECTION) || defined(CONFIG_USERSPACE) -#define Z_X86_STACK_BASE_ALIGN MMU_PAGE_SIZE +#define Z_X86_STACK_BASE_ALIGN CONFIG_MMU_PAGE_SIZE #else #define Z_X86_STACK_BASE_ALIGN ARCH_STACK_PTR_ALIGN #endif @@ -120,7 +71,7 @@ * the access control granularity and we don't want other kernel data to * unintentionally fall in the latter part of the page */ -#define Z_X86_STACK_SIZE_ALIGN MMU_PAGE_SIZE +#define Z_X86_STACK_SIZE_ALIGN CONFIG_MMU_PAGE_SIZE #else #define Z_X86_STACK_SIZE_ALIGN ARCH_STACK_PTR_ALIGN #endif @@ -136,7 +87,7 @@ struct z_x86_kernel_stack_data { * are page-aligned and we just reserve room for them in * Z_X86_THREAD_PT_AREA. */ - struct x86_page_tables ptables; + uint8_t ptables[0x20]; } __aligned(0x20); #endif /* !CONFIG_X86_64 */ @@ -180,14 +131,14 @@ struct z_x86_thread_stack_header { #endif #ifdef CONFIG_HW_STACK_PROTECTION - char guard_page[MMU_PAGE_SIZE]; + char guard_page[CONFIG_MMU_PAGE_SIZE]; #endif #ifdef CONFIG_USERSPACE #ifdef CONFIG_X86_64 - char privilege_stack[MMU_PAGE_SIZE]; + char privilege_stack[CONFIG_MMU_PAGE_SIZE]; #else - char privilege_stack[MMU_PAGE_SIZE - + char privilege_stack[CONFIG_MMU_PAGE_SIZE - sizeof(struct z_x86_kernel_stack_data)]; struct z_x86_kernel_stack_data kernel_data; @@ -204,8 +155,8 @@ struct z_x86_thread_stack_header { sizeof(struct z_x86_thread_stack_header) #ifdef CONFIG_HW_STACK_PROTECTION -#define ARCH_KERNEL_STACK_RESERVED MMU_PAGE_SIZE -#define ARCH_KERNEL_STACK_OBJ_ALIGN MMU_PAGE_SIZE +#define ARCH_KERNEL_STACK_RESERVED CONFIG_MMU_PAGE_SIZE +#define ARCH_KERNEL_STACK_OBJ_ALIGN CONFIG_MMU_PAGE_SIZE #else #define ARCH_KERNEL_STACK_RESERVED 0 #define ARCH_KERNEL_STACK_OBJ_ALIGN ARCH_STACK_PTR_ALIGN