From c3b3aafaec02256cb00a7fff7b7ece819ccbe111 Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Wed, 31 Jul 2019 14:21:14 -0700 Subject: [PATCH] x86: generate page tables at runtime Removes very complex boot-time generation of page tables with a much simpler runtime generation of them at bootup. For those x86 boards that enable the MMU in the defconfig, set the number of page pool pages appropriately. The MMU_RUNTIME_* flags have been removed. They were an artifact of the old page table generation and did not correspond to any hardware state. Signed-off-by: Andrew Boie --- CODEOWNERS | 1 - arch/x86/core/Kconfig.ia32 | 11 +- arch/x86/core/ia32/crt0.S | 50 +- arch/x86/core/ia32/x86_mmu.c | 203 ++++++- arch/x86/gen_mmu_x86.py | 558 ------------------ arch/x86/ia32.cmake | 32 - arch/x86/include/ia32/kernel_arch_func.h | 10 + arch/x86/include/ia32/mmustructs.h | 27 +- .../x86/qemu_x86/qemu_x86_coverage_defconfig | 2 + boards/x86/qemu_x86/qemu_x86_defconfig | 1 + boards/x86/qemu_x86/qemu_x86_iamcu_defconfig | 1 + include/arch/x86/ia32/linker.ld | 48 +- soc/x86/apollo_lake/linker.ld | 3 - soc/x86/atom/linker.ld | 3 - soc/x86/ia32/linker.ld | 1 - 15 files changed, 250 insertions(+), 701 deletions(-) delete mode 100755 arch/x86/gen_mmu_x86.py diff --git a/CODEOWNERS b/CODEOWNERS index 430873ef195..d080041dbaa 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -315,7 +315,6 @@ /arch/x86/gen_gdt.py @andrewboie /arch/x86/gen_idt.py @andrewboie /scripts/gen_kobject_list.py @andrewboie -/arch/x86/gen_mmu_x86.py @andrewboie /scripts/gen_priv_stacks.py @andrewboie @agross-oss @ioannisg /scripts/gen_syscall_header.py @andrewboie /scripts/gen_syscalls.py @andrewboie diff --git a/arch/x86/core/Kconfig.ia32 b/arch/x86/core/Kconfig.ia32 index 19ba2cf57ef..70635221e74 100644 --- a/arch/x86/core/Kconfig.ia32 +++ b/arch/x86/core/Kconfig.ia32 @@ -129,9 +129,18 @@ config X86_MMU select MEMORY_PROTECTION help This options enables the memory management unit present in x86 - and creates a set of page tables at build time. Requires an MMU + and creates a set of page tables at boot time. Requires an MMU which supports PAE page tables. +config X86_MMU_PAGE_POOL_PAGES + int "Number of pages to reserve for building page tables" + default 16 + depends on X86_MMU + help + Building page tables at boot requires a pool of free memory pages + to construct it. This can't be derived at build time, tune this + to your SoC's specific memory map. + config X86_NO_MELTDOWN bool help diff --git a/arch/x86/core/ia32/crt0.S b/arch/x86/core/ia32/crt0.S index d426c06c757..8535587f5f8 100644 --- a/arch/x86/core/ia32/crt0.S +++ b/arch/x86/core/ia32/crt0.S @@ -20,6 +20,7 @@ /* exports (private APIs) */ GTEXT(__start) + GTEXT(z_x86_enable_paging) /* externs */ GTEXT(z_cstart) @@ -273,30 +274,6 @@ __csSet: lgdt %ds:_gdt #endif -#ifdef CONFIG_X86_MMU - - /* load the page directory address into the registers*/ - movl $z_x86_kernel_pdpt, %eax - movl %eax, %cr3 - - /* Enable PAE */ - movl %cr4, %eax - orl $CR4_PAE_ENABLE, %eax - movl %eax, %cr4 - - /* IA32_EFER NXE bit set */ - movl $0xC0000080, %ecx - rdmsr - orl $0x800, %eax - wrmsr - - /* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */ - movl %cr0, %eax - orl $CR0_PG_WP_ENABLE, %eax - movl %eax, %cr0 - -#endif /* CONFIG_X86_MMU */ - #if defined(CONFIG_X86_ENABLE_TSS) mov $MAIN_TSS, %ax ltr %ax @@ -399,6 +376,31 @@ dataWords: ret #endif /* CONFIG_XIP */ +#ifdef CONFIG_X86_MMU +z_x86_enable_paging: + /* load the page directory address into the registers*/ + movl $z_x86_kernel_pdpt, %eax + movl %eax, %cr3 + + /* Enable PAE */ + movl %cr4, %eax + orl $CR4_PAE_ENABLE, %eax + movl %eax, %cr4 + + /* IA32_EFER NXE bit set */ + movl $0xC0000080, %ecx + rdmsr + orl $0x800, %eax + wrmsr + + /* Enable paging (CR0.PG, bit 31) / write protect (CR0.WP, bit 16) */ + movl %cr0, %eax + orl $CR0_PG_WP_ENABLE, %eax + movl %eax, %cr0 + + ret +#endif /* CONFIG_X86_MMU */ + #if defined(CONFIG_SSE) /* SSE control & status register initial value */ diff --git a/arch/x86/core/ia32/x86_mmu.c b/arch/x86/core/ia32/x86_mmu.c index c64cd6401f8..74ad22e0bce 100644 --- a/arch/x86/core/ia32/x86_mmu.c +++ b/arch/x86/core/ia32/x86_mmu.c @@ -24,13 +24,11 @@ MMU_BOOT_REGION((u32_t)&_image_text_start, (u32_t)&_image_text_size, MMU_ENTRY_READ | MMU_ENTRY_USER); MMU_BOOT_REGION((u32_t)&_image_rodata_start, (u32_t)&_image_rodata_size, - MMU_ENTRY_READ | MMU_ENTRY_USER | - MMU_ENTRY_EXECUTE_DISABLE); + MMU_ENTRY_READ | MMU_ENTRY_USER | MMU_ENTRY_EXECUTE_DISABLE); #ifdef CONFIG_USERSPACE MMU_BOOT_REGION((u32_t)&_app_smem_start, (u32_t)&_app_smem_size, - MMU_ENTRY_WRITE | MMU_ENTRY_RUNTIME_USER | - MMU_ENTRY_EXECUTE_DISABLE); + MMU_ENTRY_WRITE | MMU_ENTRY_EXECUTE_DISABLE); #endif #ifdef CONFIG_COVERAGE_GCOV @@ -43,9 +41,7 @@ MMU_BOOT_REGION((u32_t)&__gcov_bss_start, (u32_t)&__gcov_bss_size, * automatically for stacks. */ MMU_BOOT_REGION((u32_t)&__kernel_ram_start, (u32_t)&__kernel_ram_size, - MMU_ENTRY_WRITE | - MMU_ENTRY_RUNTIME_USER | - MMU_ENTRY_EXECUTE_DISABLE); + MMU_ENTRY_WRITE | MMU_ENTRY_EXECUTE_DISABLE); /* Works for PDPT, PD, PT entries, the bits we check here are all the same. * @@ -298,13 +294,19 @@ static inline void tlb_flush_page(void *addr) __asm__ ("invlpg %0" :: "m" (*page)); } +#define PDPTE_FLAGS_MASK MMU_ENTRY_PRESENT + +#define PDE_FLAGS_MASK (MMU_ENTRY_WRITE | MMU_ENTRY_USER | \ + PDPTE_FLAGS_MASK) + +#define PTE_FLAGS_MASK (PDE_FLAGS_MASK | MMU_ENTRY_EXECUTE_DISABLE | \ + MMU_ENTRY_WRITE_THROUGH | \ + MMU_ENTRY_CACHING_DISABLE) void z_x86_mmu_set_flags(struct x86_mmu_pdpt *pdpt, void *ptr, size_t size, x86_page_entry_data_t flags, x86_page_entry_data_t mask, bool flush) { - union x86_mmu_pte *pte; - u32_t addr = (u32_t)ptr; __ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided"); @@ -319,12 +321,26 @@ void z_x86_mmu_set_flags(struct x86_mmu_pdpt *pdpt, void *ptr, size_t size, } while (size != 0) { + union x86_mmu_pte *pte; + union x86_mmu_pde_pt *pde; + union x86_mmu_pdpte *pdpte; x86_page_entry_data_t cur_flags = flags; - /* TODO we're not generating 2MB entries at the moment */ - __ASSERT(X86_MMU_GET_PDE(pdpt, addr)->ps != 1, "2MB PDE found"); - pte = X86_MMU_GET_PTE(pdpt, addr); + pdpte = X86_MMU_GET_PDPTE(pdpt, addr); + __ASSERT(pdpte->p == 1, "set flags on non-present PDPTE"); + pdpte->value |= (flags & PDPTE_FLAGS_MASK); + pde = X86_MMU_GET_PDE(pdpt, addr); + __ASSERT(pde->p == 1, "set flags on non-present PDE"); + pde->value |= (flags & PDE_FLAGS_MASK); + /* If any flags enable execution, clear execute disable at the + * page directory level + */ + if ((flags & MMU_ENTRY_EXECUTE_DISABLE) == 0) { + pde->value &= ~MMU_ENTRY_EXECUTE_DISABLE; + } + + pte = X86_MMU_GET_PTE(pdpt, addr); /* If we're setting the present bit, restore the address * field. If we're clearing it, then the address field * will be zeroed instead, mapping the PTE to the NULL page. @@ -344,6 +360,169 @@ void z_x86_mmu_set_flags(struct x86_mmu_pdpt *pdpt, void *ptr, size_t size, } } +static char __aligned(MMU_PAGE_SIZE) + page_pool[MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES]; + +static char *page_pos = page_pool + sizeof(page_pool); + +static void *get_page(void) +{ + page_pos -= MMU_PAGE_SIZE; + + __ASSERT(page_pos >= page_pool, "out of MMU pages\n"); + + return page_pos; +} + +__aligned(0x20) struct x86_mmu_pdpt z_x86_kernel_pdpt; +#ifdef CONFIG_X86_KPTI +__aligned(0x20) struct x86_mmu_pdpt z_x86_user_pdpt; +#endif + +extern char z_shared_kernel_page_start[]; + +static inline bool is_within_system_ram(uintptr_t addr) +{ + return (addr >= DT_PHYS_RAM_ADDR) && + (addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))); +} + +static void add_mmu_region_page(struct x86_mmu_pdpt *pdpt, uintptr_t addr, + u64_t flags, bool user_table) +{ + union x86_mmu_pdpte *pdpte; + struct x86_mmu_pd *pd; + union x86_mmu_pde_pt *pde; + struct x86_mmu_pt *pt; + union x86_mmu_pte *pte; + +#ifdef CONFIG_X86_KPTI + /* If we are generating a page table for user mode, and this address + * does not have the user flag set, and this address falls outside + * of system RAM, then don't bother generating any tables for it, + * we will never need them later as memory domains are limited to + * regions within system RAM. + */ + if (user_table && (flags & MMU_ENTRY_USER) == 0 && + !is_within_system_ram(addr)) { + return; + } +#endif + + /* Setup the PDPTE entry for the address, creating a page directory + * if one didn't exist + */ + pdpte = &pdpt->entry[MMU_PDPTE_NUM(addr)]; + if (pdpte->p == 0) { + pd = get_page(); + pdpte->pd = ((uintptr_t)pd) >> MMU_PAGE_SHIFT; + } else { + pd = (struct x86_mmu_pd *)(pdpte->pd << MMU_PAGE_SHIFT); + } + pdpte->value |= (flags & PDPTE_FLAGS_MASK); + + /* Setup the PDE entry for the address, creating a page table + * if necessary + */ + pde = &pd->entry[MMU_PDE_NUM(addr)].pt; + if (pde->p == 0) { + pt = get_page(); + pde->pt = ((uintptr_t)pt) >> MMU_PAGE_SHIFT; + } else { + pt = (struct x86_mmu_pt *)(pde->pt << MMU_PAGE_SHIFT); + } + pde->value |= (flags & PDE_FLAGS_MASK); + + /* Execute disable bit needs special handling, we should only set it + * at the page directory level if ALL pages have XD set (instead of + * just one). + * + * Use the 'ignored2' field to store a marker on whether any + * configured region allows execution, the CPU never looks at + * or modifies it. + */ + if ((flags & MMU_ENTRY_EXECUTE_DISABLE) == 0) { + pde->ignored2 = 1; + pde->value &= ~MMU_ENTRY_EXECUTE_DISABLE; + } else if (pde->ignored2 == 0) { + pde->value |= MMU_ENTRY_EXECUTE_DISABLE; + } + +#ifdef CONFIG_X86_KPTI + if (user_table && (flags & MMU_ENTRY_USER) == 0 && + addr != (uintptr_t)(&z_shared_kernel_page_start)) { + /* All non-user accessible pages except the shared page + * are marked non-present in the page table. + */ + return; + } +#else + ARG_UNUSED(user_table); +#endif + + /* Finally set up the page table entry */ + pte = &pt->entry[MMU_PAGE_NUM(addr)]; + pte->page = addr >> MMU_PAGE_SHIFT; + pte->value |= (flags & PTE_FLAGS_MASK); +} + +static void add_mmu_region(struct x86_mmu_pdpt *pdpt, struct mmu_region *rgn, + bool user_table) +{ + size_t size; + u64_t flags; + uintptr_t addr; + + __ASSERT((rgn->address & MMU_PAGE_MASK) == 0U, + "unaligned address provided"); + __ASSERT((rgn->size & MMU_PAGE_MASK) == 0U, + "unaligned size provided"); + + addr = rgn->address; + + /* Add the present flag, and filter out 'runtime user' since this + * has no meaning to the actual MMU + */ + flags = rgn->flags | MMU_ENTRY_PRESENT; + + /* Iterate through the region a page at a time, creating entries as + * necessary. + */ + size = rgn->size; + while (size > 0) { + add_mmu_region_page(pdpt, addr, flags, user_table); + + size -= MMU_PAGE_SIZE; + addr += MMU_PAGE_SIZE; + } +} + +extern struct mmu_region z_x86_mmulist_start[]; +extern struct mmu_region z_x86_mmulist_end[]; + +/* Called from x86's kernel_arch_init() */ +void z_x86_paging_init(void) +{ + size_t pages_free; + + for (struct mmu_region *rgn = z_x86_mmulist_start; + rgn < z_x86_mmulist_end; rgn++) { + add_mmu_region(&z_x86_kernel_pdpt, rgn, false); +#ifdef CONFIG_X86_KPTI + add_mmu_region(&z_x86_user_pdpt, rgn, true); +#endif + } + + pages_free = (page_pos - page_pool) / MMU_PAGE_SIZE; + + if (pages_free != 0) { + printk("Optimal CONFIG_X86_MMU_PAGE_POOL_PAGES %zu\n", + CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free); + } + + z_x86_enable_paging(); +} + #ifdef CONFIG_X86_USERSPACE int z_arch_buffer_validate(void *addr, size_t size, int write) { diff --git a/arch/x86/gen_mmu_x86.py b/arch/x86/gen_mmu_x86.py deleted file mode 100755 index 60e21abedbf..00000000000 --- a/arch/x86/gen_mmu_x86.py +++ /dev/null @@ -1,558 +0,0 @@ -#!/usr/bin/env python3 -# -# Copyright (c) 2019 Intel Corporation -# -# SPDX-License-Identifier: Apache-2.0 - -"""Generate MMU page tables for x86 CPUs. - -This script generates 64-bit PAE style MMU page tables for x86. -Even though x86 is a 32-bit target, we use this type of page table -to support the No-Execute (NX) bit. Please consult the IA -Architecture SW Developer Manual, volume 3, chapter 4 for more -details on this data structure. - -The script takes as input the zephyr_prebuilt.elf kernel binary, -which is a link of the Zephyr kernel without various build-time -generated data structures (such as the MMU tables) inserted into it. -The build cannot easily predict how large these tables will be, -so it is important that these MMU tables be inserted at the very -end of memory. - -Of particular interest is the "mmulist" section, which is a -table of memory region access policies set in code by instances -of MMU_BOOT_REGION() macros. The set of regions defined here -specifies the boot-time configuration of the page tables. - -The output of this script is a linked set of page tables, page -directories, and a page directory pointer table, which gets linked -into the final Zephyr binary, reflecting the access policies -read in the "mmulist" section. Any memory ranges not specified -in "mmulist" are marked non-present. - -If Kernel Page Table Isolation (CONFIG_X86_KPTI) is enabled, this -script additionally outputs a second set of page tables intended -for use by user threads running in Ring 3. These tables have the -same policy as the kernel's set of page tables with one crucial -difference: any pages not accessible to user mode threads are not -marked 'present', preventing Meltdown-style side channel attacks -from reading their contents. -""" - -import os -import sys -import struct -from collections import namedtuple -import ctypes -import argparse -from elftools.elf.elffile import ELFFile -from elftools.elf.sections import SymbolTableSection - -mmu_region_details = namedtuple("mmu_region_details", - "pde_index page_entries_info") - -valid_pages_inside_pde = namedtuple("valid_pages_inside_pde", "start_addr size \ - pte_valid_addr_start \ - pte_valid_addr_end \ - permissions") - -mmu_region_details_pdpt = namedtuple("mmu_region_details_pdpt", - "pdpte_index pd_entries") - -# Constants -PAGE_ENTRY_PRESENT = 1 -PAGE_ENTRY_READ_WRITE = 1 << 1 -PAGE_ENTRY_USER_SUPERVISOR = 1 << 2 -PAGE_ENTRY_XD = 1 << 63 - -# Struct formatters -struct_mmu_regions_format = "> 30) & 0x3 - - # return the page directory number for the give address - def get_pde_number(self, value): - return (value >> 21) & 0x1FF - - # return the page table number for the given address - def get_pte_number(self, value): - return (value >> 12) & 0x1FF - - def get_number_of_pd(self): - return len(self.get_pdpte_list()) - - def get_pdpte_list(self): - return list({temp[0] for temp in self.pd_tables_list}) - - # the return value will have the page address and it is assumed to be a 4096 - # boundary.hence the output of this API will be a 20bit address of the page - # table - def address_of_page_table(self, pdpte, page_table_number): - # first page given to page directory pointer - # and 2nd page till 5th page are used for storing the page directories. - - # set the max pdpte used. this tells how many pd are needed after - # that we start keeping the pt - PT_start_addr = self.get_number_of_pd() * 4096 +\ - self.pd_start_addr + 4096 - return (PT_start_addr + - (self.pd_tables_list.index([pdpte, page_table_number]) * - 4096) >> 12) - - def get_binary_pde_value(self, pdpte, value): - perms = value.page_entries_info[0].permissions - - present = PAGE_ENTRY_PRESENT - - read_write = check_bits(perms, [1, 29]) << 1 - user_mode = check_bits(perms, [2, 28]) << 2 - - page_table = self.address_of_page_table(pdpte, value.pde_index) << 12 - return present | read_write | user_mode | page_table - - def get_binary_pte_value(self, value, pde, pte, perm_for_pte): - read_write = perm_for_pte & PAGE_ENTRY_READ_WRITE - user_mode = perm_for_pte & PAGE_ENTRY_USER_SUPERVISOR - xd = perm_for_pte & PAGE_ENTRY_XD - - # This points to the actual memory in the HW - # totally 20 bits to rep the phy address - # first 2bits is from pdpte then 9bits is the number got from pde and - # next 9bits is pte - page_table = ((value.pdpte_index << 18) | (pde << 9) | pte) << 12 - - if self.kpti: - if user_mode: - present = PAGE_ENTRY_PRESENT - else: - if page_table == self.syms['z_shared_kernel_page_start']: - present = PAGE_ENTRY_PRESENT - else: - present = 0 - else: - present = PAGE_ENTRY_PRESENT - - binary_value = (present | read_write | user_mode | xd) - - # L1TF mitigation: map non-present pages to the NULL page - if present: - binary_value |= page_table - - return binary_value - - def clean_up_unused_pdpte(self): - self.list_of_pdpte = {key: value for key, value in - self.list_of_pdpte.items() - if value.pd_entries != {}} - - # update the tuple values for the memory regions needed - def set_pde_pte_values(self, pdpte, pde_index, address, mem_size, - pte_valid_addr_start, pte_valid_addr_end, perm): - - pages_tuple = valid_pages_inside_pde( - start_addr=address, - size=mem_size, - pte_valid_addr_start=pte_valid_addr_start, - pte_valid_addr_end=pte_valid_addr_end, - permissions=perm) - - mem_region_values = mmu_region_details(pde_index=pde_index, - page_entries_info=[]) - - mem_region_values.page_entries_info.append(pages_tuple) - - if pde_index in self.list_of_pdpte[pdpte].pd_entries.keys(): - # this step adds the new page info to the exsisting pages info - self.list_of_pdpte[pdpte].pd_entries[pde_index].\ - page_entries_info.append(pages_tuple) - else: - self.list_of_pdpte[pdpte].pd_entries[pde_index] = mem_region_values - - def populate_required_structs(self): - for start, size, flags in self.mem_regions: - pdpte_index = self.get_pdpte_number(start) - pde_index = self.get_pde_number(start) - pte_valid_addr_start = self.get_pte_number(start) - - # Get the end of the page table entries - # Since a memory region can take up only a few entries in the Page - # table, this helps us get the last valid PTE. - pte_valid_addr_end = self.get_pte_number(start + - size - 1) - - mem_size = size - - # In-case the start address aligns with a page table entry other - # than zero and the mem_size is greater than (1024*4096) i.e 4MB - # in case where it overflows the current PDE's range then limit the - # PTE to 1024 and so make the mem_size reflect the actual size - # taken up in the current PDE - if (size + (pte_valid_addr_start * 4096)) >= \ - (self.size_addressed_per_pde): - - pte_valid_addr_end = self.total_pages - mem_size = (((self.total_pages + 1) - - pte_valid_addr_start) * 4096) - - self.set_pde_pte_values(pdpte_index, - pde_index, - start, - mem_size, - pte_valid_addr_start, - pte_valid_addr_end, - flags) - - if [pdpte_index, pde_index] not in self.pd_tables_list: - self.pd_tables_list.append([pdpte_index, pde_index]) - - # IF the current pde couldn't fit the entire requested region - # size then there is a need to create new PDEs to match the size. - # Here the overflow_size represents the size that couldn't be fit - # inside the current PDE, this is will now to used to create a new - # PDE/PDEs so the size remaining will be - # requested size - allocated size(in the current PDE) - - overflow_size = size - mem_size - - # create all the extra PDEs needed to fit the requested size - # this loop starts from the current pde till the last pde that is - # needed the last pde is calculated as the (start_addr + size) >> - # 22 - if overflow_size != 0: - for extra_pdpte in range(pdpte_index, - self.get_pdpte_number(start + - size) + 1): - for extra_pde in range(pde_index + 1, self.get_pde_number( - start + size) + 1): - - # new pde's start address - # each page directory entry has a addr range of - # (1024 *4096) thus the new PDE start address is a - # multiple of that number - extra_pde_start_address = ( - extra_pde * (self.size_addressed_per_pde)) - - # the start address of and extra pde will always be 0 - # and the end address is calculated with the new - # pde's start address and the overflow_size - extra_pte_valid_addr_end = ( - self.get_pte_number(extra_pde_start_address + - overflow_size - 1)) - - # if the overflow_size couldn't be fit inside this new - # pde then need another pde and so we now need to limit - # the end of the PTE to 1024 and set the size of this - # new region to the max possible - extra_region_size = overflow_size - if overflow_size >= (self.size_addressed_per_pde): - extra_region_size = self.size_addressed_per_pde - extra_pte_valid_addr_end = self.total_pages - - # load the new PDE's details - - self.set_pde_pte_values(extra_pdpte, - extra_pde, - extra_pde_start_address, - extra_region_size, - 0, - extra_pte_valid_addr_end, - flags) - - # for the next iteration of the loop the size needs - # to decreased - overflow_size -= extra_region_size - - if [extra_pdpte, extra_pde] not in self.pd_tables_list: - self.pd_tables_list.append([extra_pdpte, extra_pde]) - - if overflow_size == 0: - break - - self.pd_tables_list.sort() - self.clean_up_unused_pdpte() - - - pages_for_pdpte = 1 - pages_for_pd = self.get_number_of_pd() - pages_for_pt = len(self.pd_tables_list) - self.output_buffer = ctypes.create_string_buffer((pages_for_pdpte + - pages_for_pd + - pages_for_pt) * 4096) - - def pdpte_create_binary_file(self): - # pae needs a pdpte at 32byte aligned address - - # Even though we have only 4 entries in the pdpte we need to move - # the self.output_offset variable to the next page to start pushing - # the pd contents - # - # FIXME: This wastes a ton of RAM!! - if args.verbose: - print("PDPTE at 0x%x" % self.pd_start_addr) - - for pdpte in range(self.total_pages + 1): - if pdpte in self.get_pdpte_list(): - present = 1 << 0 - addr_of_pd = (((self.pd_start_addr + 4096) + - self.get_pdpte_list().index(pdpte) * - 4096) >> 12) << 12 - binary_value = (present | addr_of_pd) - else: - binary_value = 0 - - struct.pack_into(page_entry_format, - self.output_buffer, - self.output_offset, - binary_value) - - self.output_offset += struct.calcsize(page_entry_format) - - - def page_directory_create_binary_file(self): - for pdpte, pde_info in self.list_of_pdpte.items(): - if args.verbose: - print("Page directory %d at 0x%x" % (pde_info.pdpte_index, - self.pd_start_addr + self.output_offset)) - for pde in range(self.total_pages + 1): - binary_value = 0 # the page directory entry is not valid - - # if i have a valid entry to populate - if pde in pde_info.pd_entries.keys(): - value = pde_info.pd_entries[pde] - binary_value = self.get_binary_pde_value(pdpte, value) - - struct.pack_into(page_entry_format, - self.output_buffer, - self.output_offset, - binary_value) - if args.verbose: - print_code(binary_value) - - self.output_offset += struct.calcsize(page_entry_format) - - def page_table_create_binary_file(self): - for _, pde_info in sorted(self.list_of_pdpte.items()): - for pde, pte_info in sorted(pde_info.pd_entries.items()): - pe_info = pte_info.page_entries_info[0] - start_addr = pe_info.start_addr & ~0x1FFFFF - end_addr = start_addr + 0x1FFFFF - if args.verbose: - print("Page table for 0x%08x - 0x%08x at 0x%08x" % - (start_addr, end_addr, - self.pd_start_addr + self.output_offset)) - for pte in range(self.total_pages + 1): - binary_value = 0 # the page directory entry is not valid - - valid_pte = 0 - # go through all the valid pages inside the pde to - # figure out if we need to populate this pte - for i in pte_info.page_entries_info: - temp_value = ((pte >= i.pte_valid_addr_start) and - (pte <= i.pte_valid_addr_end)) - if temp_value: - perm_for_pte = i.permissions - valid_pte |= temp_value - - # if i have a valid entry to populate - if valid_pte: - binary_value = self.get_binary_pte_value(pde_info, - pde, - pte, - perm_for_pte) - - if args.verbose: - print_code(binary_value) - struct.pack_into(page_entry_format, - self.output_buffer, - self.output_offset, - binary_value) - self.output_offset += struct.calcsize(page_entry_format) - - - -#*****************************************************************************# - -def read_mmu_list(mmu_list_data): - regions = [] - - # Read mmu_list header data - num_of_regions, pd_start_addr = struct.unpack_from( - header_values_format, mmu_list_data, 0) - - # a offset used to remember next location to read in the binary - size_read_from_binary = struct.calcsize(header_values_format) - - if args.verbose: - print("Start address of page tables: 0x%08x" % pd_start_addr) - print("Build-time memory regions:") - - # Read all the region definitions - for region_index in range(num_of_regions): - addr, size, flags = struct.unpack_from(struct_mmu_regions_format, - mmu_list_data, - size_read_from_binary) - size_read_from_binary += struct.calcsize(struct_mmu_regions_format) - - if args.verbose: - print(" Region %03d: 0x%08x - 0x%08x (0x%016x)" % - (region_index, addr, addr + size - 1, flags)) - - # ignore zero sized memory regions - if size == 0: - continue - - if (addr & 0xFFF) != 0: - print("Memory region %d start address %x is not page-aligned" % - (region_index, addr)) - sys.exit(2) - - if (size & 0xFFF) != 0: - print("Memory region %d size %d is not page-aligned" % - (region_index, size)) - sys.exit(2) - - # validate for memory overlap here - for other_region_index in range(len(regions)): - other_addr, other_size, _ = regions[other_region_index] - - end_addr = addr + size - other_end_addr = other_addr + other_size - - overlap = ((addr <= other_addr and end_addr > other_addr) or - (other_addr <= addr and other_end_addr > addr)) - - if overlap: - print("Memory region %d (%x:%x) overlaps memory region %d (%x:%x)" % - (region_index, addr, end_addr, other_region_index, - other_addr, other_end_addr)) - sys.exit(2) - - # add the retrieved info another list - regions.append((addr, size, flags)) - - return (pd_start_addr, regions) - - -def check_bits(val, bits): - for b in bits: - if val & (1 << b): - return 1 - return 0 - -def get_symbols(obj): - for section in obj.iter_sections(): - if isinstance(section, SymbolTableSection): - return {sym.name: sym.entry.st_value - for sym in section.iter_symbols()} - - raise LookupError("Could not find symbol table") - -# Read the parameters passed to the file -def parse_args(): - global args - - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument("-k", "--kernel", - help="Zephyr kernel image") - parser.add_argument("-o", "--output", - help="Output file into which the page tables are " - "written.") - parser.add_argument("-u", "--user-output", - help="User mode page tables for KPTI") - parser.add_argument("-v", "--verbose", action="count", default=0, - help="Print debugging information. Multiple " - "invocations increase verbosity") - args = parser.parse_args() - if "VERBOSE" in os.environ: - args.verbose = 1 - -def main(): - parse_args() - - with open(args.kernel, "rb") as fp: - kernel = ELFFile(fp) - syms = get_symbols(kernel) - irq_data = kernel.get_section_by_name("mmulist").data() - - pd_start_addr, regions = read_mmu_list(irq_data) - - # select the page table needed - page_table = PageMode_PAE(pd_start_addr, regions, syms, False) - - # write the binary data into the file - with open(args.output, 'wb') as fp: - fp.write(page_table.output_buffer) - - if "CONFIG_X86_KPTI" in syms: - pd_start_addr += page_table.output_offset - - user_page_table = PageMode_PAE(pd_start_addr, regions, syms, True) - with open(args.user_output, 'wb') as fp: - fp.write(user_page_table.output_buffer) - -if __name__ == "__main__": - main() diff --git a/arch/x86/ia32.cmake b/arch/x86/ia32.cmake index be15724c673..1a76fcec46e 100644 --- a/arch/x86/ia32.cmake +++ b/arch/x86/ia32.cmake @@ -105,38 +105,6 @@ add_bin_file_to_the_next_link(gen_idt_output staticIdt) add_bin_file_to_the_next_link(gen_idt_output irq_int_vector_map) add_bin_file_to_the_next_link(gen_idt_output irq_vectors_alloc) -if(CONFIG_X86_MMU) - if(CONFIG_X86_KPTI) - set(user_mmu_tables_bin user_mmu_tables.bin) - endif() - - add_custom_target( - mmu_tables_bin_target - DEPENDS - mmu_tables.bin - ${user_mmu_tables_bin} - ) - add_custom_command( - OUTPUT - mmu_tables.bin - ${user_mmu_tables_bin} - COMMAND - ${PYTHON_EXECUTABLE} - ${ZEPHYR_BASE}/arch/x86/gen_mmu_x86.py - -k $ - -o mmu_tables.bin - -u user_mmu_tables.bin - $<$:-v> - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} - DEPENDS ${ZEPHYR_PREBUILT_EXECUTABLE} - ) - - add_bin_file_to_the_next_link( mmu_tables_bin_target mmu_tables) - if(CONFIG_X86_KPTI) - add_bin_file_to_the_next_link(mmu_tables_bin_target user_mmu_tables) - endif() -endif() - if(CONFIG_GDT_DYNAMIC) # Use gen_gdt.py and objcopy to generate gdt.o from from the elf # file ${ZEPHYR_PREBUILT_EXECUTABLE}, creating the temp file gdt.bin along the diff --git a/arch/x86/include/ia32/kernel_arch_func.h b/arch/x86/include/ia32/kernel_arch_func.h index 1ac9c6df996..b49d2adff74 100644 --- a/arch/x86/include/ia32/kernel_arch_func.h +++ b/arch/x86/include/ia32/kernel_arch_func.h @@ -28,6 +28,9 @@ extern K_THREAD_STACK_DEFINE(_interrupt_stack, CONFIG_ISR_STACK_SIZE); void z_x86_early_serial_init(void); #endif +/* Create all page tables with boot configuration and enable paging */ +void z_x86_paging_init(void); + /** * * @brief Performs architecture-specific initialization @@ -47,6 +50,9 @@ static inline void kernel_arch_init(void) #ifdef CONFIG_X86_VERY_EARLY_CONSOLE z_x86_early_serial_init(); #endif +#ifdef CONFIG_X86_MMU + z_x86_paging_init(); +#endif #if CONFIG_X86_STACK_PROTECTION z_x86_mmu_set_flags(&z_x86_kernel_pdpt, _interrupt_stack, MMU_PAGE_SIZE, MMU_ENTRY_READ, MMU_PTE_RW_MASK, true); @@ -95,6 +101,10 @@ static inline struct x86_mmu_pdpt *z_x86_pdpt_get(struct k_thread *thread) return &header->kernel_data.pdpt; } #endif /* CONFIG_USERSPACE */ + +/* ASM code to fiddle with registers to enable the MMU with PAE paging */ +void z_x86_enable_paging(void); + #include /* For size_t */ #ifdef __cplusplus diff --git a/arch/x86/include/ia32/mmustructs.h b/arch/x86/include/ia32/mmustructs.h index 16e14aeadf1..e303a3e685d 100644 --- a/arch/x86/include/ia32/mmustructs.h +++ b/arch/x86/include/ia32/mmustructs.h @@ -116,29 +116,6 @@ #define MMU_ENTRY_EXECUTE_DISABLE 0x8000000000000000ULL -/* Special flag argument for MMU_BOOT region invocations */ - -/* Indicates that pages within this region may have their user/supervisor - * permissions adjusted at runtime. Unnecessary if MMU_ENTRY_USER is already - * set. - * - * The result of this is a guarantee that the 'user' bit for all PDEs referring - * to the region will be set, even if the boot configuration has no user pages - * in it. - */ -#define MMU_ENTRY_RUNTIME_USER 0x10000000ULL - -/* Indicates that pages within this region may have their read/write - * permissions adjusted at runtime. Unnecessary if MMU_ENTRY_WRITE is already - * set. - * - * The result of this is a guarantee that the 'write' bit for all PDEs - * referring to the region will be set, even if the boot configuration has no - * writable pages in it. - */ -#define MMU_ENTRY_RUNTIME_WRITE 0x20000000ULL - - /* Helper macros to ease the usage of the MMU page table structures. */ @@ -228,8 +205,8 @@ * In order to populate this structure use macro MMU_BOOT_REGION. */ struct mmu_region { - u32_t address; /*Start address of the memory region */ - u32_t size; /* Size of the memory region*/ + uintptr_t address; /*Start address of the memory region */ + size_t size; /* Size of the memory region*/ u64_t flags; /* Permissions needed for this region*/ }; diff --git a/boards/x86/qemu_x86/qemu_x86_coverage_defconfig b/boards/x86/qemu_x86/qemu_x86_coverage_defconfig index 1049d1004d4..8b91763da45 100644 --- a/boards/x86/qemu_x86/qemu_x86_coverage_defconfig +++ b/boards/x86/qemu_x86/qemu_x86_coverage_defconfig @@ -16,7 +16,9 @@ CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC=25000000 CONFIG_TEST_RANDOM_GENERATOR=y CONFIG_XIP=y CONFIG_X86_MMU=y +CONFIG_X86_MMU_PAGE_POOL_PAGES=15 CONFIG_DEBUG_INFO=y CONFIG_SCHED_SCALABLE=y CONFIG_WAITQ_SCALABLE=y CONFIG_COVERAGE=y +CONFIG_X86_VERY_EARLY_CONSOLE=y diff --git a/boards/x86/qemu_x86/qemu_x86_defconfig b/boards/x86/qemu_x86/qemu_x86_defconfig index f90157126c0..441efc1f4d4 100644 --- a/boards/x86/qemu_x86/qemu_x86_defconfig +++ b/boards/x86/qemu_x86/qemu_x86_defconfig @@ -15,6 +15,7 @@ CONFIG_UART_CONSOLE=y CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC=25000000 CONFIG_TEST_RANDOM_GENERATOR=y CONFIG_X86_MMU=y +CONFIG_X86_MMU_PAGE_POOL_PAGES=15 CONFIG_DEBUG_INFO=y CONFIG_SCHED_SCALABLE=y CONFIG_WAITQ_SCALABLE=y diff --git a/boards/x86/qemu_x86/qemu_x86_iamcu_defconfig b/boards/x86/qemu_x86/qemu_x86_iamcu_defconfig index 0c09112a5e9..ced7cbd4b29 100644 --- a/boards/x86/qemu_x86/qemu_x86_iamcu_defconfig +++ b/boards/x86/qemu_x86/qemu_x86_iamcu_defconfig @@ -15,4 +15,5 @@ CONFIG_UART_CONSOLE=y CONFIG_SYS_CLOCK_HW_CYCLES_PER_SEC=25000000 CONFIG_X86_IAMCU=y CONFIG_X86_MMU=y +CONFIG_X86_MMU_PAGE_POOL_PAGES=15 CONFIG_DEBUG_INFO=y diff --git a/include/arch/x86/ia32/linker.ld b/include/arch/x86/ia32/linker.ld index 4d4bdf155ad..56205f802d9 100644 --- a/include/arch/x86/ia32/linker.ld +++ b/include/arch/x86/ia32/linker.ld @@ -119,6 +119,13 @@ SECTIONS *(".rodata.*") *(.gnu.linkonce.r.*) +#ifdef CONFIG_X86_MMU + . = ALIGN(4); + z_x86_mmulist_start = .; + KEEP(*(.mmulist)) + z_x86_mmulist_end = .; +#endif /* CONFIG_X86_MMU */ + #ifndef CONFIG_DYNAMIC_INTERRUPTS . = ALIGN(8); _idt_base_address = .; @@ -349,29 +356,6 @@ SECTIONS #include -#ifdef CONFIG_X86_MMU - /* Can't really predict the size of this section. Anything after this - * should not be affected if addresses change between builds (currently - * just the gperf tables which is fine). - * - * However, __mmu_tables_start *must* remain stable between builds, - * we can't have anything shifting the memory map beforehand. - */ - SECTION_DATA_PROLOGUE(mmu_tables,,) - { - /* Page Tables are located here if MMU is enabled.*/ - MMU_PAGE_ALIGN - __mmu_tables_start = .; - z_x86_kernel_pdpt = .; - KEEP(*(mmu_tables)); -#ifdef CONFIG_X86_KPTI - z_x86_user_pdpt = .; - KEEP(*(user_mmu_tables)); -#endif /* CONFIG_X86_KPTI */ - __mmu_tables_end = .; - } GROUP_DATA_LINK_IN(RAMABLE_REGION, ROMABLE_REGION) -#endif - #include MMU_PAGE_ALIGN @@ -397,23 +381,6 @@ SECTIONS KEEP(*(.intList)) KEEP(*(.gnu.linkonce.intList.*)) } > IDT_LIST - -#ifdef CONFIG_X86_MMU - /* Memory management unit*/ - SECTION_PROLOGUE(mmulist,,) - { - /* get size of the mmu lists needed for gen_mmu_x86.py*/ - LONG((__MMU_LIST_END__ - __MMU_LIST_START__) / __MMU_REGION_SIZEOF) - /* Get the start of mmu tables in data section so that the address - * of the page tables can be calculated. - */ - LONG(__mmu_tables_start) - __MMU_LIST_START__ = .; - KEEP(*(.mmulist)) - __MMU_LIST_END__ = .; - } > MMU_LIST -#endif /* CONFIG_X86_MMU */ - #else /DISCARD/ : { @@ -421,7 +388,6 @@ SECTIONS KEEP(*(.spurNoErrIsr)) KEEP(*(.intList)) KEEP(*(.gnu.linkonce.intList.*)) - KEEP(*(.mmulist)) } #endif diff --git a/soc/x86/apollo_lake/linker.ld b/soc/x86/apollo_lake/linker.ld index 5afcb726435..a5db250de47 100644 --- a/soc/x86/apollo_lake/linker.ld +++ b/soc/x86/apollo_lake/linker.ld @@ -32,9 +32,6 @@ MEMORY */ IDT_LIST : ORIGIN = 2K, LENGTH = 2K -#ifdef CONFIG_X86_MMU - MMU_LIST : ORIGIN = 4k, LENGTH = 1K -#endif } #include diff --git a/soc/x86/atom/linker.ld b/soc/x86/atom/linker.ld index c71deac3478..75064e72205 100644 --- a/soc/x86/atom/linker.ld +++ b/soc/x86/atom/linker.ld @@ -39,9 +39,6 @@ MEMORY */ IDT_LIST : ORIGIN = 2K, LENGTH = 2K -#ifdef CONFIG_X86_MMU - MMU_LIST : ORIGIN = 4k, LENGTH = 1K -#endif } #include diff --git a/soc/x86/ia32/linker.ld b/soc/x86/ia32/linker.ld index b7c9b42ec58..218f61fd588 100644 --- a/soc/x86/ia32/linker.ld +++ b/soc/x86/ia32/linker.ld @@ -40,7 +40,6 @@ MEMORY */ IDT_LIST : ORIGIN = 0xFFFF1000, LENGTH = 2K - MMU_LIST : ORIGIN = 0xFFFF2000, LENGTH = 1K } #include