zephyr/arch/x86/core/x86_mmu.c

1098 lines
28 KiB
C
Raw Normal View History

/*
* Copyright (c) 2011-2014 Wind River Systems, Inc.
* Copyright (c) 2017 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <kernel.h>
#include <arch/x86/mmustructs.h>
#include <linker/linker-defs.h>
#include <kernel_internal.h>
#include <kernel_structs.h>
#include <init.h>
#include <ctype.h>
#include <string.h>
/* Despite our use of PAE page tables, we do not (and will never) actually
* support PAE. Use a 64-bit x86 target if you have that much RAM.
*/
BUILD_ASSERT(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024ULL) - 1ULL <=
(unsigned long long)UINTPTR_MAX);
/* Common regions for all x86 processors.
* Peripheral I/O ranges configured at the SOC level
*/
/* Mark text and rodata as read-only.
* Userspace may read all text and rodata.
*/
MMU_BOOT_REGION(&_image_text_start, &_image_text_size,
Z_X86_MMU_US);
MMU_BOOT_REGION(&_image_rodata_start, &_image_rodata_size,
Z_X86_MMU_US | Z_X86_MMU_XD);
#ifdef CONFIG_USERSPACE
MMU_BOOT_REGION(&_app_smem_start, &_app_smem_size,
Z_X86_MMU_RW | Z_X86_MMU_XD);
userspace: compartmentalized app memory organization Summary: revised attempt at addressing issue 6290. The following provides an alternative to using CONFIG_APPLICATION_MEMORY by compartmentalizing data into Memory Domains. Dependent on MPU limitations, supports compartmentalized Memory Domains for 1...N logical applications. This is considered an initial attempt at designing flexible compartmentalized Memory Domains for multiple logical applications and, with the provided python script and edited CMakeLists.txt, provides support for power of 2 aligned MPU architectures. Overview: The current patch uses qualifiers to group data into subsections. The qualifier usage allows for dynamic subsection creation and affords the developer a large amount of flexibility in the grouping, naming, and size of the resulting partitions and domains that are built on these subsections. By additional macro calls, functions are created that help calculate the size, address, and permissions for the subsections and enable the developer to control application data in specified partitions and memory domains. Background: Initial attempts focused on creating a single section in the linker script that then contained internally grouped variables/data to allow MPU/MMU alignment and protection. This did not provide additional functionality beyond CONFIG_APPLICATION_MEMORY as we were unable to reliably group data or determine their grouping via exported linker symbols. Thus, the resulting decision was made to dynamically create subsections using the current qualifier method. An attempt to group the data by object file was tested, but found that this broke applications such as ztest where two object files are created: ztest and main. This also creates an issue of grouping the two object files together in the same memory domain while also allowing for compartmenting other data among threads. Because it is not possible to know a) the name of the partition and thus the symbol in the linker, b) the size of all the data in the subsection, nor c) the overall number of partitions created by the developer, it was not feasible to align the subsections at compile time without using dynamically generated linker script for MPU architectures requiring power of 2 alignment. In order to provide support for MPU architectures that require a power of 2 alignment, a python script is run at build prior to when linker_priv_stacks.cmd is generated. This script scans the built object files for all possible partitions and the names given to them. It then generates a linker file (app_smem.ld) that is included in the main linker.ld file. This app_smem.ld allows the compiler and linker to then create each subsection and align to the next power of 2. Usage: - Requires: app_memory/app_memdomain.h . - _app_dmem(id) marks a variable to be placed into a data section for memory partition id. - _app_bmem(id) marks a variable to be placed into a bss section for memory partition id. - These are seen in the linker.map as "data_smem_id" and "data_smem_idb". - To create a k_mem_partition, call the macro app_mem_partition(part0) where "part0" is the name then used to refer to that partition. This macro only creates a function and necessary data structures for the later "initialization". - To create a memory domain for the partition, the macro app_mem_domain(dom0) is called where "dom0" is the name then used for the memory domain. - To initialize the partition (effectively adding the partition to a linked list), init_part_part0() is called. This is followed by init_app_memory(), which walks all partitions in the linked list and calculates the sizes for each partition. - Once the partition is initialized, the domain can be initialized with init_domain_dom0(part0) which initializes the domain with partition part0. - After the domain has been initialized, the current thread can be added using add_thread_dom0(k_current_get()). - The code used in ztests ans kernel/init has been added under a conditional #ifdef to isolate the code from other tests. The userspace test CMakeLists.txt file has commands to insert the CONFIG_APP_SHARED_MEM definition into the required build targets. Example: /* create partition at top of file outside functions */ app_mem_partition(part0); /* create domain */ app_mem_domain(dom0); _app_dmem(dom0) int var1; _app_bmem(dom0) static volatile int var2; int main() { init_part_part0(); init_app_memory(); init_domain_dom0(part0); add_thread_dom0(k_current_get()); ... } - If multiple partitions are being created, a variadic preprocessor macro can be used as provided in app_macro_support.h: FOR_EACH(app_mem_partition, part0, part1, part2); or, for multiple domains, similarly: FOR_EACH(app_mem_domain, dom0, dom1); Similarly, the init_part_* can also be used in the macro: FOR_EACH(init_part, part0, part1, part2); Testing: - This has been successfully tested on qemu_x86 and the ARM frdm_k64f board. It compiles and builds power of 2 aligned subsections for the linker script on the 96b_carbon boards. These power of 2 alignments have been checked by hand and are viewable in the zephyr.map file that is produced during build. However, due to a shortage of available MPU regions on the 96b_carbon board, we are unable to test this. - When run on the 96b_carbon board, the test suite will enter execution, but each individaul test will fail due to an MPU FAULT. This is expected as the required number of MPU regions exceeds the number allowed due to the static allocation. As the MPU driver does not detect this issue, the fault occurs because the data being accessed has been placed outside the active MPU region. - This now compiles successfully for the ARC boards em_starterkit_em7d and em_starterkit_em7d_v22. However, as we lack ARC hardware to run this build on, we are unable to test this build. Current known issues: 1) While the script and edited CMakeLists.txt creates the ability to align to the next power of 2, this does not address the shortage of available MPU regions on certain devices (e.g. 96b_carbon). In testing the APB and PPB regions were commented out. 2) checkpatch.pl lists several issues regarding the following: a) Complex macros. The FOR_EACH macros as defined in app_macro_support.h are listed as complex macros needing parentheses. Adding parentheses breaks their functionality, and we have otherwise been unable to resolve the reported error. b) __aligned() preferred. The _app_dmem_pad() and _app_bmem_pad() macros give warnings that __aligned() is preferred. Prior iterations had this implementation, which resulted in errors due to "complex macros". c) Trailing semicolon. The macro init_part(name) has a trailing semicolon as the semicolon is needed for the inlined macro call that is generated when this macro expands. Update: updated to alternative CONFIG_APPLCATION_MEMORY. Added config option CONFIG_APP_SHARED_MEM to enable a new section app_smem to contain the shared memory component. This commit seperates the Kconfig definition from the definition used for the conditional code. The change is in response to changes in the way the build system treats definitions. The python script used to generate a linker script for app_smem was also midified to simplify the alignment directives. A default linker script app_smem.ld was added to remove the conditional includes dependency on CONFIG_APP_SHARED_MEM. By addining the default linker script the prebuild stages link properly prior to the python script running Signed-off-by: Joshua Domagalski <jedomag@tycho.nsa.gov> Signed-off-by: Shawn Mosley <smmosle@tycho.nsa.gov>
2018-04-26 10:14:02 -04:00
#endif
#ifdef CONFIG_COVERAGE_GCOV
MMU_BOOT_REGION(&__gcov_bss_start, &__gcov_bss_size,
Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_XD);
#endif
#ifdef CONFIG_X86_LONGMODE
extern char _locore_start[];
extern char _locore_size[];
extern char _lorodata_start[];
extern char _lorodata_size[];
extern char _lodata_start[];
extern char _lodata_size[];
/* Early boot regions that need to be in low memory to be comprehensible
* by the CPU in 16-bit mode
*/
MMU_BOOT_REGION(&_locore_start, &_locore_size, 0);
MMU_BOOT_REGION(&_lorodata_start, &_lorodata_size, Z_X86_MMU_XD);
MMU_BOOT_REGION(&_lodata_start, &_lodata_size, Z_X86_MMU_RW | Z_X86_MMU_XD);
#endif
/* __kernel_ram_size includes all unused memory, which is used for heaps.
* User threads cannot access this unless granted at runtime. This is done
* automatically for stacks.
*/
MMU_BOOT_REGION(&__kernel_ram_start, &__kernel_ram_size,
Z_X86_MMU_RW | Z_X86_MMU_XD);
/*
* Inline functions for setting memory addresses in page table structures
*/
#ifdef CONFIG_X86_LONGMODE
static inline void pml4e_update_pdpt(u64_t *pml4e, struct x86_mmu_pdpt *pdpt)
{
uintptr_t pdpt_addr = (uintptr_t)pdpt;
*pml4e = ((*pml4e & ~Z_X86_MMU_PML4E_PDPT_MASK) |
(pdpt_addr & Z_X86_MMU_PML4E_PDPT_MASK));
}
#endif /* CONFIG_X86_LONGMODE */
static inline void pdpte_update_pd(u64_t *pdpte, struct x86_mmu_pd *pd)
{
uintptr_t pd_addr = (uintptr_t)pd;
#ifdef CONFIG_X86_LONGMODE
__ASSERT((*pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page");
#endif
*pdpte = ((*pdpte & ~Z_X86_MMU_PDPTE_PD_MASK) |
(pd_addr & Z_X86_MMU_PDPTE_PD_MASK));
}
static inline void pde_update_pt(u64_t *pde, struct x86_mmu_pt *pt)
{
uintptr_t pt_addr = (uintptr_t)pt;
__ASSERT((*pde & Z_X86_MMU_PS) == 0, "pde is for 2MB page");
*pde = ((*pde & ~Z_X86_MMU_PDE_PT_MASK) |
(pt_addr & Z_X86_MMU_PDE_PT_MASK));
}
static inline void pte_update_addr(u64_t *pte, uintptr_t addr)
{
*pte = ((*pte & ~Z_X86_MMU_PTE_ADDR_MASK) |
(addr & Z_X86_MMU_PTE_ADDR_MASK));
}
/*
* Functions for dumping page tables to console
*/
/* Works for PDPT, PD, PT entries, the bits we check here are all the same.
*
* Not trying to capture every flag, just the most interesting stuff,
* Present, write, XD, user, in typically encountered combinations.
*/
static char get_entry_code(u64_t value)
{
char ret;
if ((value & Z_X86_MMU_P) == 0) {
ret = '.';
} else {
if ((value & Z_X86_MMU_RW) != 0) {
/* Writable page */
if ((value & Z_X86_MMU_XD) != 0) {
/* RW */
ret = 'w';
} else {
/* RWX */
ret = 'a';
}
} else {
if ((value & Z_X86_MMU_XD) != 0) {
/* R */
ret = 'r';
} else {
/* RX */
ret = 'x';
}
}
if ((value & Z_X86_MMU_US) != 0) {
/* Uppercase indicates user mode access */
ret = toupper(ret);
}
}
return ret;
}
static void print_entries(u64_t entries_array[], size_t count)
{
int column = 0;
for (int i = 0; i < count; i++) {
printk("%c", get_entry_code(entries_array[i]));
column++;
if (column == 64) {
column = 0;
printk("\n");
}
}
if (column != 0) {
printk("\n");
}
}
static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index)
{
printk("Page table %d for 0x%016lX - 0x%016lX at %p\n",
index, base, base + Z_X86_PT_AREA - 1, pt);
print_entries(pt->entry, Z_X86_NUM_PT_ENTRIES);
}
static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index)
{
printk("Page directory %d for 0x%016lX - 0x%016lX at %p\n",
index, base, base + Z_X86_PD_AREA - 1, pd);
print_entries(pd->entry, Z_X86_NUM_PD_ENTRIES);
for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) {
struct x86_mmu_pt *pt;
u64_t pde = pd->entry[i];
if (((pde & Z_X86_MMU_P) == 0) || ((pde & Z_X86_MMU_PS) != 0)) {
/* Skip non-present, or 2MB directory entries, there's
* no page table to examine */
continue;
}
pt = z_x86_pde_get_pt(pde);
z_x86_dump_pt(pt, base + (i * Z_X86_PT_AREA), i);
}
}
static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base,
int index)
{
printk("Page directory pointer table %d for 0x%0816lX - 0x%016lX at %p\n",
index, base, base + Z_X86_PDPT_AREA - 1, pdpt);
print_entries(pdpt->entry, Z_X86_NUM_PDPT_ENTRIES);
for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) {
struct x86_mmu_pd *pd;
u64_t pdpte = pdpt->entry[i];
if ((pdpte & Z_X86_MMU_P) == 0) {
continue;
}
#ifdef CONFIG_X86_LONGMODE
if ((pdpte & Z_X86_MMU_PS) != 0) {
continue;
}
#endif
pd = z_x86_pdpte_get_pd(pdpte);
z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i);
}
}
#ifdef CONFIG_X86_LONGMODE
static void z_x86_dump_pml4(struct x86_mmu_pml4 *pml4)
{
printk("Page mapping level 4 at %p for all memory addresses\n", pml4);
print_entries(pml4->entry, Z_X86_NUM_PML4_ENTRIES);
for (int i = 0; i < Z_X86_NUM_PML4_ENTRIES; i++) {
struct x86_mmu_pdpt *pdpt;
u64_t pml4e = pml4->entry[i];
if ((pml4e & Z_X86_MMU_P) == 0) {
continue;
}
pdpt = z_x86_pml4e_get_pdpt(pml4e);
z_x86_dump_pdpt(pdpt, i * Z_X86_PDPT_AREA, i);
}
}
void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{
z_x86_dump_pml4(z_x86_get_pml4(ptables));
}
#else
void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{
z_x86_dump_pdpt(z_x86_get_pdpt(ptables, 0), 0, 0);
}
#endif
void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr,
u64_t *pde_flags, u64_t *pte_flags)
{
*pde_flags = *z_x86_get_pde(ptables, (uintptr_t)addr) &
~Z_X86_MMU_PDE_PT_MASK;
if ((*pde_flags & Z_X86_MMU_P) != 0) {
*pte_flags = *z_x86_get_pte(ptables, (uintptr_t)addr) &
~Z_X86_MMU_PTE_ADDR_MASK;
} else {
*pte_flags = 0;
}
}
/* Given an address/size pair, which corresponds to some memory address
* within a table of table_size, return the maximum number of bytes to
* examine so we look just to the end of the table and no further.
*
* If size fits entirely within the table, just return size.
*/
static size_t get_table_max(uintptr_t addr, size_t size, size_t table_size)
{
size_t table_remaining;
addr &= (table_size - 1);
table_remaining = table_size - addr;
if (size < table_remaining) {
return size;
} else {
return table_remaining;
}
}
/* Range [addr, addr + size) must fall within the bounds of the pt */
static int x86_mmu_validate_pt(struct x86_mmu_pt *pt, uintptr_t addr,
size_t size, bool write)
{
uintptr_t pos = addr;
size_t remaining = size;
int ret = 0;
while (true) {
u64_t pte = *z_x86_pt_get_pte(pt, pos);
if ((pte & Z_X86_MMU_P) == 0 || (pte & Z_X86_MMU_US) == 0 ||
(write && (pte & Z_X86_MMU_RW) == 0)) {
ret = -1;
break;
}
if (remaining <= MMU_PAGE_SIZE) {
break;
}
remaining -= MMU_PAGE_SIZE;
pos += MMU_PAGE_SIZE;
}
return ret;
}
/* Range [addr, addr + size) must fall within the bounds of the pd */
static int x86_mmu_validate_pd(struct x86_mmu_pd *pd, uintptr_t addr,
size_t size, bool write)
{
uintptr_t pos = addr;
size_t remaining = size;
int ret = 0;
size_t to_examine;
while (remaining) {
u64_t pde = *z_x86_pd_get_pde(pd, pos);
if ((pde & Z_X86_MMU_P) == 0 || (pde & Z_X86_MMU_US) == 0 ||
(write && (pde & Z_X86_MMU_RW) == 0)) {
ret = -1;
break;
}
to_examine = get_table_max(pos, remaining, Z_X86_PT_AREA);
if ((pde & Z_X86_MMU_PS) == 0) {
/* Not a 2MB PDE. Need to check all the linked
* tables for this entry
*/
struct x86_mmu_pt *pt;
pt = z_x86_pde_get_pt(pde);
ret = x86_mmu_validate_pt(pt, pos, to_examine, write);
if (ret != 0) {
break;
}
} else {
ret = 0;
}
remaining -= to_examine;
pos += to_examine;
}
return ret;
}
/* Range [addr, addr + size) must fall within the bounds of the pdpt */
static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
size_t size, bool write)
{
uintptr_t pos = addr;
size_t remaining = size;
int ret = 0;
size_t to_examine;
while (remaining) {
u64_t pdpte = *z_x86_pdpt_get_pdpte(pdpt, pos);
if ((pdpte & Z_X86_MMU_P) == 0) {
/* Non-present */
ret = -1;
break;
}
#ifdef CONFIG_X86_LONGMODE
if ((pdpte & Z_X86_MMU_US) == 0 ||
(write && (pdpte & Z_X86_MMU_RW) == 0)) {
ret = -1;
break;
}
#endif
to_examine = get_table_max(pos, remaining, Z_X86_PD_AREA);
#ifdef CONFIG_X86_LONGMODE
/* Check if 1GB page, if not, examine linked page directory */
if ((pdpte & Z_X86_MMU_PS) == 0) {
#endif
struct x86_mmu_pd *pd = z_x86_pdpte_get_pd(pdpte);
ret = x86_mmu_validate_pd(pd, pos, to_examine, write);
if (ret != 0) {
break;
}
#ifdef CONFIG_X86_LONGMODE
} else {
ret = 0;
}
#endif
remaining -= to_examine;
pos += to_examine;
}
return ret;
}
#ifdef CONFIG_X86_LONGMODE
static int x86_mmu_validate_pml4(struct x86_mmu_pml4 *pml4, uintptr_t addr,
size_t size, bool write)
{
uintptr_t pos = addr;
size_t remaining = size;
int ret = 0;
size_t to_examine;
while (remaining) {
u64_t pml4e = *z_x86_pml4_get_pml4e(pml4, pos);
struct x86_mmu_pdpt *pdpt;
if ((pml4e & Z_X86_MMU_P) == 0 || (pml4e & Z_X86_MMU_US) == 0 ||
(write && (pml4e & Z_X86_MMU_RW) == 0)) {
ret = -1;
break;
}
to_examine = get_table_max(pos, remaining, Z_X86_PDPT_AREA);
pdpt = z_x86_pml4e_get_pdpt(pml4e);
ret = x86_mmu_validate_pdpt(pdpt, pos, to_examine, write);
if (ret != 0) {
break;
}
remaining -= to_examine;
pos += to_examine;
}
return ret;
}
#endif /* CONFIG_X86_LONGMODE */
int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size,
bool write)
{
int ret;
#ifdef CONFIG_X86_LONGMODE
struct x86_mmu_pml4 *pml4 = z_x86_get_pml4(ptables);
ret = x86_mmu_validate_pml4(pml4, (uintptr_t)addr, size, write);
#else
struct x86_mmu_pdpt *pdpt = z_x86_get_pdpt(ptables, (uintptr_t)addr);
ret = x86_mmu_validate_pdpt(pdpt, (uintptr_t)addr, size, write);
#endif
#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
__asm__ volatile ("lfence" : : : "memory");
#endif
return ret;
}
static inline void tlb_flush_page(void *addr)
{
/* Invalidate TLB entries corresponding to the page containing the
* specified address
*/
char *page = (char *)addr;
__asm__ ("invlpg %0" :: "m" (*page));
}
#ifdef CONFIG_X86_LONGMODE
#define PML4E_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_P)
#define PDPTE_FLAGS_MASK PML4E_FLAGS_MASK
#define PDE_FLAGS_MASK PDPTE_FLAGS_MASK
#else
#define PDPTE_FLAGS_MASK Z_X86_MMU_P
#define PDE_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \
PDPTE_FLAGS_MASK)
#endif
#define PTE_FLAGS_MASK (PDE_FLAGS_MASK | Z_X86_MMU_XD | \
Z_X86_MMU_PWT | \
Z_X86_MMU_PCD)
void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
size_t size, u64_t flags, u64_t mask, bool flush)
{
uintptr_t addr = (uintptr_t)ptr;
__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");
/* L1TF mitigation: non-present PTEs will have address fields
* zeroed. Expand the mask to include address bits if we are changing
* the present bit.
*/
if ((mask & Z_X86_MMU_P) != 0) {
mask |= Z_X86_MMU_PTE_ADDR_MASK;
}
/* NOTE: All of this code assumes that 2MB or 1GB pages are not being
* modified.
*/
while (size != 0) {
u64_t *pte;
u64_t *pde;
u64_t *pdpte;
#ifdef CONFIG_X86_LONGMODE
u64_t *pml4e;
#endif
u64_t cur_flags = flags;
bool exec = (flags & Z_X86_MMU_XD) == 0;
#ifdef CONFIG_X86_LONGMODE
pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
__ASSERT((*pml4e & Z_X86_MMU_P) != 0,
"set flags on non-present PML4e");
*pml4e |= (flags & PML4E_FLAGS_MASK);
if (exec) {
*pml4e &= ~Z_X86_MMU_XD;
}
pdpte = z_x86_pdpt_get_pdpte(z_x86_pml4e_get_pdpt(*pml4e),
addr);
#else
pdpte = z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr),
addr);
#endif
__ASSERT((*pdpte & Z_X86_MMU_P) != 0,
"set flags on non-present PDPTE");
*pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
if (exec) {
*pdpte &= ~Z_X86_MMU_XD;
}
#endif
pde = z_x86_pd_get_pde(z_x86_pdpte_get_pd(*pdpte), addr);
__ASSERT((*pde & Z_X86_MMU_P) != 0,
"set flags on non-present PDE");
*pde |= (flags & PDE_FLAGS_MASK);
/* If any flags enable execution, clear execute disable at the
* page directory level
*/
if (exec) {
*pde &= ~Z_X86_MMU_XD;
}
pte = z_x86_pt_get_pte(z_x86_pde_get_pt(*pde), addr);
/* If we're setting the present bit, restore the address
* field. If we're clearing it, then the address field
* will be zeroed instead, mapping the PTE to the NULL page.
*/
if ((mask & Z_X86_MMU_P) != 0 && ((flags & Z_X86_MMU_P) != 0)) {
cur_flags |= addr;
}
*pte = (*pte & ~mask) | cur_flags;
if (flush) {
tlb_flush_page((void *)addr);
}
size -= MMU_PAGE_SIZE;
addr += MMU_PAGE_SIZE;
}
}
static char __aligned(MMU_PAGE_SIZE)
page_pool[MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES];
static char *page_pos = page_pool + sizeof(page_pool);
static void *get_page(void)
{
page_pos -= MMU_PAGE_SIZE;
__ASSERT(page_pos >= page_pool, "out of MMU pages\n");
return page_pos;
}
#ifdef CONFIG_X86_LONGMODE
#define PTABLES_ALIGN 4096
#else
#define PTABLES_ALIGN 32
#endif
__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_kernel_ptables;
#ifdef CONFIG_X86_KPTI
__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_user_ptables;
#endif
extern char z_shared_kernel_page_start[];
static inline bool is_within_system_ram(uintptr_t addr)
{
return (addr >= DT_PHYS_RAM_ADDR) &&
(addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));
}
/* Ignored bit posiition at all levels */
#define IGNORED BIT64(11)
static void maybe_clear_xd(u64_t *entry, bool exec)
{
/* Execute disable bit needs special handling, we should only set it at
* intermediate levels if ALL containing pages have XD set (instead of
* just one).
*
* Use an ignored bit position in the PDE to store a marker on whether
* any configured region allows execution.
*/
if (exec) {
*entry |= IGNORED;
*entry &= ~Z_X86_MMU_XD;
} else if ((*entry & IGNORED) == 0) {
*entry |= Z_X86_MMU_XD;
}
}
static void add_mmu_region_page(struct x86_page_tables *ptables,
uintptr_t addr, u64_t flags, bool user_table)
{
#ifdef CONFIG_X86_LONGMODE
u64_t *pml4e;
#endif
struct x86_mmu_pdpt *pdpt;
u64_t *pdpte;
struct x86_mmu_pd *pd;
u64_t *pde;
struct x86_mmu_pt *pt;
u64_t *pte;
bool exec = (flags & Z_X86_MMU_XD) == 0;
#ifdef CONFIG_X86_KPTI
/* If we are generating a page table for user mode, and this address
* does not have the user flag set, and this address falls outside
* of system RAM, then don't bother generating any tables for it,
* we will never need them later as memory domains are limited to
* regions within system RAM.
*/
if (user_table && (flags & Z_X86_MMU_US) == 0 &&
!is_within_system_ram(addr)) {
return;
}
#endif
#ifdef CONFIG_X86_LONGMODE
pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
if ((*pml4e & Z_X86_MMU_P) == 0) {
pdpt = get_page();
pml4e_update_pdpt(pml4e, pdpt);
} else {
pdpt = z_x86_pml4e_get_pdpt(*pml4e);
}
*pml4e |= (flags & PML4E_FLAGS_MASK);
maybe_clear_xd(pml4e, exec);
#else
pdpt = z_x86_get_pdpt(ptables, addr);
#endif
/* Setup the PDPTE entry for the address, creating a page directory
* if one didn't exist
*/
pdpte = z_x86_pdpt_get_pdpte(pdpt, addr);
if ((*pdpte & Z_X86_MMU_P) == 0) {
pd = get_page();
pdpte_update_pd(pdpte, pd);
} else {
pd = z_x86_pdpte_get_pd(*pdpte);
}
*pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
maybe_clear_xd(pdpte, exec);
#endif
/* Setup the PDE entry for the address, creating a page table
* if necessary
*/
pde = z_x86_pd_get_pde(pd, addr);
if ((*pde & Z_X86_MMU_P) == 0) {
pt = get_page();
pde_update_pt(pde, pt);
} else {
pt = z_x86_pde_get_pt(*pde);
}
*pde |= (flags & PDE_FLAGS_MASK);
maybe_clear_xd(pde, exec);
#ifdef CONFIG_X86_KPTI
if (user_table && (flags & Z_X86_MMU_US) == 0 &&
addr != (uintptr_t)(&z_shared_kernel_page_start)) {
/* All non-user accessible pages except the shared page
* are marked non-present in the page table.
*/
return;
}
#else
ARG_UNUSED(user_table);
#endif
/* Finally set up the page table entry */
pte = z_x86_pt_get_pte(pt, addr);
pte_update_addr(pte, addr);
*pte |= (flags & PTE_FLAGS_MASK);
}
static void add_mmu_region(struct x86_page_tables *ptables,
struct mmu_region *rgn,
bool user_table)
{
size_t size;
u64_t flags;
uintptr_t addr;
__ASSERT((rgn->address & MMU_PAGE_MASK) == 0U,
"unaligned address provided");
__ASSERT((rgn->size & MMU_PAGE_MASK) == 0U,
"unaligned size provided");
addr = rgn->address;
flags = rgn->flags | Z_X86_MMU_P;
/* Iterate through the region a page at a time, creating entries as
* necessary.
*/
size = rgn->size;
while (size > 0) {
add_mmu_region_page(ptables, addr, flags, user_table);
size -= MMU_PAGE_SIZE;
addr += MMU_PAGE_SIZE;
}
}
/* Called from x86's z_arch_kernel_init() */
void z_x86_paging_init(void)
{
size_t pages_free;
Z_STRUCT_SECTION_FOREACH(mmu_region, rgn) {
add_mmu_region(&z_x86_kernel_ptables, rgn, false);
#ifdef CONFIG_X86_KPTI
add_mmu_region(&z_x86_user_ptables, rgn, true);
#endif
}
pages_free = (page_pos - page_pool) / MMU_PAGE_SIZE;
if (pages_free != 0) {
printk("Optimal CONFIG_X86_MMU_PAGE_POOL_PAGES %zu\n",
CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free);
}
#ifdef CONFIG_X86_LONGMODE
/* MMU already enabled at boot for long mode, we just need to
* program CR3 with our newly generated page tables.
*/
__asm__ volatile("movq %0, %%cr3\n\t"
: : "r" (&z_x86_kernel_ptables) : "memory");
#else
z_x86_enable_paging();
#endif
}
#ifdef CONFIG_X86_USERSPACE
int z_arch_buffer_validate(void *addr, size_t size, int write)
{
return z_x86_mmu_validate(z_x86_thread_page_tables_get(_current), addr,
size, write != 0);
}
static uintptr_t thread_pd_create(uintptr_t pages,
struct x86_page_tables *thread_ptables,
struct x86_page_tables *master_ptables)
{
uintptr_t pos = pages, phys_addr = Z_X86_PD_START;
for (int i = 0; i < Z_X86_NUM_PD; i++, phys_addr += Z_X86_PD_AREA) {
u64_t *pdpte;
struct x86_mmu_pd *master_pd, *dest_pd;
/* Obtain PD in master tables for the address range and copy
* into the per-thread PD for this range
*/
master_pd = z_x86_get_pd(master_ptables, phys_addr);
dest_pd = (struct x86_mmu_pd *)pos;
(void)memcpy(dest_pd, master_pd, sizeof(struct x86_mmu_pd));
/* Update pointer in per-thread pdpt to point to the per-thread
* directory we just copied
*/
pdpte = z_x86_get_pdpte(thread_ptables, phys_addr);
pdpte_update_pd(pdpte, dest_pd);
pos += MMU_PAGE_SIZE;
}
return pos;
}
/* thread_ptables must be initialized, as well as all the page directories */
static uintptr_t thread_pt_create(uintptr_t pages,
struct x86_page_tables *thread_ptables,
struct x86_page_tables *master_ptables)
{
uintptr_t pos = pages, phys_addr = Z_X86_PT_START;
for (int i = 0; i < Z_X86_NUM_PT; i++, phys_addr += Z_X86_PT_AREA) {
u64_t *pde;
struct x86_mmu_pt *master_pt, *dest_pt;
/* Same as we did with the directories, obtain PT in master
* tables for the address range and copy into per-thread PT
* for this range
*/
master_pt = z_x86_get_pt(master_ptables, phys_addr);
dest_pt = (struct x86_mmu_pt *)pos;
(void)memcpy(dest_pt, master_pt, sizeof(struct x86_mmu_pt));
/* And then wire this up to the relevant per-thread
* page directory entry
*/
pde = z_x86_get_pde(thread_ptables, phys_addr);
pde_update_pt(pde, dest_pt);
pos += MMU_PAGE_SIZE;
}
return pos;
}
/* Initialize the page tables for a thread. This will contain, once done,
* the boot-time configuration for a user thread page tables. There are
* no pre-conditions on the existing state of the per-thread tables.
*/
static void copy_page_tables(struct k_thread *thread,
struct x86_page_tables *master_ptables)
{
uintptr_t pos, start;
struct x86_page_tables *thread_ptables =
z_x86_thread_page_tables_get(thread);
struct z_x86_thread_stack_header *header =
(struct z_x86_thread_stack_header *)thread->stack_obj;
__ASSERT(thread->stack_obj != NULL, "no stack object assigned");
__ASSERT(z_x86_page_tables_get() != thread_ptables,
"tables are active");
__ASSERT(((uintptr_t)thread_ptables & 0x1f) == 0,
"unaligned page tables at %p", thread_ptables);
(void)memcpy(thread_ptables, master_ptables,
sizeof(struct x86_page_tables));
/* pos represents the page we are working with in the reserved area
* in the stack buffer for per-thread tables. As we create tables in
* this area, pos is incremented to the next free page.
*
* The layout of the stack object, when this is done:
*
* +---------------------------+ <- thread->stack_obj
* | PDE(0) |
* +---------------------------+
* | ... |
* +---------------------------+
* | PDE(Z_X86_NUM_PD - 1) |
* +---------------------------+
* | PTE(0) |
* +---------------------------+
* | ... |
* +---------------------------+
* | PTE(Z_X86_NUM_PT - 1) |
* +---------------------------+ <- pos once this logic completes
* | Stack guard |
* +---------------------------+
* | Privilege elevation stack |
* | PDPT |
* +---------------------------+ <- thread->stack_info.start
* | Thread stack |
* | ... |
*
*/
start = (uintptr_t)(&header->page_tables);
pos = thread_pd_create(start, thread_ptables, master_ptables);
pos = thread_pt_create(pos, thread_ptables, master_ptables);
__ASSERT(pos == (start + Z_X86_THREAD_PT_AREA),
"wrong amount of stack object memory used");
}
static void reset_mem_partition(struct x86_page_tables *thread_ptables,
struct k_mem_partition *partition)
{
uintptr_t addr = partition->start;
size_t size = partition->size;
__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");
while (size != 0) {
u64_t *thread_pte, *master_pte;
thread_pte = z_x86_get_pte(thread_ptables, addr);
master_pte = z_x86_get_pte(&USER_PTABLES, addr);
*thread_pte = *master_pte;
size -= MMU_PAGE_SIZE;
addr += MMU_PAGE_SIZE;
}
}
static void apply_mem_partition(struct x86_page_tables *ptables,
struct k_mem_partition *partition)
{
u64_t x86_attr;
u64_t mask;
if (IS_ENABLED(CONFIG_X86_KPTI)) {
x86_attr = partition->attr | Z_X86_MMU_P;
mask = K_MEM_PARTITION_PERM_MASK | Z_X86_MMU_P;
} else {
x86_attr = partition->attr;
mask = K_MEM_PARTITION_PERM_MASK;
}
__ASSERT(partition->start >= DT_PHYS_RAM_ADDR,
"region at %08lx[%u] extends below system ram start 0x%08x",
partition->start, partition->size, DT_PHYS_RAM_ADDR);
__ASSERT(((partition->start + partition->size) <=
(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))),
"region at %08lx[%u] end at %08lx extends beyond system ram end 0x%08x",
partition->start, partition->size,
partition->start + partition->size,
(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));
z_x86_mmu_set_flags(ptables, (void *)partition->start, partition->size,
x86_attr, mask, false);
}
void z_x86_apply_mem_domain(struct x86_page_tables *ptables,
struct k_mem_domain *mem_domain)
{
for (int i = 0, pcount = 0; pcount < mem_domain->num_partitions; i++) {
struct k_mem_partition *partition;
partition = &mem_domain->partitions[i];
if (partition->size == 0) {
continue;
}
pcount++;
apply_mem_partition(ptables, partition);
}
}
/* Called on creation of a user thread or when a supervisor thread drops to
* user mode.
*
* Sets up the per-thread page tables, such that when they are activated on
* context switch, everything is ready to go.
*/
void z_x86_thread_pt_init(struct k_thread *thread)
{
struct x86_page_tables *ptables = z_x86_thread_page_tables_get(thread);
/* USER_PDPT contains the page tables with the boot time memory
* policy. We use it as a template to set up the per-thread page
* tables.
*
* With KPTI, this is a distinct set of tables z_x86_user_pdpt from the
* kernel page tables in z_x86_kernel_pdpt; it has all non user
* accessible pages except the trampoline page marked as non-present.
* Without KPTI, they are the same object.
*/
copy_page_tables(thread, &USER_PTABLES);
/* Enable access to the thread's own stack buffer */
z_x86_mmu_set_flags(ptables, (void *)thread->stack_info.start,
ROUND_UP(thread->stack_info.size, MMU_PAGE_SIZE),
Z_X86_MMU_P | K_MEM_PARTITION_P_RW_U_RW,
Z_X86_MMU_P | K_MEM_PARTITION_PERM_MASK,
false);
}
/*
* Memory domain interface
*
* In all cases, if one of these APIs is called on a supervisor thread,
* we don't need to do anything. If the thread later drops into supervisor
* mode the per-thread page tables will be generated and the memory domain
* configuration applied.
*/
void z_arch_mem_domain_partition_remove(struct k_mem_domain *domain,
u32_t partition_id)
{
sys_dnode_t *node, *next_node;
/* Removing a partition. Need to reset the relevant memory range
* to the defaults in USER_PDPT for each thread.
*/
SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
struct k_thread *thread =
CONTAINER_OF(node, struct k_thread, mem_domain_info);
if ((thread->base.user_options & K_USER) == 0) {
continue;
}
reset_mem_partition(z_x86_thread_page_tables_get(thread),
&domain->partitions[partition_id]);
}
}
void z_arch_mem_domain_destroy(struct k_mem_domain *domain)
{
for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
struct k_mem_partition *partition;
partition = &domain->partitions[i];
if (partition->size == 0) {
continue;
}
pcount++;
z_arch_mem_domain_partition_remove(domain, i);
}
}
void z_arch_mem_domain_thread_remove(struct k_thread *thread)
{
struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;
/* Non-user threads don't have per-thread page tables set up */
if ((thread->base.user_options & K_USER) == 0) {
return;
}
for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
struct k_mem_partition *partition;
partition = &domain->partitions[i];
if (partition->size == 0) {
continue;
}
pcount++;
reset_mem_partition(z_x86_thread_page_tables_get(thread),
partition);
}
}
void z_arch_mem_domain_partition_add(struct k_mem_domain *domain,
u32_t partition_id)
{
sys_dnode_t *node, *next_node;
SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
struct k_thread *thread =
CONTAINER_OF(node, struct k_thread, mem_domain_info);
if ((thread->base.user_options & K_USER) == 0) {
continue;
}
apply_mem_partition(z_x86_thread_page_tables_get(thread),
&domain->partitions[partition_id]);
}
}
void z_arch_mem_domain_thread_add(struct k_thread *thread)
{
if ((thread->base.user_options & K_USER) == 0) {
return;
}
z_x86_apply_mem_domain(z_x86_thread_page_tables_get(thread),
thread->mem_domain_info.mem_domain);
}
int z_arch_mem_domain_max_partitions_get(void)
{
return CONFIG_MAX_DOMAIN_PARTITIONS;
}
#endif /* CONFIG_X86_USERSPACE*/