zephyr/arch/x86/core/x86_mmu.c

/*
 * Copyright (c) 2011-2014 Wind River Systems, Inc.
 * Copyright (c) 2017 Intel Corporation
 *
 * SPDX-License-Identifier: Apache-2.0
 */
#include <kernel.h>
#include <arch/x86/mmustructs.h>
#include <linker/linker-defs.h>
#include <kernel_internal.h>
#include <kernel_structs.h>
#include <init.h>
#include <ctype.h>
#include <string.h>

/* Despite our use of PAE page tables, we do not (and will never) actually
 * support PAE. Use a 64-bit x86 target if you have that much RAM.
 */
BUILD_ASSERT(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024ULL) - 1ULL <=
				 (unsigned long long)UINTPTR_MAX);

/* Common regions for all x86 processors.
 * Peripheral I/O ranges configured at the SOC level
 */

/* Mark text and rodata as read-only.
 * Userspace may read all text and rodata.
 */
MMU_BOOT_REGION(&_image_text_start, &_image_text_size,
		Z_X86_MMU_US);

MMU_BOOT_REGION(&_image_rodata_start, &_image_rodata_size,
		Z_X86_MMU_US | Z_X86_MMU_XD);

#ifdef CONFIG_USERSPACE
MMU_BOOT_REGION(&_app_smem_start, &_app_smem_size,
		Z_X86_MMU_RW | Z_X86_MMU_XD);
#endif

#ifdef CONFIG_COVERAGE_GCOV
MMU_BOOT_REGION(&__gcov_bss_start, &__gcov_bss_size,
		Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_XD);
#endif

#ifdef CONFIG_X86_LONGMODE
extern char _locore_start[];
extern char _locore_size[];
extern char _lorodata_start[];
extern char _lorodata_size[];
extern char _lodata_start[];
extern char _lodata_size[];

/* Early boot regions that need to be in low memory to be comprehensible
 * by the CPU in 16-bit mode
 */

MMU_BOOT_REGION(&_locore_start, &_locore_size, 0);
MMU_BOOT_REGION(&_lorodata_start, &_lorodata_size, Z_X86_MMU_XD);
MMU_BOOT_REGION(&_lodata_start, &_lodata_size, Z_X86_MMU_RW | Z_X86_MMU_XD);
#endif

/* __kernel_ram_size includes all unused memory, which is used for heaps.
 * User threads cannot access this unless granted at runtime. This is done
 * automatically for stacks.
 */
MMU_BOOT_REGION(&__kernel_ram_start, &__kernel_ram_size,
		Z_X86_MMU_RW | Z_X86_MMU_XD);

/*
 * Inline functions for setting memory addresses in page table structures
 */

#ifdef CONFIG_X86_LONGMODE
static inline void pml4e_update_pdpt(u64_t *pml4e, struct x86_mmu_pdpt *pdpt)
{
	uintptr_t pdpt_addr = (uintptr_t)pdpt;

	*pml4e = ((*pml4e & ~Z_X86_MMU_PML4E_PDPT_MASK) |
		  (pdpt_addr & Z_X86_MMU_PML4E_PDPT_MASK));
}
#endif /* CONFIG_X86_LONGMODE */

static inline void pdpte_update_pd(u64_t *pdpte, struct x86_mmu_pd *pd)
{
	uintptr_t pd_addr = (uintptr_t)pd;

#ifdef CONFIG_X86_LONGMODE
	__ASSERT((*pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page");
#endif
	*pdpte = ((*pdpte & ~Z_X86_MMU_PDPTE_PD_MASK) |
		  (pd_addr & Z_X86_MMU_PDPTE_PD_MASK));
}

static inline void pde_update_pt(u64_t *pde, struct x86_mmu_pt *pt)
{
	uintptr_t pt_addr = (uintptr_t)pt;

	__ASSERT((*pde & Z_X86_MMU_PS) == 0, "pde is for 2MB page");

	*pde = ((*pde & ~Z_X86_MMU_PDE_PT_MASK) |
		(pt_addr & Z_X86_MMU_PDE_PT_MASK));
}

static inline void pte_update_addr(u64_t *pte, uintptr_t addr)
{
	*pte = ((*pte & ~Z_X86_MMU_PTE_ADDR_MASK) |
		(addr & Z_X86_MMU_PTE_ADDR_MASK));
}

/*
 * Functions for dumping page tables to console
 */

/* Works for PDPT, PD, PT entries, the bits we check here are all the same.
 *
 * Not trying to capture every flag, just the most interesting stuff,
 * Present, write, XD, user, in typically encountered combinations.
 */
static char get_entry_code(u64_t value)
{
	char ret;

	if ((value & Z_X86_MMU_P) == 0) {
		ret = '.';
	} else {
		if ((value & Z_X86_MMU_RW) != 0) {
			/* Writable page */
			if ((value & Z_X86_MMU_XD) != 0) {
				/* RW */
				ret = 'w';
			} else {
				/* RWX */
				ret = 'a';
			}
		} else {
			if ((value & Z_X86_MMU_XD) != 0) {
				/* R */
				ret = 'r';
			} else {
				/* RX */
				ret = 'x';
			}
		}

		if ((value & Z_X86_MMU_US) != 0) {
			/* Uppercase indicates user mode access */
			ret = toupper(ret);
		}
	}

	return ret;
}

static void print_entries(u64_t entries_array[], size_t count)
{
	int column = 0;

	for (int i = 0; i < count; i++) {
		printk("%c", get_entry_code(entries_array[i]));

		column++;
		if (column == 64) {
			column = 0;
			printk("\n");
		}
	}

	if (column != 0) {
		printk("\n");
	}
}

static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index)
{
	printk("Page table %d for 0x%016lX - 0x%016lX at %p\n",
	       index, base, base + Z_X86_PT_AREA - 1, pt);

	print_entries(pt->entry, Z_X86_NUM_PT_ENTRIES);
}

static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index)
{
	printk("Page directory %d for 0x%016lX - 0x%016lX at %p\n",
	       index, base, base + Z_X86_PD_AREA - 1, pd);

	print_entries(pd->entry, Z_X86_NUM_PD_ENTRIES);

	for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) {
		struct x86_mmu_pt *pt;
		u64_t pde = pd->entry[i];

		if (((pde & Z_X86_MMU_P) == 0) || ((pde & Z_X86_MMU_PS) != 0)) {
			/* Skip non-present, or 2MB directory entries, there's
			 * no page table to examine */
			continue;
		}
		pt = z_x86_pde_get_pt(pde);

		z_x86_dump_pt(pt, base + (i * Z_X86_PT_AREA), i);
	}
}

static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base,
			    int index)
{
	printk("Page directory pointer table %d for 0x%0816lX - 0x%016lX at %p\n",
	       index, base, base + Z_X86_PDPT_AREA - 1, pdpt);

	print_entries(pdpt->entry, Z_X86_NUM_PDPT_ENTRIES);

	for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) {
		struct x86_mmu_pd *pd;
		u64_t pdpte = pdpt->entry[i];

		if ((pdpte & Z_X86_MMU_P) == 0) {
			continue;
		}
#ifdef CONFIG_X86_LONGMODE
		if ((pdpte & Z_X86_MMU_PS) != 0) {
			continue;
		}
#endif
		pd = z_x86_pdpte_get_pd(pdpte);
		z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i);
	}
}

#ifdef CONFIG_X86_LONGMODE
static void z_x86_dump_pml4(struct x86_mmu_pml4 *pml4)
{
	printk("Page mapping level 4 at %p for all memory addresses\n", pml4);

	print_entries(pml4->entry, Z_X86_NUM_PML4_ENTRIES);

	for (int i = 0; i < Z_X86_NUM_PML4_ENTRIES; i++) {
		struct x86_mmu_pdpt *pdpt;
		u64_t pml4e = pml4->entry[i];

		if ((pml4e & Z_X86_MMU_P) == 0) {
			continue;
		}

		pdpt = z_x86_pml4e_get_pdpt(pml4e);
		z_x86_dump_pdpt(pdpt, i * Z_X86_PDPT_AREA, i);
	}
}

void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{
	z_x86_dump_pml4(z_x86_get_pml4(ptables));
}

#else
void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{
	z_x86_dump_pdpt(z_x86_get_pdpt(ptables, 0), 0, 0);
}
#endif

void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr,
			 u64_t *pde_flags, u64_t *pte_flags)
{
	*pde_flags = *z_x86_get_pde(ptables, (uintptr_t)addr) &
		~Z_X86_MMU_PDE_PT_MASK;

	if ((*pde_flags & Z_X86_MMU_P) != 0) {
		*pte_flags = *z_x86_get_pte(ptables, (uintptr_t)addr) &
			~Z_X86_MMU_PTE_ADDR_MASK;
	} else {
		*pte_flags = 0;
	}
}

/* Given an address/size pair, which corresponds to some memory address
 * within a table of table_size, return the maximum number of bytes to
 * examine so we look just to the end of the table and no further.
 *
 * If size fits entirely within the table, just return size.
 */
static size_t get_table_max(uintptr_t addr, size_t size, size_t table_size)
{
	size_t table_remaining;

	addr &= (table_size - 1);
	table_remaining = table_size - addr;

	if (size < table_remaining) {
		return size;
	} else {
		return table_remaining;
	}
}

/* Range [addr, addr + size) must fall within the bounds of the pt */
static int x86_mmu_validate_pt(struct x86_mmu_pt *pt, uintptr_t addr,
			       size_t size, bool write)
{
	uintptr_t pos = addr;
	size_t remaining = size;
	int ret = 0;

	while (true) {
		u64_t pte = *z_x86_pt_get_pte(pt, pos);

		if ((pte & Z_X86_MMU_P) == 0 || (pte & Z_X86_MMU_US) == 0 ||
		    (write && (pte & Z_X86_MMU_RW) == 0)) {
			ret = -1;
			break;
		}

		if (remaining <= MMU_PAGE_SIZE) {
			break;
		}

		remaining -= MMU_PAGE_SIZE;
		pos += MMU_PAGE_SIZE;
	}

	return ret;
}

/* Range [addr, addr + size) must fall within the bounds of the pd */
static int x86_mmu_validate_pd(struct x86_mmu_pd *pd, uintptr_t addr,
			       size_t size, bool write)
{
	uintptr_t pos = addr;
	size_t remaining = size;
	int ret = 0;
	size_t to_examine;

	while (remaining) {
		u64_t pde = *z_x86_pd_get_pde(pd, pos);

		if ((pde & Z_X86_MMU_P) == 0 || (pde & Z_X86_MMU_US) == 0 ||
		    (write && (pde & Z_X86_MMU_RW) == 0)) {
			ret = -1;
			break;
		}

		to_examine = get_table_max(pos, remaining, Z_X86_PT_AREA);

		if ((pde & Z_X86_MMU_PS) == 0) {
			/* Not a 2MB PDE. Need to check all the linked
			 * tables for this entry
			 */
			struct x86_mmu_pt *pt;

			pt = z_x86_pde_get_pt(pde);
			ret = x86_mmu_validate_pt(pt, pos, to_examine, write);
			if (ret != 0) {
				break;
			}
		} else {
			ret = 0;
		}

		remaining -= to_examine;
		pos += to_examine;
	}

	return ret;
}

/* Range [addr, addr + size) must fall within the bounds of the pdpt */
static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
				 size_t size, bool write)
{
	uintptr_t pos = addr;
	size_t remaining = size;
	int ret = 0;
	size_t to_examine;

	while (remaining) {
		u64_t pdpte = *z_x86_pdpt_get_pdpte(pdpt, pos);

		if ((pdpte & Z_X86_MMU_P) == 0) {
			/* Non-present */
			ret = -1;
			break;
		}

#ifdef CONFIG_X86_LONGMODE
		if ((pdpte & Z_X86_MMU_US) == 0 ||
		    (write && (pdpte & Z_X86_MMU_RW) == 0)) {
			ret = -1;
			break;
		}
#endif
		to_examine = get_table_max(pos, remaining, Z_X86_PD_AREA);

#ifdef CONFIG_X86_LONGMODE
		/* Check if 1GB page, if not, examine linked page directory */
		if ((pdpte & Z_X86_MMU_PS) == 0) {
#endif
			struct x86_mmu_pd *pd = z_x86_pdpte_get_pd(pdpte);

			ret = x86_mmu_validate_pd(pd, pos, to_examine, write);
			if (ret != 0) {
				break;
			}
#ifdef CONFIG_X86_LONGMODE
		} else {
			ret = 0;
		}
#endif
		remaining -= to_examine;
		pos += to_examine;
	}

	return ret;
}

#ifdef CONFIG_X86_LONGMODE
static int x86_mmu_validate_pml4(struct x86_mmu_pml4 *pml4, uintptr_t addr,
				 size_t size, bool write)
{
	uintptr_t pos = addr;
	size_t remaining = size;
	int ret = 0;
	size_t to_examine;

	while (remaining) {
		u64_t pml4e = *z_x86_pml4_get_pml4e(pml4, pos);
		struct x86_mmu_pdpt *pdpt;

		if ((pml4e & Z_X86_MMU_P) == 0 || (pml4e & Z_X86_MMU_US) == 0 ||
		    (write && (pml4e & Z_X86_MMU_RW) == 0)) {
			ret = -1;
			break;
		}

		to_examine = get_table_max(pos, remaining, Z_X86_PDPT_AREA);
		pdpt = z_x86_pml4e_get_pdpt(pml4e);

		ret = x86_mmu_validate_pdpt(pdpt, pos, to_examine, write);
		if (ret != 0) {
			break;
		}

		remaining -= to_examine;
		pos += to_examine;
	}

	return ret;
}
#endif /* CONFIG_X86_LONGMODE */

int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size,
		       bool write)
{
	int ret;

#ifdef CONFIG_X86_LONGMODE
	struct x86_mmu_pml4 *pml4 = z_x86_get_pml4(ptables);

	ret = x86_mmu_validate_pml4(pml4, (uintptr_t)addr, size, write);
#else
	struct x86_mmu_pdpt *pdpt = z_x86_get_pdpt(ptables, (uintptr_t)addr);

	ret = x86_mmu_validate_pdpt(pdpt, (uintptr_t)addr, size, write);
#endif

#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
	__asm__ volatile ("lfence" : : : "memory");
#endif

	return ret;
}

static inline void tlb_flush_page(void *addr)
{
	/* Invalidate TLB entries corresponding to the page containing the
	 * specified address
	 */
	char *page = (char *)addr;

	__asm__ ("invlpg %0" :: "m" (*page));
}

#ifdef CONFIG_X86_LONGMODE
#define PML4E_FLAGS_MASK	(Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_P)

#define PDPTE_FLAGS_MASK	PML4E_FLAGS_MASK

#define PDE_FLAGS_MASK		PDPTE_FLAGS_MASK
#else
#define PDPTE_FLAGS_MASK	Z_X86_MMU_P

#define PDE_FLAGS_MASK		(Z_X86_MMU_RW | Z_X86_MMU_US | \
				 PDPTE_FLAGS_MASK)
#endif

#define PTE_FLAGS_MASK		(PDE_FLAGS_MASK | Z_X86_MMU_XD | \
				 Z_X86_MMU_PWT | \
				 Z_X86_MMU_PCD)

void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
			 size_t size, u64_t flags, u64_t mask, bool flush)
{
	uintptr_t addr = (uintptr_t)ptr;

	__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
	__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");

	/* L1TF mitigation: non-present PTEs will have address fields
	 * zeroed. Expand the mask to include address bits if we are changing
	 * the present bit.
	 */
	if ((mask & Z_X86_MMU_P) != 0) {
		mask |= Z_X86_MMU_PTE_ADDR_MASK;
	}

	/* NOTE: All of this code assumes that 2MB or 1GB pages are not being
	 * modified.
	 */
	while (size != 0) {
		u64_t *pte;
		u64_t *pde;
		u64_t *pdpte;
#ifdef CONFIG_X86_LONGMODE
		u64_t *pml4e;
#endif
		u64_t cur_flags = flags;
		bool exec = (flags & Z_X86_MMU_XD) == 0;

#ifdef CONFIG_X86_LONGMODE
		pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
		__ASSERT((*pml4e & Z_X86_MMU_P) != 0,
			 "set flags on non-present PML4e");
		*pml4e |= (flags & PML4E_FLAGS_MASK);

		if (exec) {
			*pml4e &= ~Z_X86_MMU_XD;
		}

		pdpte = z_x86_pdpt_get_pdpte(z_x86_pml4e_get_pdpt(*pml4e),
					     addr);
#else
		pdpte = z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr),
					     addr);
#endif
		__ASSERT((*pdpte & Z_X86_MMU_P) != 0,
			 "set flags on non-present PDPTE");
		*pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
		if (exec) {
			*pdpte &= ~Z_X86_MMU_XD;
		}
#endif
		pde = z_x86_pd_get_pde(z_x86_pdpte_get_pd(*pdpte), addr);
		__ASSERT((*pde & Z_X86_MMU_P) != 0,
			 "set flags on non-present PDE");
		*pde |= (flags & PDE_FLAGS_MASK);

		/* If any flags enable execution, clear execute disable at the
		 * page directory level
		 */
		if (exec) {
			*pde &= ~Z_X86_MMU_XD;
		}

		pte = z_x86_pt_get_pte(z_x86_pde_get_pt(*pde), addr);

		/* If we're setting the present bit, restore the address
		 * field. If we're clearing it, then the address field
		 * will be zeroed instead, mapping the PTE to the NULL page.
		 */
		if ((mask & Z_X86_MMU_P) != 0 && ((flags & Z_X86_MMU_P) != 0)) {
			cur_flags |= addr;
		}

		*pte = (*pte & ~mask) | cur_flags;
		if (flush) {
			tlb_flush_page((void *)addr);
		}

		size -= MMU_PAGE_SIZE;
		addr += MMU_PAGE_SIZE;
	}
}

static char __aligned(MMU_PAGE_SIZE)
	page_pool[MMU_PAGE_SIZE * CONFIG_X86_MMU_PAGE_POOL_PAGES];

static char *page_pos = page_pool + sizeof(page_pool);

static void *get_page(void)
{
	page_pos -= MMU_PAGE_SIZE;

	__ASSERT(page_pos >= page_pool, "out of MMU pages\n");

	return page_pos;
}

#ifdef CONFIG_X86_LONGMODE
#define PTABLES_ALIGN	4096
#else
#define PTABLES_ALIGN	32
#endif

__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_kernel_ptables;
#ifdef CONFIG_X86_KPTI
__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_user_ptables;
#endif

extern char z_shared_kernel_page_start[];

static inline bool is_within_system_ram(uintptr_t addr)
{
	return (addr >= DT_PHYS_RAM_ADDR) &&
		(addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));
}

/* Ignored bit posiition at all levels */
#define IGNORED		BIT64(11)

static void maybe_clear_xd(u64_t *entry, bool exec)
{
	/* Execute disable bit needs special handling, we should only set it at
	 * intermediate levels if ALL containing pages have XD set (instead of
	 * just one).
	 *
	 * Use an ignored bit position in the PDE to store a marker on whether
	 * any configured region allows execution.
	 */
	if (exec) {
		*entry |= IGNORED;
		*entry &= ~Z_X86_MMU_XD;
	} else if ((*entry & IGNORED) == 0) {
		*entry |= Z_X86_MMU_XD;
	}
}

static void add_mmu_region_page(struct x86_page_tables *ptables,
				uintptr_t addr, u64_t flags, bool user_table)
{
#ifdef CONFIG_X86_LONGMODE
	u64_t *pml4e;
#endif
	struct x86_mmu_pdpt *pdpt;
	u64_t *pdpte;
	struct x86_mmu_pd *pd;
	u64_t *pde;
	struct x86_mmu_pt *pt;
	u64_t *pte;
	bool exec = (flags & Z_X86_MMU_XD) == 0;

#ifdef CONFIG_X86_KPTI
	/* If we are generating a page table for user mode, and this address
	 * does not have the user flag set, and this address falls outside
	 * of system RAM, then don't bother generating any tables for it,
	 * we will never need them later as memory domains are limited to
	 * regions within system RAM.
	 */
	if (user_table && (flags & Z_X86_MMU_US) == 0 &&
	    !is_within_system_ram(addr)) {
		return;
	}
#endif

#ifdef CONFIG_X86_LONGMODE
	pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
	if ((*pml4e & Z_X86_MMU_P) == 0) {
		pdpt = get_page();
		pml4e_update_pdpt(pml4e, pdpt);
	} else {
		pdpt = z_x86_pml4e_get_pdpt(*pml4e);
	}
	*pml4e |= (flags & PML4E_FLAGS_MASK);
	maybe_clear_xd(pml4e, exec);
#else
	pdpt = z_x86_get_pdpt(ptables, addr);
#endif

	/* Setup the PDPTE entry for the address, creating a page directory
	 * if one didn't exist
	 */
	pdpte = z_x86_pdpt_get_pdpte(pdpt, addr);
	if ((*pdpte & Z_X86_MMU_P) == 0) {
		pd = get_page();
		pdpte_update_pd(pdpte, pd);
	} else {
		pd = z_x86_pdpte_get_pd(*pdpte);
	}
	*pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
	maybe_clear_xd(pdpte, exec);
#endif

	/* Setup the PDE entry for the address, creating a page table
	 * if necessary
	 */
	pde = z_x86_pd_get_pde(pd, addr);
	if ((*pde & Z_X86_MMU_P) == 0) {
		pt = get_page();
		pde_update_pt(pde, pt);
	} else {
		pt = z_x86_pde_get_pt(*pde);
	}
	*pde |= (flags & PDE_FLAGS_MASK);
	maybe_clear_xd(pde, exec);

#ifdef CONFIG_X86_KPTI
	if (user_table && (flags & Z_X86_MMU_US) == 0 &&
	    addr != (uintptr_t)(&z_shared_kernel_page_start)) {
		/* All non-user accessible pages except the shared page
		 * are marked non-present in the page table.
		 */
		return;
	}
#else
	ARG_UNUSED(user_table);
#endif

	/* Finally set up the page table entry */
	pte = z_x86_pt_get_pte(pt, addr);
	pte_update_addr(pte, addr);
	*pte |= (flags & PTE_FLAGS_MASK);
}

static void add_mmu_region(struct x86_page_tables *ptables,
			   struct mmu_region *rgn,
			   bool user_table)
{
	size_t size;
	u64_t flags;
	uintptr_t addr;

	__ASSERT((rgn->address & MMU_PAGE_MASK) == 0U,
		 "unaligned address provided");
	__ASSERT((rgn->size & MMU_PAGE_MASK) == 0U,
		 "unaligned size provided");
	addr = rgn->address;
	flags = rgn->flags | Z_X86_MMU_P;

	/* Iterate through the region a page at a time, creating entries as
	 * necessary.
	 */
	size = rgn->size;
	while (size > 0) {
		add_mmu_region_page(ptables, addr, flags, user_table);

		size -= MMU_PAGE_SIZE;
		addr += MMU_PAGE_SIZE;
	}
}

/* Called from x86's z_arch_kernel_init() */
void z_x86_paging_init(void)
{
	size_t pages_free;

	Z_STRUCT_SECTION_FOREACH(mmu_region, rgn) {
		add_mmu_region(&z_x86_kernel_ptables, rgn, false);
#ifdef CONFIG_X86_KPTI
		add_mmu_region(&z_x86_user_ptables, rgn, true);
#endif
	}

	pages_free = (page_pos - page_pool) / MMU_PAGE_SIZE;

	if (pages_free != 0) {
		printk("Optimal CONFIG_X86_MMU_PAGE_POOL_PAGES %zu\n",
		       CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free);
	}

#ifdef CONFIG_X86_LONGMODE
	/* MMU already enabled at boot for long mode, we just need to
	 * program CR3 with our newly generated page tables.
	 */
	__asm__ volatile("movq %0, %%cr3\n\t"
			 : : "r" (&z_x86_kernel_ptables) : "memory");
#else
	z_x86_enable_paging();
#endif
}

#ifdef CONFIG_X86_USERSPACE
int z_arch_buffer_validate(void *addr, size_t size, int write)
{
	return z_x86_mmu_validate(z_x86_thread_page_tables_get(_current), addr,
				  size, write != 0);
}

static uintptr_t thread_pd_create(uintptr_t pages,
				  struct x86_page_tables *thread_ptables,
				  struct x86_page_tables *master_ptables)
{
	uintptr_t pos = pages, phys_addr = Z_X86_PD_START;

	for (int i = 0; i < Z_X86_NUM_PD; i++, phys_addr += Z_X86_PD_AREA) {
		u64_t *pdpte;
		struct x86_mmu_pd *master_pd, *dest_pd;

		/* Obtain PD in master tables for the address range and copy
		 * into the per-thread PD for this range
		 */
		master_pd = z_x86_get_pd(master_ptables, phys_addr);
		dest_pd = (struct x86_mmu_pd *)pos;

		(void)memcpy(dest_pd, master_pd, sizeof(struct x86_mmu_pd));

		/* Update pointer in per-thread pdpt to point to the per-thread
		 * directory we just copied
		 */
		pdpte = z_x86_get_pdpte(thread_ptables, phys_addr);
		pdpte_update_pd(pdpte, dest_pd);
		pos += MMU_PAGE_SIZE;
	}

	return pos;
}

/* thread_ptables must be initialized, as well as all the page directories */
static uintptr_t thread_pt_create(uintptr_t pages,
				  struct x86_page_tables *thread_ptables,
				  struct x86_page_tables *master_ptables)
{
	uintptr_t pos = pages, phys_addr = Z_X86_PT_START;

	for (int i = 0; i < Z_X86_NUM_PT; i++, phys_addr += Z_X86_PT_AREA) {
		u64_t *pde;
		struct x86_mmu_pt *master_pt, *dest_pt;

		/* Same as we did with the directories, obtain PT in master
		 * tables for the address range and copy into per-thread PT
		 * for this range
		 */
		master_pt = z_x86_get_pt(master_ptables, phys_addr);
		dest_pt = (struct x86_mmu_pt *)pos;
		(void)memcpy(dest_pt, master_pt, sizeof(struct x86_mmu_pt));

		/* And then wire this up to the relevant per-thread
		 * page directory entry
		 */
		pde = z_x86_get_pde(thread_ptables, phys_addr);
		pde_update_pt(pde, dest_pt);
		pos += MMU_PAGE_SIZE;
	}

	return pos;
}

/* Initialize the page tables for a thread. This will contain, once done,
 * the boot-time configuration for a user thread page tables. There are
 * no pre-conditions on the existing state of the per-thread tables.
 */
static void copy_page_tables(struct k_thread *thread,
			     struct x86_page_tables *master_ptables)
{
	uintptr_t pos, start;
	struct x86_page_tables *thread_ptables =
		z_x86_thread_page_tables_get(thread);
	struct z_x86_thread_stack_header *header =
		(struct z_x86_thread_stack_header *)thread->stack_obj;

	__ASSERT(thread->stack_obj != NULL, "no stack object assigned");
	__ASSERT(z_x86_page_tables_get() != thread_ptables,
		 "tables are active");
	__ASSERT(((uintptr_t)thread_ptables & 0x1f) == 0,
		 "unaligned page tables at %p", thread_ptables);

	(void)memcpy(thread_ptables, master_ptables,
		     sizeof(struct x86_page_tables));

	/* pos represents the page we are working with in the reserved area
	 * in the stack buffer for per-thread tables. As we create tables in
	 * this area, pos is incremented to the next free page.
	 *
	 * The layout of the stack object, when this is done:
	 *
	 * +---------------------------+  <- thread->stack_obj
	 * | PDE(0)                    |
	 * +---------------------------+
	 * | ...                       |
	 * +---------------------------+
	 * | PDE(Z_X86_NUM_PD - 1)     |
	 * +---------------------------+
	 * | PTE(0)                    |
	 * +---------------------------+
	 * | ...                       |
	 * +---------------------------+
	 * | PTE(Z_X86_NUM_PT - 1)     |
	 * +---------------------------+ <- pos once this logic completes
	 * | Stack guard               |
	 * +---------------------------+
	 * | Privilege elevation stack |
	 * | PDPT                      |
	 * +---------------------------+ <- thread->stack_info.start
	 * | Thread stack              |
	 * | ...                       |
	 *
	 */
	start = (uintptr_t)(&header->page_tables);
	pos = thread_pd_create(start, thread_ptables, master_ptables);
	pos = thread_pt_create(pos, thread_ptables, master_ptables);

	__ASSERT(pos == (start + Z_X86_THREAD_PT_AREA),
		 "wrong amount of stack object memory used");
}

static void reset_mem_partition(struct x86_page_tables *thread_ptables,
				struct k_mem_partition *partition)
{
	uintptr_t addr = partition->start;
	size_t size = partition->size;

	__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
	__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");

	while (size != 0) {
		u64_t *thread_pte, *master_pte;

		thread_pte = z_x86_get_pte(thread_ptables, addr);
		master_pte = z_x86_get_pte(&USER_PTABLES, addr);

		*thread_pte = *master_pte;

		size -= MMU_PAGE_SIZE;
		addr += MMU_PAGE_SIZE;
	}
}

static void apply_mem_partition(struct x86_page_tables *ptables,
				struct k_mem_partition *partition)
{
	u64_t x86_attr;
	u64_t mask;

	if (IS_ENABLED(CONFIG_X86_KPTI)) {
		x86_attr = partition->attr | Z_X86_MMU_P;
		mask = K_MEM_PARTITION_PERM_MASK | Z_X86_MMU_P;
	} else {
		x86_attr = partition->attr;
		mask = K_MEM_PARTITION_PERM_MASK;
	}

	__ASSERT(partition->start >= DT_PHYS_RAM_ADDR,
		 "region at %08lx[%u] extends below system ram start 0x%08x",
		 partition->start, partition->size, DT_PHYS_RAM_ADDR);
	__ASSERT(((partition->start + partition->size) <=
		  (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))),
		 "region at %08lx[%u] end at %08lx extends beyond system ram end 0x%08x",
		 partition->start, partition->size,
		 partition->start + partition->size,
		 (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));

	z_x86_mmu_set_flags(ptables, (void *)partition->start, partition->size,
			    x86_attr, mask, false);
}

void z_x86_apply_mem_domain(struct x86_page_tables *ptables,
			    struct k_mem_domain *mem_domain)
{
	for (int i = 0, pcount = 0; pcount < mem_domain->num_partitions; i++) {
		struct k_mem_partition *partition;

		partition = &mem_domain->partitions[i];
		if (partition->size == 0) {
			continue;
		}
		pcount++;

		apply_mem_partition(ptables, partition);
	}
}

/* Called on creation of a user thread or when a supervisor thread drops to
 * user mode.
 *
 * Sets up the per-thread page tables, such that when they are activated on
 * context switch, everything is ready to go.
 */
void z_x86_thread_pt_init(struct k_thread *thread)
{
	struct x86_page_tables *ptables = z_x86_thread_page_tables_get(thread);

	/* USER_PDPT contains the page tables with the boot time memory
	 * policy. We use it as a template to set up the per-thread page
	 * tables.
	 *
	 * With KPTI, this is a distinct set of tables z_x86_user_pdpt from the
	 * kernel page tables in z_x86_kernel_pdpt; it has all non user
	 * accessible pages except the trampoline page marked as non-present.
	 * Without KPTI, they are the same object.
	 */
	copy_page_tables(thread, &USER_PTABLES);

	/* Enable access to the thread's own stack buffer */
	z_x86_mmu_set_flags(ptables, (void *)thread->stack_info.start,
			    ROUND_UP(thread->stack_info.size, MMU_PAGE_SIZE),
			    Z_X86_MMU_P | K_MEM_PARTITION_P_RW_U_RW,
			    Z_X86_MMU_P | K_MEM_PARTITION_PERM_MASK,
			    false);
}

/*
 * Memory domain interface
 *
 * In all cases, if one of these APIs is called on a supervisor thread,
 * we don't need to do anything. If the thread later drops into supervisor
 * mode the per-thread page tables will be generated and the memory domain
 * configuration applied.
 */
void z_arch_mem_domain_partition_remove(struct k_mem_domain *domain,
					u32_t partition_id)
{
	sys_dnode_t *node, *next_node;

	/* Removing a partition. Need to reset the relevant memory range
	 * to the defaults in USER_PDPT for each thread.
	 */
	SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
		struct k_thread *thread =
			CONTAINER_OF(node, struct k_thread, mem_domain_info);

		if ((thread->base.user_options & K_USER) == 0) {
			continue;
		}

		reset_mem_partition(z_x86_thread_page_tables_get(thread),
				    &domain->partitions[partition_id]);
	}
}

void z_arch_mem_domain_destroy(struct k_mem_domain *domain)
{
	for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
		struct k_mem_partition *partition;

		partition = &domain->partitions[i];
		if (partition->size == 0) {
			continue;
		}
		pcount++;

		z_arch_mem_domain_partition_remove(domain, i);
	}
}

void z_arch_mem_domain_thread_remove(struct k_thread *thread)
{
	struct k_mem_domain *domain = thread->mem_domain_info.mem_domain;

	/* Non-user threads don't have per-thread page tables set up */
	if ((thread->base.user_options & K_USER) == 0) {
		return;
	}

	for (int i = 0, pcount = 0; pcount < domain->num_partitions; i++) {
		struct k_mem_partition *partition;

		partition = &domain->partitions[i];
		if (partition->size == 0) {
			continue;
		}
		pcount++;

		reset_mem_partition(z_x86_thread_page_tables_get(thread),
				    partition);
	}
}

void z_arch_mem_domain_partition_add(struct k_mem_domain *domain,
				     u32_t partition_id)
{
	sys_dnode_t *node, *next_node;

	SYS_DLIST_FOR_EACH_NODE_SAFE(&domain->mem_domain_q, node, next_node) {
		struct k_thread *thread =
			CONTAINER_OF(node, struct k_thread, mem_domain_info);

		if ((thread->base.user_options & K_USER) == 0) {
			continue;
		}

		apply_mem_partition(z_x86_thread_page_tables_get(thread),
				    &domain->partitions[partition_id]);
	}
}

void z_arch_mem_domain_thread_add(struct k_thread *thread)
{
	if ((thread->base.user_options & K_USER) == 0) {
		return;
	}

	z_x86_apply_mem_domain(z_x86_thread_page_tables_get(thread),
			       thread->mem_domain_info.mem_domain);
}

int z_arch_mem_domain_max_partitions_get(void)
{
	return CONFIG_MAX_DOMAIN_PARTITIONS;
}
#endif	/* CONFIG_X86_USERSPACE*/