x86: generate runtime 64-bit page tables

- Bring in CONFIG_X86_MMU and some related defines to
  common X86 Kconfig
- Don't set ARCH_HAS_USERSPACE for intel64 yet when
  X86_MMU is enabled
- Uplevel x86_mmu.c to common code
- Add logic for handling PML4 table and generating PDPTs
- move z_x86_paging_init() to common kernel_arch_func.h
- Uplevel inclusion of mmustructs.h to common x86 arch.h,
  both need it for memory domain defines

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2019-10-10 13:05:43 -07:00 committed by Andrew Boie
commit f6e82ea1bd
10 changed files with 398 additions and 142 deletions

View file

@ -16,23 +16,23 @@ config ARCH
config CPU_ATOM config CPU_ATOM
bool # hidden bool # hidden
select CPU_HAS_FPU select CPU_HAS_FPU
select ARCH_HAS_STACK_PROTECTION if X86_MMU select ARCH_HAS_STACK_PROTECTION if X86_MMU && !X86_LONGMODE
select ARCH_HAS_USERSPACE if X86_MMU select ARCH_HAS_USERSPACE if X86_MMU && !X86_LONGMODE
help help
This option signifies the use of a CPU from the Atom family. This option signifies the use of a CPU from the Atom family.
config CPU_MINUTEIA config CPU_MINUTEIA
bool # hidden bool # hidden
select ARCH_HAS_STACK_PROTECTION if X86_MMU select ARCH_HAS_STACK_PROTECTION if X86_MMU && !X86_LONGMODE
select ARCH_HAS_USERSPACE if X86_MMU select ARCH_HAS_USERSPACE if X86_MMU && !X86_LONGMODE
help help
This option signifies the use of a CPU from the Minute IA family. This option signifies the use of a CPU from the Minute IA family.
config CPU_APOLLO_LAKE config CPU_APOLLO_LAKE
bool # hidden bool # hidden
select CPU_HAS_FPU select CPU_HAS_FPU
select ARCH_HAS_STACK_PROTECTION if X86_MMU select ARCH_HAS_STACK_PROTECTION if X86_MMU && !X86_LONGMODE
select ARCH_HAS_USERSPACE if X86_MMU select ARCH_HAS_USERSPACE if X86_MMU && !X86_LONGMODE
help help
This option signifies the use of a CPU from the Apollo Lake family. This option signifies the use of a CPU from the Apollo Lake family.
@ -156,6 +156,71 @@ config X86_VERY_EARLY_CONSOLE
printk to emit messages to the 16550 UART port 0 instance in device printk to emit messages to the 16550 UART port 0 instance in device
tree. This mini-driver assumes I/O to the UART is done via ports. tree. This mini-driver assumes I/O to the UART is done via ports.
config X86_MMU
bool "Enable Memory Management Unit"
select MEMORY_PROTECTION
help
This options enables the memory management unit present in x86
and creates a set of page tables at boot time that is runtime-
mutable.
config X86_MMU_PAGE_POOL_PAGES
int "Number of pages to reserve for building page tables"
default 16
depends on X86_MMU
help
Building page tables at boot requires a pool of free memory pages
to construct it. This can't be derived at build time, tune this
to your SoC's specific memory map.
config X86_NO_MELTDOWN
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Meltdown CPU vulnerability,
as described in CVE-2017-5754.
config X86_NO_SPECTRE_V1
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V1, V1.1, and V1.2
CPU vulnerabilities as described in CVE-2017-5753 and CVE-2018-3693.
config X86_NO_SPECTRE_V2
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V2 CPU
vulnerability, as described in CVE-2017-5715.
config X86_NO_SPECTRE_V4
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V4 CPU
vulnerability, as described in CVE-2018-3639.
config X86_NO_LAZY_FP
bool
help
This hidden option should be set on a per-SOC basis to indicate
that a particular SOC is not vulnerable to the Lazy FP CPU
vulnerability, as described in CVE-2018-3665.
config X86_NO_SPECULATIVE_VULNERABILITIES
bool
select X86_NO_MELTDOWN
select X86_NO_SPECTRE_V1
select X86_NO_SPECTRE_V2
select X86_NO_SPECTRE_V4
select X86_NO_LAZY_FP
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC does not perform any kind of speculative execution,
or is a newer chip which is immune to the class of vulnerabilities
which exploit speculative execution side channel attacks.
source "arch/x86/core/Kconfig.ia32" source "arch/x86/core/Kconfig.ia32"
source "arch/x86/core/Kconfig.intel64" source "arch/x86/core/Kconfig.intel64"

View file

@ -15,6 +15,7 @@ zephyr_library_sources_if_kconfig(pcie.c)
zephyr_library_sources_if_kconfig(reboot_rst_cnt.c) zephyr_library_sources_if_kconfig(reboot_rst_cnt.c)
zephyr_library_sources_if_kconfig(multiboot.c) zephyr_library_sources_if_kconfig(multiboot.c)
zephyr_library_sources_if_kconfig(acpi.c) zephyr_library_sources_if_kconfig(acpi.c)
zephyr_library_sources_if_kconfig(x86_mmu.c)
zephyr_library_sources_ifdef(CONFIG_X86_VERY_EARLY_CONSOLE early_serial.c) zephyr_library_sources_ifdef(CONFIG_X86_VERY_EARLY_CONSOLE early_serial.c)

View file

@ -102,71 +102,6 @@ config X86_BOUNDS_CHECK_BYPASS_MITIGATION
menu "Processor Capabilities" menu "Processor Capabilities"
config X86_MMU
bool "Enable Memory Management Unit"
select MEMORY_PROTECTION
help
This options enables the memory management unit present in x86
and creates a set of page tables at boot time. Requires an MMU
which supports PAE page tables.
config X86_MMU_PAGE_POOL_PAGES
int "Number of pages to reserve for building page tables"
default 16
depends on X86_MMU
help
Building page tables at boot requires a pool of free memory pages
to construct it. This can't be derived at build time, tune this
to your SoC's specific memory map.
config X86_NO_MELTDOWN
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Meltdown CPU vulnerability,
as described in CVE-2017-5754.
config X86_NO_SPECTRE_V1
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V1, V1.1, and V1.2
CPU vulnerabilities as described in CVE-2017-5753 and CVE-2018-3693.
config X86_NO_SPECTRE_V2
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V2 CPU
vulnerability, as described in CVE-2017-5715.
config X86_NO_SPECTRE_V4
bool
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC is not vulnerable to the Spectre V4 CPU
vulnerability, as described in CVE-2018-3639.
config X86_NO_LAZY_FP
bool
help
This hidden option should be set on a per-SOC basis to indicate
that a particular SOC is not vulnerable to the Lazy FP CPU
vulnerability, as described in CVE-2018-3665.
config X86_NO_SPECULATIVE_VULNERABILITIES
bool
select X86_NO_MELTDOWN
select X86_NO_SPECTRE_V1
select X86_NO_SPECTRE_V2
select X86_NO_SPECTRE_V4
select X86_NO_LAZY_FP
help
This hidden option should be set on a per-SOC basis to indicate that
a particular SOC does not perform any kind of speculative execution,
or is a newer chip which is immune to the class of vulnerabilities
which exploit speculative execution side channel attacks.
config X86_ENABLE_TSS config X86_ENABLE_TSS
bool bool
help help

View file

@ -21,7 +21,6 @@ zephyr_library_sources(
) )
zephyr_library_sources_ifdef(CONFIG_IRQ_OFFLOAD ia32/irq_offload.c) zephyr_library_sources_ifdef(CONFIG_IRQ_OFFLOAD ia32/irq_offload.c)
zephyr_library_sources_ifdef(CONFIG_X86_MMU ia32/x86_mmu.c)
zephyr_library_sources_ifdef(CONFIG_X86_USERSPACE ia32/userspace.S) zephyr_library_sources_ifdef(CONFIG_X86_USERSPACE ia32/userspace.S)
zephyr_library_sources_ifdef(CONFIG_LAZY_FP_SHARING ia32/float.c) zephyr_library_sources_ifdef(CONFIG_LAZY_FP_SHARING ia32/float.c)

View file

@ -26,37 +26,67 @@ BUILD_ASSERT(DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024ULL) - 1ULL <=
/* Mark text and rodata as read-only. /* Mark text and rodata as read-only.
* Userspace may read all text and rodata. * Userspace may read all text and rodata.
*/ */
MMU_BOOT_REGION((u32_t)&_image_text_start, (u32_t)&_image_text_size, MMU_BOOT_REGION(&_image_text_start, &_image_text_size,
Z_X86_MMU_US); Z_X86_MMU_US);
MMU_BOOT_REGION((u32_t)&_image_rodata_start, (u32_t)&_image_rodata_size, MMU_BOOT_REGION(&_image_rodata_start, &_image_rodata_size,
Z_X86_MMU_US | Z_X86_MMU_XD); Z_X86_MMU_US | Z_X86_MMU_XD);
#ifdef CONFIG_USERSPACE #ifdef CONFIG_USERSPACE
MMU_BOOT_REGION((u32_t)&_app_smem_start, (u32_t)&_app_smem_size, MMU_BOOT_REGION(&_app_smem_start, &_app_smem_size,
Z_X86_MMU_RW | Z_X86_MMU_XD); Z_X86_MMU_RW | Z_X86_MMU_XD);
#endif #endif
#ifdef CONFIG_COVERAGE_GCOV #ifdef CONFIG_COVERAGE_GCOV
MMU_BOOT_REGION((u32_t)&__gcov_bss_start, (u32_t)&__gcov_bss_size, MMU_BOOT_REGION(&__gcov_bss_start, &__gcov_bss_size,
Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_XD); Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_XD);
#endif #endif
#ifdef CONFIG_X86_LONGMODE
extern char _locore_start[];
extern char _locore_size[];
extern char _lorodata_start[];
extern char _lorodata_size[];
extern char _lodata_start[];
extern char _lodata_size[];
/* Early boot regions that need to be in low memory to be comprehensible
* by the CPU in 16-bit mode
*/
MMU_BOOT_REGION(&_locore_start, &_locore_size, 0);
MMU_BOOT_REGION(&_lorodata_start, &_lorodata_size, Z_X86_MMU_XD);
MMU_BOOT_REGION(&_lodata_start, &_lodata_size, Z_X86_MMU_RW | Z_X86_MMU_XD);
#endif
/* __kernel_ram_size includes all unused memory, which is used for heaps. /* __kernel_ram_size includes all unused memory, which is used for heaps.
* User threads cannot access this unless granted at runtime. This is done * User threads cannot access this unless granted at runtime. This is done
* automatically for stacks. * automatically for stacks.
*/ */
MMU_BOOT_REGION((u32_t)&__kernel_ram_start, (u32_t)&__kernel_ram_size, MMU_BOOT_REGION(&__kernel_ram_start, &__kernel_ram_size,
Z_X86_MMU_RW | Z_X86_MMU_XD); Z_X86_MMU_RW | Z_X86_MMU_XD);
/* /*
* Inline functions for setting memory addresses in page table structures * Inline functions for setting memory addresses in page table structures
*/ */
#ifdef CONFIG_X86_LONGMODE
static inline void pml4e_update_pdpt(u64_t *pml4e, struct x86_mmu_pdpt *pdpt)
{
uintptr_t pdpt_addr = (uintptr_t)pdpt;
*pml4e = ((*pml4e & ~Z_X86_MMU_PML4E_PDPT_MASK) |
(pdpt_addr & Z_X86_MMU_PML4E_PDPT_MASK));
}
#endif /* CONFIG_X86_LONGMODE */
static inline void pdpte_update_pd(u64_t *pdpte, struct x86_mmu_pd *pd) static inline void pdpte_update_pd(u64_t *pdpte, struct x86_mmu_pd *pd)
{ {
uintptr_t pd_addr = (uintptr_t)pd; uintptr_t pd_addr = (uintptr_t)pd;
#ifdef CONFIG_X86_LONGMODE
__ASSERT((*pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page");
#endif
*pdpte = ((*pdpte & ~Z_X86_MMU_PDPTE_PD_MASK) | *pdpte = ((*pdpte & ~Z_X86_MMU_PDPTE_PD_MASK) |
(pd_addr & Z_X86_MMU_PDPTE_PD_MASK)); (pd_addr & Z_X86_MMU_PDPTE_PD_MASK));
} }
@ -121,15 +151,12 @@ static char get_entry_code(u64_t value)
return ret; return ret;
} }
static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index) static void print_entries(u64_t entries_array[], size_t count)
{ {
int column = 0; int column = 0;
printk("Page table %d for 0x%08lX - 0x%08lX at %p\n", for (int i = 0; i < count; i++) {
index, base, base + Z_X86_PT_AREA - 1, pt); printk("%c", get_entry_code(entries_array[i]));
for (int i = 0; i < Z_X86_NUM_PT_ENTRIES; i++) {
printk("%c", get_entry_code(pt->entry[i]));
column++; column++;
if (column == 64) { if (column == 64) {
@ -137,24 +164,26 @@ static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index)
printk("\n"); printk("\n");
} }
} }
if (column != 0) {
printk("\n");
}
}
static void z_x86_dump_pt(struct x86_mmu_pt *pt, uintptr_t base, int index)
{
printk("Page table %d for 0x%016lX - 0x%016lX at %p\n",
index, base, base + Z_X86_PT_AREA - 1, pt);
print_entries(pt->entry, Z_X86_NUM_PT_ENTRIES);
} }
static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index) static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index)
{ {
int column = 0; printk("Page directory %d for 0x%016lX - 0x%016lX at %p\n",
printk("Page directory %d for 0x%08lX - 0x%08lX at %p\n",
index, base, base + Z_X86_PD_AREA - 1, pd); index, base, base + Z_X86_PD_AREA - 1, pd);
for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) { print_entries(pd->entry, Z_X86_NUM_PD_ENTRIES);
printk("%c", get_entry_code(pd->entry[i]));
column++;
if (column == 64) {
column = 0;
printk("\n");
}
}
for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) { for (int i = 0; i < Z_X86_NUM_PD_ENTRIES; i++) {
struct x86_mmu_pt *pt; struct x86_mmu_pt *pt;
@ -174,13 +203,11 @@ static void z_x86_dump_pd(struct x86_mmu_pd *pd, uintptr_t base, int index)
static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base, static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base,
int index) int index)
{ {
printk("Page directory pointer table %d for 0x%08lX - 0x%08lX at %p\n", printk("Page directory pointer table %d for 0x%0816lX - 0x%016lX at %p\n",
index, base, base + Z_X86_PDPT_AREA - 1, pdpt); index, base, base + Z_X86_PDPT_AREA - 1, pdpt);
for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) { print_entries(pdpt->entry, Z_X86_NUM_PDPT_ENTRIES);
printk("%c", get_entry_code(pdpt->entry[i]));
}
printk("\n");
for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) { for (int i = 0; i < Z_X86_NUM_PDPT_ENTRIES; i++) {
struct x86_mmu_pd *pd; struct x86_mmu_pd *pd;
u64_t pdpte = pdpt->entry[i]; u64_t pdpte = pdpt->entry[i];
@ -188,15 +215,47 @@ static void z_x86_dump_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t base,
if ((pdpte & Z_X86_MMU_P) == 0) { if ((pdpte & Z_X86_MMU_P) == 0) {
continue; continue;
} }
#ifdef CONFIG_X86_LONGMODE
if ((pdpte & Z_X86_MMU_PS) != 0) {
continue;
}
#endif
pd = z_x86_pdpte_get_pd(pdpte); pd = z_x86_pdpte_get_pd(pdpte);
z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i); z_x86_dump_pd(pd, base + (i * Z_X86_PD_AREA), i);
} }
} }
#ifdef CONFIG_X86_LONGMODE
static void z_x86_dump_pml4(struct x86_mmu_pml4 *pml4)
{
printk("Page mapping level 4 at %p for all memory addresses\n", pml4);
print_entries(pml4->entry, Z_X86_NUM_PML4_ENTRIES);
for (int i = 0; i < Z_X86_NUM_PML4_ENTRIES; i++) {
struct x86_mmu_pdpt *pdpt;
u64_t pml4e = pml4->entry[i];
if ((pml4e & Z_X86_MMU_P) == 0) {
continue;
}
pdpt = z_x86_pml4e_get_pdpt(pml4e);
z_x86_dump_pdpt(pdpt, i * Z_X86_PDPT_AREA, i);
}
}
void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{
z_x86_dump_pml4(z_x86_get_pml4(ptables));
}
#else
void z_x86_dump_page_tables(struct x86_page_tables *ptables) void z_x86_dump_page_tables(struct x86_page_tables *ptables)
{ {
z_x86_dump_pdpt(z_x86_get_pdpt(ptables, 0), 0, 0); z_x86_dump_pdpt(z_x86_get_pdpt(ptables, 0), 0, 0);
} }
#endif
void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr, void z_x86_mmu_get_flags(struct x86_page_tables *ptables, void *addr,
u64_t *pde_flags, u64_t *pte_flags) u64_t *pde_flags, u64_t *pte_flags)
@ -313,7 +372,6 @@ static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
while (remaining) { while (remaining) {
u64_t pdpte = *z_x86_pdpt_get_pdpte(pdpt, pos); u64_t pdpte = *z_x86_pdpt_get_pdpte(pdpt, pos);
struct x86_mmu_pd *pd;
if ((pdpte & Z_X86_MMU_P) == 0) { if ((pdpte & Z_X86_MMU_P) == 0) {
/* Non-present */ /* Non-present */
@ -321,13 +379,30 @@ static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
break; break;
} }
pd = z_x86_pdpte_get_pd(pdpte); #ifdef CONFIG_X86_LONGMODE
to_examine = get_table_max(pos, remaining, Z_X86_PD_AREA); if ((pdpte & Z_X86_MMU_US) == 0 ||
(write && (pdpte & Z_X86_MMU_RW) == 0)) {
ret = x86_mmu_validate_pd(pd, pos, to_examine, write); ret = -1;
if (ret != 0) {
break; break;
} }
#endif
to_examine = get_table_max(pos, remaining, Z_X86_PD_AREA);
#ifdef CONFIG_X86_LONGMODE
/* Check if 1GB page, if not, examine linked page directory */
if ((pdpte & Z_X86_MMU_PS) == 0) {
#endif
struct x86_mmu_pd *pd = z_x86_pdpte_get_pd(pdpte);
ret = x86_mmu_validate_pd(pd, pos, to_examine, write);
if (ret != 0) {
break;
}
#ifdef CONFIG_X86_LONGMODE
} else {
ret = 0;
}
#endif
remaining -= to_examine; remaining -= to_examine;
pos += to_examine; pos += to_examine;
} }
@ -335,14 +410,55 @@ static int x86_mmu_validate_pdpt(struct x86_mmu_pdpt *pdpt, uintptr_t addr,
return ret; return ret;
} }
#ifdef CONFIG_X86_LONGMODE
static int x86_mmu_validate_pml4(struct x86_mmu_pml4 *pml4, uintptr_t addr,
size_t size, bool write)
{
uintptr_t pos = addr;
size_t remaining = size;
int ret = 0;
size_t to_examine;
while (remaining) {
u64_t pml4e = *z_x86_pml4_get_pml4e(pml4, pos);
struct x86_mmu_pdpt *pdpt;
if ((pml4e & Z_X86_MMU_P) == 0 || (pml4e & Z_X86_MMU_US) == 0 ||
(write && (pml4e & Z_X86_MMU_RW) == 0)) {
ret = -1;
break;
}
to_examine = get_table_max(pos, remaining, Z_X86_PDPT_AREA);
pdpt = z_x86_pml4e_get_pdpt(pml4e);
ret = x86_mmu_validate_pdpt(pdpt, pos, to_examine, write);
if (ret != 0) {
break;
}
remaining -= to_examine;
pos += to_examine;
}
return ret;
}
#endif /* CONFIG_X86_LONGMODE */
int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size, int z_x86_mmu_validate(struct x86_page_tables *ptables, void *addr, size_t size,
bool write) bool write)
{ {
int ret; int ret;
/* 32-bit just has one PDPT that covers the entire address space */
#ifdef CONFIG_X86_LONGMODE
struct x86_mmu_pml4 *pml4 = z_x86_get_pml4(ptables);
ret = x86_mmu_validate_pml4(pml4, (uintptr_t)addr, size, write);
#else
struct x86_mmu_pdpt *pdpt = z_x86_get_pdpt(ptables, (uintptr_t)addr); struct x86_mmu_pdpt *pdpt = z_x86_get_pdpt(ptables, (uintptr_t)addr);
ret = x86_mmu_validate_pdpt(pdpt, (uintptr_t)addr, size, write); ret = x86_mmu_validate_pdpt(pdpt, (uintptr_t)addr, size, write);
#endif
#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
__asm__ volatile ("lfence" : : : "memory"); __asm__ volatile ("lfence" : : : "memory");
@ -361,10 +477,18 @@ static inline void tlb_flush_page(void *addr)
__asm__ ("invlpg %0" :: "m" (*page)); __asm__ ("invlpg %0" :: "m" (*page));
} }
#ifdef CONFIG_X86_LONGMODE
#define PML4E_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | Z_X86_MMU_P)
#define PDPTE_FLAGS_MASK PML4E_FLAGS_MASK
#define PDE_FLAGS_MASK PDPTE_FLAGS_MASK
#else
#define PDPTE_FLAGS_MASK Z_X86_MMU_P #define PDPTE_FLAGS_MASK Z_X86_MMU_P
#define PDE_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \ #define PDE_FLAGS_MASK (Z_X86_MMU_RW | Z_X86_MMU_US | \
PDPTE_FLAGS_MASK) PDPTE_FLAGS_MASK)
#endif
#define PTE_FLAGS_MASK (PDE_FLAGS_MASK | Z_X86_MMU_XD | \ #define PTE_FLAGS_MASK (PDE_FLAGS_MASK | Z_X86_MMU_XD | \
Z_X86_MMU_PWT | \ Z_X86_MMU_PWT | \
@ -373,7 +497,7 @@ static inline void tlb_flush_page(void *addr)
void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr, void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
size_t size, u64_t flags, u64_t mask, bool flush) size_t size, u64_t flags, u64_t mask, bool flush)
{ {
u32_t addr = (u32_t)ptr; uintptr_t addr = (uintptr_t)ptr;
__ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided"); __ASSERT((addr & MMU_PAGE_MASK) == 0U, "unaligned address provided");
__ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided"); __ASSERT((size & MMU_PAGE_MASK) == 0U, "unaligned size provided");
@ -386,18 +510,43 @@ void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
mask |= Z_X86_MMU_PTE_ADDR_MASK; mask |= Z_X86_MMU_PTE_ADDR_MASK;
} }
/* NOTE: All of this code assumes that 2MB or 1GB pages are not being
* modified.
*/
while (size != 0) { while (size != 0) {
u64_t *pte; u64_t *pte;
u64_t *pde; u64_t *pde;
u64_t *pdpte; u64_t *pdpte;
#ifdef CONFIG_X86_LONGMODE
u64_t *pml4e;
#endif
u64_t cur_flags = flags; u64_t cur_flags = flags;
bool exec = (flags & Z_X86_MMU_XD) == 0;
#ifdef CONFIG_X86_LONGMODE
pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
__ASSERT((*pml4e & Z_X86_MMU_P) != 0,
"set flags on non-present PML4e");
*pml4e |= (flags & PML4E_FLAGS_MASK);
if (exec) {
*pml4e &= ~Z_X86_MMU_XD;
}
pdpte = z_x86_pdpt_get_pdpte(z_x86_pml4e_get_pdpt(*pml4e),
addr);
#else
pdpte = z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr), pdpte = z_x86_pdpt_get_pdpte(z_x86_get_pdpt(ptables, addr),
addr); addr);
#endif
__ASSERT((*pdpte & Z_X86_MMU_P) != 0, __ASSERT((*pdpte & Z_X86_MMU_P) != 0,
"set flags on non-present PDPTE"); "set flags on non-present PDPTE");
*pdpte |= (flags & PDPTE_FLAGS_MASK); *pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
if (exec) {
*pdpte &= ~Z_X86_MMU_XD;
}
#endif
pde = z_x86_pd_get_pde(z_x86_pdpte_get_pd(*pdpte), addr); pde = z_x86_pd_get_pde(z_x86_pdpte_get_pd(*pdpte), addr);
__ASSERT((*pde & Z_X86_MMU_P) != 0, __ASSERT((*pde & Z_X86_MMU_P) != 0,
"set flags on non-present PDE"); "set flags on non-present PDE");
@ -406,7 +555,7 @@ void z_x86_mmu_set_flags(struct x86_page_tables *ptables, void *ptr,
/* If any flags enable execution, clear execute disable at the /* If any flags enable execution, clear execute disable at the
* page directory level * page directory level
*/ */
if ((flags & Z_X86_MMU_XD) == 0) { if (exec) {
*pde &= ~Z_X86_MMU_XD; *pde &= ~Z_X86_MMU_XD;
} }
@ -444,9 +593,15 @@ static void *get_page(void)
return page_pos; return page_pos;
} }
__aligned(0x20) struct x86_page_tables z_x86_kernel_ptables; #ifdef CONFIG_X86_LONGMODE
#define PTABLES_ALIGN 4096
#else
#define PTABLES_ALIGN 32
#endif
__aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_kernel_ptables;
#ifdef CONFIG_X86_KPTI #ifdef CONFIG_X86_KPTI
__aligned(0x20) struct x86_page_tables z_x86_user_ptables; __aligned(PTABLES_ALIGN) struct x86_page_tables z_x86_user_ptables;
#endif #endif
extern char z_shared_kernel_page_start[]; extern char z_shared_kernel_page_start[];
@ -457,17 +612,39 @@ static inline bool is_within_system_ram(uintptr_t addr)
(addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))); (addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)));
} }
#define PDE_IGNORED BIT64(11) /* Ignored bit posiition at all levels */
#define IGNORED BIT64(11)
static void maybe_clear_xd(u64_t *entry, bool exec)
{
/* Execute disable bit needs special handling, we should only set it at
* intermediate levels if ALL containing pages have XD set (instead of
* just one).
*
* Use an ignored bit position in the PDE to store a marker on whether
* any configured region allows execution.
*/
if (exec) {
*entry |= IGNORED;
*entry &= ~Z_X86_MMU_XD;
} else if ((*entry & IGNORED) == 0) {
*entry |= Z_X86_MMU_XD;
}
}
static void add_mmu_region_page(struct x86_page_tables *ptables, static void add_mmu_region_page(struct x86_page_tables *ptables,
uintptr_t addr, u64_t flags, bool user_table) uintptr_t addr, u64_t flags, bool user_table)
{ {
#ifdef CONFIG_X86_LONGMODE
u64_t *pml4e;
#endif
struct x86_mmu_pdpt *pdpt; struct x86_mmu_pdpt *pdpt;
u64_t *pdpte; u64_t *pdpte;
struct x86_mmu_pd *pd; struct x86_mmu_pd *pd;
u64_t *pde; u64_t *pde;
struct x86_mmu_pt *pt; struct x86_mmu_pt *pt;
u64_t *pte; u64_t *pte;
bool exec = (flags & Z_X86_MMU_XD) == 0;
#ifdef CONFIG_X86_KPTI #ifdef CONFIG_X86_KPTI
/* If we are generating a page table for user mode, and this address /* If we are generating a page table for user mode, and this address
@ -482,7 +659,19 @@ static void add_mmu_region_page(struct x86_page_tables *ptables,
} }
#endif #endif
#ifdef CONFIG_X86_LONGMODE
pml4e = z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
if ((*pml4e & Z_X86_MMU_P) == 0) {
pdpt = get_page();
pml4e_update_pdpt(pml4e, pdpt);
} else {
pdpt = z_x86_pml4e_get_pdpt(*pml4e);
}
*pml4e |= (flags & PML4E_FLAGS_MASK);
maybe_clear_xd(pml4e, exec);
#else
pdpt = z_x86_get_pdpt(ptables, addr); pdpt = z_x86_get_pdpt(ptables, addr);
#endif
/* Setup the PDPTE entry for the address, creating a page directory /* Setup the PDPTE entry for the address, creating a page directory
* if one didn't exist * if one didn't exist
@ -495,6 +684,9 @@ static void add_mmu_region_page(struct x86_page_tables *ptables,
pd = z_x86_pdpte_get_pd(*pdpte); pd = z_x86_pdpte_get_pd(*pdpte);
} }
*pdpte |= (flags & PDPTE_FLAGS_MASK); *pdpte |= (flags & PDPTE_FLAGS_MASK);
#ifdef CONFIG_X86_LONGMODE
maybe_clear_xd(pdpte, exec);
#endif
/* Setup the PDE entry for the address, creating a page table /* Setup the PDE entry for the address, creating a page table
* if necessary * if necessary
@ -507,20 +699,7 @@ static void add_mmu_region_page(struct x86_page_tables *ptables,
pt = z_x86_pde_get_pt(*pde); pt = z_x86_pde_get_pt(*pde);
} }
*pde |= (flags & PDE_FLAGS_MASK); *pde |= (flags & PDE_FLAGS_MASK);
maybe_clear_xd(pde, exec);
/* Execute disable bit needs special handling, we should only set it at
* the page directory level if ALL pages have XD set (instead of just
* one).
*
* Use an ignored bit position in the PDE to store a marker on whether
* any configured region allows execution.
*/
if ((flags & Z_X86_MMU_XD) == 0) {
*pde |= PDE_IGNORED;
*pde &= ~Z_X86_MMU_XD;
} else if ((*pde & PDE_IGNORED) == 0) {
*pde |= Z_X86_MMU_XD;
}
#ifdef CONFIG_X86_KPTI #ifdef CONFIG_X86_KPTI
if (user_table && (flags & Z_X86_MMU_US) == 0 && if (user_table && (flags & Z_X86_MMU_US) == 0 &&
@ -552,7 +731,6 @@ static void add_mmu_region(struct x86_page_tables *ptables,
"unaligned address provided"); "unaligned address provided");
__ASSERT((rgn->size & MMU_PAGE_MASK) == 0U, __ASSERT((rgn->size & MMU_PAGE_MASK) == 0U,
"unaligned size provided"); "unaligned size provided");
addr = rgn->address; addr = rgn->address;
flags = rgn->flags | Z_X86_MMU_P; flags = rgn->flags | Z_X86_MMU_P;
@ -587,8 +765,15 @@ void z_x86_paging_init(void)
CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free); CONFIG_X86_MMU_PAGE_POOL_PAGES - pages_free);
} }
#ifdef CONFIG_X86_LONGMODE
/* MMU already enabled at boot for long mode, we just need to
* program CR3 with our newly generated page tables.
*/
__asm__ volatile("movq %0, %%cr3\n\t"
: : "r" (&z_x86_kernel_ptables) : "memory");
#else
z_x86_enable_paging(); z_x86_enable_paging();
#endif
} }
#ifdef CONFIG_X86_USERSPACE #ifdef CONFIG_X86_USERSPACE

View file

@ -23,9 +23,6 @@ extern "C" {
#define STACK_ROUND_UP(x) ROUND_UP(x, STACK_ALIGN_SIZE) #define STACK_ROUND_UP(x) ROUND_UP(x, STACK_ALIGN_SIZE)
#define STACK_ROUND_DOWN(x) ROUND_DOWN(x, STACK_ALIGN_SIZE) #define STACK_ROUND_DOWN(x) ROUND_DOWN(x, STACK_ALIGN_SIZE)
/* Create all page tables with boot configuration and enable paging */
void z_x86_paging_init(void);
static inline void z_arch_kernel_init(void) static inline void z_arch_kernel_init(void)
{ {
/* No-op on this arch */ /* No-op on this arch */

View file

@ -36,6 +36,11 @@ extern FUNC_NORETURN void z_x86_prep_c(int dummy, struct multiboot_info *info);
void z_x86_early_serial_init(void); void z_x86_early_serial_init(void);
#endif /* CONFIG_X86_VERY_EARLY_CONSOLE */ #endif /* CONFIG_X86_VERY_EARLY_CONSOLE */
#ifdef CONFIG_X86_MMU
/* Create all page tables with boot configuration and enable paging */
void z_x86_paging_init(void);
#endif /* CONFIG_X86_MMU */
#endif #endif
#endif /* ZEPHYR_ARCH_X86_INCLUDE_KERNEL_ARCH_FUNC_H_ */ #endif /* ZEPHYR_ARCH_X86_INCLUDE_KERNEL_ARCH_FUNC_H_ */

View file

@ -15,6 +15,7 @@
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#include <irq.h> #include <irq.h>
#include <arch/x86/mmustructs.h>
static ALWAYS_INLINE void z_arch_irq_unlock(unsigned int key) static ALWAYS_INLINE void z_arch_irq_unlock(unsigned int key)
{ {

View file

@ -17,7 +17,6 @@
#include "sys_io.h" #include "sys_io.h"
#include <drivers/interrupt_controller/sysapic.h> #include <drivers/interrupt_controller/sysapic.h>
#include <kernel_arch_thread.h> #include <kernel_arch_thread.h>
#include <arch/x86/mmustructs.h>
#include <stdbool.h> #include <stdbool.h>
#include <arch/common/ffs.h> #include <arch/common/ffs.h>
#include <misc/util.h> #include <misc/util.h>

View file

@ -10,7 +10,7 @@
#include <sys/util.h> #include <sys/util.h>
#define MMU_PAGE_SIZE 4096U #define MMU_PAGE_SIZE 4096UL
#define MMU_PAGE_MASK 0xfffU #define MMU_PAGE_MASK 0xfffU
#define MMU_PAGE_SHIFT 12U #define MMU_PAGE_SHIFT 12U
#define PAGES(x) ((x) << (MMU_PAGE_SHIFT)) #define PAGES(x) ((x) << (MMU_PAGE_SHIFT))
@ -38,17 +38,32 @@
#define Z_X86_MMU_G BIT64(8) /** Global */ #define Z_X86_MMU_G BIT64(8) /** Global */
#define Z_X86_MMU_XD BIT64(63) /** Execute Disable */ #define Z_X86_MMU_XD BIT64(63) /** Execute Disable */
#ifdef CONFIG_X86_LONGMODE
#define Z_X86_MMU_PROT_KEY_MASK 0x7800000000000000ULL
#endif
/* /*
* Structure-specific flags / masks * Structure-specific flags / masks
*/ */
#define Z_X86_MMU_PDPTE_PAT BIT64(12)
#define Z_X86_MMU_PDE_PAT BIT64(12) #define Z_X86_MMU_PDE_PAT BIT64(12)
#define Z_X86_MMU_PTE_PAT BIT64(7) /** Page Attribute Table */ #define Z_X86_MMU_PTE_PAT BIT64(7) /** Page Attribute Table */
#define Z_X86_MMU_PDPTE_PD_MASK 0x00000000FFFFF000ULL /* The true size of the mask depends on MAXADDR, which is found at run-time.
#define Z_X86_MMU_PDE_PT_MASK 0x00000000FFFFF000ULL * As a simplification, roll the area for the memory address, and the
#define Z_X86_MMU_PDE_2MB_MASK 0x00000000FFC00000ULL * reserved or ignored regions immediately above it, into a single area.
#define Z_X86_MMU_PTE_ADDR_MASK 0x00000000FFFFF000ULL * This will work as expected if valid memory addresses are written.
*/
#ifdef CONFIG_X86_LONGMODE
#define Z_X86_MMU_PML4E_PDPT_MASK 0x7FFFFFFFFFFFF000ULL
#endif
#define Z_X86_MMU_PDPTE_PD_MASK 0x7FFFFFFFFFFFF000ULL
#ifdef CONFIG_X86_LONGMODE
#define Z_X86_MMU_PDPTE_1G_MASK 0x07FFFFFFC0000000ULL
#endif
#define Z_X86_MMU_PDE_PT_MASK 0x7FFFFFFFFFFFF000ULL
#define Z_X86_MMU_PDE_2MB_MASK 0x07FFFFFFFFC00000ULL
#define Z_X86_MMU_PTE_ADDR_MASK 0x07FFFFFFFFFFF000ULL
/* /*
* These flags indicate intention when setting access properties. * These flags indicate intention when setting access properties.
@ -148,9 +163,14 @@ struct mmu_region {
#define MMU_BOOT_REGION(addr, region_size, permission_flags) \ #define MMU_BOOT_REGION(addr, region_size, permission_flags) \
Z_MMU_BOOT_REGION(__COUNTER__, addr, region_size, permission_flags) Z_MMU_BOOT_REGION(__COUNTER__, addr, region_size, permission_flags)
#define Z_X86_NUM_PDPT_ENTRIES 4 #ifdef CONFIG_X86_LONGMODE
#define Z_X86_NUM_PD_ENTRIES 512 #define Z_X86_NUM_PML4_ENTRIES 512U
#define Z_X86_NUM_PT_ENTRIES 512 #define Z_X86_NUM_PDPT_ENTRIES 512U
#else
#define Z_X86_NUM_PDPT_ENTRIES 4U
#endif
#define Z_X86_NUM_PD_ENTRIES 512U
#define Z_X86_NUM_PT_ENTRIES 512U
/* Memory range covered by an instance of various table types */ /* Memory range covered by an instance of various table types */
#define Z_X86_PT_AREA (MMU_PAGE_SIZE * Z_X86_NUM_PT_ENTRIES) #define Z_X86_PT_AREA (MMU_PAGE_SIZE * Z_X86_NUM_PT_ENTRIES)
@ -159,6 +179,12 @@ struct mmu_region {
typedef u64_t k_mem_partition_attr_t; typedef u64_t k_mem_partition_attr_t;
#ifdef CONFIG_X86_LONGMODE
struct x86_mmu_pml4 {
u64_t entry[Z_X86_NUM_PML4_ENTRIES];
};
#endif
struct x86_mmu_pdpt { struct x86_mmu_pdpt {
u64_t entry[Z_X86_NUM_PDPT_ENTRIES]; u64_t entry[Z_X86_NUM_PDPT_ENTRIES];
}; };
@ -172,12 +198,32 @@ struct x86_mmu_pt {
}; };
struct x86_page_tables { struct x86_page_tables {
#ifdef CONFIG_X86_LONGMODE
struct x86_mmu_pml4 pml4;
#else
struct x86_mmu_pdpt pdpt; struct x86_mmu_pdpt pdpt;
#endif
}; };
/* /*
* Inline functions for getting the next linked structure * Inline functions for getting the next linked structure
*/ */
#ifdef CONFIG_X86_LONGMODE
static inline u64_t *z_x86_pml4_get_pml4e(struct x86_mmu_pml4 *pml4,
uintptr_t addr)
{
int index = (addr >> 39U) & (Z_X86_NUM_PML4_ENTRIES - 1);
return &pml4->entry[index];
}
static inline struct x86_mmu_pdpt *z_x86_pml4e_get_pdpt(u64_t pml4e)
{
uintptr_t addr = pml4e & Z_X86_MMU_PML4E_PDPT_MASK;
return (struct x86_mmu_pdpt *)addr;
}
#endif
static inline u64_t *z_x86_pdpt_get_pdpte(struct x86_mmu_pdpt *pdpt, static inline u64_t *z_x86_pdpt_get_pdpte(struct x86_mmu_pdpt *pdpt,
uintptr_t addr) uintptr_t addr)
@ -191,6 +237,9 @@ static inline struct x86_mmu_pd *z_x86_pdpte_get_pd(u64_t pdpte)
{ {
uintptr_t addr = pdpte & Z_X86_MMU_PDPTE_PD_MASK; uintptr_t addr = pdpte & Z_X86_MMU_PDPTE_PD_MASK;
#ifdef CONFIG_X86_LONGMODE
__ASSERT((pdpte & Z_X86_MMU_PS) == 0, "PDPT is for 1GB page");
#endif
return (struct x86_mmu_pd *)addr; return (struct x86_mmu_pd *)addr;
} }
@ -221,6 +270,25 @@ static inline u64_t *z_x86_pt_get_pte(struct x86_mmu_pt *pt, uintptr_t addr)
* Inline functions for obtaining page table structures from the top-level * Inline functions for obtaining page table structures from the top-level
*/ */
#ifdef CONFIG_X86_LONGMODE
static inline struct x86_mmu_pml4 *
z_x86_get_pml4(struct x86_page_tables *ptables)
{
return &ptables->pml4;
}
static inline u64_t *z_x86_get_pml4e(struct x86_page_tables *ptables,
uintptr_t addr)
{
return z_x86_pml4_get_pml4e(z_x86_get_pml4(ptables), addr);
}
static inline struct x86_mmu_pdpt *
z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr)
{
return z_x86_pml4e_get_pdpt(*z_x86_get_pml4e(ptables, addr));
}
#else
static inline struct x86_mmu_pdpt * static inline struct x86_mmu_pdpt *
z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr) z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr)
{ {
@ -228,6 +296,7 @@ z_x86_get_pdpt(struct x86_page_tables *ptables, uintptr_t addr)
return &ptables->pdpt; return &ptables->pdpt;
} }
#endif /* CONFIG_X86_LONGMODE */
static inline u64_t *z_x86_get_pdpte(struct x86_page_tables *ptables, static inline u64_t *z_x86_get_pdpte(struct x86_page_tables *ptables,
uintptr_t addr) uintptr_t addr)