From d2a72273b787210d44bdbc5d4be481d2a9a22486 Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Tue, 27 Oct 2020 11:27:37 -0700 Subject: [PATCH] x86: add support for common page tables We provide an option for low-memory systems to use a single set of page tables for all threads. This is only supported if KPTI and SMP are disabled. This configuration saves a considerable amount of RAM, especially if multiple memory domains are used, at a cost of context switching overhead. Some caching techniques are used to reduce the amount of context switch updates; the page tables aren't updated if switching to a supervisor thread, and the page table configuration of the last user thread switched in is cached. Signed-off-by: Andrew Boie --- arch/Kconfig | 2 +- arch/x86/Kconfig | 12 ++ arch/x86/core/offsets/ia32_offsets.c | 2 + arch/x86/core/offsets/intel64_offsets.c | 2 + arch/x86/core/userspace.c | 12 +- arch/x86/core/x86_mmu.c | 178 +++++++++++++++++++++++- arch/x86/include/x86_mmu.h | 6 +- include/arch/x86/ia32/thread.h | 2 + include/arch/x86/intel64/thread.h | 2 + tests/arch/x86/pagetables/src/main.c | 6 +- 10 files changed, 213 insertions(+), 11 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 2d6ec48f2a0..f2c3d57c521 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -44,7 +44,7 @@ config X86 select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64 select ARCH_SUPPORTS_COREDUMP select CPU_HAS_MMU - select ARCH_MEM_DOMAIN_DATA if USERSPACE + select ARCH_MEM_DOMAIN_DATA if USERSPACE && !X86_COMMON_PAGE_TABLE select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE select ARCH_HAS_GDBSTUB if !X86_64 select ARCH_HAS_TIMING_FUNCTIONS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f291349d5..294a99ec4de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -213,6 +213,18 @@ config X86_MMU_PAGE_POOL_PAGES Unused pages in this pool cannot be used for other purposes. +config X86_COMMON_PAGE_TABLE + bool "Use a single page table for all threads" + default n + depends on USERSPACE + depends on !SMP + depends on !X86_KPTI + help + If this option is enabled, userspace memory domains will not have their + own page tables. Instead, context switching operations will modify + page tables in place. This is much slower, but uses much less RAM + for page tables. + config X86_NO_MELTDOWN bool help diff --git a/arch/x86/core/offsets/ia32_offsets.c b/arch/x86/core/offsets/ia32_offsets.c index 3cd68729748..3ac11219b7b 100644 --- a/arch/x86/core/offsets/ia32_offsets.c +++ b/arch/x86/core/offsets/ia32_offsets.c @@ -35,8 +35,10 @@ GEN_OFFSET_SYM(_thread_arch_t, excNestCount); #ifdef CONFIG_USERSPACE GEN_OFFSET_SYM(_thread_arch_t, psp); +#ifndef CONFIG_X86_COMMON_PAGE_TABLE GEN_OFFSET_SYM(_thread_arch_t, ptables); #endif +#endif GEN_OFFSET_SYM(_thread_arch_t, preempFloatReg); diff --git a/arch/x86/core/offsets/intel64_offsets.c b/arch/x86/core/offsets/intel64_offsets.c index b43747869e7..4a6e52c4eba 100644 --- a/arch/x86/core/offsets/intel64_offsets.c +++ b/arch/x86/core/offsets/intel64_offsets.c @@ -30,7 +30,9 @@ GEN_OFFSET_SYM(_thread_arch_t, sse); GEN_OFFSET_SYM(_thread_arch_t, ss); GEN_OFFSET_SYM(_thread_arch_t, cs); GEN_OFFSET_SYM(_thread_arch_t, psp); +#ifndef CONFIG_X86_COMMON_PAGE_TABLE GEN_OFFSET_SYM(_thread_arch_t, ptables); +#endif #endif /* CONFIG_USERSPACE */ GEN_OFFSET_SYM(x86_tss64_t, ist1); diff --git a/arch/x86/core/userspace.c b/arch/x86/core/userspace.c index 5ea2f4cbead..98027ff50fa 100644 --- a/arch/x86/core/userspace.c +++ b/arch/x86/core/userspace.c @@ -29,24 +29,28 @@ */ void z_x86_swap_update_page_tables(struct k_thread *incoming) { - uintptr_t ptables_phys; - #ifndef CONFIG_X86_64 /* Set initial stack pointer when elevating privileges from Ring 3 * to Ring 0. */ _main_tss.esp0 = (uintptr_t)incoming->arch.psp; #endif + +#ifdef CONFIG_X86_COMMON_PAGE_TABLE + z_x86_swap_update_common_page_table(incoming); +#else /* Check first that we actually need to do this, since setting * CR3 involves an expensive full TLB flush. */ - ptables_phys = incoming->arch.ptables; + uintptr_t ptables_phys = incoming->arch.ptables; + __ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n", incoming); if (ptables_phys != z_x86_cr3_get()) { z_x86_cr3_set(ptables_phys); } +#endif /* CONFIG_X86_COMMON_PAGE_TABLE */ } #endif /* CONFIG_X86_KPTI */ @@ -63,11 +67,13 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread) thread->arch.psp = header->privilege_stack + sizeof(header->privilege_stack); +#ifndef CONFIG_X86_COMMON_PAGE_TABLE /* Important this gets cleared, so that arch_mem_domain_* APIs * can distinguish between new threads, and threads migrating * between domains */ thread->arch.ptables = (uintptr_t)NULL; +#endif /* CONFIG_X86_COMMON_PAGE_TABLE */ if ((thread->base.user_options & K_USER) != 0U) { initial_entry = arch_user_mode_enter; diff --git a/arch/x86/core/x86_mmu.c b/arch/x86/core/x86_mmu.c index 1f557405ef4..9e52782fe6c 100644 --- a/arch/x86/core/x86_mmu.c +++ b/arch/x86/core/x86_mmu.c @@ -53,7 +53,7 @@ LOG_MODULE_DECLARE(os); /* Protects x86_domain_list and serializes any changes to page tables */ static struct k_spinlock x86_mmu_lock; -#ifdef CONFIG_USERSPACE +#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) /* List of all active and initialized memory domains. This is used to make * sure all memory mappings are the same across all page tables when invoking * range_map() @@ -994,7 +994,7 @@ static int range_map(void *virt, uintptr_t phys, size_t size, * Any new mappings need to be applied to all page tables. */ key = k_spin_lock(&x86_mmu_lock); -#ifdef CONFIG_USERSPACE +#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) sys_snode_t *node; SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) { @@ -1014,7 +1014,7 @@ static int range_map(void *virt, uintptr_t phys, size_t size, #endif /* CONFIG_USERSPACE */ ret = range_map_ptables(z_x86_kernel_ptables, virt, phys, size, entry_flags, mask, options); -#ifdef CONFIG_USERSPACE +#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) out_unlock: #endif /* CONFIG_USERSPACE */ if (ret == 0 && (options & OPTION_ALLOC) != 0) { @@ -1151,6 +1151,166 @@ int arch_buffer_validate(void *addr, size_t size, int write) return ret; } +#ifdef CONFIG_X86_COMMON_PAGE_TABLE +/* Very low memory configuration. A single set of page tables is used for + * all threads. This relies on some assumptions: + * + * - No KPTI. If that were supported, we would need both a kernel and user + * set of page tables. + * - No SMP. If that were supported, we would need per-core page tables. + * - Memory domains don't affect supervisor mode. + * - All threads have the same virtual-to-physical mappings. + * - Memory domain APIs can't be called by user mode. + * + * Because there is no SMP, only one set of page tables, and user threads can't + * modify their own memory domains, we don't have to do much when + * arch_mem_domain_* APIs are called. We do use a caching scheme to avoid + * updating page tables if the last user thread scheduled was in the same + * domain. + * + * We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting + * up any arch-specific memory domain data (per domain page tables.) + * + * This is all nice and simple and saves a lot of memory. The cost is that + * context switching is not trivial CR3 update. We have to reset all partitions + * for the current domain configuration and then apply all the partitions for + * the incoming thread's domain if they are not the same. We also need to + * update permissions similarly on the thread stack region. + */ + +static inline void reset_region(uintptr_t start, size_t size) +{ + (void)range_map((void *)start, 0, size, 0, 0, + OPTION_FLUSH | OPTION_RESET); +} + +static inline void apply_region(uintptr_t start, size_t size, pentry_t attr) +{ + (void)range_map((void *)start, 0, size, attr, MASK_PERM, OPTION_FLUSH); +} + +/* Cache of the current memory domain applied to the common page tables and + * the stack buffer region that had User access granted. + */ +static struct k_mem_domain *current_domain; +static uintptr_t current_stack_start; +static size_t current_stack_size; + +void z_x86_swap_update_common_page_table(struct k_thread *incoming) +{ + k_spinlock_key_t key; + + if ((incoming->base.user_options & K_USER) == 0) { + /* Incoming thread is not a user thread. Memory domains don't + * affect supervisor threads and we don't need to enable User + * bits for its stack buffer; do nothing. + */ + return; + } + + /* Step 1: Make sure the thread stack is set up correctly for the + * for the incoming thread + */ + if (incoming->stack_info.start != current_stack_start || + incoming->stack_info.size != current_stack_size) { + if (current_stack_size != 0U) { + reset_region(current_stack_start, current_stack_size); + } + + /* The incoming thread's stack region needs User permissions */ + apply_region(incoming->stack_info.start, + incoming->stack_info.size, + K_MEM_PARTITION_P_RW_U_RW); + + /* Update cache */ + current_stack_start = incoming->stack_info.start; + current_stack_size = incoming->stack_info.size; + } + + /* Step 2: The page tables always have some memory domain applied to + * them. If the incoming thread's memory domain is different, + * update the page tables + */ + key = k_spin_lock(&z_mem_domain_lock); + if (incoming->mem_domain_info.mem_domain == current_domain) { + /* The incoming thread's domain is already applied */ + goto out_unlock; + } + + /* Reset the current memory domain regions... */ + if (current_domain != NULL) { + for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) { + struct k_mem_partition *ptn = + ¤t_domain->partitions[i]; + + if (ptn->size == 0) { + continue; + } + reset_region(ptn->start, ptn->size); + } + } + + /* ...and apply all the incoming domain's regions */ + for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) { + struct k_mem_partition *ptn = + &incoming->mem_domain_info.mem_domain->partitions[i]; + + if (ptn->size == 0) { + continue; + } + apply_region(ptn->start, ptn->size, ptn->attr); + } + current_domain = incoming->mem_domain_info.mem_domain; +out_unlock: + k_spin_unlock(&z_mem_domain_lock, key); +} + +/* If a partition was added or removed in the cached domain, update the + * page tables. + */ +void arch_mem_domain_partition_remove(struct k_mem_domain *domain, + uint32_t partition_id) +{ + struct k_mem_partition *ptn; + + if (domain != current_domain) { + return; + } + + ptn = &domain->partitions[partition_id]; + reset_region(ptn->start, ptn->size); +} + +void arch_mem_domain_partition_add(struct k_mem_domain *domain, + uint32_t partition_id) +{ + struct k_mem_partition *ptn; + + if (domain != current_domain) { + return; + } + + ptn = &domain->partitions[partition_id]; + apply_region(ptn->start, ptn->size, ptn->attr); +} + +/* Rest of the APIs don't need to do anything */ +void arch_mem_domain_thread_add(struct k_thread *thread) +{ + +} + +void arch_mem_domain_thread_remove(struct k_thread *thread) +{ + +} + +void arch_mem_domain_destroy(struct k_mem_domain *domain) +{ + +} +#else +/* Memory domains each have a set of page tables assigned to them */ /** * Duplicate an entire set of page tables @@ -1416,7 +1576,7 @@ void arch_mem_domain_thread_add(struct k_thread *thread) thread->stack_info.size); } -#if !defined(CONFIG_X86_KPTI) +#if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) /* Need to switch to using these new page tables, in case we drop * to user mode before we are ever context switched out. * IPI takes care of this if the thread is currently running on some @@ -1427,6 +1587,7 @@ void arch_mem_domain_thread_add(struct k_thread *thread) } #endif /* CONFIG_X86_KPTI */ } +#endif /* !CONFIG_X86_COMMON_PAGE_TABLE */ int arch_mem_domain_max_partitions_get(void) { @@ -1445,11 +1606,18 @@ void z_x86_current_stack_perms(void) /* Only now is it safe to grant access to the stack buffer since any * previous context has been erased. */ - +#ifdef CONFIG_X86_COMMON_PAGE_TABLE + /* Re run swap page table update logic since we're entering User mode. + * This will grant stack and memory domain access if it wasn't set + * already (in which case this returns very quickly). + */ + z_x86_swap_update_common_page_table(_current); +#else /* Memory domain access is already programmed into the page tables. * Need to enable access to this new user thread's stack buffer in * its domain-specific page tables. */ set_stack_perms(_current, z_x86_thread_page_tables_get(_current)); +#endif } #endif /* CONFIG_USERSPACE */ diff --git a/arch/x86/include/x86_mmu.h b/arch/x86/include/x86_mmu.h index 84b51e87d57..db3028ad747 100644 --- a/arch/x86/include/x86_mmu.h +++ b/arch/x86/include/x86_mmu.h @@ -174,7 +174,7 @@ extern pentry_t z_x86_kernel_ptables[]; /* Get the page tables used by this thread during normal execution */ static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread) { -#ifdef CONFIG_USERSPACE +#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) return (pentry_t *)(thread->arch.ptables); #else return z_x86_kernel_ptables; @@ -185,4 +185,8 @@ static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread) /* Handling function for TLB shootdown inter-processor interrupts. */ void z_x86_tlb_ipi(const void *arg); #endif + +#ifdef CONFIG_X86_COMMON_PAGE_TABLE +void z_x86_swap_update_common_page_table(struct k_thread *incoming); +#endif #endif /* ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H */ diff --git a/include/arch/x86/ia32/thread.h b/include/arch/x86/ia32/thread.h index 6b4e00ea87b..756202a0b22 100644 --- a/include/arch/x86/ia32/thread.h +++ b/include/arch/x86/ia32/thread.h @@ -215,8 +215,10 @@ struct _thread_arch { uint8_t flags; #ifdef CONFIG_USERSPACE +#ifndef CONFIG_X86_COMMON_PAGE_TABLE /* Physical address of the page tables used by this thread */ uintptr_t ptables; +#endif /* CONFIG_X86_COMMON_PAGE_TABLE */ /* Initial privilege mode stack pointer when doing a system call. * Un-set for supervisor threads. diff --git a/include/arch/x86/intel64/thread.h b/include/arch/x86/intel64/thread.h index ad99adc9e58..fdaefce1230 100644 --- a/include/arch/x86/intel64/thread.h +++ b/include/arch/x86/intel64/thread.h @@ -115,8 +115,10 @@ struct _thread_arch { uint8_t flags; #ifdef CONFIG_USERSPACE +#ifndef CONFIG_X86_COMMON_PAGE_TABLE /* Physical address of the page tables used by this thread */ uintptr_t ptables; +#endif /* CONFIG_X86_COMMON_PAGE_TABLE */ /* Initial privilege mode stack pointer when doing a system call. * Un-set for supervisor threads. diff --git a/tests/arch/x86/pagetables/src/main.c b/tests/arch/x86/pagetables/src/main.c index 22ad2684d7b..223aa543d7c 100644 --- a/tests/arch/x86/pagetables/src/main.c +++ b/tests/arch/x86/pagetables/src/main.c @@ -100,7 +100,7 @@ void test_ram_perms(void) expected = MMU_P | MMU_XD; } #endif /* CONFIG_X86_64 */ -#ifndef CONFIG_X86_KPTI +#if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE) } else if (IN_REGION(_app_smem, pos)) { /* If KPTI is not enabled, then the default memory * domain affects our page tables even though we are @@ -109,6 +109,10 @@ void test_ram_perms(void) * partitions within it would be active in * k_mem_domain_default (ztest_partition and any libc * partitions) + * + * If we have a common page table, no thread has + * entered user mode yet and no domain regions + * will be programmed. */ expected = MMU_P | MMU_US | MMU_RW | MMU_XD; #endif /* CONFIG_X86_KPTI */