x86: add support for common page tables

We provide an option for low-memory systems to use a single set
of page tables for all threads. This is only supported if
KPTI and SMP are disabled. This configuration saves a considerable
amount of RAM, especially if multiple memory domains are used,
at a cost of context switching overhead.

Some caching techniques are used to reduce the amount of context
switch updates; the page tables aren't updated if switching to
a supervisor thread, and the page table configuration of the last
user thread switched in is cached.

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2020-10-27 11:27:37 -07:00 committed by Anas Nashif
commit d2a72273b7
10 changed files with 213 additions and 11 deletions

View file

@ -44,7 +44,7 @@ config X86
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
select ARCH_SUPPORTS_COREDUMP
select CPU_HAS_MMU
select ARCH_MEM_DOMAIN_DATA if USERSPACE
select ARCH_MEM_DOMAIN_DATA if USERSPACE && !X86_COMMON_PAGE_TABLE
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
select ARCH_HAS_GDBSTUB if !X86_64
select ARCH_HAS_TIMING_FUNCTIONS

View file

@ -213,6 +213,18 @@ config X86_MMU_PAGE_POOL_PAGES
Unused pages in this pool cannot be used for other purposes.
config X86_COMMON_PAGE_TABLE
bool "Use a single page table for all threads"
default n
depends on USERSPACE
depends on !SMP
depends on !X86_KPTI
help
If this option is enabled, userspace memory domains will not have their
own page tables. Instead, context switching operations will modify
page tables in place. This is much slower, but uses much less RAM
for page tables.
config X86_NO_MELTDOWN
bool
help

View file

@ -35,8 +35,10 @@ GEN_OFFSET_SYM(_thread_arch_t, excNestCount);
#ifdef CONFIG_USERSPACE
GEN_OFFSET_SYM(_thread_arch_t, psp);
#ifndef CONFIG_X86_COMMON_PAGE_TABLE
GEN_OFFSET_SYM(_thread_arch_t, ptables);
#endif
#endif
GEN_OFFSET_SYM(_thread_arch_t, preempFloatReg);

View file

@ -30,7 +30,9 @@ GEN_OFFSET_SYM(_thread_arch_t, sse);
GEN_OFFSET_SYM(_thread_arch_t, ss);
GEN_OFFSET_SYM(_thread_arch_t, cs);
GEN_OFFSET_SYM(_thread_arch_t, psp);
#ifndef CONFIG_X86_COMMON_PAGE_TABLE
GEN_OFFSET_SYM(_thread_arch_t, ptables);
#endif
#endif /* CONFIG_USERSPACE */
GEN_OFFSET_SYM(x86_tss64_t, ist1);

View file

@ -29,24 +29,28 @@
*/
void z_x86_swap_update_page_tables(struct k_thread *incoming)
{
uintptr_t ptables_phys;
#ifndef CONFIG_X86_64
/* Set initial stack pointer when elevating privileges from Ring 3
* to Ring 0.
*/
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
#endif
#ifdef CONFIG_X86_COMMON_PAGE_TABLE
z_x86_swap_update_common_page_table(incoming);
#else
/* Check first that we actually need to do this, since setting
* CR3 involves an expensive full TLB flush.
*/
ptables_phys = incoming->arch.ptables;
uintptr_t ptables_phys = incoming->arch.ptables;
__ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n",
incoming);
if (ptables_phys != z_x86_cr3_get()) {
z_x86_cr3_set(ptables_phys);
}
#endif /* CONFIG_X86_COMMON_PAGE_TABLE */
}
#endif /* CONFIG_X86_KPTI */
@ -63,11 +67,13 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
thread->arch.psp =
header->privilege_stack + sizeof(header->privilege_stack);
#ifndef CONFIG_X86_COMMON_PAGE_TABLE
/* Important this gets cleared, so that arch_mem_domain_* APIs
* can distinguish between new threads, and threads migrating
* between domains
*/
thread->arch.ptables = (uintptr_t)NULL;
#endif /* CONFIG_X86_COMMON_PAGE_TABLE */
if ((thread->base.user_options & K_USER) != 0U) {
initial_entry = arch_user_mode_enter;

View file

@ -53,7 +53,7 @@ LOG_MODULE_DECLARE(os);
/* Protects x86_domain_list and serializes any changes to page tables */
static struct k_spinlock x86_mmu_lock;
#ifdef CONFIG_USERSPACE
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
/* List of all active and initialized memory domains. This is used to make
* sure all memory mappings are the same across all page tables when invoking
* range_map()
@ -994,7 +994,7 @@ static int range_map(void *virt, uintptr_t phys, size_t size,
* Any new mappings need to be applied to all page tables.
*/
key = k_spin_lock(&x86_mmu_lock);
#ifdef CONFIG_USERSPACE
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
sys_snode_t *node;
SYS_SLIST_FOR_EACH_NODE(&x86_domain_list, node) {
@ -1014,7 +1014,7 @@ static int range_map(void *virt, uintptr_t phys, size_t size,
#endif /* CONFIG_USERSPACE */
ret = range_map_ptables(z_x86_kernel_ptables, virt, phys, size,
entry_flags, mask, options);
#ifdef CONFIG_USERSPACE
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
out_unlock:
#endif /* CONFIG_USERSPACE */
if (ret == 0 && (options & OPTION_ALLOC) != 0) {
@ -1151,6 +1151,166 @@ int arch_buffer_validate(void *addr, size_t size, int write)
return ret;
}
#ifdef CONFIG_X86_COMMON_PAGE_TABLE
/* Very low memory configuration. A single set of page tables is used for
* all threads. This relies on some assumptions:
*
* - No KPTI. If that were supported, we would need both a kernel and user
* set of page tables.
* - No SMP. If that were supported, we would need per-core page tables.
* - Memory domains don't affect supervisor mode.
* - All threads have the same virtual-to-physical mappings.
* - Memory domain APIs can't be called by user mode.
*
* Because there is no SMP, only one set of page tables, and user threads can't
* modify their own memory domains, we don't have to do much when
* arch_mem_domain_* APIs are called. We do use a caching scheme to avoid
* updating page tables if the last user thread scheduled was in the same
* domain.
*
* We don't set CONFIG_ARCH_MEM_DOMAIN_DATA, since we aren't setting
* up any arch-specific memory domain data (per domain page tables.)
*
* This is all nice and simple and saves a lot of memory. The cost is that
* context switching is not trivial CR3 update. We have to reset all partitions
* for the current domain configuration and then apply all the partitions for
* the incoming thread's domain if they are not the same. We also need to
* update permissions similarly on the thread stack region.
*/
static inline void reset_region(uintptr_t start, size_t size)
{
(void)range_map((void *)start, 0, size, 0, 0,
OPTION_FLUSH | OPTION_RESET);
}
static inline void apply_region(uintptr_t start, size_t size, pentry_t attr)
{
(void)range_map((void *)start, 0, size, attr, MASK_PERM, OPTION_FLUSH);
}
/* Cache of the current memory domain applied to the common page tables and
* the stack buffer region that had User access granted.
*/
static struct k_mem_domain *current_domain;
static uintptr_t current_stack_start;
static size_t current_stack_size;
void z_x86_swap_update_common_page_table(struct k_thread *incoming)
{
k_spinlock_key_t key;
if ((incoming->base.user_options & K_USER) == 0) {
/* Incoming thread is not a user thread. Memory domains don't
* affect supervisor threads and we don't need to enable User
* bits for its stack buffer; do nothing.
*/
return;
}
/* Step 1: Make sure the thread stack is set up correctly for the
* for the incoming thread
*/
if (incoming->stack_info.start != current_stack_start ||
incoming->stack_info.size != current_stack_size) {
if (current_stack_size != 0U) {
reset_region(current_stack_start, current_stack_size);
}
/* The incoming thread's stack region needs User permissions */
apply_region(incoming->stack_info.start,
incoming->stack_info.size,
K_MEM_PARTITION_P_RW_U_RW);
/* Update cache */
current_stack_start = incoming->stack_info.start;
current_stack_size = incoming->stack_info.size;
}
/* Step 2: The page tables always have some memory domain applied to
* them. If the incoming thread's memory domain is different,
* update the page tables
*/
key = k_spin_lock(&z_mem_domain_lock);
if (incoming->mem_domain_info.mem_domain == current_domain) {
/* The incoming thread's domain is already applied */
goto out_unlock;
}
/* Reset the current memory domain regions... */
if (current_domain != NULL) {
for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
struct k_mem_partition *ptn =
&current_domain->partitions[i];
if (ptn->size == 0) {
continue;
}
reset_region(ptn->start, ptn->size);
}
}
/* ...and apply all the incoming domain's regions */
for (int i = 0; i < CONFIG_MAX_DOMAIN_PARTITIONS; i++) {
struct k_mem_partition *ptn =
&incoming->mem_domain_info.mem_domain->partitions[i];
if (ptn->size == 0) {
continue;
}
apply_region(ptn->start, ptn->size, ptn->attr);
}
current_domain = incoming->mem_domain_info.mem_domain;
out_unlock:
k_spin_unlock(&z_mem_domain_lock, key);
}
/* If a partition was added or removed in the cached domain, update the
* page tables.
*/
void arch_mem_domain_partition_remove(struct k_mem_domain *domain,
uint32_t partition_id)
{
struct k_mem_partition *ptn;
if (domain != current_domain) {
return;
}
ptn = &domain->partitions[partition_id];
reset_region(ptn->start, ptn->size);
}
void arch_mem_domain_partition_add(struct k_mem_domain *domain,
uint32_t partition_id)
{
struct k_mem_partition *ptn;
if (domain != current_domain) {
return;
}
ptn = &domain->partitions[partition_id];
apply_region(ptn->start, ptn->size, ptn->attr);
}
/* Rest of the APIs don't need to do anything */
void arch_mem_domain_thread_add(struct k_thread *thread)
{
}
void arch_mem_domain_thread_remove(struct k_thread *thread)
{
}
void arch_mem_domain_destroy(struct k_mem_domain *domain)
{
}
#else
/* Memory domains each have a set of page tables assigned to them */
/**
* Duplicate an entire set of page tables
@ -1416,7 +1576,7 @@ void arch_mem_domain_thread_add(struct k_thread *thread)
thread->stack_info.size);
}
#if !defined(CONFIG_X86_KPTI)
#if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
/* Need to switch to using these new page tables, in case we drop
* to user mode before we are ever context switched out.
* IPI takes care of this if the thread is currently running on some
@ -1427,6 +1587,7 @@ void arch_mem_domain_thread_add(struct k_thread *thread)
}
#endif /* CONFIG_X86_KPTI */
}
#endif /* !CONFIG_X86_COMMON_PAGE_TABLE */
int arch_mem_domain_max_partitions_get(void)
{
@ -1445,11 +1606,18 @@ void z_x86_current_stack_perms(void)
/* Only now is it safe to grant access to the stack buffer since any
* previous context has been erased.
*/
#ifdef CONFIG_X86_COMMON_PAGE_TABLE
/* Re run swap page table update logic since we're entering User mode.
* This will grant stack and memory domain access if it wasn't set
* already (in which case this returns very quickly).
*/
z_x86_swap_update_common_page_table(_current);
#else
/* Memory domain access is already programmed into the page tables.
* Need to enable access to this new user thread's stack buffer in
* its domain-specific page tables.
*/
set_stack_perms(_current, z_x86_thread_page_tables_get(_current));
#endif
}
#endif /* CONFIG_USERSPACE */

View file

@ -174,7 +174,7 @@ extern pentry_t z_x86_kernel_ptables[];
/* Get the page tables used by this thread during normal execution */
static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread)
{
#ifdef CONFIG_USERSPACE
#if defined(CONFIG_USERSPACE) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
return (pentry_t *)(thread->arch.ptables);
#else
return z_x86_kernel_ptables;
@ -185,4 +185,8 @@ static inline pentry_t *z_x86_thread_page_tables_get(struct k_thread *thread)
/* Handling function for TLB shootdown inter-processor interrupts. */
void z_x86_tlb_ipi(const void *arg);
#endif
#ifdef CONFIG_X86_COMMON_PAGE_TABLE
void z_x86_swap_update_common_page_table(struct k_thread *incoming);
#endif
#endif /* ZEPHYR_ARCH_X86_INCLUDE_X86_MMU_H */

View file

@ -215,8 +215,10 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
#ifndef CONFIG_X86_COMMON_PAGE_TABLE
/* Physical address of the page tables used by this thread */
uintptr_t ptables;
#endif /* CONFIG_X86_COMMON_PAGE_TABLE */
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.

View file

@ -115,8 +115,10 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
#ifndef CONFIG_X86_COMMON_PAGE_TABLE
/* Physical address of the page tables used by this thread */
uintptr_t ptables;
#endif /* CONFIG_X86_COMMON_PAGE_TABLE */
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.

View file

@ -100,7 +100,7 @@ void test_ram_perms(void)
expected = MMU_P | MMU_XD;
}
#endif /* CONFIG_X86_64 */
#ifndef CONFIG_X86_KPTI
#if !defined(CONFIG_X86_KPTI) && !defined(CONFIG_X86_COMMON_PAGE_TABLE)
} else if (IN_REGION(_app_smem, pos)) {
/* If KPTI is not enabled, then the default memory
* domain affects our page tables even though we are
@ -109,6 +109,10 @@ void test_ram_perms(void)
* partitions within it would be active in
* k_mem_domain_default (ztest_partition and any libc
* partitions)
*
* If we have a common page table, no thread has
* entered user mode yet and no domain regions
* will be programmed.
*/
expected = MMU_P | MMU_US | MMU_RW | MMU_XD;
#endif /* CONFIG_X86_KPTI */