x86: move page tables to the memory domain level

- z_x86_userspace_enter() for both 32-bit and 64-bit now
  call into C code to clear the stack buffer and set the
  US bits in the page tables for the memory range.

- Page tables are now associated with memory domains,
  instead of having separate page tables per thread.
  A spinlock protects write access to these page tables,
  and read/write access to the list of active page
  tables.

- arch_mem_domain_init() implemented, allocating and
  copying page tables from the boot page tables.

- struct arch_mem_domain defined for x86. It has
  a page table link and also a list node for iterating
  over them.

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2020-10-20 13:28:50 -07:00 committed by Anas Nashif
commit b8242bff64
9 changed files with 651 additions and 579 deletions

View file

@ -44,6 +44,7 @@ config X86
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64 select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
select ARCH_SUPPORTS_COREDUMP select ARCH_SUPPORTS_COREDUMP
select CPU_HAS_MMU select CPU_HAS_MMU
select ARCH_MEM_DOMAIN_DATA if USERSPACE
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
select ARCH_HAS_GDBSTUB if !X86_64 select ARCH_HAS_GDBSTUB if !X86_64
select ARCH_HAS_TIMING_FUNCTIONS select ARCH_HAS_TIMING_FUNCTIONS

View file

@ -305,42 +305,18 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter)
* want to leak any information. * want to leak any information.
*/ */
mov %edi, %esp mov %edi, %esp
#ifdef CONFIG_X86_PAE
/* Skip over the toplevel PDPT stored here */
subl $0x20, %esp
#endif /* CONFIG_X86_PAE */
/* Stash some registers we are going to need to erase the user /* Erase and enable US bit in page tables for the stack buffer */
* stack.
*/
push %ecx push %ecx
push %edi
push %eax push %eax
push %edx
/* Compute size of user stack in 4-byte chunks and put in ECX */ call z_x86_current_stack_perms
mov %ebx, %ecx pop %edx
sub %edi, %ecx
shr $2, %ecx /* Divide by 4 */
#ifdef CONFIG_INIT_STACKS
mov $0xAAAAAAAA, %eax
#else
xor %eax, %eax
#endif
/* Copy 4 bytes of memory at a time, starting at ES:EDI, with whatever
* is in EAX. Repeat this ECX times. Stack sizes are always at least
* 4-byte aligned.
*/
cld
rep stosl
/* Restore registers */
pop %eax pop %eax
pop %edi
pop %ecx pop %ecx
/* Now set stack pointer to the base of the user stack. Now that this /* Set stack pointer to the base of the freshly-erased user stack.
* is set we won't need EBX any more. * Now that this is set we won't need EBX any more.
*/ */
mov %ebx, %esp mov %ebx, %esp

View file

@ -286,24 +286,20 @@ z_x86_userspace_enter:
*/ */
movq %r9, %rsp movq %r9, %rsp
/* Need RDI temporarily */ /* Push callee-saved regs and go back into C code to erase the stack
pushq %rdi * buffer and set US bit in page tables for it
/* Compute size of user stack in 8-byte chunks and put in RCX */
movq %r9, %rdi /* Start address for rep stosq in RDI */
movq %r8, %rcx /* Ending address */
subq %rdi, %rcx /* Subtract starting address */
shrq $3, %rcx /* Divide by 8 */
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
* is in RAX. Repeat this RCX times. Stack sizes are always at least
* 8-byte aligned.
*/ */
cld pushq %rdx
rep stosq pushq %rsi
pushq %rdi
pushq %r8
pushq %r10
callq z_x86_current_stack_perms
popq %r10
popq %r8
popq %rdi popq %rdi
popq %rsi
popq %rdx
/* Reset to the beginning of the user stack */ /* Reset to the beginning of the user stack */
movq %r8, %rsp movq %r8, %rsp

View file

@ -15,12 +15,11 @@
/* Update the to the incoming thread's page table, and update the location of /* Update the to the incoming thread's page table, and update the location of
* the privilege elevation stack. * the privilege elevation stack.
* *
* May be called ONLY during context switch and when supervisor threads drop * May be called ONLY during context switch. Hot code path!
* synchronously to user mode. Hot code path!
* *
* Nothing to do here if KPTI is enabled. We are in supervisor mode, so the * Nothing to do here if KPTI is enabled. We are in supervisor mode, so the
* active page tables are the kernel's page tables. If the incoming thread is * active page tables are the kernel's page tables. If the incoming thread is
* in user mode we are going to switch CR3 to the thread-specific tables when * in user mode we are going to switch CR3 to the domain-specific tables when
* we go through z_x86_trampoline_to_user. * we go through z_x86_trampoline_to_user.
* *
* We don't need to update the privilege mode initial stack pointer either, * We don't need to update the privilege mode initial stack pointer either,
@ -33,18 +32,17 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
uintptr_t ptables_phys; uintptr_t ptables_phys;
#ifndef CONFIG_X86_64 #ifndef CONFIG_X86_64
/* 64-bit uses syscall/sysret which switches stacks manually, /* Set initial stack pointer when elevating privileges from Ring 3
* tss64.psp is updated unconditionally in __resume * to Ring 0.
*/ */
if ((incoming->base.user_options & K_USER) != 0) { _main_tss.esp0 = (uintptr_t)incoming->arch.psp;
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
}
#endif #endif
/* Check first that we actually need to do this, since setting /* Check first that we actually need to do this, since setting
* CR3 involves an expensive full TLB flush. * CR3 involves an expensive full TLB flush.
*/ */
ptables_phys = incoming->arch.ptables; ptables_phys = incoming->arch.ptables;
__ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n",
incoming);
if (ptables_phys != z_x86_cr3_get()) { if (ptables_phys != z_x86_cr3_get()) {
z_x86_cr3_set(ptables_phys); z_x86_cr3_set(ptables_phys);
@ -52,23 +50,6 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
} }
#endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_X86_KPTI */
FUNC_NORETURN static void drop_to_user(k_thread_entry_t user_entry,
void *p1, void *p2, void *p3)
{
uint32_t stack_end;
/* Transition will reset stack pointer to initial, discarding
* any old context since this is a one-way operation
*/
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
_current->stack_info.size -
_current->stack_info.delta);
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
_current->stack_info.start);
CODE_UNREACHABLE;
}
/* Preparation steps needed for all threads if user mode is turned on. /* Preparation steps needed for all threads if user mode is turned on.
* *
* Returns the initial entry point to swap into. * Returns the initial entry point to swap into.
@ -82,11 +63,15 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
thread->arch.psp = thread->arch.psp =
header->privilege_stack + sizeof(header->privilege_stack); header->privilege_stack + sizeof(header->privilege_stack);
/* Important this gets cleared, so that arch_mem_domain_* APIs
* can distinguish between new threads, and threads migrating
* between domains
*/
thread->arch.ptables = (uintptr_t)NULL;
if ((thread->base.user_options & K_USER) != 0U) { if ((thread->base.user_options & K_USER) != 0U) {
z_x86_thread_pt_init(thread); initial_entry = arch_user_mode_enter;
initial_entry = drop_to_user;
} else { } else {
thread->arch.ptables = (uintptr_t)&z_x86_kernel_ptables;
initial_entry = z_thread_entry; initial_entry = z_thread_entry;
} }
@ -96,32 +81,16 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry, FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
void *p1, void *p2, void *p3) void *p1, void *p2, void *p3)
{ {
k_spinlock_key_t key; uint32_t stack_end;
z_x86_thread_pt_init(_current); /* Transition will reset stack pointer to initial, discarding
* any old context since this is a one-way operation
key = k_spin_lock(&z_mem_domain_lock);
/* Apply memory domain configuration, if assigned. Threads that
* started in user mode already had this done via z_setup_new_thread()
*/ */
z_x86_apply_mem_domain(_current, _current->mem_domain_info.mem_domain); stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
k_spin_unlock(&z_mem_domain_lock, key); _current->stack_info.size -
_current->stack_info.delta);
#ifndef CONFIG_X86_KPTI z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
/* We're synchronously dropping into user mode from a thread that _current->stack_info.start);
* used to be in supervisor mode. K_USER flag has now been set, but CODE_UNREACHABLE;
* Need to swap from the kernel's page tables to the per-thread page
* tables.
*
* Safe to update page tables from here, all tables are identity-
* mapped and memory areas used before the ring 3 transition all
* have the same attributes wrt supervisor mode access.
*
* Threads that started in user mode already had this applied on
* initial context switch.
*/
z_x86_swap_update_page_tables(_current);
#endif
drop_to_user(user_entry, p1, p2, p3);
} }

File diff suppressed because it is too large Load diff

View file

@ -40,7 +40,8 @@
#define MMU_PCD BITL(4) /** Page Cache Disable */ #define MMU_PCD BITL(4) /** Page Cache Disable */
#define MMU_A BITL(5) /** Accessed */ #define MMU_A BITL(5) /** Accessed */
#define MMU_D BITL(6) /** Dirty */ #define MMU_D BITL(6) /** Dirty */
#define MMU_PS BITL(7) /** Page Size */ #define MMU_PS BITL(7) /** Page Size (non PTE)*/
#define MMU_PAT BITL(7) /** Page Attribute (PTE) */
#define MMU_G BITL(8) /** Global */ #define MMU_G BITL(8) /** Global */
#ifdef XD_SUPPORTED #ifdef XD_SUPPORTED
#define MMU_XD BITL(63) /** Execute Disable */ #define MMU_XD BITL(63) /** Execute Disable */
@ -122,18 +123,14 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack);
*/ */
extern uint8_t z_shared_kernel_page_start; extern uint8_t z_shared_kernel_page_start;
#endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_X86_KPTI */
/* Set up per-thread page tables just prior to entering user mode */
void z_x86_thread_pt_init(struct k_thread *thread);
/* Apply a memory domain policy to a set of thread page tables.
*
* Must be called with z_mem_domain_lock held.
*/
void z_x86_apply_mem_domain(struct k_thread *thread,
struct k_mem_domain *mem_domain);
#endif /* CONFIG_USERSPACE */ #endif /* CONFIG_USERSPACE */
#ifdef CONFIG_X86_PAE
#define PTABLES_ALIGN 0x1fU
#else
#define PTABLES_ALIGN 0xfffU
#endif
/* Set CR3 to a physical address. There must be a valid top-level paging /* Set CR3 to a physical address. There must be a valid top-level paging
* structure here or the CPU will triple fault. The incoming page tables must * structure here or the CPU will triple fault. The incoming page tables must
* have the same kernel mappings wrt supervisor mode. Don't use this function * have the same kernel mappings wrt supervisor mode. Don't use this function
@ -141,6 +138,7 @@ void z_x86_apply_mem_domain(struct k_thread *thread,
*/ */
static inline void z_x86_cr3_set(uintptr_t phys) static inline void z_x86_cr3_set(uintptr_t phys)
{ {
__ASSERT((phys & PTABLES_ALIGN) == 0U, "unaligned page tables");
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory"); __asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
#else #else

View file

@ -215,17 +215,9 @@ struct _thread_arch {
uint8_t flags; uint8_t flags;
#ifdef CONFIG_USERSPACE #ifdef CONFIG_USERSPACE
/* Physical address of the page tables used by this thread. Supervisor /* Physical address of the page tables used by this thread */
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object.
*/
uintptr_t ptables; uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call. /* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads. * Un-set for supervisor threads.
*/ */

View file

@ -115,17 +115,9 @@ struct _thread_arch {
uint8_t flags; uint8_t flags;
#ifdef CONFIG_USERSPACE #ifdef CONFIG_USERSPACE
/* Physical address to page tables used by this thread. Supervisor /* Physical address of the page tables used by this thread */
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object
*/
uintptr_t ptables; uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call. /* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads. * Un-set for supervisor threads.
*/ */

View file

@ -148,6 +148,8 @@
Z_X86_MMU_XD) Z_X86_MMU_XD)
#ifndef _ASMLANGUAGE #ifndef _ASMLANGUAGE
#include <sys/slist.h>
/* Page table entry data type at all levels. Defined here due to /* Page table entry data type at all levels. Defined here due to
* k_mem_partition_attr_t, eventually move to private x86_mmu.h * k_mem_partition_attr_t, eventually move to private x86_mmu.h
*/ */
@ -157,5 +159,21 @@ typedef uint64_t pentry_t;
typedef uint32_t pentry_t; typedef uint32_t pentry_t;
#endif #endif
typedef pentry_t k_mem_partition_attr_t; typedef pentry_t k_mem_partition_attr_t;
struct arch_mem_domain {
#ifdef CONFIG_X86_PAE
/* 4-entry, 32-byte top-level PDPT */
pentry_t pdpt[4];
#endif
/* Pointer to top-level structure, either a PML4, PDPT, PD */
pentry_t *ptables;
/* Linked list of all active memory domains */
sys_snode_t node;
#ifdef CONFIG_X86_PAE
} __aligned(32);
#else
};
#endif /* CONFIG_X86_PAE */
#endif /* _ASMLANGUAGE */ #endif /* _ASMLANGUAGE */
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */ #endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */