x86: move page tables to the memory domain level

- z_x86_userspace_enter() for both 32-bit and 64-bit now
  call into C code to clear the stack buffer and set the
  US bits in the page tables for the memory range.

- Page tables are now associated with memory domains,
  instead of having separate page tables per thread.
  A spinlock protects write access to these page tables,
  and read/write access to the list of active page
  tables.

- arch_mem_domain_init() implemented, allocating and
  copying page tables from the boot page tables.

- struct arch_mem_domain defined for x86. It has
  a page table link and also a list node for iterating
  over them.

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2020-10-20 13:28:50 -07:00 committed by Anas Nashif
commit b8242bff64
9 changed files with 651 additions and 579 deletions

View file

@ -44,6 +44,7 @@ config X86
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
select ARCH_SUPPORTS_COREDUMP
select CPU_HAS_MMU
select ARCH_MEM_DOMAIN_DATA if USERSPACE
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
select ARCH_HAS_GDBSTUB if !X86_64
select ARCH_HAS_TIMING_FUNCTIONS

View file

@ -305,42 +305,18 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter)
* want to leak any information.
*/
mov %edi, %esp
#ifdef CONFIG_X86_PAE
/* Skip over the toplevel PDPT stored here */
subl $0x20, %esp
#endif /* CONFIG_X86_PAE */
/* Stash some registers we are going to need to erase the user
* stack.
*/
/* Erase and enable US bit in page tables for the stack buffer */
push %ecx
push %edi
push %eax
/* Compute size of user stack in 4-byte chunks and put in ECX */
mov %ebx, %ecx
sub %edi, %ecx
shr $2, %ecx /* Divide by 4 */
#ifdef CONFIG_INIT_STACKS
mov $0xAAAAAAAA, %eax
#else
xor %eax, %eax
#endif
/* Copy 4 bytes of memory at a time, starting at ES:EDI, with whatever
* is in EAX. Repeat this ECX times. Stack sizes are always at least
* 4-byte aligned.
*/
cld
rep stosl
/* Restore registers */
push %edx
call z_x86_current_stack_perms
pop %edx
pop %eax
pop %edi
pop %ecx
/* Now set stack pointer to the base of the user stack. Now that this
* is set we won't need EBX any more.
/* Set stack pointer to the base of the freshly-erased user stack.
* Now that this is set we won't need EBX any more.
*/
mov %ebx, %esp

View file

@ -286,24 +286,20 @@ z_x86_userspace_enter:
*/
movq %r9, %rsp
/* Need RDI temporarily */
pushq %rdi
/* Compute size of user stack in 8-byte chunks and put in RCX */
movq %r9, %rdi /* Start address for rep stosq in RDI */
movq %r8, %rcx /* Ending address */
subq %rdi, %rcx /* Subtract starting address */
shrq $3, %rcx /* Divide by 8 */
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
* is in RAX. Repeat this RCX times. Stack sizes are always at least
* 8-byte aligned.
/* Push callee-saved regs and go back into C code to erase the stack
* buffer and set US bit in page tables for it
*/
cld
rep stosq
pushq %rdx
pushq %rsi
pushq %rdi
pushq %r8
pushq %r10
callq z_x86_current_stack_perms
popq %r10
popq %r8
popq %rdi
popq %rsi
popq %rdx
/* Reset to the beginning of the user stack */
movq %r8, %rsp

View file

@ -15,12 +15,11 @@
/* Update the to the incoming thread's page table, and update the location of
* the privilege elevation stack.
*
* May be called ONLY during context switch and when supervisor threads drop
* synchronously to user mode. Hot code path!
* May be called ONLY during context switch. Hot code path!
*
* Nothing to do here if KPTI is enabled. We are in supervisor mode, so the
* active page tables are the kernel's page tables. If the incoming thread is
* in user mode we are going to switch CR3 to the thread-specific tables when
* in user mode we are going to switch CR3 to the domain-specific tables when
* we go through z_x86_trampoline_to_user.
*
* We don't need to update the privilege mode initial stack pointer either,
@ -33,18 +32,17 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
uintptr_t ptables_phys;
#ifndef CONFIG_X86_64
/* 64-bit uses syscall/sysret which switches stacks manually,
* tss64.psp is updated unconditionally in __resume
/* Set initial stack pointer when elevating privileges from Ring 3
* to Ring 0.
*/
if ((incoming->base.user_options & K_USER) != 0) {
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
}
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
#endif
/* Check first that we actually need to do this, since setting
* CR3 involves an expensive full TLB flush.
*/
ptables_phys = incoming->arch.ptables;
__ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n",
incoming);
if (ptables_phys != z_x86_cr3_get()) {
z_x86_cr3_set(ptables_phys);
@ -52,23 +50,6 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
}
#endif /* CONFIG_X86_KPTI */
FUNC_NORETURN static void drop_to_user(k_thread_entry_t user_entry,
void *p1, void *p2, void *p3)
{
uint32_t stack_end;
/* Transition will reset stack pointer to initial, discarding
* any old context since this is a one-way operation
*/
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
_current->stack_info.size -
_current->stack_info.delta);
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
_current->stack_info.start);
CODE_UNREACHABLE;
}
/* Preparation steps needed for all threads if user mode is turned on.
*
* Returns the initial entry point to swap into.
@ -82,11 +63,15 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
thread->arch.psp =
header->privilege_stack + sizeof(header->privilege_stack);
/* Important this gets cleared, so that arch_mem_domain_* APIs
* can distinguish between new threads, and threads migrating
* between domains
*/
thread->arch.ptables = (uintptr_t)NULL;
if ((thread->base.user_options & K_USER) != 0U) {
z_x86_thread_pt_init(thread);
initial_entry = drop_to_user;
initial_entry = arch_user_mode_enter;
} else {
thread->arch.ptables = (uintptr_t)&z_x86_kernel_ptables;
initial_entry = z_thread_entry;
}
@ -96,32 +81,16 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
void *p1, void *p2, void *p3)
{
k_spinlock_key_t key;
uint32_t stack_end;
z_x86_thread_pt_init(_current);
key = k_spin_lock(&z_mem_domain_lock);
/* Apply memory domain configuration, if assigned. Threads that
* started in user mode already had this done via z_setup_new_thread()
/* Transition will reset stack pointer to initial, discarding
* any old context since this is a one-way operation
*/
z_x86_apply_mem_domain(_current, _current->mem_domain_info.mem_domain);
k_spin_unlock(&z_mem_domain_lock, key);
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
_current->stack_info.size -
_current->stack_info.delta);
#ifndef CONFIG_X86_KPTI
/* We're synchronously dropping into user mode from a thread that
* used to be in supervisor mode. K_USER flag has now been set, but
* Need to swap from the kernel's page tables to the per-thread page
* tables.
*
* Safe to update page tables from here, all tables are identity-
* mapped and memory areas used before the ring 3 transition all
* have the same attributes wrt supervisor mode access.
*
* Threads that started in user mode already had this applied on
* initial context switch.
*/
z_x86_swap_update_page_tables(_current);
#endif
drop_to_user(user_entry, p1, p2, p3);
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
_current->stack_info.start);
CODE_UNREACHABLE;
}

File diff suppressed because it is too large Load diff

View file

@ -40,7 +40,8 @@
#define MMU_PCD BITL(4) /** Page Cache Disable */
#define MMU_A BITL(5) /** Accessed */
#define MMU_D BITL(6) /** Dirty */
#define MMU_PS BITL(7) /** Page Size */
#define MMU_PS BITL(7) /** Page Size (non PTE)*/
#define MMU_PAT BITL(7) /** Page Attribute (PTE) */
#define MMU_G BITL(8) /** Global */
#ifdef XD_SUPPORTED
#define MMU_XD BITL(63) /** Execute Disable */
@ -122,18 +123,14 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack);
*/
extern uint8_t z_shared_kernel_page_start;
#endif /* CONFIG_X86_KPTI */
/* Set up per-thread page tables just prior to entering user mode */
void z_x86_thread_pt_init(struct k_thread *thread);
/* Apply a memory domain policy to a set of thread page tables.
*
* Must be called with z_mem_domain_lock held.
*/
void z_x86_apply_mem_domain(struct k_thread *thread,
struct k_mem_domain *mem_domain);
#endif /* CONFIG_USERSPACE */
#ifdef CONFIG_X86_PAE
#define PTABLES_ALIGN 0x1fU
#else
#define PTABLES_ALIGN 0xfffU
#endif
/* Set CR3 to a physical address. There must be a valid top-level paging
* structure here or the CPU will triple fault. The incoming page tables must
* have the same kernel mappings wrt supervisor mode. Don't use this function
@ -141,6 +138,7 @@ void z_x86_apply_mem_domain(struct k_thread *thread,
*/
static inline void z_x86_cr3_set(uintptr_t phys)
{
__ASSERT((phys & PTABLES_ALIGN) == 0U, "unaligned page tables");
#ifdef CONFIG_X86_64
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
#else

View file

@ -215,17 +215,9 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
/* Physical address of the page tables used by this thread. Supervisor
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object.
*/
/* Physical address of the page tables used by this thread */
uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.
*/

View file

@ -115,17 +115,9 @@ struct _thread_arch {
uint8_t flags;
#ifdef CONFIG_USERSPACE
/* Physical address to page tables used by this thread. Supervisor
* threads always use the kernel's page table, user thread use
* per-thread tables stored in the stack object
*/
/* Physical address of the page tables used by this thread */
uintptr_t ptables;
/* Track available unused space in the stack object used for building
* thread-specific page tables.
*/
uint8_t *mmu_pos;
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.
*/

View file

@ -148,6 +148,8 @@
Z_X86_MMU_XD)
#ifndef _ASMLANGUAGE
#include <sys/slist.h>
/* Page table entry data type at all levels. Defined here due to
* k_mem_partition_attr_t, eventually move to private x86_mmu.h
*/
@ -157,5 +159,21 @@ typedef uint64_t pentry_t;
typedef uint32_t pentry_t;
#endif
typedef pentry_t k_mem_partition_attr_t;
struct arch_mem_domain {
#ifdef CONFIG_X86_PAE
/* 4-entry, 32-byte top-level PDPT */
pentry_t pdpt[4];
#endif
/* Pointer to top-level structure, either a PML4, PDPT, PD */
pentry_t *ptables;
/* Linked list of all active memory domains */
sys_snode_t node;
#ifdef CONFIG_X86_PAE
} __aligned(32);
#else
};
#endif /* CONFIG_X86_PAE */
#endif /* _ASMLANGUAGE */
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */