x86: move page tables to the memory domain level
- z_x86_userspace_enter() for both 32-bit and 64-bit now call into C code to clear the stack buffer and set the US bits in the page tables for the memory range. - Page tables are now associated with memory domains, instead of having separate page tables per thread. A spinlock protects write access to these page tables, and read/write access to the list of active page tables. - arch_mem_domain_init() implemented, allocating and copying page tables from the boot page tables. - struct arch_mem_domain defined for x86. It has a page table link and also a list node for iterating over them. Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
parent
86cfd90026
commit
b8242bff64
9 changed files with 651 additions and 579 deletions
|
@ -44,6 +44,7 @@ config X86
|
|||
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
|
||||
select ARCH_SUPPORTS_COREDUMP
|
||||
select CPU_HAS_MMU
|
||||
select ARCH_MEM_DOMAIN_DATA if USERSPACE
|
||||
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
|
||||
select ARCH_HAS_GDBSTUB if !X86_64
|
||||
select ARCH_HAS_TIMING_FUNCTIONS
|
||||
|
|
|
@ -305,42 +305,18 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter)
|
|||
* want to leak any information.
|
||||
*/
|
||||
mov %edi, %esp
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/* Skip over the toplevel PDPT stored here */
|
||||
subl $0x20, %esp
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
|
||||
/* Stash some registers we are going to need to erase the user
|
||||
* stack.
|
||||
*/
|
||||
/* Erase and enable US bit in page tables for the stack buffer */
|
||||
push %ecx
|
||||
push %edi
|
||||
push %eax
|
||||
|
||||
/* Compute size of user stack in 4-byte chunks and put in ECX */
|
||||
mov %ebx, %ecx
|
||||
sub %edi, %ecx
|
||||
shr $2, %ecx /* Divide by 4 */
|
||||
|
||||
#ifdef CONFIG_INIT_STACKS
|
||||
mov $0xAAAAAAAA, %eax
|
||||
#else
|
||||
xor %eax, %eax
|
||||
#endif
|
||||
/* Copy 4 bytes of memory at a time, starting at ES:EDI, with whatever
|
||||
* is in EAX. Repeat this ECX times. Stack sizes are always at least
|
||||
* 4-byte aligned.
|
||||
*/
|
||||
cld
|
||||
rep stosl
|
||||
|
||||
/* Restore registers */
|
||||
push %edx
|
||||
call z_x86_current_stack_perms
|
||||
pop %edx
|
||||
pop %eax
|
||||
pop %edi
|
||||
pop %ecx
|
||||
|
||||
/* Now set stack pointer to the base of the user stack. Now that this
|
||||
* is set we won't need EBX any more.
|
||||
/* Set stack pointer to the base of the freshly-erased user stack.
|
||||
* Now that this is set we won't need EBX any more.
|
||||
*/
|
||||
mov %ebx, %esp
|
||||
|
||||
|
|
|
@ -286,24 +286,20 @@ z_x86_userspace_enter:
|
|||
*/
|
||||
movq %r9, %rsp
|
||||
|
||||
/* Need RDI temporarily */
|
||||
pushq %rdi
|
||||
|
||||
/* Compute size of user stack in 8-byte chunks and put in RCX */
|
||||
movq %r9, %rdi /* Start address for rep stosq in RDI */
|
||||
movq %r8, %rcx /* Ending address */
|
||||
subq %rdi, %rcx /* Subtract starting address */
|
||||
shrq $3, %rcx /* Divide by 8 */
|
||||
|
||||
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
|
||||
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
|
||||
* is in RAX. Repeat this RCX times. Stack sizes are always at least
|
||||
* 8-byte aligned.
|
||||
/* Push callee-saved regs and go back into C code to erase the stack
|
||||
* buffer and set US bit in page tables for it
|
||||
*/
|
||||
cld
|
||||
rep stosq
|
||||
|
||||
pushq %rdx
|
||||
pushq %rsi
|
||||
pushq %rdi
|
||||
pushq %r8
|
||||
pushq %r10
|
||||
callq z_x86_current_stack_perms
|
||||
popq %r10
|
||||
popq %r8
|
||||
popq %rdi
|
||||
popq %rsi
|
||||
popq %rdx
|
||||
|
||||
/* Reset to the beginning of the user stack */
|
||||
movq %r8, %rsp
|
||||
|
|
|
@ -15,12 +15,11 @@
|
|||
/* Update the to the incoming thread's page table, and update the location of
|
||||
* the privilege elevation stack.
|
||||
*
|
||||
* May be called ONLY during context switch and when supervisor threads drop
|
||||
* synchronously to user mode. Hot code path!
|
||||
* May be called ONLY during context switch. Hot code path!
|
||||
*
|
||||
* Nothing to do here if KPTI is enabled. We are in supervisor mode, so the
|
||||
* active page tables are the kernel's page tables. If the incoming thread is
|
||||
* in user mode we are going to switch CR3 to the thread-specific tables when
|
||||
* in user mode we are going to switch CR3 to the domain-specific tables when
|
||||
* we go through z_x86_trampoline_to_user.
|
||||
*
|
||||
* We don't need to update the privilege mode initial stack pointer either,
|
||||
|
@ -33,18 +32,17 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
|
|||
uintptr_t ptables_phys;
|
||||
|
||||
#ifndef CONFIG_X86_64
|
||||
/* 64-bit uses syscall/sysret which switches stacks manually,
|
||||
* tss64.psp is updated unconditionally in __resume
|
||||
/* Set initial stack pointer when elevating privileges from Ring 3
|
||||
* to Ring 0.
|
||||
*/
|
||||
if ((incoming->base.user_options & K_USER) != 0) {
|
||||
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
|
||||
}
|
||||
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
|
||||
#endif
|
||||
|
||||
/* Check first that we actually need to do this, since setting
|
||||
* CR3 involves an expensive full TLB flush.
|
||||
*/
|
||||
ptables_phys = incoming->arch.ptables;
|
||||
__ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n",
|
||||
incoming);
|
||||
|
||||
if (ptables_phys != z_x86_cr3_get()) {
|
||||
z_x86_cr3_set(ptables_phys);
|
||||
|
@ -52,23 +50,6 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
|
|||
}
|
||||
#endif /* CONFIG_X86_KPTI */
|
||||
|
||||
FUNC_NORETURN static void drop_to_user(k_thread_entry_t user_entry,
|
||||
void *p1, void *p2, void *p3)
|
||||
{
|
||||
uint32_t stack_end;
|
||||
|
||||
/* Transition will reset stack pointer to initial, discarding
|
||||
* any old context since this is a one-way operation
|
||||
*/
|
||||
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
|
||||
_current->stack_info.size -
|
||||
_current->stack_info.delta);
|
||||
|
||||
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
|
||||
_current->stack_info.start);
|
||||
CODE_UNREACHABLE;
|
||||
}
|
||||
|
||||
/* Preparation steps needed for all threads if user mode is turned on.
|
||||
*
|
||||
* Returns the initial entry point to swap into.
|
||||
|
@ -82,11 +63,15 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
|
|||
thread->arch.psp =
|
||||
header->privilege_stack + sizeof(header->privilege_stack);
|
||||
|
||||
/* Important this gets cleared, so that arch_mem_domain_* APIs
|
||||
* can distinguish between new threads, and threads migrating
|
||||
* between domains
|
||||
*/
|
||||
thread->arch.ptables = (uintptr_t)NULL;
|
||||
|
||||
if ((thread->base.user_options & K_USER) != 0U) {
|
||||
z_x86_thread_pt_init(thread);
|
||||
initial_entry = drop_to_user;
|
||||
initial_entry = arch_user_mode_enter;
|
||||
} else {
|
||||
thread->arch.ptables = (uintptr_t)&z_x86_kernel_ptables;
|
||||
initial_entry = z_thread_entry;
|
||||
}
|
||||
|
||||
|
@ -96,32 +81,16 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
|
|||
FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
|
||||
void *p1, void *p2, void *p3)
|
||||
{
|
||||
k_spinlock_key_t key;
|
||||
uint32_t stack_end;
|
||||
|
||||
z_x86_thread_pt_init(_current);
|
||||
|
||||
key = k_spin_lock(&z_mem_domain_lock);
|
||||
/* Apply memory domain configuration, if assigned. Threads that
|
||||
* started in user mode already had this done via z_setup_new_thread()
|
||||
/* Transition will reset stack pointer to initial, discarding
|
||||
* any old context since this is a one-way operation
|
||||
*/
|
||||
z_x86_apply_mem_domain(_current, _current->mem_domain_info.mem_domain);
|
||||
k_spin_unlock(&z_mem_domain_lock, key);
|
||||
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
|
||||
_current->stack_info.size -
|
||||
_current->stack_info.delta);
|
||||
|
||||
#ifndef CONFIG_X86_KPTI
|
||||
/* We're synchronously dropping into user mode from a thread that
|
||||
* used to be in supervisor mode. K_USER flag has now been set, but
|
||||
* Need to swap from the kernel's page tables to the per-thread page
|
||||
* tables.
|
||||
*
|
||||
* Safe to update page tables from here, all tables are identity-
|
||||
* mapped and memory areas used before the ring 3 transition all
|
||||
* have the same attributes wrt supervisor mode access.
|
||||
*
|
||||
* Threads that started in user mode already had this applied on
|
||||
* initial context switch.
|
||||
*/
|
||||
z_x86_swap_update_page_tables(_current);
|
||||
#endif
|
||||
|
||||
drop_to_user(user_entry, p1, p2, p3);
|
||||
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
|
||||
_current->stack_info.start);
|
||||
CODE_UNREACHABLE;
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -40,7 +40,8 @@
|
|||
#define MMU_PCD BITL(4) /** Page Cache Disable */
|
||||
#define MMU_A BITL(5) /** Accessed */
|
||||
#define MMU_D BITL(6) /** Dirty */
|
||||
#define MMU_PS BITL(7) /** Page Size */
|
||||
#define MMU_PS BITL(7) /** Page Size (non PTE)*/
|
||||
#define MMU_PAT BITL(7) /** Page Attribute (PTE) */
|
||||
#define MMU_G BITL(8) /** Global */
|
||||
#ifdef XD_SUPPORTED
|
||||
#define MMU_XD BITL(63) /** Execute Disable */
|
||||
|
@ -122,18 +123,14 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack);
|
|||
*/
|
||||
extern uint8_t z_shared_kernel_page_start;
|
||||
#endif /* CONFIG_X86_KPTI */
|
||||
|
||||
/* Set up per-thread page tables just prior to entering user mode */
|
||||
void z_x86_thread_pt_init(struct k_thread *thread);
|
||||
|
||||
/* Apply a memory domain policy to a set of thread page tables.
|
||||
*
|
||||
* Must be called with z_mem_domain_lock held.
|
||||
*/
|
||||
void z_x86_apply_mem_domain(struct k_thread *thread,
|
||||
struct k_mem_domain *mem_domain);
|
||||
#endif /* CONFIG_USERSPACE */
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
#define PTABLES_ALIGN 0x1fU
|
||||
#else
|
||||
#define PTABLES_ALIGN 0xfffU
|
||||
#endif
|
||||
|
||||
/* Set CR3 to a physical address. There must be a valid top-level paging
|
||||
* structure here or the CPU will triple fault. The incoming page tables must
|
||||
* have the same kernel mappings wrt supervisor mode. Don't use this function
|
||||
|
@ -141,6 +138,7 @@ void z_x86_apply_mem_domain(struct k_thread *thread,
|
|||
*/
|
||||
static inline void z_x86_cr3_set(uintptr_t phys)
|
||||
{
|
||||
__ASSERT((phys & PTABLES_ALIGN) == 0U, "unaligned page tables");
|
||||
#ifdef CONFIG_X86_64
|
||||
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
|
||||
#else
|
||||
|
|
|
@ -215,17 +215,9 @@ struct _thread_arch {
|
|||
uint8_t flags;
|
||||
|
||||
#ifdef CONFIG_USERSPACE
|
||||
/* Physical address of the page tables used by this thread. Supervisor
|
||||
* threads always use the kernel's page table, user thread use
|
||||
* per-thread tables stored in the stack object.
|
||||
*/
|
||||
/* Physical address of the page tables used by this thread */
|
||||
uintptr_t ptables;
|
||||
|
||||
/* Track available unused space in the stack object used for building
|
||||
* thread-specific page tables.
|
||||
*/
|
||||
uint8_t *mmu_pos;
|
||||
|
||||
/* Initial privilege mode stack pointer when doing a system call.
|
||||
* Un-set for supervisor threads.
|
||||
*/
|
||||
|
|
|
@ -115,17 +115,9 @@ struct _thread_arch {
|
|||
uint8_t flags;
|
||||
|
||||
#ifdef CONFIG_USERSPACE
|
||||
/* Physical address to page tables used by this thread. Supervisor
|
||||
* threads always use the kernel's page table, user thread use
|
||||
* per-thread tables stored in the stack object
|
||||
*/
|
||||
/* Physical address of the page tables used by this thread */
|
||||
uintptr_t ptables;
|
||||
|
||||
/* Track available unused space in the stack object used for building
|
||||
* thread-specific page tables.
|
||||
*/
|
||||
uint8_t *mmu_pos;
|
||||
|
||||
/* Initial privilege mode stack pointer when doing a system call.
|
||||
* Un-set for supervisor threads.
|
||||
*/
|
||||
|
|
|
@ -148,6 +148,8 @@
|
|||
Z_X86_MMU_XD)
|
||||
|
||||
#ifndef _ASMLANGUAGE
|
||||
#include <sys/slist.h>
|
||||
|
||||
/* Page table entry data type at all levels. Defined here due to
|
||||
* k_mem_partition_attr_t, eventually move to private x86_mmu.h
|
||||
*/
|
||||
|
@ -157,5 +159,21 @@ typedef uint64_t pentry_t;
|
|||
typedef uint32_t pentry_t;
|
||||
#endif
|
||||
typedef pentry_t k_mem_partition_attr_t;
|
||||
|
||||
struct arch_mem_domain {
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/* 4-entry, 32-byte top-level PDPT */
|
||||
pentry_t pdpt[4];
|
||||
#endif
|
||||
/* Pointer to top-level structure, either a PML4, PDPT, PD */
|
||||
pentry_t *ptables;
|
||||
|
||||
/* Linked list of all active memory domains */
|
||||
sys_snode_t node;
|
||||
#ifdef CONFIG_X86_PAE
|
||||
} __aligned(32);
|
||||
#else
|
||||
};
|
||||
#endif /* CONFIG_X86_PAE */
|
||||
#endif /* _ASMLANGUAGE */
|
||||
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue