x86: move page tables to the memory domain level
- z_x86_userspace_enter() for both 32-bit and 64-bit now call into C code to clear the stack buffer and set the US bits in the page tables for the memory range. - Page tables are now associated with memory domains, instead of having separate page tables per thread. A spinlock protects write access to these page tables, and read/write access to the list of active page tables. - arch_mem_domain_init() implemented, allocating and copying page tables from the boot page tables. - struct arch_mem_domain defined for x86. It has a page table link and also a list node for iterating over them. Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
parent
86cfd90026
commit
b8242bff64
9 changed files with 651 additions and 579 deletions
|
@ -44,6 +44,7 @@ config X86
|
||||||
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
|
select ARCH_HAS_CUSTOM_SWAP_TO_MAIN if !X86_64
|
||||||
select ARCH_SUPPORTS_COREDUMP
|
select ARCH_SUPPORTS_COREDUMP
|
||||||
select CPU_HAS_MMU
|
select CPU_HAS_MMU
|
||||||
|
select ARCH_MEM_DOMAIN_DATA if USERSPACE
|
||||||
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
|
select ARCH_MEM_DOMAIN_SYNCHRONOUS_API if USERSPACE
|
||||||
select ARCH_HAS_GDBSTUB if !X86_64
|
select ARCH_HAS_GDBSTUB if !X86_64
|
||||||
select ARCH_HAS_TIMING_FUNCTIONS
|
select ARCH_HAS_TIMING_FUNCTIONS
|
||||||
|
|
|
@ -305,42 +305,18 @@ SECTION_FUNC(TEXT, z_x86_userspace_enter)
|
||||||
* want to leak any information.
|
* want to leak any information.
|
||||||
*/
|
*/
|
||||||
mov %edi, %esp
|
mov %edi, %esp
|
||||||
#ifdef CONFIG_X86_PAE
|
|
||||||
/* Skip over the toplevel PDPT stored here */
|
|
||||||
subl $0x20, %esp
|
|
||||||
#endif /* CONFIG_X86_PAE */
|
|
||||||
|
|
||||||
/* Stash some registers we are going to need to erase the user
|
/* Erase and enable US bit in page tables for the stack buffer */
|
||||||
* stack.
|
|
||||||
*/
|
|
||||||
push %ecx
|
push %ecx
|
||||||
push %edi
|
|
||||||
push %eax
|
push %eax
|
||||||
|
push %edx
|
||||||
/* Compute size of user stack in 4-byte chunks and put in ECX */
|
call z_x86_current_stack_perms
|
||||||
mov %ebx, %ecx
|
pop %edx
|
||||||
sub %edi, %ecx
|
|
||||||
shr $2, %ecx /* Divide by 4 */
|
|
||||||
|
|
||||||
#ifdef CONFIG_INIT_STACKS
|
|
||||||
mov $0xAAAAAAAA, %eax
|
|
||||||
#else
|
|
||||||
xor %eax, %eax
|
|
||||||
#endif
|
|
||||||
/* Copy 4 bytes of memory at a time, starting at ES:EDI, with whatever
|
|
||||||
* is in EAX. Repeat this ECX times. Stack sizes are always at least
|
|
||||||
* 4-byte aligned.
|
|
||||||
*/
|
|
||||||
cld
|
|
||||||
rep stosl
|
|
||||||
|
|
||||||
/* Restore registers */
|
|
||||||
pop %eax
|
pop %eax
|
||||||
pop %edi
|
|
||||||
pop %ecx
|
pop %ecx
|
||||||
|
|
||||||
/* Now set stack pointer to the base of the user stack. Now that this
|
/* Set stack pointer to the base of the freshly-erased user stack.
|
||||||
* is set we won't need EBX any more.
|
* Now that this is set we won't need EBX any more.
|
||||||
*/
|
*/
|
||||||
mov %ebx, %esp
|
mov %ebx, %esp
|
||||||
|
|
||||||
|
|
|
@ -286,24 +286,20 @@ z_x86_userspace_enter:
|
||||||
*/
|
*/
|
||||||
movq %r9, %rsp
|
movq %r9, %rsp
|
||||||
|
|
||||||
/* Need RDI temporarily */
|
/* Push callee-saved regs and go back into C code to erase the stack
|
||||||
pushq %rdi
|
* buffer and set US bit in page tables for it
|
||||||
|
|
||||||
/* Compute size of user stack in 8-byte chunks and put in RCX */
|
|
||||||
movq %r9, %rdi /* Start address for rep stosq in RDI */
|
|
||||||
movq %r8, %rcx /* Ending address */
|
|
||||||
subq %rdi, %rcx /* Subtract starting address */
|
|
||||||
shrq $3, %rcx /* Divide by 8 */
|
|
||||||
|
|
||||||
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
|
|
||||||
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
|
|
||||||
* is in RAX. Repeat this RCX times. Stack sizes are always at least
|
|
||||||
* 8-byte aligned.
|
|
||||||
*/
|
*/
|
||||||
cld
|
pushq %rdx
|
||||||
rep stosq
|
pushq %rsi
|
||||||
|
pushq %rdi
|
||||||
|
pushq %r8
|
||||||
|
pushq %r10
|
||||||
|
callq z_x86_current_stack_perms
|
||||||
|
popq %r10
|
||||||
|
popq %r8
|
||||||
popq %rdi
|
popq %rdi
|
||||||
|
popq %rsi
|
||||||
|
popq %rdx
|
||||||
|
|
||||||
/* Reset to the beginning of the user stack */
|
/* Reset to the beginning of the user stack */
|
||||||
movq %r8, %rsp
|
movq %r8, %rsp
|
||||||
|
|
|
@ -15,12 +15,11 @@
|
||||||
/* Update the to the incoming thread's page table, and update the location of
|
/* Update the to the incoming thread's page table, and update the location of
|
||||||
* the privilege elevation stack.
|
* the privilege elevation stack.
|
||||||
*
|
*
|
||||||
* May be called ONLY during context switch and when supervisor threads drop
|
* May be called ONLY during context switch. Hot code path!
|
||||||
* synchronously to user mode. Hot code path!
|
|
||||||
*
|
*
|
||||||
* Nothing to do here if KPTI is enabled. We are in supervisor mode, so the
|
* Nothing to do here if KPTI is enabled. We are in supervisor mode, so the
|
||||||
* active page tables are the kernel's page tables. If the incoming thread is
|
* active page tables are the kernel's page tables. If the incoming thread is
|
||||||
* in user mode we are going to switch CR3 to the thread-specific tables when
|
* in user mode we are going to switch CR3 to the domain-specific tables when
|
||||||
* we go through z_x86_trampoline_to_user.
|
* we go through z_x86_trampoline_to_user.
|
||||||
*
|
*
|
||||||
* We don't need to update the privilege mode initial stack pointer either,
|
* We don't need to update the privilege mode initial stack pointer either,
|
||||||
|
@ -33,18 +32,17 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
|
||||||
uintptr_t ptables_phys;
|
uintptr_t ptables_phys;
|
||||||
|
|
||||||
#ifndef CONFIG_X86_64
|
#ifndef CONFIG_X86_64
|
||||||
/* 64-bit uses syscall/sysret which switches stacks manually,
|
/* Set initial stack pointer when elevating privileges from Ring 3
|
||||||
* tss64.psp is updated unconditionally in __resume
|
* to Ring 0.
|
||||||
*/
|
*/
|
||||||
if ((incoming->base.user_options & K_USER) != 0) {
|
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
|
||||||
_main_tss.esp0 = (uintptr_t)incoming->arch.psp;
|
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/* Check first that we actually need to do this, since setting
|
/* Check first that we actually need to do this, since setting
|
||||||
* CR3 involves an expensive full TLB flush.
|
* CR3 involves an expensive full TLB flush.
|
||||||
*/
|
*/
|
||||||
ptables_phys = incoming->arch.ptables;
|
ptables_phys = incoming->arch.ptables;
|
||||||
|
__ASSERT(ptables_phys != 0, "NULL page tables for thread %p\n",
|
||||||
|
incoming);
|
||||||
|
|
||||||
if (ptables_phys != z_x86_cr3_get()) {
|
if (ptables_phys != z_x86_cr3_get()) {
|
||||||
z_x86_cr3_set(ptables_phys);
|
z_x86_cr3_set(ptables_phys);
|
||||||
|
@ -52,23 +50,6 @@ void z_x86_swap_update_page_tables(struct k_thread *incoming)
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_X86_KPTI */
|
#endif /* CONFIG_X86_KPTI */
|
||||||
|
|
||||||
FUNC_NORETURN static void drop_to_user(k_thread_entry_t user_entry,
|
|
||||||
void *p1, void *p2, void *p3)
|
|
||||||
{
|
|
||||||
uint32_t stack_end;
|
|
||||||
|
|
||||||
/* Transition will reset stack pointer to initial, discarding
|
|
||||||
* any old context since this is a one-way operation
|
|
||||||
*/
|
|
||||||
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
|
|
||||||
_current->stack_info.size -
|
|
||||||
_current->stack_info.delta);
|
|
||||||
|
|
||||||
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
|
|
||||||
_current->stack_info.start);
|
|
||||||
CODE_UNREACHABLE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Preparation steps needed for all threads if user mode is turned on.
|
/* Preparation steps needed for all threads if user mode is turned on.
|
||||||
*
|
*
|
||||||
* Returns the initial entry point to swap into.
|
* Returns the initial entry point to swap into.
|
||||||
|
@ -82,11 +63,15 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
|
||||||
thread->arch.psp =
|
thread->arch.psp =
|
||||||
header->privilege_stack + sizeof(header->privilege_stack);
|
header->privilege_stack + sizeof(header->privilege_stack);
|
||||||
|
|
||||||
|
/* Important this gets cleared, so that arch_mem_domain_* APIs
|
||||||
|
* can distinguish between new threads, and threads migrating
|
||||||
|
* between domains
|
||||||
|
*/
|
||||||
|
thread->arch.ptables = (uintptr_t)NULL;
|
||||||
|
|
||||||
if ((thread->base.user_options & K_USER) != 0U) {
|
if ((thread->base.user_options & K_USER) != 0U) {
|
||||||
z_x86_thread_pt_init(thread);
|
initial_entry = arch_user_mode_enter;
|
||||||
initial_entry = drop_to_user;
|
|
||||||
} else {
|
} else {
|
||||||
thread->arch.ptables = (uintptr_t)&z_x86_kernel_ptables;
|
|
||||||
initial_entry = z_thread_entry;
|
initial_entry = z_thread_entry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,32 +81,16 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread)
|
||||||
FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
|
FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry,
|
||||||
void *p1, void *p2, void *p3)
|
void *p1, void *p2, void *p3)
|
||||||
{
|
{
|
||||||
k_spinlock_key_t key;
|
uint32_t stack_end;
|
||||||
|
|
||||||
z_x86_thread_pt_init(_current);
|
/* Transition will reset stack pointer to initial, discarding
|
||||||
|
* any old context since this is a one-way operation
|
||||||
key = k_spin_lock(&z_mem_domain_lock);
|
|
||||||
/* Apply memory domain configuration, if assigned. Threads that
|
|
||||||
* started in user mode already had this done via z_setup_new_thread()
|
|
||||||
*/
|
*/
|
||||||
z_x86_apply_mem_domain(_current, _current->mem_domain_info.mem_domain);
|
stack_end = Z_STACK_PTR_ALIGN(_current->stack_info.start +
|
||||||
k_spin_unlock(&z_mem_domain_lock, key);
|
_current->stack_info.size -
|
||||||
|
_current->stack_info.delta);
|
||||||
|
|
||||||
#ifndef CONFIG_X86_KPTI
|
z_x86_userspace_enter(user_entry, p1, p2, p3, stack_end,
|
||||||
/* We're synchronously dropping into user mode from a thread that
|
_current->stack_info.start);
|
||||||
* used to be in supervisor mode. K_USER flag has now been set, but
|
CODE_UNREACHABLE;
|
||||||
* Need to swap from the kernel's page tables to the per-thread page
|
|
||||||
* tables.
|
|
||||||
*
|
|
||||||
* Safe to update page tables from here, all tables are identity-
|
|
||||||
* mapped and memory areas used before the ring 3 transition all
|
|
||||||
* have the same attributes wrt supervisor mode access.
|
|
||||||
*
|
|
||||||
* Threads that started in user mode already had this applied on
|
|
||||||
* initial context switch.
|
|
||||||
*/
|
|
||||||
z_x86_swap_update_page_tables(_current);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
drop_to_user(user_entry, p1, p2, p3);
|
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -40,7 +40,8 @@
|
||||||
#define MMU_PCD BITL(4) /** Page Cache Disable */
|
#define MMU_PCD BITL(4) /** Page Cache Disable */
|
||||||
#define MMU_A BITL(5) /** Accessed */
|
#define MMU_A BITL(5) /** Accessed */
|
||||||
#define MMU_D BITL(6) /** Dirty */
|
#define MMU_D BITL(6) /** Dirty */
|
||||||
#define MMU_PS BITL(7) /** Page Size */
|
#define MMU_PS BITL(7) /** Page Size (non PTE)*/
|
||||||
|
#define MMU_PAT BITL(7) /** Page Attribute (PTE) */
|
||||||
#define MMU_G BITL(8) /** Global */
|
#define MMU_G BITL(8) /** Global */
|
||||||
#ifdef XD_SUPPORTED
|
#ifdef XD_SUPPORTED
|
||||||
#define MMU_XD BITL(63) /** Execute Disable */
|
#define MMU_XD BITL(63) /** Execute Disable */
|
||||||
|
@ -122,18 +123,14 @@ void z_x86_set_stack_guard(k_thread_stack_t *stack);
|
||||||
*/
|
*/
|
||||||
extern uint8_t z_shared_kernel_page_start;
|
extern uint8_t z_shared_kernel_page_start;
|
||||||
#endif /* CONFIG_X86_KPTI */
|
#endif /* CONFIG_X86_KPTI */
|
||||||
|
|
||||||
/* Set up per-thread page tables just prior to entering user mode */
|
|
||||||
void z_x86_thread_pt_init(struct k_thread *thread);
|
|
||||||
|
|
||||||
/* Apply a memory domain policy to a set of thread page tables.
|
|
||||||
*
|
|
||||||
* Must be called with z_mem_domain_lock held.
|
|
||||||
*/
|
|
||||||
void z_x86_apply_mem_domain(struct k_thread *thread,
|
|
||||||
struct k_mem_domain *mem_domain);
|
|
||||||
#endif /* CONFIG_USERSPACE */
|
#endif /* CONFIG_USERSPACE */
|
||||||
|
|
||||||
|
#ifdef CONFIG_X86_PAE
|
||||||
|
#define PTABLES_ALIGN 0x1fU
|
||||||
|
#else
|
||||||
|
#define PTABLES_ALIGN 0xfffU
|
||||||
|
#endif
|
||||||
|
|
||||||
/* Set CR3 to a physical address. There must be a valid top-level paging
|
/* Set CR3 to a physical address. There must be a valid top-level paging
|
||||||
* structure here or the CPU will triple fault. The incoming page tables must
|
* structure here or the CPU will triple fault. The incoming page tables must
|
||||||
* have the same kernel mappings wrt supervisor mode. Don't use this function
|
* have the same kernel mappings wrt supervisor mode. Don't use this function
|
||||||
|
@ -141,6 +138,7 @@ void z_x86_apply_mem_domain(struct k_thread *thread,
|
||||||
*/
|
*/
|
||||||
static inline void z_x86_cr3_set(uintptr_t phys)
|
static inline void z_x86_cr3_set(uintptr_t phys)
|
||||||
{
|
{
|
||||||
|
__ASSERT((phys & PTABLES_ALIGN) == 0U, "unaligned page tables");
|
||||||
#ifdef CONFIG_X86_64
|
#ifdef CONFIG_X86_64
|
||||||
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
|
__asm__ volatile("movq %0, %%cr3\n\t" : : "r" (phys) : "memory");
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -215,17 +215,9 @@ struct _thread_arch {
|
||||||
uint8_t flags;
|
uint8_t flags;
|
||||||
|
|
||||||
#ifdef CONFIG_USERSPACE
|
#ifdef CONFIG_USERSPACE
|
||||||
/* Physical address of the page tables used by this thread. Supervisor
|
/* Physical address of the page tables used by this thread */
|
||||||
* threads always use the kernel's page table, user thread use
|
|
||||||
* per-thread tables stored in the stack object.
|
|
||||||
*/
|
|
||||||
uintptr_t ptables;
|
uintptr_t ptables;
|
||||||
|
|
||||||
/* Track available unused space in the stack object used for building
|
|
||||||
* thread-specific page tables.
|
|
||||||
*/
|
|
||||||
uint8_t *mmu_pos;
|
|
||||||
|
|
||||||
/* Initial privilege mode stack pointer when doing a system call.
|
/* Initial privilege mode stack pointer when doing a system call.
|
||||||
* Un-set for supervisor threads.
|
* Un-set for supervisor threads.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -115,17 +115,9 @@ struct _thread_arch {
|
||||||
uint8_t flags;
|
uint8_t flags;
|
||||||
|
|
||||||
#ifdef CONFIG_USERSPACE
|
#ifdef CONFIG_USERSPACE
|
||||||
/* Physical address to page tables used by this thread. Supervisor
|
/* Physical address of the page tables used by this thread */
|
||||||
* threads always use the kernel's page table, user thread use
|
|
||||||
* per-thread tables stored in the stack object
|
|
||||||
*/
|
|
||||||
uintptr_t ptables;
|
uintptr_t ptables;
|
||||||
|
|
||||||
/* Track available unused space in the stack object used for building
|
|
||||||
* thread-specific page tables.
|
|
||||||
*/
|
|
||||||
uint8_t *mmu_pos;
|
|
||||||
|
|
||||||
/* Initial privilege mode stack pointer when doing a system call.
|
/* Initial privilege mode stack pointer when doing a system call.
|
||||||
* Un-set for supervisor threads.
|
* Un-set for supervisor threads.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -148,6 +148,8 @@
|
||||||
Z_X86_MMU_XD)
|
Z_X86_MMU_XD)
|
||||||
|
|
||||||
#ifndef _ASMLANGUAGE
|
#ifndef _ASMLANGUAGE
|
||||||
|
#include <sys/slist.h>
|
||||||
|
|
||||||
/* Page table entry data type at all levels. Defined here due to
|
/* Page table entry data type at all levels. Defined here due to
|
||||||
* k_mem_partition_attr_t, eventually move to private x86_mmu.h
|
* k_mem_partition_attr_t, eventually move to private x86_mmu.h
|
||||||
*/
|
*/
|
||||||
|
@ -157,5 +159,21 @@ typedef uint64_t pentry_t;
|
||||||
typedef uint32_t pentry_t;
|
typedef uint32_t pentry_t;
|
||||||
#endif
|
#endif
|
||||||
typedef pentry_t k_mem_partition_attr_t;
|
typedef pentry_t k_mem_partition_attr_t;
|
||||||
|
|
||||||
|
struct arch_mem_domain {
|
||||||
|
#ifdef CONFIG_X86_PAE
|
||||||
|
/* 4-entry, 32-byte top-level PDPT */
|
||||||
|
pentry_t pdpt[4];
|
||||||
|
#endif
|
||||||
|
/* Pointer to top-level structure, either a PML4, PDPT, PD */
|
||||||
|
pentry_t *ptables;
|
||||||
|
|
||||||
|
/* Linked list of all active memory domains */
|
||||||
|
sys_snode_t node;
|
||||||
|
#ifdef CONFIG_X86_PAE
|
||||||
|
} __aligned(32);
|
||||||
|
#else
|
||||||
|
};
|
||||||
|
#endif /* CONFIG_X86_PAE */
|
||||||
#endif /* _ASMLANGUAGE */
|
#endif /* _ASMLANGUAGE */
|
||||||
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */
|
#endif /* ZEPHYR_INCLUDE_ARCH_X86_MMU_H */
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue