x86: implement user mode on 64-bit

- In early boot, enable the syscall instruction and set up
  necessary MSRs
- Add a hook to update page tables on context switch
- Properly initialize thread based on whether it will
  start in user or supervisor mode
- Add landing function for system calls to execute the
  desired handler
- Implement arch_user_string_nlen()
- Implement logic for dropping a thread down to user mode
- Reserve per-CPU storage space for user and privilege
  elevation stack pointers, necessary for handling syscalls
  when no free registers are available
- Proper handling of gs register considerations when
  transitioning privilege levels

Kernel page table isolation (KPTI) is not yet implemented.

Signed-off-by: Andrew Boie <andrew.p.boie@intel.com>
This commit is contained in:
Andrew Boie 2019-12-18 14:30:41 -08:00 committed by Anas Nashif
commit 3d80208025
10 changed files with 406 additions and 15 deletions

View file

@ -17,14 +17,14 @@ config CPU_ATOM
bool
select CPU_HAS_FPU
select ARCH_HAS_STACK_PROTECTION if X86_MMU
select ARCH_HAS_USERSPACE if X86_MMU && !X86_64
select ARCH_HAS_USERSPACE if X86_MMU
help
This option signifies the use of a CPU from the Atom family.
config CPU_MINUTEIA
bool
select ARCH_HAS_STACK_PROTECTION if X86_MMU
select ARCH_HAS_USERSPACE if X86_MMU && !X86_64
select ARCH_HAS_USERSPACE if X86_MMU
help
This option signifies the use of a CPU from the Minute IA family.
@ -32,7 +32,7 @@ config CPU_APOLLO_LAKE
bool
select CPU_HAS_FPU
select ARCH_HAS_STACK_PROTECTION if X86_MMU
select ARCH_HAS_USERSPACE if X86_MMU && !X86_64
select ARCH_HAS_USERSPACE if X86_MMU
help
This option signifies the use of a CPU from the Apollo Lake family.
@ -269,6 +269,7 @@ config X86_KPTI
default y
depends on USERSPACE
depends on !X86_NO_MELTDOWN
depends on !X86_64
help
Implements kernel page table isolation to mitigate Meltdown exploits
to read Kernel RAM. Incurs a significant performance cost for

View file

@ -15,3 +15,5 @@ zephyr_library_sources(
intel64/thread.c
intel64/fatal.c
)
zephyr_library_sources_ifdef(CONFIG_USERSPACE intel64/userspace.S)

View file

@ -24,3 +24,14 @@ void z_x86_exception(z_arch_esf_t *esf)
CODE_UNREACHABLE;
}
}
#ifdef CONFIG_USERSPACE
void arch_syscall_oops(void *ssf_ptr)
{
struct x86_ssf *ssf = ssf_ptr;
LOG_ERR("Bad system call from RIP 0x%lx", ssf->rip);
z_x86_fatal_error(K_ERR_KERNEL_OOPS, NULL);
}
#endif /* CONFIG_USERSPACE */

View file

@ -115,9 +115,9 @@ go64: movl %cr4, %eax /* enable PAE and SSE */
#endif
movl %eax, %cr3
movl $X86_EFER_MSR, %ecx /* enable long mode and no-execute */
movl $X86_EFER_MSR, %ecx /* enable long mode, no-execute, syscall */
rdmsr
orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE), %eax
orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %eax
wrmsr
movl %cr0, %eax /* enable paging */
@ -169,6 +169,30 @@ go64: movl %cr4, %eax /* enable PAE and SSE */
call z_loapic_enable
#endif
#ifdef CONFIG_USERSPACE
/* Set landing site for system calls made with 'syscall' instruction */
movq $z_x86_syscall_entry_stub, %rax
movq %rax, %rdx
shrq $32, %rdx
movl $X86_LSTAR_MSR, %ecx
/* LSTAR set to 64-bit address denoted by EDX:EAX */
wrmsr
/* Set segment descriptors in STAR */
xorl %eax, %eax /* Zero low bits, reserved */
movl $X86_STAR_UPPER, %edx
movl $X86_STAR_MSR, %ecx
wrmsr
/* Set EFLAGS mask applied when making system calls. Currently we
* mask interrupts and clear direction flag.
*/
movl $0x600, %eax
xorl %edx, %edx
movl $X86_FMASK_MSR, %ecx
wrmsr
#endif /* CONFIG_USERSPACE */
/* don't replace CALL with JMP; honor the ABI stack alignment! */
incl __x86_cpuboot_t_ready_OFFSET(%rbp)
@ -222,7 +246,13 @@ z_x86_switch:
movq %r13, _thread_offset_to_r13(%rsi)
movq %r14, _thread_offset_to_r14(%rsi)
movq %r15, _thread_offset_to_r15(%rsi)
#ifdef CONFIG_USERSPACE
/* We're always in supervisor mode if we get here, the other case
* is when __resume is invoked from irq_dispatch
*/
movq $X86_KERNEL_CS, _thread_offset_to_cs(%rsi)
movq $X86_KERNEL_DS, _thread_offset_to_ss(%rsi)
#endif
movq %gs:__x86_tss64_t_ist1_OFFSET, %rsp
/* fall through to __resume */
@ -234,10 +264,23 @@ z_x86_switch:
*/
__resume:
#ifdef CONFIG_USERSPACE
pushq %rdi /* Caller-saved, stash it */
call z_x86_swap_update_page_tables
popq %rdi
/* Set up exception return stack frame */
pushq _thread_offset_to_ss(%rdi) /* SS */
#else
pushq $X86_KERNEL_DS /* SS */
#endif
pushq _thread_offset_to_rsp(%rdi) /* RSP */
pushq _thread_offset_to_rflags(%rdi) /* RFLAGS */
#ifdef CONFIG_USERSPACE
pushq _thread_offset_to_cs(%rdi) /* CS */
#else
pushq $X86_KERNEL_CS /* CS */
#endif
pushq _thread_offset_to_rip(%rdi) /* RIP */
movq _thread_offset_to_rbx(%rdi), %rbx
@ -261,6 +304,13 @@ __resume:
movq _thread_offset_to_r11(%rdi), %r11
movq _thread_offset_to_rdi(%rdi), %rdi /* do last :-) */
#ifdef CONFIG_USERSPACE
/* Swap GS register values if we are returning to user mode */
testb $0x3, 8(%rsp)
jz 1f
swapgs
#endif /* CONFIG_USERSPACE */
1: iretq
@ -273,7 +323,13 @@ except: /*
* finish struct NANO_ESF on stack. 'vector' .. 'ss' are
* already there from hardware trap and EXCEPT_*() stub.
*/
#ifdef CONFIG_USERSPACE
/* Swap GS register values if we came in from user mode */
testb $0x3, 24(%rsp)
jz 1f
swapgs
1:
#endif /* CONFIG_USERSPACE */
pushq %r15
subq $X86_FXSAVE_SIZE, %rsp
fxsave (%rsp)
@ -323,6 +379,15 @@ except: /*
/* Drop the vector/err code pushed by the HW or EXCEPT_*() stub */
add $16, %rsp
#ifdef CONFIG_USERSPACE
/* Swap GS register values if we are returning to user mode */
testb $0x3, 8(%rsp)
jz 1f
cli
swapgs
1:
#endif /* CONFIG_USERSPACE */
iretq
EXCEPT ( 0); EXCEPT ( 1); EXCEPT ( 2); EXCEPT ( 3)
@ -356,6 +421,13 @@ EXCEPT(Z_X86_OOPS_VECTOR);
.globl x86_irq_args /* .. for these definitions */
irq:
#ifdef CONFIG_USERSPACE
/* Swap GS register values if we came in from user mode */
testb $0x3, 16(%rsp)
jz 1f
swapgs
1:
#endif /* CONFIG_USERSPACE */
pushq %rsi
movq %gs:__x86_tss64_t_cpu_OFFSET, %rsi
@ -422,12 +494,18 @@ irq_enter_unnested: /* Not nested: dump state to thread struct for __resume */
popq %rcx /* vector number */
popq %rax /* RIP */
movq %rax, _thread_offset_to_rip(%rsi)
popq %rax /* CS: discard */
popq %rax /* CS */
#ifdef CONFIG_USERSPACE
movq %rax, _thread_offset_to_cs(%rsi)
#endif
popq %rax /* RFLAGS */
movq %rax, _thread_offset_to_rflags(%rsi)
popq %rax /* RSP */
movq %rax, _thread_offset_to_rsp(%rsi)
popq %rax /* SS: discard */
popq %rax /* SS */
#ifdef CONFIG_USERSPACE
movq %rax, _thread_offset_to_ss(%rsi)
#endif
irq_dispatch:
movq x86_irq_funcs(,%rcx,8), %rbx

View file

@ -15,27 +15,36 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
void *parameter1, void *parameter2, void *parameter3,
int priority, unsigned int options)
{
#if defined(CONFIG_X86_USERSPACE) || defined(CONFIG_X86_STACK_PROTECTION)
struct z_x86_thread_stack_header *header =
(struct z_x86_thread_stack_header *)stack;
#endif
void *switch_entry;
Z_ASSERT_VALID_PRIO(priority, entry);
z_new_thread_init(thread, Z_THREAD_STACK_BUFFER(stack),
stack_size, priority, options);
#if CONFIG_X86_STACK_PROTECTION
struct z_x86_thread_stack_header *header =
(struct z_x86_thread_stack_header *)stack;
/* Set guard area to read-only to catch stack overflows */
z_x86_mmu_set_flags(&z_x86_kernel_ptables, &header->guard_page,
MMU_PAGE_SIZE, MMU_ENTRY_READ, Z_X86_MMU_RW,
true);
#endif
#ifdef CONFIG_USERSPACE
switch_entry = z_x86_userspace_prepare_thread(thread);
thread->arch.cs = X86_KERNEL_CS;
thread->arch.ss = X86_KERNEL_DS;
#else
switch_entry = z_thread_entry;
#endif
thread->callee_saved.rsp = (long) Z_THREAD_STACK_BUFFER(stack);
thread->callee_saved.rsp += (stack_size - 8); /* fake RIP for ABI */
thread->callee_saved.rip = (long) z_thread_entry;
thread->callee_saved.rip = (long) switch_entry;
thread->callee_saved.rflags = EFLAGS_INITIAL;
/* Parameters to entry point, which is populated in
* thread->callee_saved.rip
*/
thread->arch.rdi = (long) entry;
thread->arch.rsi = (long) parameter1;
thread->arch.rdx = (long) parameter2;

View file

@ -0,0 +1,234 @@
/*
* Copyright (c) 2017 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <arch/cpu.h>
#include <offsets_short.h>
#include <syscall.h>
/* Landing site for 'syscall' instruction
*
* Call id is in RAX
* Arguments are in RDI, RSI, RDX, R10, R8, R9
* Return address stored by CPU in RCX
* User RFLAGS store by CPU in R11
* Current RFLAGS has been masked with ~X86_FMASK_MSR
*/
.global z_x86_syscall_entry_stub
z_x86_syscall_entry_stub:
swapgs
/* Switch to the privilege mode stack pointer stored in
* x86_tss64.psp and store the user mode stack pointer in
* x86_tss64.usp, immediately pushing it once the stack switch
* is done since this is a per-cpu and not per-thread area.
*
* This dance is necessary as upon entry we have no free registers
* nor a stack we can push to.
*/
movq %rsp, %gs:__x86_tss64_t_usp_OFFSET
movq %gs:__x86_tss64_t_psp_OFFSET, %rsp
pushq %gs:__x86_tss64_t_usp_OFFSET
sti /* re-enable interrupts */
/* call_id is in RAX. bounds-check it, must be less than
* K_SYSCALL_LIMIT.
*/
cmp $K_SYSCALL_LIMIT, %rax
jae _bad_syscall
_id_ok:
#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION
/* Prevent speculation with bogus system call IDs */
lfence
#endif
/* Remaining registers not involved in the syscall operation are
* RBX, RBP, R12-R15, plus floating point / SIMD registers.
*
* We save caller-saved registers so we can restore to original values
* when we call 'sysretq' at the end.
*/
pushq %rdi
subq $X86_FXSAVE_SIZE, %rsp
fxsave (%rsp)
pushq %rsi
pushq %rdx
pushq %r8
pushq %r9
pushq %r10
pushq %r11 /* RFLAGS */
pushq %rcx /* Return address stored by 'syscall' */
pushq %rsp /* SSF parameter */
/* All other args are in the right registers, except arg4 which
* we had to put in r10 instead of RCX
*/
movq %r10, %rcx
/* from the call ID in RAX, load R10 with the actual function pointer
* to call by looking it up in the system call dispatch table
*/
xorq %r11, %r11
movq _k_syscall_table(%r11, %rax, 8), %r10
/* Run the marshal function, which is some entry in _k_syscall_table */
call *%r10
/* RAX now contains the return value
*
* Callee-saved registers are un-touched from original values per C
* calling convention, but sensitive data may lurk in caller-saved regs
* RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system
* call. We saved them earlier, restore their original values when
* the syscall was made. This also preserves these registers if they
* were not used as arguments.
*
* We also can't have RCX and R11 clobbered as we need the original
* values to successfully 'sysretq'.
*/
addq $8, %rsp /* Discard ssf */
popq %rcx /* Restore return address for 'sysretq' */
popq %r11 /* Restore RFLAGS for 'sysretq' */
popq %r10
popq %r9
popq %r8
popq %rdx
popq %rsi
fxrstor (%rsp)
addq $X86_FXSAVE_SIZE, %rsp
popq %rdi
/* Restore user stack pointer */
popq %rsp
/* Return to user mode, locking interrupts as the normal interrupt
* handling path will get very confused if it occurs between
* 'swapgs' and 'sysretq'
*/
cli
swapgs
sysretq
_bad_syscall:
/* RAX had a bogus syscall value in it, replace with the bad syscall
* handler's ID, and put the bad ID as its first argument.
*
* TODO: On this and all other arches, simply immediately return
* with -ENOSYS, once all syscalls have a return value
*/
movq %rax, %rdi
movq $K_SYSCALL_BAD, %rax
jmp _id_ok
/*
* size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg)
* ^ RDI ^ RSI ^ RDX
*/
.global arch_user_string_nlen
arch_user_string_nlen:
/* Initial error value, strlen_done adjusts this if we succeed */
movl $-1, %r8d
/* use RAX as our length count (this function's return value) */
xor %rax, %rax
/* This code might page fault */
strlen_loop:
.global z_x86_user_string_nlen_fault_start
z_x86_user_string_nlen_fault_start:
cmpb $0x0, (%rdi, %rax, 1) /* *(RDI + RAX) == 0? Could fault. */
.global z_x86_user_string_nlen_fault_end
z_x86_user_string_nlen_fault_end:
je strlen_done
cmp %rsi, %rax /* Max length reached? */
je strlen_done
inc %rax /* EAX++ and loop again */
jmp strlen_loop
strlen_done:
/* Set error value to 0 since we succeeded */
xorl %r8d, %r8d
.global z_x86_user_string_nlen_fixup
z_x86_user_string_nlen_fixup:
/* Write error value to 32-bit integer err pointer parameter */
movl %r8d, (%rdx)
retq
/*
* Trampoline function to put the p3 parameter in the register expected
* by the calling convention, we couldn't use RCX when we called 'sysret'
*/
z_x86_userspace_landing_site:
/* Place argument 4 in the correct position */
movq %r10, %rcx
call z_thread_entry
/* FUNC_NORETURN void z_x86_userspace_enter(
* k_thread_entry_t user_entry, <- RDI
* void *p1, void *p2, void *p3, <- RSI, RDX, RCX
* uintptr_t stack_end, <- R8
* uintptr_t stack_start) <- R9
*
* A one-way trip to userspace.
*/
.global z_x86_userspace_enter
z_x86_userspace_enter:
/* RCX is sysret return address, pass along p3 in r10,
* z_x86_userspace_landing_site will fix this up
*/
movq %rcx, %r10
/* switch to privilege mode stack so we can erase thread stack buffer,
* the buffer is the page immediately before the thread stack
*/
movq %r9, %rsp
/* Need RDI temporarily */
pushq %rdi
/* Compute size of user stack in 8-byte chunks and put in RCX */
movq %r9, %rdi /* Start address for rep stosq in RDI */
movq %r8, %rcx /* Ending address */
subq %rdi, %rcx /* Subtract starting address */
shrq $3, %rcx /* Divide by 8 */
movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */
/* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever
* is in RAX. Repeat this RCX times. Stack sizes are always at least
* 8-byte aligned.
*/
cld
rep stosq
popq %rdi
/* Reset to the beginning of the user stack */
movq %r8, %rsp
/* set sysret entry point */
movq $z_x86_userspace_landing_site, %rcx
/* Copy RFLAGS into r11, required by sysret */
pushfq
movq (%rsp), %r11
movq $0, (%rsp) /* Now a debugger-friendly return address */
/* cleanse other registers */
xorq %rax, %rax
xorq %rbx, %rbx
xorq %rbp, %rbp
xorq %r12, %r12
xorq %r13, %r13
xorq %r14, %r14
xorq %r15, %r15
cli
swapgs
sysretq

View file

@ -23,9 +23,17 @@ GEN_OFFSET_SYM(_thread_arch_t, r9);
GEN_OFFSET_SYM(_thread_arch_t, r10);
GEN_OFFSET_SYM(_thread_arch_t, r11);
GEN_OFFSET_SYM(_thread_arch_t, sse);
#ifdef CONFIG_USERSPACE
GEN_OFFSET_SYM(_thread_arch_t, ss);
GEN_OFFSET_SYM(_thread_arch_t, cs);
#endif
GEN_OFFSET_SYM(x86_tss64_t, ist1);
GEN_OFFSET_SYM(x86_tss64_t, cpu);
#ifdef CONFIG_USERSPACE
GEN_OFFSET_SYM(x86_tss64_t, psp);
GEN_OFFSET_SYM(x86_tss64_t, usp);
#endif
GEN_ABSOLUTE_SYM(__X86_TSS64_SIZEOF, sizeof(x86_tss64_t));
GEN_OFFSET_SYM(x86_cpuboot_t, ready);

View file

@ -65,4 +65,10 @@
#define _thread_offset_to_sse \
(___thread_t_arch_OFFSET + ___thread_arch_t_sse_OFFSET)
#define _thread_offset_to_ss \
(___thread_t_arch_OFFSET + ___thread_arch_t_ss_OFFSET)
#define _thread_offset_to_cs \
(___thread_t_arch_OFFSET + ___thread_arch_t_cs_OFFSET)
#endif /* ZEPHYR_ARCH_X86_INCLUDE_INTEL64_OFFSETS_SHORT_ARCH_H_ */

View file

@ -59,6 +59,19 @@ struct x86_esf {
typedef struct x86_esf z_arch_esf_t;
struct x86_ssf {
unsigned long rip;
unsigned long rflags;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long rdx;
unsigned long rsi;
char fxsave[X86_FXSAVE_SIZE];
unsigned long rdi;
unsigned long rsp;
};
#define ARCH_EXCEPT(reason_p) do { \
__asm__ volatile( \
"movq %[reason], %%rax\n\t" \

View file

@ -20,6 +20,11 @@
#define X86_USER_DS 0x30 /* 64-bit user mode data */
#define X86_USER_CS 0x38 /* 64-bit user mode code */
/* Value programmed into bits 63:32 of STAR MSR with proper segment
* descriptors for implementing user mode with syscall/sysret
*/
#define X86_STAR_UPPER ((X86_USER_CS_32 << 16) | X86_KERNEL_CS)
#define X86_KERNEL_CPU0_TR 0x40 /* 64-bit task state segment */
#define X86_KERNEL_CPU1_TR 0x50 /* 64-bit task state segment */
#define X86_KERNEL_CPU2_TR 0x60 /* 64-bit task state segment */
@ -73,6 +78,13 @@ struct x86_tss64 {
*/
struct _cpu *cpu;
#ifdef CONFIG_USERSPACE
/* Privilege mode stack pointer value when doing a system call */
char *psp;
/* Storage area for user mode stack pointer when doing a syscall */
char *usp;
#endif
} __packed __aligned(8);
typedef struct x86_tss64 x86_tss64_t;
@ -101,6 +113,23 @@ typedef struct _callee_saved _callee_saved_t;
struct _thread_arch {
u8_t flags;
#ifdef CONFIG_USERSPACE
/* Pointer to page tables used by this thread. Supervisor threads
* always use the kernel's page table, user thread use per-thread
* tables stored in the stack object
*/
struct x86_page_tables *ptables;
/* Initial privilege mode stack pointer when doing a system call.
* Un-set for supervisor threads.
*/
char *psp;
/* SS and CS selectors for this thread when restoring context */
u64_t ss;
u64_t cs;
#endif
u64_t rax;
u64_t rcx;
u64_t rdx;