From 3d8020802561b12b3bc425f970f493eb6697a24e Mon Sep 17 00:00:00 2001 From: Andrew Boie Date: Wed, 18 Dec 2019 14:30:41 -0800 Subject: [PATCH] x86: implement user mode on 64-bit - In early boot, enable the syscall instruction and set up necessary MSRs - Add a hook to update page tables on context switch - Properly initialize thread based on whether it will start in user or supervisor mode - Add landing function for system calls to execute the desired handler - Implement arch_user_string_nlen() - Implement logic for dropping a thread down to user mode - Reserve per-CPU storage space for user and privilege elevation stack pointers, necessary for handling syscalls when no free registers are available - Proper handling of gs register considerations when transitioning privilege levels Kernel page table isolation (KPTI) is not yet implemented. Signed-off-by: Andrew Boie --- arch/x86/Kconfig | 7 +- arch/x86/core/intel64.cmake | 2 + arch/x86/core/intel64/fatal.c | 11 + arch/x86/core/intel64/locore.S | 90 ++++++- arch/x86/core/intel64/thread.c | 21 +- arch/x86/core/intel64/userspace.S | 234 ++++++++++++++++++ arch/x86/core/offsets/intel64_offsets.c | 8 + arch/x86/include/intel64/offsets_short_arch.h | 6 + include/arch/x86/intel64/arch.h | 13 + include/arch/x86/intel64/thread.h | 29 +++ 10 files changed, 406 insertions(+), 15 deletions(-) create mode 100644 arch/x86/core/intel64/userspace.S diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a33d3ca2f33..64869cc6385 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,14 +17,14 @@ config CPU_ATOM bool select CPU_HAS_FPU select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && !X86_64 + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Atom family. config CPU_MINUTEIA bool select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && !X86_64 + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Minute IA family. @@ -32,7 +32,7 @@ config CPU_APOLLO_LAKE bool select CPU_HAS_FPU select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && !X86_64 + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Apollo Lake family. @@ -269,6 +269,7 @@ config X86_KPTI default y depends on USERSPACE depends on !X86_NO_MELTDOWN + depends on !X86_64 help Implements kernel page table isolation to mitigate Meltdown exploits to read Kernel RAM. Incurs a significant performance cost for diff --git a/arch/x86/core/intel64.cmake b/arch/x86/core/intel64.cmake index b6227d2ecde..5fc04a49699 100644 --- a/arch/x86/core/intel64.cmake +++ b/arch/x86/core/intel64.cmake @@ -15,3 +15,5 @@ zephyr_library_sources( intel64/thread.c intel64/fatal.c ) + +zephyr_library_sources_ifdef(CONFIG_USERSPACE intel64/userspace.S) diff --git a/arch/x86/core/intel64/fatal.c b/arch/x86/core/intel64/fatal.c index 7310b979edc..15580fa2215 100644 --- a/arch/x86/core/intel64/fatal.c +++ b/arch/x86/core/intel64/fatal.c @@ -24,3 +24,14 @@ void z_x86_exception(z_arch_esf_t *esf) CODE_UNREACHABLE; } } + +#ifdef CONFIG_USERSPACE +void arch_syscall_oops(void *ssf_ptr) +{ + struct x86_ssf *ssf = ssf_ptr; + + LOG_ERR("Bad system call from RIP 0x%lx", ssf->rip); + + z_x86_fatal_error(K_ERR_KERNEL_OOPS, NULL); +} +#endif /* CONFIG_USERSPACE */ diff --git a/arch/x86/core/intel64/locore.S b/arch/x86/core/intel64/locore.S index 3df456f048a..8e5ef3d6e98 100644 --- a/arch/x86/core/intel64/locore.S +++ b/arch/x86/core/intel64/locore.S @@ -115,9 +115,9 @@ go64: movl %cr4, %eax /* enable PAE and SSE */ #endif movl %eax, %cr3 - movl $X86_EFER_MSR, %ecx /* enable long mode and no-execute */ + movl $X86_EFER_MSR, %ecx /* enable long mode, no-execute, syscall */ rdmsr - orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE), %eax + orl $(X86_EFER_MSR_LME | X86_EFER_MSR_NXE | X86_EFER_MSR_SCE), %eax wrmsr movl %cr0, %eax /* enable paging */ @@ -169,6 +169,30 @@ go64: movl %cr4, %eax /* enable PAE and SSE */ call z_loapic_enable #endif +#ifdef CONFIG_USERSPACE + /* Set landing site for system calls made with 'syscall' instruction */ + movq $z_x86_syscall_entry_stub, %rax + movq %rax, %rdx + shrq $32, %rdx + movl $X86_LSTAR_MSR, %ecx + /* LSTAR set to 64-bit address denoted by EDX:EAX */ + wrmsr + + /* Set segment descriptors in STAR */ + xorl %eax, %eax /* Zero low bits, reserved */ + movl $X86_STAR_UPPER, %edx + movl $X86_STAR_MSR, %ecx + wrmsr + + /* Set EFLAGS mask applied when making system calls. Currently we + * mask interrupts and clear direction flag. + */ + movl $0x600, %eax + xorl %edx, %edx + movl $X86_FMASK_MSR, %ecx + wrmsr +#endif /* CONFIG_USERSPACE */ + /* don't replace CALL with JMP; honor the ABI stack alignment! */ incl __x86_cpuboot_t_ready_OFFSET(%rbp) @@ -222,7 +246,13 @@ z_x86_switch: movq %r13, _thread_offset_to_r13(%rsi) movq %r14, _thread_offset_to_r14(%rsi) movq %r15, _thread_offset_to_r15(%rsi) - +#ifdef CONFIG_USERSPACE + /* We're always in supervisor mode if we get here, the other case + * is when __resume is invoked from irq_dispatch + */ + movq $X86_KERNEL_CS, _thread_offset_to_cs(%rsi) + movq $X86_KERNEL_DS, _thread_offset_to_ss(%rsi) +#endif movq %gs:__x86_tss64_t_ist1_OFFSET, %rsp /* fall through to __resume */ @@ -234,10 +264,23 @@ z_x86_switch: */ __resume: +#ifdef CONFIG_USERSPACE + pushq %rdi /* Caller-saved, stash it */ + call z_x86_swap_update_page_tables + popq %rdi + + /* Set up exception return stack frame */ + pushq _thread_offset_to_ss(%rdi) /* SS */ +#else pushq $X86_KERNEL_DS /* SS */ +#endif pushq _thread_offset_to_rsp(%rdi) /* RSP */ pushq _thread_offset_to_rflags(%rdi) /* RFLAGS */ +#ifdef CONFIG_USERSPACE + pushq _thread_offset_to_cs(%rdi) /* CS */ +#else pushq $X86_KERNEL_CS /* CS */ +#endif pushq _thread_offset_to_rip(%rdi) /* RIP */ movq _thread_offset_to_rbx(%rdi), %rbx @@ -261,6 +304,13 @@ __resume: movq _thread_offset_to_r11(%rdi), %r11 movq _thread_offset_to_rdi(%rdi), %rdi /* do last :-) */ +#ifdef CONFIG_USERSPACE + /* Swap GS register values if we are returning to user mode */ + testb $0x3, 8(%rsp) + jz 1f + swapgs +#endif /* CONFIG_USERSPACE */ + 1: iretq @@ -273,7 +323,13 @@ except: /* * finish struct NANO_ESF on stack. 'vector' .. 'ss' are * already there from hardware trap and EXCEPT_*() stub. */ - +#ifdef CONFIG_USERSPACE + /* Swap GS register values if we came in from user mode */ + testb $0x3, 24(%rsp) + jz 1f + swapgs +1: +#endif /* CONFIG_USERSPACE */ pushq %r15 subq $X86_FXSAVE_SIZE, %rsp fxsave (%rsp) @@ -323,6 +379,15 @@ except: /* /* Drop the vector/err code pushed by the HW or EXCEPT_*() stub */ add $16, %rsp +#ifdef CONFIG_USERSPACE + /* Swap GS register values if we are returning to user mode */ + testb $0x3, 8(%rsp) + jz 1f + cli + swapgs +1: +#endif /* CONFIG_USERSPACE */ + iretq EXCEPT ( 0); EXCEPT ( 1); EXCEPT ( 2); EXCEPT ( 3) @@ -356,6 +421,13 @@ EXCEPT(Z_X86_OOPS_VECTOR); .globl x86_irq_args /* .. for these definitions */ irq: +#ifdef CONFIG_USERSPACE + /* Swap GS register values if we came in from user mode */ + testb $0x3, 16(%rsp) + jz 1f + swapgs +1: +#endif /* CONFIG_USERSPACE */ pushq %rsi movq %gs:__x86_tss64_t_cpu_OFFSET, %rsi @@ -422,12 +494,18 @@ irq_enter_unnested: /* Not nested: dump state to thread struct for __resume */ popq %rcx /* vector number */ popq %rax /* RIP */ movq %rax, _thread_offset_to_rip(%rsi) - popq %rax /* CS: discard */ + popq %rax /* CS */ +#ifdef CONFIG_USERSPACE + movq %rax, _thread_offset_to_cs(%rsi) +#endif popq %rax /* RFLAGS */ movq %rax, _thread_offset_to_rflags(%rsi) popq %rax /* RSP */ movq %rax, _thread_offset_to_rsp(%rsi) - popq %rax /* SS: discard */ + popq %rax /* SS */ +#ifdef CONFIG_USERSPACE + movq %rax, _thread_offset_to_ss(%rsi) +#endif irq_dispatch: movq x86_irq_funcs(,%rcx,8), %rbx diff --git a/arch/x86/core/intel64/thread.c b/arch/x86/core/intel64/thread.c index 44d284ed8a1..0a34bf6b371 100644 --- a/arch/x86/core/intel64/thread.c +++ b/arch/x86/core/intel64/thread.c @@ -15,27 +15,36 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, void *parameter1, void *parameter2, void *parameter3, int priority, unsigned int options) { -#if defined(CONFIG_X86_USERSPACE) || defined(CONFIG_X86_STACK_PROTECTION) - struct z_x86_thread_stack_header *header = - (struct z_x86_thread_stack_header *)stack; -#endif + void *switch_entry; Z_ASSERT_VALID_PRIO(priority, entry); z_new_thread_init(thread, Z_THREAD_STACK_BUFFER(stack), stack_size, priority, options); #if CONFIG_X86_STACK_PROTECTION + struct z_x86_thread_stack_header *header = + (struct z_x86_thread_stack_header *)stack; + /* Set guard area to read-only to catch stack overflows */ z_x86_mmu_set_flags(&z_x86_kernel_ptables, &header->guard_page, MMU_PAGE_SIZE, MMU_ENTRY_READ, Z_X86_MMU_RW, true); #endif - +#ifdef CONFIG_USERSPACE + switch_entry = z_x86_userspace_prepare_thread(thread); + thread->arch.cs = X86_KERNEL_CS; + thread->arch.ss = X86_KERNEL_DS; +#else + switch_entry = z_thread_entry; +#endif thread->callee_saved.rsp = (long) Z_THREAD_STACK_BUFFER(stack); thread->callee_saved.rsp += (stack_size - 8); /* fake RIP for ABI */ - thread->callee_saved.rip = (long) z_thread_entry; + thread->callee_saved.rip = (long) switch_entry; thread->callee_saved.rflags = EFLAGS_INITIAL; + /* Parameters to entry point, which is populated in + * thread->callee_saved.rip + */ thread->arch.rdi = (long) entry; thread->arch.rsi = (long) parameter1; thread->arch.rdx = (long) parameter2; diff --git a/arch/x86/core/intel64/userspace.S b/arch/x86/core/intel64/userspace.S new file mode 100644 index 00000000000..86939894811 --- /dev/null +++ b/arch/x86/core/intel64/userspace.S @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2017 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + + +/* Landing site for 'syscall' instruction + * + * Call id is in RAX + * Arguments are in RDI, RSI, RDX, R10, R8, R9 + * Return address stored by CPU in RCX + * User RFLAGS store by CPU in R11 + * Current RFLAGS has been masked with ~X86_FMASK_MSR + */ +.global z_x86_syscall_entry_stub +z_x86_syscall_entry_stub: + swapgs + + /* Switch to the privilege mode stack pointer stored in + * x86_tss64.psp and store the user mode stack pointer in + * x86_tss64.usp, immediately pushing it once the stack switch + * is done since this is a per-cpu and not per-thread area. + * + * This dance is necessary as upon entry we have no free registers + * nor a stack we can push to. + */ + movq %rsp, %gs:__x86_tss64_t_usp_OFFSET + movq %gs:__x86_tss64_t_psp_OFFSET, %rsp + pushq %gs:__x86_tss64_t_usp_OFFSET + + sti /* re-enable interrupts */ + + /* call_id is in RAX. bounds-check it, must be less than + * K_SYSCALL_LIMIT. + */ + cmp $K_SYSCALL_LIMIT, %rax + jae _bad_syscall + +_id_ok: +#ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION + /* Prevent speculation with bogus system call IDs */ + lfence +#endif + + /* Remaining registers not involved in the syscall operation are + * RBX, RBP, R12-R15, plus floating point / SIMD registers. + * + * We save caller-saved registers so we can restore to original values + * when we call 'sysretq' at the end. + */ + pushq %rdi + subq $X86_FXSAVE_SIZE, %rsp + fxsave (%rsp) + pushq %rsi + pushq %rdx + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 /* RFLAGS */ + pushq %rcx /* Return address stored by 'syscall' */ + pushq %rsp /* SSF parameter */ + + /* All other args are in the right registers, except arg4 which + * we had to put in r10 instead of RCX + */ + movq %r10, %rcx + + /* from the call ID in RAX, load R10 with the actual function pointer + * to call by looking it up in the system call dispatch table + */ + xorq %r11, %r11 + movq _k_syscall_table(%r11, %rax, 8), %r10 + + /* Run the marshal function, which is some entry in _k_syscall_table */ + call *%r10 + + /* RAX now contains the return value + * + * Callee-saved registers are un-touched from original values per C + * calling convention, but sensitive data may lurk in caller-saved regs + * RDI, RSI, RDX, R8, R9, R10, XMM* after we have serviced the system + * call. We saved them earlier, restore their original values when + * the syscall was made. This also preserves these registers if they + * were not used as arguments. + * + * We also can't have RCX and R11 clobbered as we need the original + * values to successfully 'sysretq'. + */ + addq $8, %rsp /* Discard ssf */ + popq %rcx /* Restore return address for 'sysretq' */ + popq %r11 /* Restore RFLAGS for 'sysretq' */ + popq %r10 + popq %r9 + popq %r8 + popq %rdx + popq %rsi + fxrstor (%rsp) + addq $X86_FXSAVE_SIZE, %rsp + popq %rdi + + /* Restore user stack pointer */ + popq %rsp + + /* Return to user mode, locking interrupts as the normal interrupt + * handling path will get very confused if it occurs between + * 'swapgs' and 'sysretq' + */ + cli + swapgs + sysretq + +_bad_syscall: + /* RAX had a bogus syscall value in it, replace with the bad syscall + * handler's ID, and put the bad ID as its first argument. + * + * TODO: On this and all other arches, simply immediately return + * with -ENOSYS, once all syscalls have a return value + */ + movq %rax, %rdi + movq $K_SYSCALL_BAD, %rax + jmp _id_ok + +/* + * size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err_arg) + * ^ RDI ^ RSI ^ RDX + */ +.global arch_user_string_nlen +arch_user_string_nlen: + /* Initial error value, strlen_done adjusts this if we succeed */ + movl $-1, %r8d + + /* use RAX as our length count (this function's return value) */ + xor %rax, %rax + + /* This code might page fault */ +strlen_loop: +.global z_x86_user_string_nlen_fault_start +z_x86_user_string_nlen_fault_start: + cmpb $0x0, (%rdi, %rax, 1) /* *(RDI + RAX) == 0? Could fault. */ + +.global z_x86_user_string_nlen_fault_end +z_x86_user_string_nlen_fault_end: + je strlen_done + cmp %rsi, %rax /* Max length reached? */ + je strlen_done + inc %rax /* EAX++ and loop again */ + jmp strlen_loop + +strlen_done: + /* Set error value to 0 since we succeeded */ + xorl %r8d, %r8d + +.global z_x86_user_string_nlen_fixup +z_x86_user_string_nlen_fixup: + /* Write error value to 32-bit integer err pointer parameter */ + movl %r8d, (%rdx) + retq + +/* + * Trampoline function to put the p3 parameter in the register expected + * by the calling convention, we couldn't use RCX when we called 'sysret' + */ +z_x86_userspace_landing_site: + /* Place argument 4 in the correct position */ + movq %r10, %rcx + call z_thread_entry + +/* FUNC_NORETURN void z_x86_userspace_enter( + * k_thread_entry_t user_entry, <- RDI + * void *p1, void *p2, void *p3, <- RSI, RDX, RCX + * uintptr_t stack_end, <- R8 + * uintptr_t stack_start) <- R9 + * + * A one-way trip to userspace. + */ +.global z_x86_userspace_enter +z_x86_userspace_enter: + /* RCX is sysret return address, pass along p3 in r10, + * z_x86_userspace_landing_site will fix this up + */ + movq %rcx, %r10 + + /* switch to privilege mode stack so we can erase thread stack buffer, + * the buffer is the page immediately before the thread stack + */ + movq %r9, %rsp + + /* Need RDI temporarily */ + pushq %rdi + + /* Compute size of user stack in 8-byte chunks and put in RCX */ + movq %r9, %rdi /* Start address for rep stosq in RDI */ + movq %r8, %rcx /* Ending address */ + subq %rdi, %rcx /* Subtract starting address */ + shrq $3, %rcx /* Divide by 8 */ + + movq $0xAAAAAAAAAAAAAAAA, %rax /* Fill value */ + /* Copy 8 bytes of memory at a time, starting at ES:RDI, with whatever + * is in RAX. Repeat this RCX times. Stack sizes are always at least + * 8-byte aligned. + */ + cld + rep stosq + + popq %rdi + + /* Reset to the beginning of the user stack */ + movq %r8, %rsp + + /* set sysret entry point */ + movq $z_x86_userspace_landing_site, %rcx + + /* Copy RFLAGS into r11, required by sysret */ + pushfq + movq (%rsp), %r11 + movq $0, (%rsp) /* Now a debugger-friendly return address */ + + /* cleanse other registers */ + xorq %rax, %rax + xorq %rbx, %rbx + xorq %rbp, %rbp + xorq %r12, %r12 + xorq %r13, %r13 + xorq %r14, %r14 + xorq %r15, %r15 + + cli + swapgs + sysretq diff --git a/arch/x86/core/offsets/intel64_offsets.c b/arch/x86/core/offsets/intel64_offsets.c index 308b69f4adb..398d4bf4659 100644 --- a/arch/x86/core/offsets/intel64_offsets.c +++ b/arch/x86/core/offsets/intel64_offsets.c @@ -23,9 +23,17 @@ GEN_OFFSET_SYM(_thread_arch_t, r9); GEN_OFFSET_SYM(_thread_arch_t, r10); GEN_OFFSET_SYM(_thread_arch_t, r11); GEN_OFFSET_SYM(_thread_arch_t, sse); +#ifdef CONFIG_USERSPACE +GEN_OFFSET_SYM(_thread_arch_t, ss); +GEN_OFFSET_SYM(_thread_arch_t, cs); +#endif GEN_OFFSET_SYM(x86_tss64_t, ist1); GEN_OFFSET_SYM(x86_tss64_t, cpu); +#ifdef CONFIG_USERSPACE +GEN_OFFSET_SYM(x86_tss64_t, psp); +GEN_OFFSET_SYM(x86_tss64_t, usp); +#endif GEN_ABSOLUTE_SYM(__X86_TSS64_SIZEOF, sizeof(x86_tss64_t)); GEN_OFFSET_SYM(x86_cpuboot_t, ready); diff --git a/arch/x86/include/intel64/offsets_short_arch.h b/arch/x86/include/intel64/offsets_short_arch.h index bef3fc6d2fe..4252ac687db 100644 --- a/arch/x86/include/intel64/offsets_short_arch.h +++ b/arch/x86/include/intel64/offsets_short_arch.h @@ -65,4 +65,10 @@ #define _thread_offset_to_sse \ (___thread_t_arch_OFFSET + ___thread_arch_t_sse_OFFSET) +#define _thread_offset_to_ss \ + (___thread_t_arch_OFFSET + ___thread_arch_t_ss_OFFSET) + +#define _thread_offset_to_cs \ + (___thread_t_arch_OFFSET + ___thread_arch_t_cs_OFFSET) + #endif /* ZEPHYR_ARCH_X86_INCLUDE_INTEL64_OFFSETS_SHORT_ARCH_H_ */ diff --git a/include/arch/x86/intel64/arch.h b/include/arch/x86/intel64/arch.h index 78e23d6fefa..988208823fb 100644 --- a/include/arch/x86/intel64/arch.h +++ b/include/arch/x86/intel64/arch.h @@ -59,6 +59,19 @@ struct x86_esf { typedef struct x86_esf z_arch_esf_t; +struct x86_ssf { + unsigned long rip; + unsigned long rflags; + unsigned long r10; + unsigned long r9; + unsigned long r8; + unsigned long rdx; + unsigned long rsi; + char fxsave[X86_FXSAVE_SIZE]; + unsigned long rdi; + unsigned long rsp; +}; + #define ARCH_EXCEPT(reason_p) do { \ __asm__ volatile( \ "movq %[reason], %%rax\n\t" \ diff --git a/include/arch/x86/intel64/thread.h b/include/arch/x86/intel64/thread.h index 0b38da0f7c2..f5fb48e39fc 100644 --- a/include/arch/x86/intel64/thread.h +++ b/include/arch/x86/intel64/thread.h @@ -20,6 +20,11 @@ #define X86_USER_DS 0x30 /* 64-bit user mode data */ #define X86_USER_CS 0x38 /* 64-bit user mode code */ +/* Value programmed into bits 63:32 of STAR MSR with proper segment + * descriptors for implementing user mode with syscall/sysret + */ +#define X86_STAR_UPPER ((X86_USER_CS_32 << 16) | X86_KERNEL_CS) + #define X86_KERNEL_CPU0_TR 0x40 /* 64-bit task state segment */ #define X86_KERNEL_CPU1_TR 0x50 /* 64-bit task state segment */ #define X86_KERNEL_CPU2_TR 0x60 /* 64-bit task state segment */ @@ -73,6 +78,13 @@ struct x86_tss64 { */ struct _cpu *cpu; +#ifdef CONFIG_USERSPACE + /* Privilege mode stack pointer value when doing a system call */ + char *psp; + + /* Storage area for user mode stack pointer when doing a syscall */ + char *usp; +#endif } __packed __aligned(8); typedef struct x86_tss64 x86_tss64_t; @@ -101,6 +113,23 @@ typedef struct _callee_saved _callee_saved_t; struct _thread_arch { u8_t flags; +#ifdef CONFIG_USERSPACE + /* Pointer to page tables used by this thread. Supervisor threads + * always use the kernel's page table, user thread use per-thread + * tables stored in the stack object + */ + struct x86_page_tables *ptables; + + /* Initial privilege mode stack pointer when doing a system call. + * Un-set for supervisor threads. + */ + char *psp; + + /* SS and CS selectors for this thread when restoring context */ + u64_t ss; + u64_t cs; +#endif + u64_t rax; u64_t rcx; u64_t rdx;