diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cd41becaf51..860e1143cde 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,14 +17,14 @@ config CPU_ATOM bool select CPU_HAS_FPU select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && (!X86_64 || (X86_64 && X86_NO_MELTDOWN)) + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Atom family. config CPU_MINUTEIA bool select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && (!X86_64 || (X86_64 && X86_NO_MELTDOWN)) + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Minute IA family. @@ -32,7 +32,7 @@ config CPU_APOLLO_LAKE bool select CPU_HAS_FPU select ARCH_HAS_STACK_PROTECTION if X86_MMU - select ARCH_HAS_USERSPACE if X86_MMU && (!X86_64 || (X86_64 && X86_NO_MELTDOWN)) + select ARCH_HAS_USERSPACE if X86_MMU help This option signifies the use of a CPU from the Apollo Lake family. @@ -270,7 +270,6 @@ config X86_KPTI default y depends on USERSPACE depends on !X86_NO_MELTDOWN - depends on !X86_64 help Implements kernel page table isolation to mitigate Meltdown exploits to read Kernel RAM. Incurs a significant performance cost for diff --git a/arch/x86/core/intel64/cpu.c b/arch/x86/core/intel64/cpu.c index 019ae124777..293c9954e8f 100644 --- a/arch/x86/core/intel64/cpu.c +++ b/arch/x86/core/intel64/cpu.c @@ -26,8 +26,18 @@ extern u8_t _exception_stack1[]; extern u8_t _exception_stack2[]; extern u8_t _exception_stack3[]; +#ifdef CONFIG_X86_KPTI +extern u8_t z_x86_trampoline_stack[]; +extern u8_t z_x86_trampoline_stack1[]; +extern u8_t z_x86_trampoline_stack2[]; +extern u8_t z_x86_trampoline_stack3[]; +#endif /* CONFIG_X86_KPTI */ + Z_GENERIC_SECTION(.tss) struct x86_tss64 tss0 = { +#ifdef CONFIG_X86_KPTI + .ist2 = (u64_t) z_x86_trampoline_stack + Z_X86_TRAMPOLINE_STACK_SIZE, +#endif .ist7 = (u64_t) _exception_stack + CONFIG_EXCEPTION_STACK_SIZE, .iomapb = 0xFFFF, .cpu = &(_kernel.cpus[0]) @@ -36,6 +46,9 @@ struct x86_tss64 tss0 = { #if CONFIG_MP_NUM_CPUS > 1 Z_GENERIC_SECTION(.tss) struct x86_tss64 tss1 = { +#ifdef CONFIG_X86_KPTI + .ist2 = (u64_t) z_x86_trampoline_stack1 + Z_X86_TRAMPOLINE_STACK_SIZE, +#endif .ist7 = (u64_t) _exception_stack1 + CONFIG_EXCEPTION_STACK_SIZE, .iomapb = 0xFFFF, .cpu = &(_kernel.cpus[1]) @@ -45,6 +58,9 @@ struct x86_tss64 tss1 = { #if CONFIG_MP_NUM_CPUS > 2 Z_GENERIC_SECTION(.tss) struct x86_tss64 tss2 = { +#ifdef CONFIG_X86_KPTI + .ist2 = (u64_t) z_x86_trampoline_stack2 + Z_X86_TRAMPOLINE_STACK_SIZE, +#endif .ist7 = (u64_t) _exception_stack2 + CONFIG_EXCEPTION_STACK_SIZE, .iomapb = 0xFFFF, .cpu = &(_kernel.cpus[2]) @@ -54,6 +70,9 @@ struct x86_tss64 tss2 = { #if CONFIG_MP_NUM_CPUS > 3 Z_GENERIC_SECTION(.tss) struct x86_tss64 tss3 = { +#ifdef CONFIG_X86_KPTI + .ist2 = (u64_t) z_x86_trampoline_stack3 + Z_X86_TRAMPOLINE_STACK_SIZE, +#endif .ist7 = (u64_t) _exception_stack3 + CONFIG_EXCEPTION_STACK_SIZE, .iomapb = 0xFFFF, .cpu = &(_kernel.cpus[3]) diff --git a/arch/x86/core/intel64/locore.S b/arch/x86/core/intel64/locore.S index e38a0fa993d..72884cbdad7 100644 --- a/arch/x86/core/intel64/locore.S +++ b/arch/x86/core/intel64/locore.S @@ -232,15 +232,21 @@ z_x86_switch: __resume: #ifdef CONFIG_USERSPACE +#ifndef CONFIG_X86_KPTI + /* If KPTI is enabled we're always on the kernel's page tables in + * this context and the appropriate page table switch takes place + * when trampolining back to user mode + */ pushq %rdi /* Caller-saved, stash it */ call z_x86_swap_update_page_tables popq %rdi +#endif /* CONFIG_X86_KPTI */ /* Set up exception return stack frame */ pushq _thread_offset_to_ss(%rdi) /* SS */ #else pushq $X86_KERNEL_DS /* SS */ -#endif +#endif /* CONFIG_USERSPACE */ pushq _thread_offset_to_rsp(%rdi) /* RSP */ pushq _thread_offset_to_rflags(%rdi) /* RFLAGS */ #ifdef CONFIG_USERSPACE @@ -256,6 +262,13 @@ __resume: movq _thread_offset_to_r13(%rdi), %r13 movq _thread_offset_to_r14(%rdi), %r14 movq _thread_offset_to_r15(%rdi), %r15 +#ifdef CONFIG_USERSPACE + /* Set correct privilege elevation stack to manually switch to in + * z_x86_syscall_entry_stub() + */ + movq _thread_offset_to_psp(%rdi), %rax + movq %rax, %gs:__x86_tss64_t_psp_OFFSET +#endif testb $X86_THREAD_FLAG_ALL, _thread_offset_to_flags(%rdi) jz 1f @@ -275,7 +288,11 @@ __resume: /* Swap GS register values if we are returning to user mode */ testb $0x3, 8(%rsp) jz 1f +#ifdef CONFIG_X86_KPTI + jmp z_x86_trampoline_to_user +#else swapgs +#endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_USERSPACE */ 1: #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION @@ -290,22 +307,66 @@ __resume: #define EXCEPT_CODE(nr) vector_ ## nr: pushq $nr; jmp except #define EXCEPT(nr) vector_ ## nr: pushq $0; pushq $nr; jmp except +/* + * When we arrive at 'except' from one of the EXCEPT(X) stubs, + * we're on the exception stack with irqs unlocked (or the trampoline stack + * with irqs locked if KPTI is enabled) and it contains: + * + * SS + * RSP + * RFLAGS + * CS + * RIP + * Error Code if pushed by CPU, else 0 + * Vector number <- RSP points here + * + */ + except: /* * finish struct NANO_ESF on stack. 'vector' .. 'ss' are * already there from hardware trap and EXCEPT_*() stub. */ + + pushq %r15 + #ifdef CONFIG_USERSPACE - /* Swap GS register values if we came in from user mode */ - testb $0x3, 24(%rsp) + /* Swap GS register values and page tables if we came from user mode */ + testb $0x3, 32(%rsp) jz 1f swapgs +#ifdef CONFIG_X86_KPTI + /* Load kernel's page table */ + movq $z_x86_kernel_ptables, %r15 + movq %r15, %cr3 +#endif /* CONFIG_X86_KPTI */ 1: #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION /* swapgs variant of Spectre V1. Disable speculation past this point */ lfence #endif /* CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION */ +#ifdef CONFIG_X86_KPTI + /* Save old trampoline stack pointer in R15 */ + movq %rsp, %r15 + + /* Switch to the exception stack */ + movq %gs:__x86_tss64_t_ist7_OFFSET, %rsp + + /* Transplant trampoline stack contents */ + pushq 56(%r15) /* SS */ + pushq 48(%r15) /* RSP */ + pushq 40(%r15) /* RFLAGS */ + pushq 32(%r15) /* CS */ + pushq 24(%r15) /* RIP */ + pushq 16(%r15) /* Error code */ + pushq 8(%r15) /* Vector */ + pushq (%r15) /* Stashed R15 */ + movq $0, (%r15) /* Cover our tracks */ + + /* We're done, it's safe to re-enable interrupts. */ + sti +#endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_USERSPACE */ - pushq %r15 + subq $X86_FXSAVE_SIZE, %rsp fxsave (%rsp) pushq %r14 @@ -359,7 +420,11 @@ except: /* testb $0x3, 8(%rsp) jz 1f cli +#ifdef CONFIG_X86_KPTI + jmp z_x86_trampoline_to_user +#else swapgs +#endif /* CONFIG_X86_KPTI */ 1: #endif /* CONFIG_USERSPACE */ @@ -381,7 +446,8 @@ EXCEPT(Z_X86_OOPS_VECTOR); /* * When we arrive at 'irq' from one of the IRQ(X) stubs, - * we're on the "freshest" IRQ stack and it contains: + * we're on the "freshest" IRQ stack (or the trampoline stack if we came from + * user mode and KPTI is enabled) and it contains: * * SS * RSP @@ -389,25 +455,48 @@ EXCEPT(Z_X86_OOPS_VECTOR); * CS * RIP * (vector number - IV_IRQS) <-- RSP points here - * RSI <-- we push this on entry */ .globl x86_irq_funcs /* see irq_manage.c .. */ .globl x86_irq_args /* .. for these definitions */ irq: + pushq %rsi + #ifdef CONFIG_USERSPACE /* Swap GS register values if we came in from user mode */ - testb $0x3, 16(%rsp) + testb $0x3, 24(%rsp) jz 1f swapgs +#ifdef CONFIG_X86_KPTI + /* Load kernel's page table */ + movq $z_x86_kernel_ptables, %rsi + movq %rsi, %cr3 +#endif /* CONFIG_X86_KPTI */ 1: #ifdef CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION /* swapgs variant of Spectre V1. Disable speculation past this point */ lfence #endif /* CONFIG_X86_BOUNDS_CHECK_BYPASS_MITIGATION */ +#ifdef CONFIG_X86_KPTI + /* Save old trampoline stack pointer in RSI */ + movq %rsp, %rsi + + /* Switch to the interrupt stack stack */ + movq %gs:__x86_tss64_t_ist1_OFFSET, %rsp + + /* Transplant trampoline stack contents */ + pushq 48(%rsi) /* SS */ + pushq 40(%rsi) /* RSP */ + pushq 32(%rsi) /* RFLAGS */ + pushq 24(%rsi) /* CS */ + pushq 16(%rsi) /* RIP */ + pushq 8(%rsi) /* Vector */ + pushq (%rsi) /* Stashed RSI value */ + movq $0, (%rsi) /* Cover our tracks, stashed RSI might be sensitive */ +#endif /* CONFIG_X86_KPTI */ #endif /* CONFIG_USERSPACE */ - pushq %rsi + movq %gs:__x86_tss64_t_cpu_OFFSET, %rsi /* @@ -564,84 +653,178 @@ IRQ(248); IRQ(249); IRQ(250); IRQ(251); IRQ(252); IRQ(253); IRQ(254); IRQ(255) * IDT. */ -#define TRAP 0x8f -#define INTR 0x8e -#define USER_INTR 0xee +/* Descriptor type. Traps don't implicitly disable interrupts. User variants + * can be invoked by software running in user mode (ring 3). + * + * For KPTI everything lands on the trampoline stack and we must get off of + * it before re-enabling interrupts; use interrupt gates for everything. + */ +#define INTR 0x8e +#define USER_INTR 0xee +#ifdef CONFIG_X86_KPTI +#define TRAP INTR +#define USER_TRAP UINTR +#else +#define TRAP 0x8f +#define USER_TRAP 0xef +#endif #define IDT(nr, type, ist) \ .word vector_ ## nr, X86_KERNEL_CS; \ .byte ist, type; \ .word 0, 0, 0, 0, 0 +/* Which IST entry in TSS to use for automatic stack switching, or 0 if + * no automatic switch is to take place. Stack page must be present in + * the current page tables, if KPTI is on only the trampoline stack and + * the current user stack can be accessed. + */ +#ifdef CONFIG_X86_KPTI +/* Everything lands on ist2, which is set to the trampoline stack. + * Interrupt/exception entry updates page tables and manually switches to + * the irq/exception stacks stored in ist1/ist7 + */ +#define IRQ_STACK 2 +#define EXC_STACK 2 +#define BAD_STACK 2 +#else +#define IRQ_STACK 1 +#define EXC_STACK 7 +#define BAD_STACK 7 /* Horrible things: NMIs, double faults, MCEs */ +#endif + .align 16 idt: - IDT( 0, TRAP, 7); IDT( 1, TRAP, 7); IDT( 2, TRAP, 7); IDT( 3, TRAP, 7) - IDT( 4, TRAP, 7); IDT( 5, TRAP, 7); IDT( 6, TRAP, 7); IDT( 7, TRAP, 7) - IDT( 8, TRAP, 7); IDT( 9, TRAP, 7); IDT( 10, TRAP, 7); IDT( 11, TRAP, 7) - IDT( 12, TRAP, 7); IDT( 13, TRAP, 7); IDT( 14, TRAP, 7); IDT( 15, TRAP, 7) - IDT( 16, TRAP, 7); IDT( 17, TRAP, 7); IDT( 18, TRAP, 7); IDT( 19, TRAP, 7) - IDT( 20, TRAP, 7); IDT( 21, TRAP, 7); IDT( 22, TRAP, 7); IDT( 23, TRAP, 7) - IDT( 24, TRAP, 7); IDT( 25, TRAP, 7); IDT( 26, TRAP, 7); IDT( 27, TRAP, 7) - IDT( 28, TRAP, 7); IDT( 29, TRAP, 7); IDT( 30, TRAP, 7); IDT( 31, TRAP, 7) + IDT( 0, TRAP, EXC_STACK); IDT( 1, TRAP, EXC_STACK) + IDT( 2, TRAP, BAD_STACK); IDT( 3, TRAP, EXC_STACK) + IDT( 4, TRAP, EXC_STACK); IDT( 5, TRAP, EXC_STACK) + IDT( 6, TRAP, EXC_STACK); IDT( 7, TRAP, EXC_STACK) + IDT( 8, TRAP, BAD_STACK); IDT( 9, TRAP, EXC_STACK) + IDT( 10, TRAP, EXC_STACK); IDT( 11, TRAP, EXC_STACK) + IDT( 12, TRAP, EXC_STACK); IDT( 13, TRAP, EXC_STACK) + IDT( 14, TRAP, EXC_STACK); IDT( 15, TRAP, EXC_STACK) + IDT( 16, TRAP, EXC_STACK); IDT( 17, TRAP, EXC_STACK) + IDT( 18, TRAP, BAD_STACK); IDT( 19, TRAP, EXC_STACK) + IDT( 20, TRAP, EXC_STACK); IDT( 21, TRAP, EXC_STACK) + IDT( 22, TRAP, EXC_STACK); IDT( 23, TRAP, EXC_STACK) + IDT( 24, TRAP, EXC_STACK); IDT( 25, TRAP, EXC_STACK) + IDT( 26, TRAP, EXC_STACK); IDT( 27, TRAP, EXC_STACK) + IDT( 28, TRAP, EXC_STACK); IDT( 29, TRAP, EXC_STACK) + IDT( 30, TRAP, EXC_STACK); IDT( 31, TRAP, EXC_STACK) - /* Oops vector can be invoked from Ring 3 and runs on exception stack */ - IDT(Z_X86_OOPS_VECTOR, USER_INTR, 7); - IDT( 33, INTR, 1); IDT( 34, INTR, 1); IDT( 35, INTR, 1) - IDT( 36, INTR, 1); IDT( 37, INTR, 1); IDT( 38, INTR, 1); IDT( 39, INTR, 1) - IDT( 40, INTR, 1); IDT( 41, INTR, 1); IDT( 42, INTR, 1); IDT( 43, INTR, 1) - IDT( 44, INTR, 1); IDT( 45, INTR, 1); IDT( 46, INTR, 1); IDT( 47, INTR, 1) - IDT( 48, INTR, 1); IDT( 49, INTR, 1); IDT( 50, INTR, 1); IDT( 51, INTR, 1) - IDT( 52, INTR, 1); IDT( 53, INTR, 1); IDT( 54, INTR, 1); IDT( 55, INTR, 1) - IDT( 56, INTR, 1); IDT( 57, INTR, 1); IDT( 58, INTR, 1); IDT( 59, INTR, 1) - IDT( 60, INTR, 1); IDT( 61, INTR, 1); IDT( 62, INTR, 1); IDT( 63, INTR, 1) - IDT( 64, INTR, 1); IDT( 65, INTR, 1); IDT( 66, INTR, 1); IDT( 67, INTR, 1) - IDT( 68, INTR, 1); IDT( 69, INTR, 1); IDT( 70, INTR, 1); IDT( 71, INTR, 1) - IDT( 72, INTR, 1); IDT( 73, INTR, 1); IDT( 74, INTR, 1); IDT( 75, INTR, 1) - IDT( 76, INTR, 1); IDT( 77, INTR, 1); IDT( 78, INTR, 1); IDT( 79, INTR, 1) - IDT( 80, INTR, 1); IDT( 81, INTR, 1); IDT( 82, INTR, 1); IDT( 83, INTR, 1) - IDT( 84, INTR, 1); IDT( 85, INTR, 1); IDT( 86, INTR, 1); IDT( 87, INTR, 1) - IDT( 88, INTR, 1); IDT( 89, INTR, 1); IDT( 90, INTR, 1); IDT( 91, INTR, 1) - IDT( 92, INTR, 1); IDT( 93, INTR, 1); IDT( 94, INTR, 1); IDT( 95, INTR, 1) - IDT( 96, INTR, 1); IDT( 97, INTR, 1); IDT( 98, INTR, 1); IDT( 99, INTR, 1) - IDT(100, INTR, 1); IDT(101, INTR, 1); IDT(102, INTR, 1); IDT(103, INTR, 1) - IDT(104, INTR, 1); IDT(105, INTR, 1); IDT(106, INTR, 1); IDT(107, INTR, 1) - IDT(108, INTR, 1); IDT(109, INTR, 1); IDT(110, INTR, 1); IDT(111, INTR, 1) - IDT(112, INTR, 1); IDT(113, INTR, 1); IDT(114, INTR, 1); IDT(115, INTR, 1) - IDT(116, INTR, 1); IDT(117, INTR, 1); IDT(118, INTR, 1); IDT(119, INTR, 1) - IDT(120, INTR, 1); IDT(121, INTR, 1); IDT(122, INTR, 1); IDT(123, INTR, 1) - IDT(124, INTR, 1); IDT(125, INTR, 1); IDT(126, INTR, 1); IDT(127, INTR, 1) - IDT(128, INTR, 1); IDT(129, INTR, 1); IDT(130, INTR, 1); IDT(131, INTR, 1) - IDT(132, INTR, 1); IDT(133, INTR, 1); IDT(134, INTR, 1); IDT(135, INTR, 1) - IDT(136, INTR, 1); IDT(137, INTR, 1); IDT(138, INTR, 1); IDT(139, INTR, 1) - IDT(140, INTR, 1); IDT(141, INTR, 1); IDT(142, INTR, 1); IDT(143, INTR, 1) - IDT(144, INTR, 1); IDT(145, INTR, 1); IDT(146, INTR, 1); IDT(147, INTR, 1) - IDT(148, INTR, 1); IDT(149, INTR, 1); IDT(150, INTR, 1); IDT(151, INTR, 1) - IDT(152, INTR, 1); IDT(153, INTR, 1); IDT(154, INTR, 1); IDT(155, INTR, 1) - IDT(156, INTR, 1); IDT(157, INTR, 1); IDT(158, INTR, 1); IDT(159, INTR, 1) - IDT(160, INTR, 1); IDT(161, INTR, 1); IDT(162, INTR, 1); IDT(163, INTR, 1) - IDT(164, INTR, 1); IDT(165, INTR, 1); IDT(166, INTR, 1); IDT(167, INTR, 1) - IDT(168, INTR, 1); IDT(169, INTR, 1); IDT(170, INTR, 1); IDT(171, INTR, 1) - IDT(172, INTR, 1); IDT(173, INTR, 1); IDT(174, INTR, 1); IDT(175, INTR, 1) - IDT(176, INTR, 1); IDT(177, INTR, 1); IDT(178, INTR, 1); IDT(179, INTR, 1) - IDT(180, INTR, 1); IDT(181, INTR, 1); IDT(182, INTR, 1); IDT(183, INTR, 1) - IDT(184, INTR, 1); IDT(185, INTR, 1); IDT(186, INTR, 1); IDT(187, INTR, 1) - IDT(188, INTR, 1); IDT(189, INTR, 1); IDT(190, INTR, 1); IDT(191, INTR, 1) - IDT(192, INTR, 1); IDT(193, INTR, 1); IDT(194, INTR, 1); IDT(195, INTR, 1) - IDT(196, INTR, 1); IDT(197, INTR, 1); IDT(198, INTR, 1); IDT(199, INTR, 1) - IDT(200, INTR, 1); IDT(201, INTR, 1); IDT(202, INTR, 1); IDT(203, INTR, 1) - IDT(204, INTR, 1); IDT(205, INTR, 1); IDT(206, INTR, 1); IDT(207, INTR, 1) - IDT(208, INTR, 1); IDT(209, INTR, 1); IDT(210, INTR, 1); IDT(211, INTR, 1) - IDT(212, INTR, 1); IDT(213, INTR, 1); IDT(214, INTR, 1); IDT(215, INTR, 1) - IDT(216, INTR, 1); IDT(217, INTR, 1); IDT(218, INTR, 1); IDT(219, INTR, 1) - IDT(220, INTR, 1); IDT(221, INTR, 1); IDT(222, INTR, 1); IDT(223, INTR, 1) - IDT(224, INTR, 1); IDT(225, INTR, 1); IDT(226, INTR, 1); IDT(227, INTR, 1) - IDT(228, INTR, 1); IDT(229, INTR, 1); IDT(230, INTR, 1); IDT(231, INTR, 1) - IDT(232, INTR, 1); IDT(233, INTR, 1); IDT(234, INTR, 1); IDT(235, INTR, 1) - IDT(236, INTR, 1); IDT(237, INTR, 1); IDT(238, INTR, 1); IDT(239, INTR, 1) - IDT(240, INTR, 1); IDT(241, INTR, 1); IDT(242, INTR, 1); IDT(243, INTR, 1) - IDT(244, INTR, 1); IDT(245, INTR, 1); IDT(246, INTR, 1); IDT(247, INTR, 1) - IDT(248, INTR, 1); IDT(249, INTR, 1); IDT(250, INTR, 1); IDT(251, INTR, 1) - IDT(252, INTR, 1); IDT(253, INTR, 1); IDT(254, INTR, 1); IDT(255, INTR, 1) + /* Oops vector can be invoked from Ring 3 and runs on exception stack */ + IDT(Z_X86_OOPS_VECTOR, USER_INTR, EXC_STACK); IDT( 33, INTR, IRQ_STACK) + IDT( 34, INTR, IRQ_STACK); IDT( 35, INTR, IRQ_STACK) + IDT( 36, INTR, IRQ_STACK); IDT( 37, INTR, IRQ_STACK) + IDT( 38, INTR, IRQ_STACK); IDT( 39, INTR, IRQ_STACK) + IDT( 40, INTR, IRQ_STACK); IDT( 41, INTR, IRQ_STACK) + IDT( 42, INTR, IRQ_STACK); IDT( 43, INTR, IRQ_STACK) + IDT( 44, INTR, IRQ_STACK); IDT( 45, INTR, IRQ_STACK) + IDT( 46, INTR, IRQ_STACK); IDT( 47, INTR, IRQ_STACK) + IDT( 48, INTR, IRQ_STACK); IDT( 49, INTR, IRQ_STACK) + IDT( 50, INTR, IRQ_STACK); IDT( 51, INTR, IRQ_STACK) + IDT( 52, INTR, IRQ_STACK); IDT( 53, INTR, IRQ_STACK) + IDT( 54, INTR, IRQ_STACK); IDT( 55, INTR, IRQ_STACK) + IDT( 56, INTR, IRQ_STACK); IDT( 57, INTR, IRQ_STACK) + IDT( 58, INTR, IRQ_STACK); IDT( 59, INTR, IRQ_STACK) + IDT( 60, INTR, IRQ_STACK); IDT( 61, INTR, IRQ_STACK) + IDT( 62, INTR, IRQ_STACK); IDT( 63, INTR, IRQ_STACK) + IDT( 64, INTR, IRQ_STACK); IDT( 65, INTR, IRQ_STACK) + IDT( 66, INTR, IRQ_STACK); IDT( 67, INTR, IRQ_STACK) + IDT( 68, INTR, IRQ_STACK); IDT( 69, INTR, IRQ_STACK) + IDT( 70, INTR, IRQ_STACK); IDT( 71, INTR, IRQ_STACK) + IDT( 72, INTR, IRQ_STACK); IDT( 73, INTR, IRQ_STACK) + IDT( 74, INTR, IRQ_STACK); IDT( 75, INTR, IRQ_STACK) + IDT( 76, INTR, IRQ_STACK); IDT( 77, INTR, IRQ_STACK) + IDT( 78, INTR, IRQ_STACK); IDT( 79, INTR, IRQ_STACK) + IDT( 80, INTR, IRQ_STACK); IDT( 81, INTR, IRQ_STACK) + IDT( 82, INTR, IRQ_STACK); IDT( 83, INTR, IRQ_STACK) + IDT( 84, INTR, IRQ_STACK); IDT( 85, INTR, IRQ_STACK) + IDT( 86, INTR, IRQ_STACK); IDT( 87, INTR, IRQ_STACK) + IDT( 88, INTR, IRQ_STACK); IDT( 89, INTR, IRQ_STACK) + IDT( 90, INTR, IRQ_STACK); IDT( 91, INTR, IRQ_STACK) + IDT( 92, INTR, IRQ_STACK); IDT( 93, INTR, IRQ_STACK) + IDT( 94, INTR, IRQ_STACK); IDT( 95, INTR, IRQ_STACK) + IDT( 96, INTR, IRQ_STACK); IDT( 97, INTR, IRQ_STACK) + IDT( 98, INTR, IRQ_STACK); IDT( 99, INTR, IRQ_STACK) + IDT(100, INTR, IRQ_STACK); IDT(101, INTR, IRQ_STACK) + IDT(102, INTR, IRQ_STACK); IDT(103, INTR, IRQ_STACK) + IDT(104, INTR, IRQ_STACK); IDT(105, INTR, IRQ_STACK) + IDT(106, INTR, IRQ_STACK); IDT(107, INTR, IRQ_STACK) + IDT(108, INTR, IRQ_STACK); IDT(109, INTR, IRQ_STACK) + IDT(110, INTR, IRQ_STACK); IDT(111, INTR, IRQ_STACK) + IDT(112, INTR, IRQ_STACK); IDT(113, INTR, IRQ_STACK) + IDT(114, INTR, IRQ_STACK); IDT(115, INTR, IRQ_STACK) + IDT(116, INTR, IRQ_STACK); IDT(117, INTR, IRQ_STACK) + IDT(118, INTR, IRQ_STACK); IDT(119, INTR, IRQ_STACK) + IDT(120, INTR, IRQ_STACK); IDT(121, INTR, IRQ_STACK) + IDT(122, INTR, IRQ_STACK); IDT(123, INTR, IRQ_STACK) + IDT(124, INTR, IRQ_STACK); IDT(125, INTR, IRQ_STACK) + IDT(126, INTR, IRQ_STACK); IDT(127, INTR, IRQ_STACK) + IDT(128, INTR, IRQ_STACK); IDT(129, INTR, IRQ_STACK) + IDT(130, INTR, IRQ_STACK); IDT(131, INTR, IRQ_STACK) + IDT(132, INTR, IRQ_STACK); IDT(133, INTR, IRQ_STACK) + IDT(134, INTR, IRQ_STACK); IDT(135, INTR, IRQ_STACK) + IDT(136, INTR, IRQ_STACK); IDT(137, INTR, IRQ_STACK) + IDT(138, INTR, IRQ_STACK); IDT(139, INTR, IRQ_STACK) + IDT(140, INTR, IRQ_STACK); IDT(141, INTR, IRQ_STACK) + IDT(142, INTR, IRQ_STACK); IDT(143, INTR, IRQ_STACK) + IDT(144, INTR, IRQ_STACK); IDT(145, INTR, IRQ_STACK) + IDT(146, INTR, IRQ_STACK); IDT(147, INTR, IRQ_STACK) + IDT(148, INTR, IRQ_STACK); IDT(149, INTR, IRQ_STACK) + IDT(150, INTR, IRQ_STACK); IDT(151, INTR, IRQ_STACK) + IDT(152, INTR, IRQ_STACK); IDT(153, INTR, IRQ_STACK) + IDT(154, INTR, IRQ_STACK); IDT(155, INTR, IRQ_STACK) + IDT(156, INTR, IRQ_STACK); IDT(157, INTR, IRQ_STACK) + IDT(158, INTR, IRQ_STACK); IDT(159, INTR, IRQ_STACK) + IDT(160, INTR, IRQ_STACK); IDT(161, INTR, IRQ_STACK) + IDT(162, INTR, IRQ_STACK); IDT(163, INTR, IRQ_STACK) + IDT(164, INTR, IRQ_STACK); IDT(165, INTR, IRQ_STACK) + IDT(166, INTR, IRQ_STACK); IDT(167, INTR, IRQ_STACK) + IDT(168, INTR, IRQ_STACK); IDT(169, INTR, IRQ_STACK) + IDT(170, INTR, IRQ_STACK); IDT(171, INTR, IRQ_STACK) + IDT(172, INTR, IRQ_STACK); IDT(173, INTR, IRQ_STACK) + IDT(174, INTR, IRQ_STACK); IDT(175, INTR, IRQ_STACK) + IDT(176, INTR, IRQ_STACK); IDT(177, INTR, IRQ_STACK) + IDT(178, INTR, IRQ_STACK); IDT(179, INTR, IRQ_STACK) + IDT(180, INTR, IRQ_STACK); IDT(181, INTR, IRQ_STACK) + IDT(182, INTR, IRQ_STACK); IDT(183, INTR, IRQ_STACK) + IDT(184, INTR, IRQ_STACK); IDT(185, INTR, IRQ_STACK) + IDT(186, INTR, IRQ_STACK); IDT(187, INTR, IRQ_STACK) + IDT(188, INTR, IRQ_STACK); IDT(189, INTR, IRQ_STACK) + IDT(190, INTR, IRQ_STACK); IDT(191, INTR, IRQ_STACK) + IDT(192, INTR, IRQ_STACK); IDT(193, INTR, IRQ_STACK) + IDT(194, INTR, IRQ_STACK); IDT(195, INTR, IRQ_STACK) + IDT(196, INTR, IRQ_STACK); IDT(197, INTR, IRQ_STACK) + IDT(198, INTR, IRQ_STACK); IDT(199, INTR, IRQ_STACK) + IDT(200, INTR, IRQ_STACK); IDT(201, INTR, IRQ_STACK) + IDT(202, INTR, IRQ_STACK); IDT(203, INTR, IRQ_STACK) + IDT(204, INTR, IRQ_STACK); IDT(205, INTR, IRQ_STACK) + IDT(206, INTR, IRQ_STACK); IDT(207, INTR, IRQ_STACK) + IDT(208, INTR, IRQ_STACK); IDT(209, INTR, IRQ_STACK) + IDT(210, INTR, IRQ_STACK); IDT(211, INTR, IRQ_STACK) + IDT(212, INTR, IRQ_STACK); IDT(213, INTR, IRQ_STACK) + IDT(214, INTR, IRQ_STACK); IDT(215, INTR, IRQ_STACK) + IDT(216, INTR, IRQ_STACK); IDT(217, INTR, IRQ_STACK) + IDT(218, INTR, IRQ_STACK); IDT(219, INTR, IRQ_STACK) + IDT(220, INTR, IRQ_STACK); IDT(221, INTR, IRQ_STACK) + IDT(222, INTR, IRQ_STACK); IDT(223, INTR, IRQ_STACK) + IDT(224, INTR, IRQ_STACK); IDT(225, INTR, IRQ_STACK) + IDT(226, INTR, IRQ_STACK); IDT(227, INTR, IRQ_STACK) + IDT(228, INTR, IRQ_STACK); IDT(229, INTR, IRQ_STACK) + IDT(230, INTR, IRQ_STACK); IDT(231, INTR, IRQ_STACK) + IDT(232, INTR, IRQ_STACK); IDT(233, INTR, IRQ_STACK) + IDT(234, INTR, IRQ_STACK); IDT(235, INTR, IRQ_STACK) + IDT(236, INTR, IRQ_STACK); IDT(237, INTR, IRQ_STACK) + IDT(238, INTR, IRQ_STACK); IDT(239, INTR, IRQ_STACK) + IDT(240, INTR, IRQ_STACK); IDT(241, INTR, IRQ_STACK) + IDT(242, INTR, IRQ_STACK); IDT(243, INTR, IRQ_STACK) + IDT(244, INTR, IRQ_STACK); IDT(245, INTR, IRQ_STACK) + IDT(246, INTR, IRQ_STACK); IDT(247, INTR, IRQ_STACK) + IDT(248, INTR, IRQ_STACK); IDT(249, INTR, IRQ_STACK) + IDT(250, INTR, IRQ_STACK); IDT(251, INTR, IRQ_STACK) + IDT(252, INTR, IRQ_STACK); IDT(253, INTR, IRQ_STACK) + IDT(254, INTR, IRQ_STACK); IDT(255, INTR, IRQ_STACK) idt48: .word (idt48 - idt - 1) @@ -670,7 +853,7 @@ pdp: .long 0x00000083 /* 0x83 = 1GB, R/W, P */ .long 0 .fill 4064, 1, 0 -.section .lodata,"ad" +.section .gdt,"ad" /* * GDT - a single GDT is shared by all threads (and, eventually, all CPUs). @@ -727,6 +910,8 @@ gdt48: .word (gdt48 - gdt - 1) .long gdt +.section .lodata,"ad" + /* * Known-good stack for handling CPU exceptions. */ @@ -756,3 +941,33 @@ _exception_stack2: _exception_stack3: .fill CONFIG_EXCEPTION_STACK_SIZE, 1, 0xAA #endif + +#ifdef CONFIG_X86_KPTI +.section .trampolines,"ad" + +.global z_x86_trampoline_stack +.align 16 +z_x86_trampoline_stack: + .fill Z_X86_TRAMPOLINE_STACK_SIZE, 1, 0xAA + +#if CONFIG_MP_NUM_CPUS > 1 +.global z_x86_trampoline_stack1 +.align 16 +z_x86_trampoline_stack1: + .fill Z_X86_TRAMPOLINE_STACK_SIZE, 1, 0xAA +#endif + +#if CONFIG_MP_NUM_CPUS > 2 +.global z_x86_trampoline_stack2 +.align 16 +z_x86_trampoline_stack2: + .fill Z_X86_TRAMPOLINE_STACK_SIZE, 1, 0xAA +#endif + +#if CONFIG_MP_NUM_CPUS > 3 +.global z_x86_trampoline_stack3 +.align 16 +z_x86_trampoline_stack3: + .fill Z_X86_TRAMPOLINE_STACK_SIZE, 1, 0xAA +#endif +#endif /* CONFIG_X86_KPTI */ diff --git a/arch/x86/core/intel64/thread.c b/arch/x86/core/intel64/thread.c index 0a34bf6b371..96351cca0c3 100644 --- a/arch/x86/core/intel64/thread.c +++ b/arch/x86/core/intel64/thread.c @@ -7,6 +7,7 @@ #include #include #include +#include extern void x86_sse_init(struct k_thread *); /* in locore.S */ diff --git a/arch/x86/core/intel64/userspace.S b/arch/x86/core/intel64/userspace.S index 86939894811..d0b15f82e0c 100644 --- a/arch/x86/core/intel64/userspace.S +++ b/arch/x86/core/intel64/userspace.S @@ -8,6 +8,53 @@ #include #include +#ifdef CONFIG_X86_KPTI +/* Copy interrupt return stack context to the trampoline stack, switch back + * to the user page table, and only then 'iret'. We jump to this instead + * of calling 'iret' if KPTI is turned on. This must be invoked with interrupts + * locked. + * + * Stack layout is expected to be what 'iretq' expects, which is as follows: + * + * 32 SS + * 24 RSP + * 16 RFLAGS + * 8 CS + * 0 RIP + */ +.global z_x86_trampoline_to_user +z_x86_trampoline_to_user: + /* Stash EDI, need a free register */ + pushq %rdi + + /* Store old stack pointer and switch to trampoline stack */ + movq %rsp, %rdi + movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp + + /* Copy context */ + pushq 40(%rdi) /* SS */ + pushq 32(%rdi) /* RSP */ + pushq 24(%rdi) /* RFLAGS */ + pushq 16(%rdi) /* CS */ + pushq 8(%rdi) /* RIP */ + xchgq %rdi, (%rdi) /* Exchange old rdi to restore it and put + trampoline stack address in its old storage + area */ + + /* Switch to thread's page table */ + pushq %rax + movq %gs:__x86_tss64_t_cpu_OFFSET, %rax + movq ___cpu_t_current_OFFSET(%rax), %rax + movq _thread_offset_to_ptables(%rax), %rax + movq %rax, %cr3 + popq %rax + movq $0, -8(%rsp) /* Delete stashed RAX data */ + + /* Trampoline stack should have nothing sensitive in it at this point */ + swapgs + iretq +#endif /* CONFIG_X86_KPTI */ + /* Landing site for 'syscall' instruction * @@ -21,17 +68,39 @@ z_x86_syscall_entry_stub: swapgs - /* Switch to the privilege mode stack pointer stored in - * x86_tss64.psp and store the user mode stack pointer in - * x86_tss64.usp, immediately pushing it once the stack switch - * is done since this is a per-cpu and not per-thread area. - * - * This dance is necessary as upon entry we have no free registers - * nor a stack we can push to. + /* Save original stack pointer from user mode in memory, at the + * moment we have no free registers or stack to save it to. This + * eventually gets put on the stack before we re-enable interrupts + * as this is a per-cpu and not per-thread area. */ movq %rsp, %gs:__x86_tss64_t_usp_OFFSET + +#ifdef CONFIG_X86_KPTI + /* We need to switch to the trampoline stack so that we can + * switch to the kernel's page table + */ + movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp + + /* Load kernel's page table */ + pushq %rax + movq $z_x86_kernel_ptables, %rax + movq %rax, %cr3 + popq %rax + movq $0, -8(%rsp) /* Delete stashed RAX data */ +#endif /* CONFIG_X86_KPTI */ + + /* Switch to the privilege mode stack pointer stored in + * x86_tss64.psp + */ movq %gs:__x86_tss64_t_psp_OFFSET, %rsp + + /* We're now on the privilege mode stack; push the old user stack + * pointer onto it + */ pushq %gs:__x86_tss64_t_usp_OFFSET +#ifdef CONFIG_X86_KPTI + movq $0, %gs:__x86_tss64_t_usp_OFFSET +#endif sti /* re-enable interrupts */ @@ -103,6 +172,29 @@ _id_ok: addq $X86_FXSAVE_SIZE, %rsp popq %rdi +#ifdef CONFIG_X86_KPTI + /* Lock IRQs as we are using per-cpu memory areas and the + * trampoline stack + */ + cli + + /* Stash user stack pointer and switch to trampoline stack */ + popq %gs:__x86_tss64_t_usp_OFFSET + movq %gs:__x86_tss64_t_ist2_OFFSET, %rsp + + /* Switch to thread's page table */ + pushq %rax + movq %gs:__x86_tss64_t_cpu_OFFSET, %rax + movq ___cpu_t_current_OFFSET(%rax), %rax + movq _thread_offset_to_ptables(%rax), %rax + movq %rax, %cr3 + popq %rax + movq $0, -8(%rsp) /* Delete stashed RAX data */ + + /* Restore saved user stack pointer */ + movq %gs:__x86_tss64_t_usp_OFFSET, %rsp + movq $0, %gs:__x86_tss64_t_usp_OFFSET +#else /* Restore user stack pointer */ popq %rsp @@ -111,6 +203,8 @@ _id_ok: * 'swapgs' and 'sysretq' */ cli +#endif /* CONFIG_X86_KPTI */ + swapgs sysretq @@ -221,7 +315,6 @@ z_x86_userspace_enter: movq $0, (%rsp) /* Now a debugger-friendly return address */ /* cleanse other registers */ - xorq %rax, %rax xorq %rbx, %rbx xorq %rbp, %rbp xorq %r12, %r12 @@ -230,5 +323,14 @@ z_x86_userspace_enter: xorq %r15, %r15 cli +#ifdef CONFIG_X86_KPTI + /* Switch to thread's page table. We have free registers so no need + * to involve the trampoline stack. + */ + movq %gs:__x86_tss64_t_cpu_OFFSET, %rax + movq ___cpu_t_current_OFFSET(%rax), %rax + movq _thread_offset_to_ptables(%rax), %rax + movq %rax, %cr3 +#endif swapgs sysretq diff --git a/arch/x86/core/offsets/intel64_offsets.c b/arch/x86/core/offsets/intel64_offsets.c index 398d4bf4659..8edc8eb8b50 100644 --- a/arch/x86/core/offsets/intel64_offsets.c +++ b/arch/x86/core/offsets/intel64_offsets.c @@ -26,14 +26,18 @@ GEN_OFFSET_SYM(_thread_arch_t, sse); #ifdef CONFIG_USERSPACE GEN_OFFSET_SYM(_thread_arch_t, ss); GEN_OFFSET_SYM(_thread_arch_t, cs); -#endif +GEN_OFFSET_SYM(_thread_arch_t, psp); +GEN_OFFSET_SYM(_thread_arch_t, ptables); +#endif /* CONFIG_USERSPACE */ GEN_OFFSET_SYM(x86_tss64_t, ist1); +GEN_OFFSET_SYM(x86_tss64_t, ist2); +GEN_OFFSET_SYM(x86_tss64_t, ist7); GEN_OFFSET_SYM(x86_tss64_t, cpu); #ifdef CONFIG_USERSPACE GEN_OFFSET_SYM(x86_tss64_t, psp); GEN_OFFSET_SYM(x86_tss64_t, usp); -#endif +#endif /* CONFIG_USERSPACE */ GEN_ABSOLUTE_SYM(__X86_TSS64_SIZEOF, sizeof(x86_tss64_t)); GEN_OFFSET_SYM(x86_cpuboot_t, ready); diff --git a/arch/x86/core/userspace.c b/arch/x86/core/userspace.c index 1bd98d5a08f..4b772348795 100644 --- a/arch/x86/core/userspace.c +++ b/arch/x86/core/userspace.c @@ -25,17 +25,6 @@ static inline void page_tables_set(struct x86_page_tables *ptables) #endif } -/* Set initial stack pointer for privilege mode elevations */ -static inline void set_initial_psp(char *psp) -{ -#ifdef CONFIG_X86_64 - __asm__ volatile("movq %0, %%gs:__x86_tss64_t_psp_OFFSET\n\t" - : : "r" (psp)); -#else - _main_tss.esp0 = (uintptr_t)psp; -#endif -} - /* Update the to the incoming thread's page table, and update the location of * the privilege elevation stack. * @@ -49,16 +38,21 @@ static inline void set_initial_psp(char *psp) * * We don't need to update the privilege mode initial stack pointer either, * privilege elevation always lands on the trampoline stack and the irq/sycall - * code has to manually transition off of it to the thread's kernel stack after + * code has to manually transition off of it to the appropriate stack after * switching page tables. */ void z_x86_swap_update_page_tables(struct k_thread *incoming) { struct x86_page_tables *ptables; +#ifndef CONFIG_X86_64 + /* 64-bit uses syscall/sysret which switches stacks manually, + * tss64.psp is updated unconditionally in __resume + */ if ((incoming->base.user_options & K_USER) != 0) { - set_initial_psp(incoming->arch.psp); + _main_tss.esp0 = (uintptr_t)incoming->arch.psp; } +#endif /* Check first that we actually need to do this, since setting * CR3 involves an expensive full TLB flush. @@ -87,34 +81,6 @@ FUNC_NORETURN static void drop_to_user(k_thread_entry_t user_entry, CODE_UNREACHABLE; } -/* Does the following: - * - * - Initialize per-thread page tables and update thread->arch.ptables to - * point to them. - * - Set thread->arch.psp to point to the initial stack pointer for user - * mode privilege elevation for system calls; supervisor mode threads leave - * this uninitailized. - */ -static void prepare_user_thread(struct k_thread *thread) -{ - struct z_x86_thread_stack_header *header = - (struct z_x86_thread_stack_header *)thread->stack_obj; - - __ASSERT((thread->base.user_options & K_USER) != 0, - "not a user thread"); - - /* Create and program into the MMU the per-thread page tables */ - z_x86_thread_pt_init(thread); - - thread->arch.psp = - header->privilege_stack + sizeof(header->privilege_stack); -} - -static void prepare_supervisor_thread(struct k_thread *thread) -{ - thread->arch.ptables = &z_x86_kernel_ptables; -} - /* Preparation steps needed for all threads if user mode is turned on. * * Returns the initial entry point to swap into. @@ -122,12 +88,17 @@ static void prepare_supervisor_thread(struct k_thread *thread) void *z_x86_userspace_prepare_thread(struct k_thread *thread) { void *initial_entry; + struct z_x86_thread_stack_header *header = + (struct z_x86_thread_stack_header *)thread->stack_obj; + + thread->arch.psp = + header->privilege_stack + sizeof(header->privilege_stack); if ((thread->base.user_options & K_USER) != 0U) { - prepare_user_thread(thread); + z_x86_thread_pt_init(thread); initial_entry = drop_to_user; } else { - prepare_supervisor_thread(thread); + thread->arch.ptables = &z_x86_kernel_ptables; initial_entry = z_thread_entry; } @@ -137,7 +108,7 @@ void *z_x86_userspace_prepare_thread(struct k_thread *thread) FUNC_NORETURN void arch_user_mode_enter(k_thread_entry_t user_entry, void *p1, void *p2, void *p3) { - prepare_user_thread(_current); + z_x86_thread_pt_init(_current); /* Apply memory domain configuration, if assigned. Threads that * started in user mode already had this done via z_setup_new_thread() diff --git a/arch/x86/core/x86_mmu.c b/arch/x86/core/x86_mmu.c index 27dfcdacb5a..4a205762f18 100644 --- a/arch/x86/core/x86_mmu.c +++ b/arch/x86/core/x86_mmu.c @@ -663,8 +663,13 @@ extern char z_shared_kernel_page_start[]; static inline bool is_within_system_ram(uintptr_t addr) { +#ifdef CONFIG_X86_64 + /* FIXME: locore not included in CONFIG_SRAM_BASE_ADDRESS */ + return addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U)); +#else return (addr >= DT_PHYS_RAM_ADDR) && (addr < (DT_PHYS_RAM_ADDR + (DT_RAM_SIZE * 1024U))); +#endif } /* Ignored bit posiition at all levels */ @@ -758,9 +763,17 @@ static void add_mmu_region_page(struct x86_page_tables *ptables, #ifdef CONFIG_X86_KPTI if (user_table && (flags & Z_X86_MMU_US) == 0 && +#ifdef CONFIG_X86_64 + addr >= (uintptr_t)&_lodata_start && +#endif addr != (uintptr_t)(&z_shared_kernel_page_start)) { /* All non-user accessible pages except the shared page * are marked non-present in the page table. + * + * For x86_64 we also make the locore text/rodata areas + * present even though they don't have user mode access, + * they contain necessary tables and program text for + * successfully handling exceptions and interrupts. */ return; } diff --git a/arch/x86/include/intel64/kernel_arch_data.h b/arch/x86/include/intel64/kernel_arch_data.h index bf8f7740a41..c46cddadefc 100644 --- a/arch/x86/include/intel64/kernel_arch_data.h +++ b/arch/x86/include/intel64/kernel_arch_data.h @@ -36,4 +36,8 @@ extern u8_t x86_cpu_loapics[]; /* CPU logical ID -> local APIC ID */ #endif /* _ASMLANGUAGE */ +#ifdef CONFIG_X86_KPTI +#define Z_X86_TRAMPOLINE_STACK_SIZE 128 +#endif + #endif /* ZEPHYR_ARCH_X86_INCLUDE_INTEL64_KERNEL_ARCH_DATA_H_ */ diff --git a/boards/x86/qemu_x86/Kconfig.board b/boards/x86/qemu_x86/Kconfig.board index b56e114d246..6671686fa5f 100644 --- a/boards/x86/qemu_x86/Kconfig.board +++ b/boards/x86/qemu_x86/Kconfig.board @@ -11,5 +11,4 @@ config BOARD_QEMU_X86_64 bool "QEMU x86_64" depends on SOC_IA32 select QEMU_TARGET - select X86_NO_MELTDOWN # Until KPTI is enabled, allow testing select X86_64 diff --git a/include/arch/x86/intel64/linker.ld b/include/arch/x86/intel64/linker.ld index 3b88637c829..3524ddda025 100644 --- a/include/arch/x86/intel64/linker.ld +++ b/include/arch/x86/intel64/linker.ld @@ -36,8 +36,35 @@ SECTIONS *(.lorodata) MMU_PAGE_ALIGN _lodata_start = .; - *(.tss) + *(.lodata) + +#ifdef CONFIG_X86_KPTI + /* Special page containing supervisor data that is still mapped in + * user mode page tables. GDT, TSSes, trampoline stack, and + * any LDT must go here as they always must live in a page that is + * marked 'present'. Still not directly user accessible, but + * no sensitive data should be here as Meltdown exploits may read it. + * + * On x86-64 the IDT is in rodata and doesn't need to be in the + * trampoline page. + */ + MMU_PAGE_ALIGN + z_shared_kernel_page_start = .; +#endif /* CONFIG_X86_KPTI */ + + *(.tss) + *(.gdt) + +#ifdef CONFIG_X86_KPTI + *(.trampolines) + MMU_PAGE_ALIGN + z_shared_kernel_page_end = .; + + ASSERT(z_shared_kernel_page_end - z_shared_kernel_page_start == 4096, + "shared kernel area is not one memory page"); +#endif /* CONFIG_X86_KPTI */ + MMU_PAGE_ALIGN _lodata_end = .; } diff --git a/include/arch/x86/intel64/thread.h b/include/arch/x86/intel64/thread.h index f5fb48e39fc..b5d0e7605e3 100644 --- a/include/arch/x86/intel64/thread.h +++ b/include/arch/x86/intel64/thread.h @@ -84,7 +84,7 @@ struct x86_tss64 { /* Storage area for user mode stack pointer when doing a syscall */ char *usp; -#endif +#endif /* CONFIG_USERSPACE */ } __packed __aligned(8); typedef struct x86_tss64 x86_tss64_t;