diff --git a/arch/x86/core/intel64/locore.S b/arch/x86/core/intel64/locore.S index 1dc965a7727..e0936836026 100644 --- a/arch/x86/core/intel64/locore.S +++ b/arch/x86/core/intel64/locore.S @@ -53,11 +53,12 @@ __start: movl $(exception_stack + CONFIG_EXCEPTION_STACK_SIZE), %esp - /* transition to long mode, by the book. */ + /* transition to long mode. along the way, we enable SSE. */ - movl %cr4, %eax /* enable PAE */ - orl $CR4_PAE, %eax + movl %cr4, %eax /* enable PAE and SSE */ + orl $(CR4_PAE | CR4_OSFXSR), %eax movl %eax, %cr4 + clts movl $pml4, %eax /* load page base */ movl %eax, %cr3 @@ -105,12 +106,38 @@ __start: 1: #endif /* CONFIG_X86_MULTIBOOT_INFO */ + /* + * set up SSE in case something uses the floating-point unit during + * early initialization (either directly, or if GCC gets clever) + */ + + xorl %edi, %edi + call x86_sse_init + /* don't replace CALL with JMP; honor the ABI stack alignment! */ call z_cstart stop: jmp stop +/* + * void x86_sse_init(struct k_thread *thread); + * + * Initialize floating-point state to something sane. If 'thread' is + * not NULL, then the resulting FP state is saved to thread->arch.sse. + */ + +.global x86_sse_init +x86_sse_init: + fninit + ldmxcsr mxcsr + testq %rdi, %rdi + jz 1f + fxsave _thread_offset_to_sse(%rdi) +1: retq + +mxcsr: .long X86_MXCSR_SANE + /* * FIXME: The multiboot header is identical (for obvious reasons) to the * version in ia32/crt0.S. They should be refactored into a common file. @@ -188,6 +215,7 @@ __resume: testb $_THREAD_SWAPPED, _thread_offset_to_thread_state(%rsi) jnz 1f + fxrstor _thread_offset_to_sse(%rsi) movq _thread_offset_to_rcx(%rsi), %rcx movq _thread_offset_to_rdx(%rsi), %rdx movq _thread_offset_to_rdi(%rsi), %rdi @@ -403,6 +431,18 @@ irq: 1: cmpl $1, _kernel_offset_to_nested(%rsi) je irq_enter_unnested + /* + * if we're a nested interrupt, we have to dump the state to the + * stack. we play some games here to re-arrange the stack thusly: + * + * SS RSP RFLAGS CS RIP RAX RSI RCX RDX RBX + * RDI RBP R8 R9 R10 R11 R12 R13 R14 R15 + * X86_FXSAVE_SIZE bytes of SSE data <-- RSP points here + * + * note that the final value of RSP must be 16-byte aligned here, + * both to satisfy FXSAVE/FXRSTOR but also to honor the C ABI. + */ + irq_enter_nested: /* Nested IRQ: dump register state to stack. */ pushq %rcx movq 16(%rsp), %rcx /* RCX = vector */ @@ -419,11 +459,14 @@ irq_enter_nested: /* Nested IRQ: dump register state to stack. */ pushq %r13 pushq %r14 pushq %r15 + subq $X86_FXSAVE_SIZE, %rsp + fxsave (%rsp) jmp irq_dispatch irq_enter_unnested: /* Not nested: dump state to thread struct for __resume */ movq _kernel_offset_to_current(%rsi), %rsi andb $(~_THREAD_SWAPPED), _thread_offset_to_thread_state(%rsi) + fxsave _thread_offset_to_sse(%rsi) movq %rbx, _thread_offset_to_rbx(%rsi) movq %rbp, _thread_offset_to_rbp(%rsi) movq %r12, _thread_offset_to_r12(%rsi) @@ -469,6 +512,8 @@ irq_dispatch: jz __resume /* not nested, just __resume (might change threads) */ irq_exit_nested: + fxrstor (%rsp) + addq $X86_FXSAVE_SIZE, %rsp popq %r15 popq %r14 popq %r13 diff --git a/arch/x86/core/intel64/thread.c b/arch/x86/core/intel64/thread.c index 033a7c44bc2..9a222bb45b4 100644 --- a/arch/x86/core/intel64/thread.c +++ b/arch/x86/core/intel64/thread.c @@ -8,6 +8,8 @@ #include #include +extern void x86_sse_init(struct k_thread *); /* in locore.S */ + void z_new_thread(struct k_thread *thread, k_thread_stack_t *stack, size_t stack_size, k_thread_entry_t entry, void *parameter1, void *parameter2, void *parameter3, @@ -26,4 +28,6 @@ void z_new_thread(struct k_thread *thread, k_thread_stack_t *stack, thread->arch.rsi = (long) parameter1; thread->arch.rdx = (long) parameter2; thread->arch.rcx = (long) parameter3; + + x86_sse_init(thread); } diff --git a/arch/x86/core/offsets/intel64_offsets.c b/arch/x86/core/offsets/intel64_offsets.c index 7824f92cade..74d864d6fc4 100644 --- a/arch/x86/core/offsets/intel64_offsets.c +++ b/arch/x86/core/offsets/intel64_offsets.c @@ -26,3 +26,4 @@ GEN_OFFSET_SYM(_thread_arch_t, r8); GEN_OFFSET_SYM(_thread_arch_t, r9); GEN_OFFSET_SYM(_thread_arch_t, r10); GEN_OFFSET_SYM(_thread_arch_t, r11); +GEN_OFFSET_SYM(_thread_arch_t, sse); diff --git a/arch/x86/include/intel64/kernel_arch_data.h b/arch/x86/include/intel64/kernel_arch_data.h index eeb9d313bad..c551cd32cf3 100644 --- a/arch/x86/include/intel64/kernel_arch_data.h +++ b/arch/x86/include/intel64/kernel_arch_data.h @@ -6,6 +6,14 @@ #ifndef ZEPHYR_ARCH_X86_INCLUDE_INTEL64_KERNEL_ARCH_DATA_H_ #define ZEPHYR_ARCH_X86_INCLUDE_INTEL64_KERNEL_ARCH_DATA_H_ +/* + * Some SSE definitions. Ideally these will ultimately be shared with 32-bit. + */ + +#define X86_FXSAVE_SIZE 512 /* size and alignment of buffer ... */ +#define X86_FXSAVE_ALIGN 16 /* ... for FXSAVE/FXRSTOR ops */ +#define X86_MXCSR_SANE 0x1dc0 /* enable division-by-zero exception */ + /* * A flag for k_thread.thread_state to tell __resume that the thread * voluntarily switched itself out, so only a portion of the register diff --git a/arch/x86/include/intel64/kernel_arch_thread.h b/arch/x86/include/intel64/kernel_arch_thread.h index 4e28675f6ac..07d47a5faad 100644 --- a/arch/x86/include/intel64/kernel_arch_thread.h +++ b/arch/x86/include/intel64/kernel_arch_thread.h @@ -9,6 +9,7 @@ #ifndef _ASMLANGUAGE #include +#include /* * The _callee_saved registers are unconditionally saved/restored across @@ -41,6 +42,7 @@ struct _thread_arch { u64_t r9; u64_t r10; u64_t r11; + char __aligned(X86_FXSAVE_ALIGN) sse[X86_FXSAVE_SIZE]; }; typedef struct _thread_arch _thread_arch_t; diff --git a/arch/x86/include/intel64/offsets_short_arch.h b/arch/x86/include/intel64/offsets_short_arch.h index c186fecba3b..24b97c8ba37 100644 --- a/arch/x86/include/intel64/offsets_short_arch.h +++ b/arch/x86/include/intel64/offsets_short_arch.h @@ -62,4 +62,7 @@ #define _thread_offset_to_r11 \ (___thread_t_arch_OFFSET + ___thread_arch_t_r11_OFFSET) +#define _thread_offset_to_sse \ + (___thread_t_arch_OFFSET + ___thread_arch_t_sse_OFFSET) + #endif /* ZEPHYR_ARCH_X86_INCLUDE_INTEL64_OFFSETS_SHORT_ARCH_H_ */ diff --git a/arch/x86/include/kernel_arch_data.h b/arch/x86/include/kernel_arch_data.h index 4bb85677028..c4b8ee29b61 100644 --- a/arch/x86/include/kernel_arch_data.h +++ b/arch/x86/include/kernel_arch_data.h @@ -49,6 +49,7 @@ #define CR0_PG 0x80000000 /* enable paging */ #define CR0_WP 0x00010000 /* honor W bit even when supervisor */ #define CR4_PAE 0x00000020 /* enable PAE */ +#define CR4_OSFXSR 0x00000200 /* enable SSE (OS FXSAVE/RSTOR) */ #ifdef CONFIG_X86_LONGMODE #include