arch/x86: add SSE floating-point to Intel64 subarch

This is a naive implementation which does "eager" context switching
for floating-point context, which, of course, introduces performance
concerns. Other approaches have security concerns, SMP implications,
and impact the x86 arch and Zephyr project as a whole. Discussion is
needed, so punting with the straightforward solution for now.

Signed-off-by: Charles E. Youse <charles.youse@intel.com>
This commit is contained in:
Charles E. Youse 2019-07-24 11:40:24 -07:00 committed by Andrew Boie
commit a5eea17dda
7 changed files with 67 additions and 3 deletions

View file

@ -53,11 +53,12 @@ __start:
movl $(exception_stack + CONFIG_EXCEPTION_STACK_SIZE), %esp
/* transition to long mode, by the book. */
/* transition to long mode. along the way, we enable SSE. */
movl %cr4, %eax /* enable PAE */
orl $CR4_PAE, %eax
movl %cr4, %eax /* enable PAE and SSE */
orl $(CR4_PAE | CR4_OSFXSR), %eax
movl %eax, %cr4
clts
movl $pml4, %eax /* load page base */
movl %eax, %cr3
@ -105,12 +106,38 @@ __start:
1:
#endif /* CONFIG_X86_MULTIBOOT_INFO */
/*
* set up SSE in case something uses the floating-point unit during
* early initialization (either directly, or if GCC gets clever)
*/
xorl %edi, %edi
call x86_sse_init
/* don't replace CALL with JMP; honor the ABI stack alignment! */
call z_cstart
stop: jmp stop
/*
* void x86_sse_init(struct k_thread *thread);
*
* Initialize floating-point state to something sane. If 'thread' is
* not NULL, then the resulting FP state is saved to thread->arch.sse.
*/
.global x86_sse_init
x86_sse_init:
fninit
ldmxcsr mxcsr
testq %rdi, %rdi
jz 1f
fxsave _thread_offset_to_sse(%rdi)
1: retq
mxcsr: .long X86_MXCSR_SANE
/*
* FIXME: The multiboot header is identical (for obvious reasons) to the
* version in ia32/crt0.S. They should be refactored into a common file.
@ -188,6 +215,7 @@ __resume:
testb $_THREAD_SWAPPED, _thread_offset_to_thread_state(%rsi)
jnz 1f
fxrstor _thread_offset_to_sse(%rsi)
movq _thread_offset_to_rcx(%rsi), %rcx
movq _thread_offset_to_rdx(%rsi), %rdx
movq _thread_offset_to_rdi(%rsi), %rdi
@ -403,6 +431,18 @@ irq:
1: cmpl $1, _kernel_offset_to_nested(%rsi)
je irq_enter_unnested
/*
* if we're a nested interrupt, we have to dump the state to the
* stack. we play some games here to re-arrange the stack thusly:
*
* SS RSP RFLAGS CS RIP RAX RSI RCX RDX RBX
* RDI RBP R8 R9 R10 R11 R12 R13 R14 R15
* X86_FXSAVE_SIZE bytes of SSE data <-- RSP points here
*
* note that the final value of RSP must be 16-byte aligned here,
* both to satisfy FXSAVE/FXRSTOR but also to honor the C ABI.
*/
irq_enter_nested: /* Nested IRQ: dump register state to stack. */
pushq %rcx
movq 16(%rsp), %rcx /* RCX = vector */
@ -419,11 +459,14 @@ irq_enter_nested: /* Nested IRQ: dump register state to stack. */
pushq %r13
pushq %r14
pushq %r15
subq $X86_FXSAVE_SIZE, %rsp
fxsave (%rsp)
jmp irq_dispatch
irq_enter_unnested: /* Not nested: dump state to thread struct for __resume */
movq _kernel_offset_to_current(%rsi), %rsi
andb $(~_THREAD_SWAPPED), _thread_offset_to_thread_state(%rsi)
fxsave _thread_offset_to_sse(%rsi)
movq %rbx, _thread_offset_to_rbx(%rsi)
movq %rbp, _thread_offset_to_rbp(%rsi)
movq %r12, _thread_offset_to_r12(%rsi)
@ -469,6 +512,8 @@ irq_dispatch:
jz __resume /* not nested, just __resume (might change threads) */
irq_exit_nested:
fxrstor (%rsp)
addq $X86_FXSAVE_SIZE, %rsp
popq %r15
popq %r14
popq %r13

View file

@ -8,6 +8,8 @@
#include <kernel_structs.h>
#include <kernel_internal.h>
extern void x86_sse_init(struct k_thread *); /* in locore.S */
void z_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
size_t stack_size, k_thread_entry_t entry,
void *parameter1, void *parameter2, void *parameter3,
@ -26,4 +28,6 @@ void z_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
thread->arch.rsi = (long) parameter1;
thread->arch.rdx = (long) parameter2;
thread->arch.rcx = (long) parameter3;
x86_sse_init(thread);
}

View file

@ -26,3 +26,4 @@ GEN_OFFSET_SYM(_thread_arch_t, r8);
GEN_OFFSET_SYM(_thread_arch_t, r9);
GEN_OFFSET_SYM(_thread_arch_t, r10);
GEN_OFFSET_SYM(_thread_arch_t, r11);
GEN_OFFSET_SYM(_thread_arch_t, sse);

View file

@ -6,6 +6,14 @@
#ifndef ZEPHYR_ARCH_X86_INCLUDE_INTEL64_KERNEL_ARCH_DATA_H_
#define ZEPHYR_ARCH_X86_INCLUDE_INTEL64_KERNEL_ARCH_DATA_H_
/*
* Some SSE definitions. Ideally these will ultimately be shared with 32-bit.
*/
#define X86_FXSAVE_SIZE 512 /* size and alignment of buffer ... */
#define X86_FXSAVE_ALIGN 16 /* ... for FXSAVE/FXRSTOR ops */
#define X86_MXCSR_SANE 0x1dc0 /* enable division-by-zero exception */
/*
* A flag for k_thread.thread_state to tell __resume that the thread
* voluntarily switched itself out, so only a portion of the register

View file

@ -9,6 +9,7 @@
#ifndef _ASMLANGUAGE
#include <zephyr/types.h>
#include <kernel_arch_data.h>
/*
* The _callee_saved registers are unconditionally saved/restored across
@ -41,6 +42,7 @@ struct _thread_arch {
u64_t r9;
u64_t r10;
u64_t r11;
char __aligned(X86_FXSAVE_ALIGN) sse[X86_FXSAVE_SIZE];
};
typedef struct _thread_arch _thread_arch_t;

View file

@ -62,4 +62,7 @@
#define _thread_offset_to_r11 \
(___thread_t_arch_OFFSET + ___thread_arch_t_r11_OFFSET)
#define _thread_offset_to_sse \
(___thread_t_arch_OFFSET + ___thread_arch_t_sse_OFFSET)
#endif /* ZEPHYR_ARCH_X86_INCLUDE_INTEL64_OFFSETS_SHORT_ARCH_H_ */

View file

@ -49,6 +49,7 @@
#define CR0_PG 0x80000000 /* enable paging */
#define CR0_WP 0x00010000 /* honor W bit even when supervisor */
#define CR4_PAE 0x00000020 /* enable PAE */
#define CR4_OSFXSR 0x00000200 /* enable SSE (OS FXSAVE/RSTOR) */
#ifdef CONFIG_X86_LONGMODE
#include <intel64/kernel_arch_data.h>