From 88ba97fea45beb00a91239611171e65b12dae15c Mon Sep 17 00:00:00 2001 From: Bradley Bolen Date: Thu, 10 Dec 2020 08:45:20 -0500 Subject: [PATCH] arch: arm: aarch32: cortex_a_r: Add shared FPU support This adds lazy floating point context switching. On svc/irq entrance, the VFP is disabled and a pointer to the exception stack frame is saved away. If the esf pointer is still valid on exception exit, then no other context used the VFP so the context is still valid and nothing needs to be restored. If the esf pointer is NULL on exception exit, then some other context used the VFP and the floating point context is restored from the esf. The undefined instruction handler is responsible for saving away the floating point context if needed. If the handler is in the first irq/svc context and the current thread uses the VFP, then the float context needs to be saved. Also, if the handler is in a nested context and the previous context was using the FVP, save the float context. Signed-off-by: Bradley Bolen --- arch/arm/core/aarch32/cortex_a_r/exc.S | 77 ++++++++++++++- arch/arm/core/aarch32/cortex_a_r/exc_exit.S | 53 ++++++++++ arch/arm/core/aarch32/cortex_a_r/fault.c | 88 +++++++++++++++++ arch/arm/core/aarch32/isr_wrapper.S | 25 +++++ arch/arm/core/aarch32/swap_helper.S | 102 +++++++++++++++++++- arch/arm/core/aarch32/thread.c | 15 ++- include/zephyr/kernel_structs.h | 4 + kernel/include/kernel_offsets.h | 4 + kernel/include/offsets_short.h | 5 + 9 files changed, 363 insertions(+), 10 deletions(-) diff --git a/arch/arm/core/aarch32/cortex_a_r/exc.S b/arch/arm/core/aarch32/cortex_a_r/exc.S index ee407ed7166..8017d0c855d 100644 --- a/arch/arm/core/aarch32/cortex_a_r/exc.S +++ b/arch/arm/core/aarch32/cortex_a_r/exc.S @@ -30,6 +30,9 @@ _ASM_FILE_PROLOGUE +#if defined(CONFIG_FPU_SHARING) +GTEXT(z_arm_fault_undef_instruction_fp) +#endif GTEXT(z_arm_fault_undef_instruction) GTEXT(z_arm_fault_prefetch) GTEXT(z_arm_fault_data) @@ -47,6 +50,19 @@ GTEXT(z_arm_data_abort) stmfd sp, {r0-r3, r12, lr}^ sub sp, #24 +#if defined(CONFIG_FPU_SHARING) + sub sp, #___fpu_t_SIZEOF + + vmrs r1, fpexc + mov r0, #FPEXC_EN + vmsr fpexc, r0 + vmrs r0, fpscr + + mov r2, sp + vstmia r2!, {s0-s15} + stm r2, {r0, r1} +#endif + #if defined(CONFIG_EXTRA_EXCEPTION_INFO) /* Pointer to extra esf info */ sub sp, #___extra_esf_info_t_SIZEOF @@ -100,7 +116,56 @@ SECTION_SUBSEC_FUNC(TEXT, __exc, z_arm_undef_instruction) subne lr, #2 /* Thumb (T_BIT) */ pop {r0} - exception_entry MODE_UND + /* + * Store r0-r3, r12, lr, lr_und and spsr_und into the stack to + * construct an exception stack frame. + */ + srsdb sp!, #MODE_UND + stmfd sp, {r0-r3, r12, lr}^ + sub sp, #24 + + /* Increment exception nesting count */ + ldr r2, =_kernel + ldr r1, [r2, #_kernel_offset_to_nested] + add r1, r1, #1 + str r1, [r2, #_kernel_offset_to_nested] + +#if defined(CONFIG_FPU_SHARING) + sub sp, #___fpu_t_SIZEOF + + bl z_arm_fault_undef_instruction_fp + cmp r0, #0 + beq z_arm_exc_exit + + vmrs r1, fpexc + mov r0, #FPEXC_EN + vmsr fpexc, r0 + vmrs r0, fpscr + + mov r2, sp + vstmia r2!, {s0-s15} + stm r2, {r0, r1} +#endif + +#if defined(CONFIG_EXTRA_EXCEPTION_INFO) + /* Pointer to extra esf info */ + sub sp, #___extra_esf_info_t_SIZEOF + mov r0, #0 + str r0, [sp, #4] + str r0, [sp, #8] + + sub r1, sp, #___callee_saved_t_SIZEOF + str r1, [sp] + cps #MODE_SYS + stm r1, {r4-r11, sp} + cps #MODE_UND + + mov r0, sp + mov sp, r1 +#else + mov r0, sp +#endif + bl z_arm_fault_undef_instruction exception_exit @@ -125,6 +190,12 @@ SECTION_SUBSEC_FUNC(TEXT, __exc, z_arm_prefetch_abort) b z_arm_exc_exit +#if defined(CONFIG_FPU_SHARING) +#define FPU_SF_SIZE ___fpu_t_SIZEOF +#else +#define FPU_SF_SIZE 0 +#endif + /** * @brief Data abort exception handler * @@ -148,10 +219,10 @@ SECTION_SUBSEC_FUNC(TEXT, __exc, z_arm_data_abort) * the true esf from the one passed to z_arm_fault_data. */ cmp r0, #0 - ldreq r1, [sp, #24] + ldreq r1, [sp, #24 + FPU_SF_SIZE] exception_exit - streq r1, [sp, #24] + streq r1, [sp, #24 + FPU_SF_SIZE] b z_arm_exc_exit diff --git a/arch/arm/core/aarch32/cortex_a_r/exc_exit.S b/arch/arm/core/aarch32/cortex_a_r/exc_exit.S index 438f763be95..43d4183ee58 100644 --- a/arch/arm/core/aarch32/cortex_a_r/exc_exit.S +++ b/arch/arm/core/aarch32/cortex_a_r/exc_exit.S @@ -62,6 +62,45 @@ system_thread_exit\@: #endif .endm +.macro fpu_exc_exit +#if defined(CONFIG_FPU_SHARING) + /* + * If the floating point context pointer is null, then a context was + * saved so restore the float context from the exception stack frame. + */ + ldr r2, =_kernel + ldr r1, [r2, #_kernel_offset_to_fp_ctx] + cmp r1, #0 + beq vfp_restore\@ + + /* + * If leaving the last interrupt context, remove the floating point + * context pointer. + */ + cmp r0, #0 + moveq r1, #0 + streq r1, [r2, #_kernel_offset_to_fp_ctx] + b vfp_exit\@ + +vfp_restore\@: + add r3, sp, #___fpu_sf_t_fpscr_OFFSET + ldm r3, {r1, r2} + tst r2, #FPEXC_EN + beq vfp_exit\@ + + vmsr fpexc, r2 + vmsr fpscr, r1 + vldmia sp, {s0-s15} + +vfp_exit\@: + /* Leave the VFP disabled when leaving */ + mov r1, #0 + vmsr fpexc, r1 + + add sp, sp, #___fpu_t_SIZEOF +#endif +.endm + /** * @brief Kernel housekeeping when exiting interrupt handler installed directly * in the vector table @@ -133,6 +172,11 @@ __EXIT_INT: * out or they are the args to _new_thread for a new thread. */ cps #MODE_SYS + +#if defined(CONFIG_FPU_SHARING) + fpu_exc_exit +#endif + pop {r0-r3, r12, lr} userspace_exc_exit rfeia sp! @@ -173,6 +217,9 @@ SECTION_SUBSEC_FUNC(TEXT, _HandlerModeExit, z_arm_exc_exit) */ /* Clean up exception stack frame */ +#if defined(CONFIG_FPU_SHARING) + add sp, sp, #___fpu_t_SIZEOF +#endif add sp, #32 /* @@ -193,6 +240,9 @@ SECTION_SUBSEC_FUNC(TEXT, _HandlerModeExit, z_arm_exc_exit) /* Return to the switched thread */ cps #MODE_SYS +#if defined(CONFIG_FPU_SHARING) + fpu_exc_exit +#endif pop {r0-r3, r12, lr} userspace_exc_exit rfeia sp! @@ -203,6 +253,9 @@ __EXIT_EXC: sub r0, r0, #1 str r0, [r3, #_kernel_offset_to_nested] +#if defined(CONFIG_FPU_SHARING) + add sp, sp, #___fpu_t_SIZEOF +#endif /* * Restore r0-r3, r12, lr, lr_und and spsr_und from the exception stack * and return to the current thread. diff --git a/arch/arm/core/aarch32/cortex_a_r/fault.c b/arch/arm/core/aarch32/cortex_a_r/fault.c index f4a77fe8bed..ae037f75d56 100644 --- a/arch/arm/core/aarch32/cortex_a_r/fault.c +++ b/arch/arm/core/aarch32/cortex_a_r/fault.c @@ -86,6 +86,80 @@ static void dump_fault(uint32_t status, uint32_t addr) } #endif +#if defined(CONFIG_FPU_SHARING) +/** + * @brief FPU undefined instruction fault handler + * + * @return Returns true if the FPU is already enabled + * implying a true undefined instruction + * Returns false if the FPU was disabled + */ +bool z_arm_fault_undef_instruction_fp(void) +{ + /* + * Assume this is a floating point instruction that faulted because + * the FP unit was disabled. Enable the FP unit and try again. If + * the FP was already enabled then this was an actual undefined + * instruction. + */ + if (__get_FPEXC() & FPEXC_EN) + return true; + + __set_FPEXC(FPEXC_EN); + + if (_kernel.cpus[0].nested > 1) { + /* + * If the nested count is greater than 1, the undefined + * instruction exception came from an irq/svc context. (The + * irq/svc handler would have the nested count at 1 and then + * the undef exception would increment it to 2). + */ + struct __fpu_sf *spill_esf = + (struct __fpu_sf *)_kernel.cpus[0].fp_ctx; + + if (spill_esf == NULL) + return false; + + _kernel.cpus[0].fp_ctx = NULL; + + /* + * If the nested count is 2 and the current thread has used the + * VFP (whether or not it was actually using the VFP before the + * current exception) OR if the nested count is greater than 2 + * and the VFP was enabled on the irq/svc entrance for the + * saved exception stack frame, then save the floating point + * context because it is about to be overwritten. + */ + if (((_kernel.cpus[0].nested == 2) + && (_current->base.user_options & K_FP_REGS)) + || ((_kernel.cpus[0].nested > 2) + && (spill_esf->undefined & FPEXC_EN))) { + /* + * Spill VFP registers to specified exception stack + * frame + */ + spill_esf->undefined |= FPEXC_EN; + spill_esf->fpscr = __get_FPSCR(); + __asm__ volatile ( + "vstmia %0, {s0-s15};\n" + : : "r" (&spill_esf->s[0]) + : "memory" + ); + } + } else { + /* + * If the nested count is one, a thread was the faulting + * context. Just flag that this thread uses the VFP. This + * means that a thread that uses the VFP does not have to, + * but should, set K_FP_REGS on thread creation. + */ + _current->base.user_options |= K_FP_REGS; + } + + return false; +} +#endif + /** * @brief Undefined instruction fault handler * @@ -93,6 +167,20 @@ static void dump_fault(uint32_t status, uint32_t addr) */ bool z_arm_fault_undef_instruction(z_arch_esf_t *esf) { +#if defined(CONFIG_FPU_SHARING) + /* + * This is a true undefined instruction and we will be crashing + * so save away the VFP registers. + */ + esf->fpu.undefined = __get_FPEXC(); + esf->fpu.fpscr = __get_FPSCR(); + __asm__ volatile ( + "vstmia %0, {s0-s15};\n" + : : "r" (&esf->fpu.s[0]) + : "memory" + ); +#endif + /* Print fault information */ LOG_ERR("***** UNDEFINED INSTRUCTION ABORT *****"); diff --git a/arch/arm/core/aarch32/isr_wrapper.S b/arch/arm/core/aarch32/isr_wrapper.S index 89012709915..1c7027c80ec 100644 --- a/arch/arm/core/aarch32/isr_wrapper.S +++ b/arch/arm/core/aarch32/isr_wrapper.S @@ -88,6 +88,31 @@ isr_system_thread: cps #MODE_SYS push {r0-r3, r12, lr} +#if defined(CONFIG_FPU_SHARING) + sub sp, sp, #___fpu_t_SIZEOF + + /* + * Note that this handler was entered with the VFP unit enabled. + * The undefined instruction handler uses this to know that it + * needs to save the current floating context. + */ + vmrs r0, fpexc + str r0, [sp, #___fpu_t_SIZEOF - 4] + + /* Disable VFP */ + mov r0, #0 + vmsr fpexc, r0 + + /* + * Mark where to store the floating context for the undefined + * instruction handler + */ + ldr r2, =_kernel + ldr r0, [r2, #_kernel_offset_to_fp_ctx] + cmp r0, #0 + streq sp, [r2, #_kernel_offset_to_fp_ctx] +#endif /* CONFIG_FPU_SHARING */ + /* * Use SVC mode stack for predictable interrupt behaviour; running ISRs * in the SYS/USR mode stack (i.e. interrupted thread stack) leaves the diff --git a/arch/arm/core/aarch32/swap_helper.S b/arch/arm/core/aarch32/swap_helper.S index 2f9518af265..7cf61b4d565 100644 --- a/arch/arm/core/aarch32/swap_helper.S +++ b/arch/arm/core/aarch32/swap_helper.S @@ -19,6 +19,7 @@ #include #include #include +#include #if defined(CONFIG_CPU_CORTEX_M) #include @@ -126,6 +127,45 @@ out_fp_endif: cps #MODE_SYS stm r0, {r4-r11, sp} cps #MODE_SVC + +#if defined(CONFIG_FPU_SHARING) + ldrb r0, [r2, #_thread_offset_to_user_options] + tst r0, #K_FP_REGS /* _current->base.user_options & K_FP_REGS */ + beq out_fp_inactive + + mov ip, #FPEXC_EN + vmsr fpexc, ip + + /* + * If the float context pointer is not null, then the VFP has not been + * used since this thread has used it. Consequently, the caller-saved + * float registers have not been saved away, so write them to the + * exception stack frame. + */ + ldr r0, [r1, #_kernel_offset_to_fp_ctx] + cmp r0, #0 + beq out_store_thread_context + + vstmia r0!, {s0-s15} + vmrs r3, fpscr + stm r0, {r3, ip} + +out_store_thread_context: + /* Store s16-s31 to thread context */ + add r0, r2, #_thread_offset_to_preempt_float + vstmia r0, {s16-s31} + + mov ip, #0 + vmsr fpexc, ip + +out_fp_inactive: + /* + * The floating context has now been saved to the exception stack + * frame, so zero out the global pointer to note this. + */ + mov r0, #0 + str r0, [r1, #_kernel_offset_to_fp_ctx] +#endif /* CONFIG_FPU_SHARING */ #else #error Unknown ARM architecture #endif /* CONFIG_ARMV6_M_ARMV8_M_BASELINE */ @@ -362,6 +402,24 @@ _thread_irq_disabled: ldm r0, {r4-r11, sp} cps #MODE_SVC +#if defined(CONFIG_FPU_SHARING) + ldrb r0, [r2, #_thread_offset_to_user_options] + tst r0, #K_FP_REGS /* _current->base.user_options & K_FP_REGS */ + beq in_fp_inactive + + mov r3, #FPEXC_EN + vmsr fpexc, r3 + + /* Restore s16-s31 from thread context */ + add r0, r2, #_thread_offset_to_preempt_float + vldmia r0, {s16-s31} + + mov r3, #0 + vmsr fpexc, r3 + +in_fp_inactive: +#endif /* CONFIG_FPU_SHARING */ + #if defined (CONFIG_ARM_MPU) /* r2 contains k_thread */ mov r0, r2 @@ -608,6 +666,12 @@ valid_syscall_id: #elif defined(CONFIG_ARMV7_R) || defined(CONFIG_AARCH32_ARMV8_R) \ || defined(CONFIG_ARMV7_A) +#if defined(CONFIG_FPU_SHARING) +#define FPU_SF_SIZE ___fpu_t_SIZEOF +#else +#define FPU_SF_SIZE 0 +#endif + /** * * @brief Service call handler @@ -650,7 +714,34 @@ svc_system_thread: srsdb #MODE_SYS! cps #MODE_SYS push {r0-r3, r12, lr} + +#if defined(CONFIG_FPU_SHARING) + sub sp, sp, #___fpu_t_SIZEOF + + /* + * Note that this handler was entered with the VFP unit enabled. + * The undefined instruction handler uses this to know that it + * needs to save the current floating context. + */ + vmrs r0, fpexc + str r0, [sp, #___fpu_t_SIZEOF - 4] + + /* Disable VFP */ + mov r0, #0 + vmsr fpexc, r0 + + /* + * Mark where to store the floating context for the undefined + * instruction handler + */ + ldr r2, =_kernel + ldr r0, [r2, #_kernel_offset_to_fp_ctx] + cmp r0, #0 + streq sp, [r2, #_kernel_offset_to_fp_ctx] +#endif /* CONFIG_FPU_SHARING */ + mov ip, sp + cps #MODE_SVC /* @@ -735,7 +826,7 @@ _oops: * the SVC. * * On SVC exception, the USER/SYSTEM stack looks like the following: - * r0 - r1 - r2 - r3 - r12 - LR - { possible FPU space } - PC - SPSR + * { possible FPU space } - r0 - r1 - r2 - r3 - r12 - LR - PC - SPSR * * Registers look like: * r0 - arg1 @@ -748,10 +839,11 @@ _oops: * r8 - saved link register */ _do_syscall: - ldr r8, [ip, #24] /* grab address of LR from stack frame */ + /* grab address of LR from stack frame */ + ldr r8, [ip, #(FPU_SF_SIZE + ___basic_sf_t_pc_OFFSET)] /* Make the exception return to system state */ - ldr r1, [ip, #28] + ldr r1, [ip, #(FPU_SF_SIZE + ___basic_sf_t_xpsr_OFFSET)] /* If leaving thumb mode, set the return address to thumb mode */ tst r1, #T_BIT @@ -759,14 +851,14 @@ _do_syscall: bic r1, #(MODE_MASK | T_BIT) orr r1, r1, #MODE_SYS - str r1, [ip, #28] + str r1, [ip, #(FPU_SF_SIZE + ___basic_sf_t_xpsr_OFFSET)] /* * Store the address of z_arm_do_syscall for the exit so the exception * return goes there in system state. */ ldr r1, =z_arm_do_syscall - str r1, [ip, #24] /* overwrite the LR to point to z_arm_do_syscall */ + str r1, [ip, #(FPU_SF_SIZE + ___basic_sf_t_pc_OFFSET)] /* validate syscall limit, only set priv mode if valid */ ldr ip, =K_SYSCALL_LIMIT diff --git a/arch/arm/core/aarch32/thread.c b/arch/arm/core/aarch32/thread.c index 68032314bd2..0f127f47a64 100644 --- a/arch/arm/core/aarch32/thread.c +++ b/arch/arm/core/aarch32/thread.c @@ -112,6 +112,13 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, #endif /* CONFIG_COMPILER_ISA_THUMB2 */ #endif /* CONFIG_CPU_CORTEX_M */ +#if !defined(CONFIG_CPU_CORTEX_M) \ + && defined(CONFIG_FPU) && defined(CONFIG_FPU_SHARING) + iframe = (struct __basic_sf *) + ((uintptr_t)iframe - sizeof(struct __fpu_sf)); + memset(iframe, 0, sizeof(struct __fpu_sf)); +#endif + thread->callee_saved.psp = (uint32_t)iframe; thread->arch.basepri = 0; @@ -470,7 +477,11 @@ int arch_float_disable(struct k_thread *thread) thread->base.user_options &= ~K_FP_REGS; +#if defined(CONFIG_CPU_CORTEX_M) __set_CONTROL(__get_CONTROL() & (~CONTROL_FPCA_Msk)); +#else + __set_FPEXC(0); +#endif /* No need to add an ISB barrier after setting the CONTROL * register; arch_irq_unlock() already adds one. @@ -483,7 +494,7 @@ int arch_float_disable(struct k_thread *thread) int arch_float_enable(struct k_thread *thread, unsigned int options) { - /* This is not supported in Cortex-M and Cortex-R does not have FPU */ + /* This is not supported in Cortex-M */ return -ENOTSUP; } #endif /* CONFIG_FPU && CONFIG_FPU_SHARING */ @@ -508,7 +519,7 @@ static void z_arm_prepare_switch_to_main(void) #else __set_FPSCR(0); #endif -#if defined(CONFIG_FPU_SHARING) +#if defined(CONFIG_CPU_CORTEX_M) && defined(CONFIG_FPU_SHARING) /* In Sharing mode clearing FPSCR may set the CONTROL.FPCA flag. */ __set_CONTROL(__get_CONTROL() & (~(CONTROL_FPCA_Msk))); __ISB(); diff --git a/include/zephyr/kernel_structs.h b/include/zephyr/kernel_structs.h index 52f630ba399..cd2e13689bc 100644 --- a/include/zephyr/kernel_structs.h +++ b/include/zephyr/kernel_structs.h @@ -126,6 +126,10 @@ struct _cpu { uint8_t id; +#if defined(CONFIG_FPU_SHARING) + void *fp_ctx; +#endif + #ifdef CONFIG_SMP /* True when _current is allowed to context switch */ uint8_t swap_ok; diff --git a/kernel/include/kernel_offsets.h b/kernel/include/kernel_offsets.h index d7298863c38..df83112c136 100644 --- a/kernel/include/kernel_offsets.h +++ b/kernel/include/kernel_offsets.h @@ -34,6 +34,10 @@ GEN_ABSOLUTE_SYM(___cpu_t_SIZEOF, sizeof(struct _cpu)); GEN_OFFSET_SYM(_kernel_t, cpus); +#if defined(CONFIG_FPU_SHARING) +GEN_OFFSET_SYM(_cpu_t, fp_ctx); +#endif + #if defined(CONFIG_THREAD_MONITOR) GEN_OFFSET_SYM(_kernel_t, threads); #endif diff --git a/kernel/include/offsets_short.h b/kernel/include/offsets_short.h index 4e4125760fe..f9fb901db6b 100644 --- a/kernel/include/offsets_short.h +++ b/kernel/include/offsets_short.h @@ -24,6 +24,11 @@ #define _kernel_offset_to_current \ (___cpu_t_current_OFFSET) + +#if defined(CONFIG_FPU_SHARING) +#define _kernel_offset_to_fp_ctx \ + (___cpu_t_fp_ctx_OFFSET) +#endif /* CONFIG_FPU_SHARING */ #endif /* CONFIG_SMP */ #define _kernel_offset_to_idle \