From 8affac64a72c4992770e18eb5cc4e1b55fb5293b Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 14 Mar 2022 12:51:40 -0500 Subject: [PATCH] arm64: improved arch_switch() implementation Make it optimal without the need for an SVC/exception roundtrip on every context switch. Performance numbers from tests/benchmarks/sched: Before: unpend 85 ready 58 switch 258 pend 231 tot 632 (avg 699) After: unpend 85 ready 59 switch 115 pend 138 tot 397 (avg 478) Signed-off-by: Nicolas Pitre --- arch/arm64/core/fpu.c | 39 +++++++++++++++----- arch/arm64/core/offsets/offsets.c | 2 +- arch/arm64/core/reset.S | 3 ++ arch/arm64/core/switch.S | 49 ++++++++++--------------- arch/arm64/core/thread.c | 2 + arch/arm64/include/exc.h | 3 -- arch/arm64/include/kernel_arch_func.h | 8 +++- arch/arm64/include/offsets_short_arch.h | 4 +- include/arch/arm64/syscall.h | 1 - include/arch/arm64/thread.h | 1 + 10 files changed, 63 insertions(+), 49 deletions(-) diff --git a/arch/arm64/core/fpu.c b/arch/arm64/core/fpu.c index a37575ae3f3..cf904f51191 100644 --- a/arch/arm64/core/fpu.c +++ b/arch/arm64/core/fpu.c @@ -268,20 +268,19 @@ void z_arm64_fpu_trap(z_arch_esf_t *esf) /* * Perform lazy FPU context switching by simply granting or denying * access to FP regs based on FPU ownership before leaving the last - * exception level. If current thread doesn't own the FP regs then - * it will trap on its first access and then the actual FPU context - * switching will occur. - * - * This is called on every exception exit except for z_arm64_fpu_trap(). + * exception level in case of exceptions, or during a thread context + * switch with the exception level of the new thread being 0. + * If current thread doesn't own the FP regs then it will trap on its + * first access and then the actual FPU context switching will occur. */ -void z_arm64_fpu_exit_exc(void) +static void fpu_access_update(unsigned int exc_update_level) { __ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled"); uint64_t cpacr = read_cpacr_el1(); - if (arch_exception_depth() == 1) { - /* We're about to leave exception mode */ + if (arch_exception_depth() == exc_update_level) { + /* We're about to execute non-exception code */ if (_current_cpu->arch.fpu_owner == _current) { /* turn on FPU access */ write_cpacr_el1(cpacr | CPACR_EL1_FPEN_NOTRAP); @@ -291,14 +290,34 @@ void z_arm64_fpu_exit_exc(void) } } else { /* - * Shallower exception levels should always trap on FPU + * Any new exception level should always trap on FPU * access as we want to make sure IRQs are disabled before - * granting them access. + * granting it access (see z_arm64_fpu_trap() documentation). */ write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP); } } +/* + * This is called on every exception exit except for z_arm64_fpu_trap(). + * In that case the exception level of interest is 1 (soon to be 0). + */ +void z_arm64_fpu_exit_exc(void) +{ + fpu_access_update(1); +} + +/* + * This is called from z_arm64_context_switch(). FPU access may be granted + * only if exception level is 0. If we switch to a thread that is still in + * some exception context then FPU access would be re-evaluated at exception + * exit time via z_arm64_fpu_exit_exc(). + */ +void z_arm64_fpu_thread_context_switch(void) +{ + fpu_access_update(0); +} + int arch_float_disable(struct k_thread *thread) { if (thread != NULL) { diff --git a/arch/arm64/core/offsets/offsets.c b/arch/arm64/core/offsets/offsets.c index fe31dbc0723..dc390b24524 100644 --- a/arch/arm64/core/offsets/offsets.c +++ b/arch/arm64/core/offsets/offsets.c @@ -38,7 +38,7 @@ GEN_NAMED_OFFSET_SYM(_callee_saved_t, x23, x23_x24); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x25, x25_x26); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x27, x27_x28); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x29, x29_sp_el0); -GEN_NAMED_OFFSET_SYM(_callee_saved_t, sp_elx, sp_elx); +GEN_NAMED_OFFSET_SYM(_callee_saved_t, sp_elx, sp_elx_lr); GEN_ABSOLUTE_SYM(___callee_saved_t_SIZEOF, sizeof(struct _callee_saved)); diff --git a/arch/arm64/core/reset.S b/arch/arm64/core/reset.S index 4668209ae79..869c5601d76 100644 --- a/arch/arm64/core/reset.S +++ b/arch/arm64/core/reset.S @@ -165,6 +165,9 @@ switch_el: /* EL1 init */ bl z_arm64_el1_init + /* We want to use SP_ELx from now on */ + msr SPSel, #1 + /* Enable SError interrupts */ msr DAIFClr, #(DAIFCLR_ABT_BIT) isb diff --git a/arch/arm64/core/switch.S b/arch/arm64/core/switch.S index 694bed2d9d4..81df04e1387 100644 --- a/arch/arm64/core/switch.S +++ b/arch/arm64/core/switch.S @@ -24,7 +24,9 @@ _ASM_FILE_PROLOGUE * Routine to handle context switches * * This function is directly called either by _isr_wrapper() in case of - * preemption, or z_arm64_sync_exc() in case of cooperative switching. + * preemption, or arch_switch() in case of cooperative switching. + * + * void z_arm64_context_switch(struct k_thread *new, struct k_thread *old); */ GTEXT(z_arm64_context_switch) @@ -40,9 +42,9 @@ SECTION_FUNC(TEXT, z_arm64_context_switch) stp x27, x28, [x1, #_thread_offset_to_callee_saved_x27_x28] stp x29, x4, [x1, #_thread_offset_to_callee_saved_x29_sp_el0] - /* Save the current SP_ELx */ + /* Save the current SP_ELx and return address */ mov x4, sp - str x4, [x1, #_thread_offset_to_callee_saved_sp_elx] + stp x4, lr, [x1, #_thread_offset_to_callee_saved_sp_elx_lr] /* save current thread's exception depth */ mrs x4, tpidrro_el0 @@ -55,6 +57,17 @@ SECTION_FUNC(TEXT, z_arm64_context_switch) orr x4, x4, x2, lsl #TPIDRROEL0_EXC_SHIFT msr tpidrro_el0, x4 +#ifdef CONFIG_FPU_SHARING + /* + * Do this after tpidrro_el0 is updated with the new exception + * depth value, and before old->switch_handle is updated (making + * it available for grab by another CPU) as we still use its stack. + */ + stp x0, x1, [sp, #-16]! + bl z_arm64_fpu_thread_context_switch + ldp x0, x1, [sp], #16 +#endif + #ifdef CONFIG_SMP /* save old thread into switch handle which is required by * wait_for_switch @@ -83,8 +96,8 @@ SECTION_FUNC(TEXT, z_arm64_context_switch) /* Restore SP_EL0 */ msr sp_el0, x4 - /* Restore SP_EL1 */ - ldr x4, [x0, #_thread_offset_to_callee_saved_sp_elx] + /* Restore SP_EL1 and return address */ + ldp x4, lr, [x0, #_thread_offset_to_callee_saved_sp_elx_lr] mov sp, x4 #ifdef CONFIG_USERSPACE @@ -99,7 +112,7 @@ SECTION_FUNC(TEXT, z_arm64_context_switch) ldp xzr, x30, [sp], #16 #endif - /* Return to z_arm64_sync_exc() or _isr_wrapper() */ + /* Return to arch_switch() or _isr_wrapper() */ ret /* @@ -131,9 +144,6 @@ SECTION_FUNC(TEXT, z_arm64_sync_exc) /* Demux the SVC call */ and x1, x0, #0xff - cmp x1, #_SVC_CALL_CONTEXT_SWITCH - beq context_switch - cmp x1, #_SVC_CALL_RUNTIME_EXCEPT beq oops @@ -179,22 +189,6 @@ oops: mov x0, sp b z_arm64_do_kernel_oops -context_switch: - /* - * Retrieve x0 and x1 from the stack: - * - * - x0 = new_thread->switch_handle = switch_to thread - * - x1 = &old_thread->switch_handle = current thread - */ - ldp x0, x1, [sp, ___esf_t_x0_x1_OFFSET] - - /* Get old thread from x1 */ - sub x1, x1, ___thread_t_switch_handle_OFFSET - - /* Switch thread */ - bl z_arm64_context_switch - b z_arm64_exit_exc - inv: mov x0, #0 /* K_ERR_CPU_EXCEPTION */ mov x1, sp @@ -202,8 +196,3 @@ inv: /* Return here only in case of recoverable error */ b z_arm64_exit_exc - -GTEXT(z_arm64_call_svc) -SECTION_FUNC(TEXT, z_arm64_call_svc) - svc #_SVC_CALL_CONTEXT_SWITCH - ret diff --git a/arch/arm64/core/thread.c b/arch/arm64/core/thread.c index 395ad00dc8e..14b87d94964 100644 --- a/arch/arm64/core/thread.c +++ b/arch/arm64/core/thread.c @@ -70,6 +70,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, char *stack_ptr, k_thread_entry_t entry, void *p1, void *p2, void *p3) { + extern void z_arm64_exit_exc(void); z_arch_esf_t *pInitCtx; /* @@ -118,6 +119,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack, * z_arm64_userspace_enter() (see comments there) */ thread->callee_saved.sp_elx = (uint64_t)pInitCtx; + thread->callee_saved.lr = (uint64_t)z_arm64_exit_exc; thread->switch_handle = thread; } diff --git a/arch/arm64/include/exc.h b/arch/arm64/include/exc.h index 2a43e9f8481..5f9adc50645 100644 --- a/arch/arm64/include/exc.h +++ b/arch/arm64/include/exc.h @@ -31,9 +31,6 @@ static ALWAYS_INLINE bool arch_is_in_isr(void) return arch_curr_cpu()->nested != 0U; } - -extern void z_arm64_call_svc(void *switch_to, void **switched_from); - #ifdef __cplusplus } #endif diff --git a/arch/arm64/include/kernel_arch_func.h b/arch/arm64/include/kernel_arch_func.h index 13cc4ee5ba0..a210416bbc7 100644 --- a/arch/arm64/include/kernel_arch_func.h +++ b/arch/arm64/include/kernel_arch_func.h @@ -34,9 +34,13 @@ static ALWAYS_INLINE void arch_kernel_init(void) static inline void arch_switch(void *switch_to, void **switched_from) { - z_arm64_call_svc(switch_to, switched_from); + extern void z_arm64_context_switch(struct k_thread *new, + struct k_thread *old); + struct k_thread *new = switch_to; + struct k_thread *old = CONTAINER_OF(switched_from, struct k_thread, + switch_handle); - return; + z_arm64_context_switch(new, old); } extern void z_arm64_fatal_error(z_arch_esf_t *esf, unsigned int reason); diff --git a/arch/arm64/include/offsets_short_arch.h b/arch/arm64/include/offsets_short_arch.h index 9c449b0247b..aeae8430a8e 100644 --- a/arch/arm64/include/offsets_short_arch.h +++ b/arch/arm64/include/offsets_short_arch.h @@ -24,7 +24,7 @@ (___thread_t_callee_saved_OFFSET + ___callee_saved_t_x27_x28_OFFSET) #define _thread_offset_to_callee_saved_x29_sp_el0 \ (___thread_t_callee_saved_OFFSET + ___callee_saved_t_x29_sp_el0_OFFSET) -#define _thread_offset_to_callee_saved_sp_elx \ - (___thread_t_callee_saved_OFFSET + ___callee_saved_t_sp_elx_OFFSET) +#define _thread_offset_to_callee_saved_sp_elx_lr \ + (___thread_t_callee_saved_OFFSET + ___callee_saved_t_sp_elx_lr_OFFSET) #endif /* ZEPHYR_ARCH_ARM64_INCLUDE_OFFSETS_SHORT_ARCH_H_ */ diff --git a/include/arch/arm64/syscall.h b/include/arch/arm64/syscall.h index 54a2d054a2d..5e7a43e2b8a 100644 --- a/include/arch/arm64/syscall.h +++ b/include/arch/arm64/syscall.h @@ -16,7 +16,6 @@ #ifndef ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_ #define ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_ -#define _SVC_CALL_CONTEXT_SWITCH 0 #define _SVC_CALL_IRQ_OFFLOAD 1 #define _SVC_CALL_RUNTIME_EXCEPT 2 #define _SVC_CALL_SYSTEM_CALL 3 diff --git a/include/arch/arm64/thread.h b/include/arch/arm64/thread.h index 3227d451d44..377f4c37687 100644 --- a/include/arch/arm64/thread.h +++ b/include/arch/arm64/thread.h @@ -36,6 +36,7 @@ struct _callee_saved { uint64_t x29; uint64_t sp_el0; uint64_t sp_elx; + uint64_t lr; }; typedef struct _callee_saved _callee_saved_t;