arm64: improved arch_switch() implementation

Make it optimal without the need for an SVC/exception  roundtrip on
every context switch. Performance numbers from tests/benchmarks/sched:

Before:
unpend   85 ready   58 switch  258 pend  231 tot  632 (avg  699)

After:
unpend   85 ready   59 switch  115 pend  138 tot  397 (avg  478)

Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
This commit is contained in:
Nicolas Pitre 2022-03-14 12:51:40 -05:00 committed by Anas Nashif
commit 8affac64a7
10 changed files with 63 additions and 49 deletions

View file

@ -268,20 +268,19 @@ void z_arm64_fpu_trap(z_arch_esf_t *esf)
/* /*
* Perform lazy FPU context switching by simply granting or denying * Perform lazy FPU context switching by simply granting or denying
* access to FP regs based on FPU ownership before leaving the last * access to FP regs based on FPU ownership before leaving the last
* exception level. If current thread doesn't own the FP regs then * exception level in case of exceptions, or during a thread context
* it will trap on its first access and then the actual FPU context * switch with the exception level of the new thread being 0.
* switching will occur. * If current thread doesn't own the FP regs then it will trap on its
* * first access and then the actual FPU context switching will occur.
* This is called on every exception exit except for z_arm64_fpu_trap().
*/ */
void z_arm64_fpu_exit_exc(void) static void fpu_access_update(unsigned int exc_update_level)
{ {
__ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled"); __ASSERT(read_daif() & DAIF_IRQ_BIT, "must be called with IRQs disabled");
uint64_t cpacr = read_cpacr_el1(); uint64_t cpacr = read_cpacr_el1();
if (arch_exception_depth() == 1) { if (arch_exception_depth() == exc_update_level) {
/* We're about to leave exception mode */ /* We're about to execute non-exception code */
if (_current_cpu->arch.fpu_owner == _current) { if (_current_cpu->arch.fpu_owner == _current) {
/* turn on FPU access */ /* turn on FPU access */
write_cpacr_el1(cpacr | CPACR_EL1_FPEN_NOTRAP); write_cpacr_el1(cpacr | CPACR_EL1_FPEN_NOTRAP);
@ -291,14 +290,34 @@ void z_arm64_fpu_exit_exc(void)
} }
} else { } else {
/* /*
* Shallower exception levels should always trap on FPU * Any new exception level should always trap on FPU
* access as we want to make sure IRQs are disabled before * access as we want to make sure IRQs are disabled before
* granting them access. * granting it access (see z_arm64_fpu_trap() documentation).
*/ */
write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP); write_cpacr_el1(cpacr & ~CPACR_EL1_FPEN_NOTRAP);
} }
} }
/*
* This is called on every exception exit except for z_arm64_fpu_trap().
* In that case the exception level of interest is 1 (soon to be 0).
*/
void z_arm64_fpu_exit_exc(void)
{
fpu_access_update(1);
}
/*
* This is called from z_arm64_context_switch(). FPU access may be granted
* only if exception level is 0. If we switch to a thread that is still in
* some exception context then FPU access would be re-evaluated at exception
* exit time via z_arm64_fpu_exit_exc().
*/
void z_arm64_fpu_thread_context_switch(void)
{
fpu_access_update(0);
}
int arch_float_disable(struct k_thread *thread) int arch_float_disable(struct k_thread *thread)
{ {
if (thread != NULL) { if (thread != NULL) {

View file

@ -38,7 +38,7 @@ GEN_NAMED_OFFSET_SYM(_callee_saved_t, x23, x23_x24);
GEN_NAMED_OFFSET_SYM(_callee_saved_t, x25, x25_x26); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x25, x25_x26);
GEN_NAMED_OFFSET_SYM(_callee_saved_t, x27, x27_x28); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x27, x27_x28);
GEN_NAMED_OFFSET_SYM(_callee_saved_t, x29, x29_sp_el0); GEN_NAMED_OFFSET_SYM(_callee_saved_t, x29, x29_sp_el0);
GEN_NAMED_OFFSET_SYM(_callee_saved_t, sp_elx, sp_elx); GEN_NAMED_OFFSET_SYM(_callee_saved_t, sp_elx, sp_elx_lr);
GEN_ABSOLUTE_SYM(___callee_saved_t_SIZEOF, sizeof(struct _callee_saved)); GEN_ABSOLUTE_SYM(___callee_saved_t_SIZEOF, sizeof(struct _callee_saved));

View file

@ -165,6 +165,9 @@ switch_el:
/* EL1 init */ /* EL1 init */
bl z_arm64_el1_init bl z_arm64_el1_init
/* We want to use SP_ELx from now on */
msr SPSel, #1
/* Enable SError interrupts */ /* Enable SError interrupts */
msr DAIFClr, #(DAIFCLR_ABT_BIT) msr DAIFClr, #(DAIFCLR_ABT_BIT)
isb isb

View file

@ -24,7 +24,9 @@ _ASM_FILE_PROLOGUE
* Routine to handle context switches * Routine to handle context switches
* *
* This function is directly called either by _isr_wrapper() in case of * This function is directly called either by _isr_wrapper() in case of
* preemption, or z_arm64_sync_exc() in case of cooperative switching. * preemption, or arch_switch() in case of cooperative switching.
*
* void z_arm64_context_switch(struct k_thread *new, struct k_thread *old);
*/ */
GTEXT(z_arm64_context_switch) GTEXT(z_arm64_context_switch)
@ -40,9 +42,9 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
stp x27, x28, [x1, #_thread_offset_to_callee_saved_x27_x28] stp x27, x28, [x1, #_thread_offset_to_callee_saved_x27_x28]
stp x29, x4, [x1, #_thread_offset_to_callee_saved_x29_sp_el0] stp x29, x4, [x1, #_thread_offset_to_callee_saved_x29_sp_el0]
/* Save the current SP_ELx */ /* Save the current SP_ELx and return address */
mov x4, sp mov x4, sp
str x4, [x1, #_thread_offset_to_callee_saved_sp_elx] stp x4, lr, [x1, #_thread_offset_to_callee_saved_sp_elx_lr]
/* save current thread's exception depth */ /* save current thread's exception depth */
mrs x4, tpidrro_el0 mrs x4, tpidrro_el0
@ -55,6 +57,17 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
orr x4, x4, x2, lsl #TPIDRROEL0_EXC_SHIFT orr x4, x4, x2, lsl #TPIDRROEL0_EXC_SHIFT
msr tpidrro_el0, x4 msr tpidrro_el0, x4
#ifdef CONFIG_FPU_SHARING
/*
* Do this after tpidrro_el0 is updated with the new exception
* depth value, and before old->switch_handle is updated (making
* it available for grab by another CPU) as we still use its stack.
*/
stp x0, x1, [sp, #-16]!
bl z_arm64_fpu_thread_context_switch
ldp x0, x1, [sp], #16
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
/* save old thread into switch handle which is required by /* save old thread into switch handle which is required by
* wait_for_switch * wait_for_switch
@ -83,8 +96,8 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
/* Restore SP_EL0 */ /* Restore SP_EL0 */
msr sp_el0, x4 msr sp_el0, x4
/* Restore SP_EL1 */ /* Restore SP_EL1 and return address */
ldr x4, [x0, #_thread_offset_to_callee_saved_sp_elx] ldp x4, lr, [x0, #_thread_offset_to_callee_saved_sp_elx_lr]
mov sp, x4 mov sp, x4
#ifdef CONFIG_USERSPACE #ifdef CONFIG_USERSPACE
@ -99,7 +112,7 @@ SECTION_FUNC(TEXT, z_arm64_context_switch)
ldp xzr, x30, [sp], #16 ldp xzr, x30, [sp], #16
#endif #endif
/* Return to z_arm64_sync_exc() or _isr_wrapper() */ /* Return to arch_switch() or _isr_wrapper() */
ret ret
/* /*
@ -131,9 +144,6 @@ SECTION_FUNC(TEXT, z_arm64_sync_exc)
/* Demux the SVC call */ /* Demux the SVC call */
and x1, x0, #0xff and x1, x0, #0xff
cmp x1, #_SVC_CALL_CONTEXT_SWITCH
beq context_switch
cmp x1, #_SVC_CALL_RUNTIME_EXCEPT cmp x1, #_SVC_CALL_RUNTIME_EXCEPT
beq oops beq oops
@ -179,22 +189,6 @@ oops:
mov x0, sp mov x0, sp
b z_arm64_do_kernel_oops b z_arm64_do_kernel_oops
context_switch:
/*
* Retrieve x0 and x1 from the stack:
*
* - x0 = new_thread->switch_handle = switch_to thread
* - x1 = &old_thread->switch_handle = current thread
*/
ldp x0, x1, [sp, ___esf_t_x0_x1_OFFSET]
/* Get old thread from x1 */
sub x1, x1, ___thread_t_switch_handle_OFFSET
/* Switch thread */
bl z_arm64_context_switch
b z_arm64_exit_exc
inv: inv:
mov x0, #0 /* K_ERR_CPU_EXCEPTION */ mov x0, #0 /* K_ERR_CPU_EXCEPTION */
mov x1, sp mov x1, sp
@ -202,8 +196,3 @@ inv:
/* Return here only in case of recoverable error */ /* Return here only in case of recoverable error */
b z_arm64_exit_exc b z_arm64_exit_exc
GTEXT(z_arm64_call_svc)
SECTION_FUNC(TEXT, z_arm64_call_svc)
svc #_SVC_CALL_CONTEXT_SWITCH
ret

View file

@ -70,6 +70,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
char *stack_ptr, k_thread_entry_t entry, char *stack_ptr, k_thread_entry_t entry,
void *p1, void *p2, void *p3) void *p1, void *p2, void *p3)
{ {
extern void z_arm64_exit_exc(void);
z_arch_esf_t *pInitCtx; z_arch_esf_t *pInitCtx;
/* /*
@ -118,6 +119,7 @@ void arch_new_thread(struct k_thread *thread, k_thread_stack_t *stack,
* z_arm64_userspace_enter() (see comments there) * z_arm64_userspace_enter() (see comments there)
*/ */
thread->callee_saved.sp_elx = (uint64_t)pInitCtx; thread->callee_saved.sp_elx = (uint64_t)pInitCtx;
thread->callee_saved.lr = (uint64_t)z_arm64_exit_exc;
thread->switch_handle = thread; thread->switch_handle = thread;
} }

View file

@ -31,9 +31,6 @@ static ALWAYS_INLINE bool arch_is_in_isr(void)
return arch_curr_cpu()->nested != 0U; return arch_curr_cpu()->nested != 0U;
} }
extern void z_arm64_call_svc(void *switch_to, void **switched_from);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View file

@ -34,9 +34,13 @@ static ALWAYS_INLINE void arch_kernel_init(void)
static inline void arch_switch(void *switch_to, void **switched_from) static inline void arch_switch(void *switch_to, void **switched_from)
{ {
z_arm64_call_svc(switch_to, switched_from); extern void z_arm64_context_switch(struct k_thread *new,
struct k_thread *old);
struct k_thread *new = switch_to;
struct k_thread *old = CONTAINER_OF(switched_from, struct k_thread,
switch_handle);
return; z_arm64_context_switch(new, old);
} }
extern void z_arm64_fatal_error(z_arch_esf_t *esf, unsigned int reason); extern void z_arm64_fatal_error(z_arch_esf_t *esf, unsigned int reason);

View file

@ -24,7 +24,7 @@
(___thread_t_callee_saved_OFFSET + ___callee_saved_t_x27_x28_OFFSET) (___thread_t_callee_saved_OFFSET + ___callee_saved_t_x27_x28_OFFSET)
#define _thread_offset_to_callee_saved_x29_sp_el0 \ #define _thread_offset_to_callee_saved_x29_sp_el0 \
(___thread_t_callee_saved_OFFSET + ___callee_saved_t_x29_sp_el0_OFFSET) (___thread_t_callee_saved_OFFSET + ___callee_saved_t_x29_sp_el0_OFFSET)
#define _thread_offset_to_callee_saved_sp_elx \ #define _thread_offset_to_callee_saved_sp_elx_lr \
(___thread_t_callee_saved_OFFSET + ___callee_saved_t_sp_elx_OFFSET) (___thread_t_callee_saved_OFFSET + ___callee_saved_t_sp_elx_lr_OFFSET)
#endif /* ZEPHYR_ARCH_ARM64_INCLUDE_OFFSETS_SHORT_ARCH_H_ */ #endif /* ZEPHYR_ARCH_ARM64_INCLUDE_OFFSETS_SHORT_ARCH_H_ */

View file

@ -16,7 +16,6 @@
#ifndef ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_ #ifndef ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_
#define ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_ #define ZEPHYR_INCLUDE_ARCH_ARM64_SYSCALL_H_
#define _SVC_CALL_CONTEXT_SWITCH 0
#define _SVC_CALL_IRQ_OFFLOAD 1 #define _SVC_CALL_IRQ_OFFLOAD 1
#define _SVC_CALL_RUNTIME_EXCEPT 2 #define _SVC_CALL_RUNTIME_EXCEPT 2
#define _SVC_CALL_SYSTEM_CALL 3 #define _SVC_CALL_SYSTEM_CALL 3

View file

@ -36,6 +36,7 @@ struct _callee_saved {
uint64_t x29; uint64_t x29;
uint64_t sp_el0; uint64_t sp_el0;
uint64_t sp_elx; uint64_t sp_elx;
uint64_t lr;
}; };
typedef struct _callee_saved _callee_saved_t; typedef struct _callee_saved _callee_saved_t;