arch/xtensa: Optimize cache management on context switch
Making context switch cache-coherent in SMP is hard. The KERNEL_COHERENCE handling was conservatively invalidating the stack region of a thread that was being switched in. This was because it might have (1) run on this CPU in the past, but (2) run most recently on a different CPU. In that case we might have stale data still in our local dcache! But this has performance impact in the (very common!) case of a thread being switched out briefly and then back in (e.g. k_sleep() for a small duration). It will come back having lost all of its cached stack context, and will have to fetch all that information back from shared SRAM! Treat this by tracking a "last_cpu" for each thread in the arch part of the thread struct. If we're coming back to the same CPU we left, we know we can skip the invalidate. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
This commit is contained in:
parent
d490358e28
commit
64a3159dee
3 changed files with 18 additions and 6 deletions
|
@ -20,6 +20,14 @@ void *xtensa_init_stack(struct k_thread *thread, int *stack_top,
|
|||
void (*entry)(void *, void *, void *),
|
||||
void *arg1, void *arg2, void *arg3)
|
||||
{
|
||||
/* Not-a-cpu ID Ensures that the first time this is run, the
|
||||
* stack will be invalidated. That covers the edge case of
|
||||
* restarting a thread on a stack that had previously been run
|
||||
* on one CPU, but then initialized on this one, and
|
||||
* potentially run THERE and not HERE.
|
||||
*/
|
||||
thread->arch.last_cpu = -1;
|
||||
|
||||
/* We cheat and shave 16 bytes off, the top four words are the
|
||||
* A0-A3 spill area for the caller of the entry function,
|
||||
* which doesn't exist. It will never be touched, so we
|
||||
|
|
|
@ -65,6 +65,8 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
|
|||
void *old_switch_handle,
|
||||
struct k_thread *new_thread)
|
||||
{
|
||||
int32_t curr_cpu = _current_cpu->id;
|
||||
|
||||
size_t ostack = old_thread->stack_info.start;
|
||||
size_t osz = old_thread->stack_info.size;
|
||||
size_t osp = (size_t) old_switch_handle;
|
||||
|
@ -84,10 +86,9 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
|
|||
|
||||
/* The "live" area (the region between the switch handle,
|
||||
* which is the stack pointer, and the top of the stack
|
||||
* memory) of the inbound stack needs to be invalidated: it
|
||||
* may contain data that was modified on another CPU since the
|
||||
* last time this CPU ran the thread, and our cache may be
|
||||
* stale.
|
||||
* memory) of the inbound stack needs to be invalidated if we
|
||||
* last ran on another cpu: it may contain data that was
|
||||
* modified there, and our cache may be stale.
|
||||
*
|
||||
* The corresponding "dead area" of the inbound stack can be
|
||||
* ignored. We may have cached data in that region, but by
|
||||
|
@ -96,7 +97,10 @@ static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
|
|||
* uninitialized data error) so our stale cache will be
|
||||
* automatically overwritten as needed.
|
||||
*/
|
||||
z_xtensa_cache_inv((void *)nsp, (nstack + nsz) - nsp);
|
||||
if (curr_cpu != new_thread->arch.last_cpu) {
|
||||
z_xtensa_cache_inv((void *)nsp, (nstack + nsz) - nsp);
|
||||
}
|
||||
old_thread->arch.last_cpu = curr_cpu;
|
||||
|
||||
/* Dummy threads appear at system initialization, but don't
|
||||
* have stack_info data and will never be saved. Ignore.
|
||||
|
|
|
@ -21,7 +21,7 @@ struct _callee_saved {
|
|||
typedef struct _callee_saved _callee_saved_t;
|
||||
|
||||
struct _thread_arch {
|
||||
char dummy;
|
||||
uint32_t last_cpu;
|
||||
};
|
||||
|
||||
typedef struct _thread_arch _thread_arch_t;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue