xtensa: New asm layer to support SMP

SMP needs a new context switch primitive (to disentangle _swap() from the scheduler) and new interrupt entry behavior (to be able to take a global spinlock on behalf of legacy drivers). The existing code is very obtuse, and working with it led me down a long path of "this would be so much better if..." So this is a new context and entry framework, intended to replace the code that exists now, at least on SMP platforms. New features: * The new context switch primitive is xtensa_switch(), which takes a "new" context handle as an argument instead of getting it from the scheduler, returns an "old" context handle through a pointer (e.g. to save it to the old thread context), and restores the lock state(PS register) exactly as it is at entry instead of taking it as an argument. * The register spill code understands wrap-around register windows and can avoid spilling A4-A15 registers when they are unused by the interrupted function, saving as much as 48 bytes of stack space on the interrupted stacks. * The "spill register windows" routine is entirely different, using a different mechanism, and is MUCH FASTER (to the tune of almost 200 cycles). See notes in comments. * Even better, interrupt entry can be done via a clever "cross stack call" I worked up, meaning that the interrupted thread's registers do not need to be spilled at all until they are naturally pushed out by the interrupt handler or until we return from the interrupt into a different thread. This is a big efficiency win for tiny interrupts (e.g. timers), and a big latency win for all interrupts. * Interrupt entry is 100% symmetric with respect to medium/high interrupts, avoiding the problems seen with hooking high priority interrupts with the current code (e.g. ESP-32's watchdog driver). * Much smaller code size. No cut and paste assembly. No use of HAL calls. * Assumes "XEA2" interrupt architecture, the register window extension (i.e. no CALL0 ABI), and the "high priority interrupts" extension. Does not support the legacy processor variants for which we have no targets. The old code has some stuff in there to support this, but it seems bitrotten, untestable, and I'm all but certain it doesn't work. Note that this simply adds the primitives to the existing tree in a form where they can be unit tested. It does not replace the existing interrupt/exception handling or _Swap() implementation. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2017-12-07 15:01:33 -08:00 · 2017-12-07 15:01:33 -08:00 · a34f884f23
commit a34f884f23
parent 8dca7ae587
6 changed files with 717 additions and 0 deletions
--- a/arch/xtensa/core/xtensa-asm2-util.S
+++ b/arch/xtensa/core/xtensa-asm2-util.S
@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <xtensa-asm2-s.h>
+
+/*
+ * xtensa_save_high_regs
+ *
+ * Call with CALL0, with A2/A3 available as scratch.  Pushes the high
+ * A4-A15 GPRs to the stack if needed (i.e. if those registers are not
+ * part of wrapped-around frames higher up the call stack), returning
+ * to the caller with the stack pointer HAVING BEEN MODIFIED to
+ * contain them.
+ */
+.global xtensa_save_high_regs
+.align 4
+xtensa_save_high_regs:
+	/* Generate a rotated (modulo NREGS/4 bits!) WINDOWSTART in A2
+	 * by duplicating the bits twice and shifting down by WINDOWBASE
+	 * bits.  Now the LSB is the register quad at WINDOWBASE.
+	 */
+	rsr.WINDOWSTART a2
+	slli a3, a2, (XCHAL_NUM_AREGS / 4)
+	or a2, a2, a3
+	rsr.WINDOWBASE a3
+	ssr a3
+	srl a2, a2
+
+	mov a3, a1 /* Stash our original stack pointer */
+
+	/* For the next three bits in WINDOWSTART (which correspond to
+	 * the A4-A7, A8-A11 and A12-A15 quads), if we find a one,
+	 * that means that the quad is owned by a wrapped-around call
+	 * in the registers, so we don't need to spill it or any
+	 * further registers from the GPRs and can skip to the end.
+	 */
+	bbsi a2, 1, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a4, a1, 0
+	s32i a5, a1, 4
+	s32i a6, a1, 8
+	s32i a7, a1, 12
+
+	bbsi a2, 2, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a8, a1, 0
+	s32i a9, a1, 4
+	s32i a10, a1, 8
+	s32i a11, a1, 12
+
+	bbsi a2, 3, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a12, a1, 0
+	s32i a13, a1, 4
+	s32i a14, a1, 8
+	s32i a15, a1, 12
+
+_high_gpr_spill_done:
+	/* Push the original stack pointer so we know at restore
+	 * time how many registers were spilled, then return, leaving the
+	 * modified SP in A1.
+	 */
+	addi a1, a1, -4
+	s32i a3, a1, 0
+
+	ret
+
+/*
+ * xtensa_restore_high_regs
+ *
+ * Does the inverse of xtensa_save_high_regs, taking a stack pointer
+ * in A1 that resulted and restoring the A4-A15 state (and the stack
+ * pointer) to the state they had at the earlier call.  Call with
+ * CALL0, leaving A2/A3 available as scratch.
+ */
+.global xtensa_restore_high_regs
+.align 4
+xtensa_restore_high_regs:
+	/* pop our "original" stack pointer into a2, stash in a3 also */
+	l32i a2, a1, 0
+	addi a1, a1, 4
+	mov a3, a2
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a4, a2, 0
+	l32i a5, a2, 4
+	l32i a6, a2, 8
+	l32i a7, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a8, a2, 0
+	l32i a9, a2, 4
+	l32i a10, a2, 8
+	l32i a11, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a12, a2, 0
+	l32i a13, a2, 4
+	l32i a14, a2, 8
+	l32i a15, a2, 12
+
+_high_restore_done:
+	mov a1, a3 /* Original stack */
+	ret
+
+/*
+ * _restore_context
+ *
+ * Arrive here via a jump.  Enters into the restored context and does
+ * not return.  A1 should have a context pointer in it as received
+ * from switch or an interrupt exit.  Interrupts must be disabled,
+ * and register windows should have been spilled.
+ *
+
+ * Note that exit from the restore is done with the RFI instruction,
+ * using the EPCn/EPSn registers.  Those will have been saved already
+ * by any interrupt entry so they are save to use.  Note that EPC1 and
+ * RFE are NOT usable (they can't preserve PS).  Per the ISA spec, all
+ * RFI levels do the same thing and differ only in the special
+ * registers used to hold PC/PS, but Qemu has been observed to behave
+ * strangely when RFI doesn't "return" to a INTLEVEL strictly lower
+ * than it started from.  So pick level 6 (the highest that works on
+ * Qemu, hardware doesn't care so it doesn't matter).  In theory we
+ * should test to be able to support hardware with less than 6 levels,
+ * though...
+ */
+.global _restore_context
+_restore_context:
+	call0 xtensa_restore_high_regs
+
+	l32i a0, a1, BSA_PC_OFF
+	wsr.EPC6 a0
+	l32i a0, a1, BSA_PS_OFF
+	wsr.EPS6 a0
+
+	l32i a0, a1, BSA_SAR_OFF
+	wsr.SAR a0
+#if XCHAL_HAVE_LOOPS
+	l32i a0, a1, BSA_LBEG_OFF
+	wsr.LBEG a0
+	l32i a0, a1, BSA_LEND_OFF
+	wsr.LEND a0
+	l32i a0, a1, BSA_LCOUNT_OFF
+	wsr.LCOUNT a0
+#endif
+	rsync
+
+	l32i a0, a1, BSA_A0_OFF
+	l32i a2, a1, BSA_A2_OFF
+	l32i a3, a1, BSA_A3_OFF
+	addi a1, a1, BASE_SAVE_AREA_SIZE
+
+	rfi 6
+
+/*
+ * void xtensa_switch(void *new, void **old_return);
+ *
+ * Context switches into the prevoiusly-saved "new" handle, placing
+ * the saved "old" handle into the address provided by old_return.
+ */
+.global xtensa_switch
+.align 4
+xtensa_switch:
+	entry a1, 16
+	SPILL_ALL_WINDOWS
+	addi a1, a1, -BASE_SAVE_AREA_SIZE
+
+	/* Stash our A0/2/3 and the shift/loop registers into the base
+	 * save area so they get restored as they are now.  A2/A3
+	 * don't actually get used post-restore, but they need to be
+	 * stashed across the xtensa_save_high_regs call and this is a
+	 * convenient place.
+	 */
+	s32i a0, a1, BSA_A0_OFF
+	s32i a2, a1, BSA_A2_OFF
+	s32i a3, a1, BSA_A3_OFF
+	ODD_REG_SAVE
+
+	/* Stash our PS register contents and a "restore" PC. */
+	rsr.PS a0
+	s32i a0, a1, BSA_PS_OFF
+	movi a0, _switch_restore_pc
+	s32i a0, a1, BSA_PC_OFF
+
+	/* Now the high registers */
+	call0 xtensa_save_high_regs
+
+	/* Restore the A3 argument we spilled earlier (via the base
+	 * save pointer pushed at the bottom of the stack) and set the
+	 * stack to the "new" context out of the A2 spill slot.
+	 */
+	l32i a2, a1, 0
+	l32i a3, a2, BSA_A3_OFF
+	s32i a1, a3, 0
+
+	/* Switch stack pointer and restore.  The jump to
+	 * _restore_context does not return as such, but we arrange
+	 * for the restored "next" address to be immediately after for
+	 * sanity.
+	 */
+	l32i a1, a2, BSA_A2_OFF
+	j _restore_context
+_switch_restore_pc:
+	retw