diff --git a/arch/xtensa/core/CMakeLists.txt b/arch/xtensa/core/CMakeLists.txt
index 38748f6fd30..af6fbbab4b5 100644
--- a/arch/xtensa/core/CMakeLists.txt
+++ b/arch/xtensa/core/CMakeLists.txt
@@ -11,6 +11,8 @@ zephyr_sources(
 	xtensa_vectors.S
 	xt_zephyr.S
 	window_vectors.S
+	xtensa-asm2-util.S
+	xtensa-asm2.c
 	)
 
 zephyr_sources_ifndef(CONFIG_ATOMIC_OPERATIONS_C atomic.S)
diff --git a/arch/xtensa/core/xtensa-asm2-util.S b/arch/xtensa/core/xtensa-asm2-util.S
new file mode 100644
index 00000000000..189359978e6
--- /dev/null
+++ b/arch/xtensa/core/xtensa-asm2-util.S
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <xtensa-asm2-s.h>
+
+/*
+ * xtensa_save_high_regs
+ *
+ * Call with CALL0, with A2/A3 available as scratch.  Pushes the high
+ * A4-A15 GPRs to the stack if needed (i.e. if those registers are not
+ * part of wrapped-around frames higher up the call stack), returning
+ * to the caller with the stack pointer HAVING BEEN MODIFIED to
+ * contain them.
+ */
+.global xtensa_save_high_regs
+.align 4
+xtensa_save_high_regs:
+	/* Generate a rotated (modulo NREGS/4 bits!) WINDOWSTART in A2
+	 * by duplicating the bits twice and shifting down by WINDOWBASE
+	 * bits.  Now the LSB is the register quad at WINDOWBASE.
+	 */
+	rsr.WINDOWSTART a2
+	slli a3, a2, (XCHAL_NUM_AREGS / 4)
+	or a2, a2, a3
+	rsr.WINDOWBASE a3
+	ssr a3
+	srl a2, a2
+
+	mov a3, a1 /* Stash our original stack pointer */
+
+	/* For the next three bits in WINDOWSTART (which correspond to
+	 * the A4-A7, A8-A11 and A12-A15 quads), if we find a one,
+	 * that means that the quad is owned by a wrapped-around call
+	 * in the registers, so we don't need to spill it or any
+	 * further registers from the GPRs and can skip to the end.
+	 */
+	bbsi a2, 1, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a4, a1, 0
+	s32i a5, a1, 4
+	s32i a6, a1, 8
+	s32i a7, a1, 12
+
+	bbsi a2, 2, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a8, a1, 0
+	s32i a9, a1, 4
+	s32i a10, a1, 8
+	s32i a11, a1, 12
+
+	bbsi a2, 3, _high_gpr_spill_done
+	addi a1, a1, -16
+	s32i a12, a1, 0
+	s32i a13, a1, 4
+	s32i a14, a1, 8
+	s32i a15, a1, 12
+
+_high_gpr_spill_done:
+	/* Push the original stack pointer so we know at restore
+	 * time how many registers were spilled, then return, leaving the
+	 * modified SP in A1.
+	 */
+	addi a1, a1, -4
+	s32i a3, a1, 0
+
+	ret
+
+/*
+ * xtensa_restore_high_regs
+ *
+ * Does the inverse of xtensa_save_high_regs, taking a stack pointer
+ * in A1 that resulted and restoring the A4-A15 state (and the stack
+ * pointer) to the state they had at the earlier call.  Call with
+ * CALL0, leaving A2/A3 available as scratch.
+ */
+.global xtensa_restore_high_regs
+.align 4
+xtensa_restore_high_regs:
+	/* pop our "original" stack pointer into a2, stash in a3 also */
+	l32i a2, a1, 0
+	addi a1, a1, 4
+	mov a3, a2
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a4, a2, 0
+	l32i a5, a2, 4
+	l32i a6, a2, 8
+	l32i a7, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a8, a2, 0
+	l32i a9, a2, 4
+	l32i a10, a2, 8
+	l32i a11, a2, 12
+
+	beq a1, a2, _high_restore_done
+	addi a2, a2, -16
+	l32i a12, a2, 0
+	l32i a13, a2, 4
+	l32i a14, a2, 8
+	l32i a15, a2, 12
+
+_high_restore_done:
+	mov a1, a3 /* Original stack */
+	ret
+
+/*
+ * _restore_context
+ *
+ * Arrive here via a jump.  Enters into the restored context and does
+ * not return.  A1 should have a context pointer in it as received
+ * from switch or an interrupt exit.  Interrupts must be disabled,
+ * and register windows should have been spilled.
+ *
+
+ * Note that exit from the restore is done with the RFI instruction,
+ * using the EPCn/EPSn registers.  Those will have been saved already
+ * by any interrupt entry so they are save to use.  Note that EPC1 and
+ * RFE are NOT usable (they can't preserve PS).  Per the ISA spec, all
+ * RFI levels do the same thing and differ only in the special
+ * registers used to hold PC/PS, but Qemu has been observed to behave
+ * strangely when RFI doesn't "return" to a INTLEVEL strictly lower
+ * than it started from.  So pick level 6 (the highest that works on
+ * Qemu, hardware doesn't care so it doesn't matter).  In theory we
+ * should test to be able to support hardware with less than 6 levels,
+ * though...
+ */
+.global _restore_context
+_restore_context:
+	call0 xtensa_restore_high_regs
+
+	l32i a0, a1, BSA_PC_OFF
+	wsr.EPC6 a0
+	l32i a0, a1, BSA_PS_OFF
+	wsr.EPS6 a0
+
+	l32i a0, a1, BSA_SAR_OFF
+	wsr.SAR a0
+#if XCHAL_HAVE_LOOPS
+	l32i a0, a1, BSA_LBEG_OFF
+	wsr.LBEG a0
+	l32i a0, a1, BSA_LEND_OFF
+	wsr.LEND a0
+	l32i a0, a1, BSA_LCOUNT_OFF
+	wsr.LCOUNT a0
+#endif
+	rsync
+
+	l32i a0, a1, BSA_A0_OFF
+	l32i a2, a1, BSA_A2_OFF
+	l32i a3, a1, BSA_A3_OFF
+	addi a1, a1, BASE_SAVE_AREA_SIZE
+
+	rfi 6
+
+/*
+ * void xtensa_switch(void *new, void **old_return);
+ *
+ * Context switches into the prevoiusly-saved "new" handle, placing
+ * the saved "old" handle into the address provided by old_return.
+ */
+.global xtensa_switch
+.align 4
+xtensa_switch:
+	entry a1, 16
+	SPILL_ALL_WINDOWS
+	addi a1, a1, -BASE_SAVE_AREA_SIZE
+
+	/* Stash our A0/2/3 and the shift/loop registers into the base
+	 * save area so they get restored as they are now.  A2/A3
+	 * don't actually get used post-restore, but they need to be
+	 * stashed across the xtensa_save_high_regs call and this is a
+	 * convenient place.
+	 */
+	s32i a0, a1, BSA_A0_OFF
+	s32i a2, a1, BSA_A2_OFF
+	s32i a3, a1, BSA_A3_OFF
+	ODD_REG_SAVE
+
+	/* Stash our PS register contents and a "restore" PC. */
+	rsr.PS a0
+	s32i a0, a1, BSA_PS_OFF
+	movi a0, _switch_restore_pc
+	s32i a0, a1, BSA_PC_OFF
+
+	/* Now the high registers */
+	call0 xtensa_save_high_regs
+
+	/* Restore the A3 argument we spilled earlier (via the base
+	 * save pointer pushed at the bottom of the stack) and set the
+	 * stack to the "new" context out of the A2 spill slot.
+	 */
+	l32i a2, a1, 0
+	l32i a3, a2, BSA_A3_OFF
+	s32i a1, a3, 0
+
+	/* Switch stack pointer and restore.  The jump to
+	 * _restore_context does not return as such, but we arrange
+	 * for the restored "next" address to be immediately after for
+	 * sanity.
+	 */
+	l32i a1, a2, BSA_A2_OFF
+	j _restore_context
+_switch_restore_pc:
+	retw
diff --git a/arch/xtensa/core/xtensa-asm2.c b/arch/xtensa/core/xtensa-asm2.c
new file mode 100644
index 00000000000..6fe0221dcf9
--- /dev/null
+++ b/arch/xtensa/core/xtensa-asm2.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <string.h>
+#include <xtensa-asm2.h>
+#include <kernel.h>
+#include <kernel_structs.h>
+
+void *xtensa_init_stack(int *stack_top,
+			void (*entry)(void *, void *, void *),
+			void *arg1, void *arg2, void *arg3)
+{
+	/* We cheat and shave 16 bytes off, the top four words are the
+	 * A0-A3 spill area for the caller of the entry function,
+	 * which doesn't exist.  It will never be touched, so we
+	 * arrange to enter the function with a CALLINC of 1 and a
+	 * stack pointer 16 bytes above the top, so its ENTRY at the
+	 * start will decrement the stack pointer by 16.
+	 */
+	const int bsasz = BASE_SAVE_AREA_SIZE - 16;
+	void **bsa = (void **) (((char *) stack_top) - bsasz);
+
+	memset(bsa, 0, bsasz);
+
+	bsa[BSA_PC_OFF/4] = entry;
+	bsa[BSA_PS_OFF/4] = (void *)(PS_WOE | PS_UM | PS_CALLINC(1));
+
+	/* Arguments.  Remember these start at A6, which will be
+	 * rotated into A2 by the ENTRY instruction that begins the
+	 * entry function.  And A4-A7 and A8-A11 are optional quads
+	 * that live below the BSA!
+	 */
+	bsa[-1] = arg2; /* a7 */
+	bsa[-2] = arg1; /* a6 */
+	bsa[-3] = 0;    /* a5 */
+	bsa[-4] = 0;    /* a4 */
+
+	bsa[-5] = 0;    /* a11 */
+	bsa[-6] = 0;    /* a10 */
+	bsa[-7] = 0;    /* a9 */
+	bsa[-8] = arg3; /* a8 */
+
+	/* Finally push the BSA pointer and return the stack pointer
+	 * as the handle
+	 */
+	bsa[-9] = bsa;
+	return &bsa[-9];
+}
diff --git a/arch/xtensa/include/xtensa-asm2-context.h b/arch/xtensa/include/xtensa-asm2-context.h
new file mode 100644
index 00000000000..53fe99b0ba0
--- /dev/null
+++ b/arch/xtensa/include/xtensa-asm2-context.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XTENSA_ASM2_CONTEXT_H
+#define _XTENSA_ASM2_CONTEXT_H
+
+#include <xtensa/corebits.h>
+#include <xtensa/config/core-isa.h>
+
+/*
+ * Stack frame layout for a saved processor context, in memory order,
+ * high to low address:
+ *
+ * SP-0 <-- Interrupted stack pointer points here
+ *
+ * SP-4   Caller A3 spill slot \
+ * SP-8   Caller A2 spill slot |
+ * SP-12  Caller A1 spill slot + (Part of ABI standard)
+ * SP-16  Caller A0 spill slot /
+ *
+ * SP-20  Saved A3
+ * SP-24  Saved A2
+ * SP-28  Unused (not "Saved A1" because the SP is saved externally as a handle)
+ * SP-32  Saved A0
+ *
+ * SP-36  Saved PC (address to jump to following restore)
+ * SP-40  Saved/interrupted PS special register
+ *
+ * SP-44  Saved SAR special register
+ *
+ * SP-48  Saved LBEG special register (if loops enabled)
+ * SP-52  Saved LEND special register (if loops enabled)
+ * SP-56  Saved LCOUNT special register (if loops enabled)
+ *
+ *       (The above fixed-size region is known as the "base save area" in the
+ *        code below)
+ *
+ * - Saved A7 \
+ * - Saved A6 |
+ * - Saved A5 +- If not in-use by another frame
+ * - Saved A4 /
+ *
+ * - Saved A11 \
+ * - Saved A10 |
+ * - Saved A9  +- If not in-use by another frame
+ * - Saved A8  /
+ *
+ * - Saved A15 \
+ * - Saved A14 |
+ * - Saved A13 +- If not in-use by another frame
+ * - Saved A12 /
+ *
+ * - Saved intermediate stack pointer (points to low word of base save
+ *   area, i.e. the saved LCOUNT or SAR).  The pointer to this value
+ *   (i.e. the final stack pointer) is stored externally as the
+ *   "restore handle" in the thread context.
+ *
+ * Essentially, you can recover a pointer to the BSA by loading *SP.
+ * Adding the fixed BSA size to that gets you back to the
+ * original/interrupted stack pointer.
+ */
+
+#if XCHAL_HAVE_LOOPS
+#define BASE_SAVE_AREA_SIZE 56
+#else
+#define BASE_SAVE_AREA_SIZE 44
+#endif
+
+#define BSA_A3_OFF	(BASE_SAVE_AREA_SIZE - 20)
+#define BSA_A2_OFF	(BASE_SAVE_AREA_SIZE - 24)
+#define BSA_SCRATCH_OFF	(BASE_SAVE_AREA_SIZE - 28)
+#define BSA_A0_OFF	(BASE_SAVE_AREA_SIZE - 32)
+#define BSA_PC_OFF	(BASE_SAVE_AREA_SIZE - 36)
+#define BSA_PS_OFF	(BASE_SAVE_AREA_SIZE - 40)
+#define BSA_SAR_OFF	(BASE_SAVE_AREA_SIZE - 44)
+#define BSA_LBEG_OFF	(BASE_SAVE_AREA_SIZE - 48)
+#define BSA_LEND_OFF	(BASE_SAVE_AREA_SIZE - 52)
+#define BSA_LCOUNT_OFF	(BASE_SAVE_AREA_SIZE - 56)
+
+#endif /* _XTENSA_ASM2_CONTEXT_H */
diff --git a/arch/xtensa/include/xtensa-asm2-s.h b/arch/xtensa/include/xtensa-asm2-s.h
new file mode 100644
index 00000000000..c37d359fd0c
--- /dev/null
+++ b/arch/xtensa/include/xtensa-asm2-s.h
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "xtensa-asm2-context.h"
+
+/* Assembler header!  This file contains macros designed to be included
+ * only by the assembler.
+ */
+
+/*
+ * SPILL_ALL_WINDOWS
+ *
+ * Spills all windowed registers (i.e. registers not visible as
+ * A0-A15) to their ABI-defined spill regions on the stack.
+ *
+ * Unlike the Xtensa HAL implementation, this code requires that the
+ * EXCM and WOE bit be enabled in PS, and relies on repeated hardware
+ * exception handling to do the register spills.  The trick is to do a
+ * noop write to the high registers, which the hardware will trap
+ * (into an overflow exception) in the case where those registers are
+ * already used by an existing call frame.  Then it rotates the window
+ * and repeats until all but the A0-A3 registers of the original frame
+ * are guaranteed to be spilled, eventually rotating back around into
+ * the original frame.  Advantages:
+ *
+ * - Vastly smaller code size
+ *
+ * - More easily maintained if changes are needed to window over/underflow
+ *   exception handling.
+ *
+ * - Requires no scratch registers to do its work, so can be used safely in any
+ *   context.
+ *
+ * - If the WOE bit is not enabled (for example, in code written for
+ *   the CALL0 ABI), this becomes a silent noop and operates compatbily.
+ *
+ * - In memory protection situations, this relies on the existing
+ *   exception handlers (and thus their use of the L/S32E
+ *   instructions) to execute stores in the protected space.  AFAICT,
+ *   the HAL routine does not handle this situation and isn't safe: it
+ *   will happily write through the "stack pointers" found in
+ *   registers regardless of where they might point.
+ *
+ * - Hilariously it's ACTUALLY FASTER than the HAL routine.  And not
+ *   just a little bit, it's MUCH faster.  With a mostly full register
+ *   file on an LX6 core (ESP-32) I'm measuring 145 cycles to spill
+ *   registers with this vs. 279 (!) to do it with
+ *   xthal_spill_windows().  Apparently Xtensa exception handling is
+ *   really fast, and no one told their software people.
+ *
+ * Note that as with the Xtensa HAL spill routine, and unlike context
+ * switching code on most sane architectures, the intermediate states
+ * here will have an invalid stack pointer.  That means that this code
+ * must not be preempted in any context (i.e. all Zephyr situations)
+ * where the interrupt code will need to use the stack to save the
+ * context.  But unlike the HAL, which runs with exceptions masked via
+ * EXCM, this will not: hit needs the overflow handlers unmasked.  Use
+ * INTLEVEL instead (which, happily, is what Zephyr's locking does
+ * anyway).
+ */
+.macro SPILL_ALL_WINDOWS
+#if XCHAL_NUM_AREGS == 64
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 4
+#elif XCHAL_NUM_AREGS == 32
+	and a12, a12, a12
+	rotw 3
+	and a12, a12, a12
+	rotw 3
+	and a4, a4, a4
+	rotw 2
+#else
+#error Unrecognized XCHAL_NUM_AREGS
+#endif
+.endm
+
+/*
+ * ODD_REG_SAVE
+ *
+ * Stashes the oddball shift/loop context registers in the base save
+ * area pointed to by the current stack pointer.  On exit, A0 will
+ * have been modified but A2/A3 have not, and the shift/loop
+ * instructions can be used freely (though note loops don't work in
+ * exceptions for other reasons!).
+ *
+ * Does not populate or modify the PS/PC save locations.
+ */
+.macro ODD_REG_SAVE
+	rsr.SAR a0
+	s32i a0, a1, BSA_SAR_OFF
+#if XCHAL_HAVE_LOOPS
+	rsr.LBEG a0
+	s32i a0, a1, BSA_LBEG_OFF
+	rsr.LEND a0
+	s32i a0, a1, BSA_LEND_OFF
+	rsr.LCOUNT a0
+	s32i a0, a1, BSA_LCOUNT_OFF
+#endif
+.endm
+
+/*
+ * CROSS_STACK_CALL
+ *
+ * Sets the stack up carefully such that a "cross stack" call can spill
+ * correctly, then invokes an immediate handler.  Note that:
+ *
+ * 0. When spilling a frame, functions find their callEE's stack pointer
+ *    (to save A0-A3) from registers.  But they find their
+ *    already-spilled callER's stack pointer (to save higher GPRs) from
+ *    their own stack memory.
+ *
+ * 1. The function that was interrupted ("interruptee") does not need to
+ *    be spilled, because it already has been as part of the context
+ *    save.  So it doesn't need registers allocated for it anywhere.
+ *
+ * 2. Interruptee's caller needs to spill into the space below the
+ *    interrupted stack frame, which means that the A1 register it finds
+ *    below it needs to contain the old/interrupted stack and not the
+ *    context saved one.
+ *
+ * 3. The ISR dispatcher (called "underneath" interruptee) needs to spill
+ *    high registers into the space immediately above its own stack frame,
+ *    so it needs to find a caller with the "new" stack pointer instead.
+ *
+ * We make this work by inserting TWO 4-register frames between
+ * "interruptee's caller" and "ISR dispatcher".  The top one (which
+ * occupies the slot formerly held by "interruptee", whose registers
+ * were saved via external means) holds the "interrupted A1" and the
+ * bottom has the "top of the interrupt stack" which can be either the
+ * word above a new memory area (when handling an interrupt from user
+ * mode) OR the existing "post-context-save" stack pointer (when
+ * handling a nested interrupt).  The code works either way.  Because
+ * these are both only 4-registers, neither needs its own caller for
+ * spilling.
+ *
+ * The net cost is 32 wasted bytes on the interrupt stack frame to
+ * spill our two "phantom frames" (actually not quite, as we'd need a
+ * few of those words used somewhere for tracking the stack pointers
+ * anyway).  But the benefit is that NO REGISTER FRAMES NEED TO BE
+ * SPILLED on interrupt entry.  And if we return back into the same
+ * context we interrupted (a common case) no windows need to be
+ * explicitly spilled at all.  And in fact in the case where the ISR
+ * uses significant depth on its own stack, the interrupted frames
+ * will be spilled naturally as a standard cost of a function call,
+ * giving register windows something like "zero cost interrupts".
+ *
+ * FIXME: a terrible awful really nifty idea to fix the stack waste
+ * problem would be to use a SINGLE frame between the two stacks,
+ * pre-spill it with one stack pointer for the "lower" call to see and
+ * leave the register SP in place for the "upper" frame to use.
+ * Would require modifying the Window{Over|Under}flow4 exceptions to
+ * know not to spill/fill these special frames, but that's not too
+ * hard, maybe...
+ *
+ * Enter this macro with a valid "context saved" pointer (i.e. SP
+ * should point to a stored pointer which points to one BSA below the
+ * interrupted/old stack) in A1, a handler function in A2, and a "new"
+ * stack pointer (i.e. a pointer to the word ABOVE the allocated stack
+ * area) in A3.  On return A0/1 will be unchanged, A2 has the return
+ * value of the called function, and A3 is clobbered.  A4-A15 become
+ * part of called frames and MUST NOT BE IN USE by the code that
+ * expands this macro.  The called function gets the context save
+ * handle in A1 as it's first argument.
+ */
+.macro CROSS_STACK_CALL
+	mov a6, a3		/* place "new sp" in the next frame's A2 */
+	mov a10, a1             /* pass "context handle" in 2nd frame's A2 */
+	mov a3, a1		/* stash it locally in A3 too */
+	mov a11, a2		/* handler in 2nd frame's A3, next frame's A7 */
+
+	/* Recover the interrupted SP from the BSA */
+	l32i a1, a1, 0
+	addi a1, a1, BASE_SAVE_AREA_SIZE
+
+	call4 _xstack_call0_\@
+	mov a1, a3		/* restore original SP */
+	mov a2, a6		/* copy return value */
+	j _xstack_returned_\@
+.align 4
+_xstack_call0_\@:
+	/* We want an ENTRY to set a bit in windowstart and do the
+	 * rotation, but we want our own SP
+	 */
+	entry a1, 16
+	mov a1, a2
+	call4 _xstack_call1_\@
+	mov a2, a6		/* copy return value */
+	retw
+.align 4
+_xstack_call1_\@:
+	/* Remember the handler is going to do our ENTRY, so the
+	 * handler pointer is still in A6 (not A2) even though this is
+	 * after the second CALL4.
+	 */
+	jx a7
+_xstack_returned_\@:
+.endm
+
+/* Entry setup for all exceptions and interrupts.  Arrive here with
+ * the stack pointer decremented across a base save area, A0-A3 and
+ * PS/PC already spilled to the stack in the BSA, and A2 containing a
+ * level-specific C handler function.
+ *
+ * This is a macro (to allow for unit testing) that expands to a
+ * handler body to which the vectors can jump.  It takes two static
+ * (!) arguments: a special register name (which should be set up to
+ * point to some kind of per-CPU record struct) and offsets within
+ * that struct which contains an interrupt stack top and a "nest
+ * count" word.
+ */
+.macro EXCINT_HANDLER SR, NEST_OFF, INTSTACK_OFF
+	/* A2 contains our handler function which will get clobbered
+	 * by the save.  Stash it into the unused "a1" slot in the
+	 * BSA and recover it immediately after.  Kind of a hack.
+	 */
+	s32i a2, a1, BSA_SCRATCH_OFF
+
+	call0 xtensa_save_high_regs
+
+	l32i a2, a1, 0
+	l32i a2, a2, BSA_SCRATCH_OFF
+
+	/* Unmask EXCM bit so C code can spill/fill in window
+	 * exceptions.  Note interrupts are already fully masked by
+	 * INTLEVEL, so this is safe.
+	 */
+	rsr.PS a0
+	movi a3, ~16
+	and a0, a0, a3
+	wsr.PS a0
+	rsync
+
+	/* A1 already contains our saved stack, and A2 our handler.
+	 * So all that's needed for CROSS_STACK_CALL is to put the
+	 * "new" stack into A3.  This can be either a copy of A1 or an
+	 * entirely new area depending on whether we find a 1 in our
+	 * SR[off] macro argument.
+	 */
+	rsr.\SR a3
+	l32i a0, a3, \NEST_OFF
+	beqz a0, _switch_stacks_\@
+
+	/* Use the same stack, just copy A1 to A3 after incrementing NEST */
+	addi a0, a0, 1
+	s32i a0, a3, \NEST_OFF
+	mov a3, a1
+	j _do_call_\@
+
+_switch_stacks_\@:
+	addi a0, a0, 1
+	s32i a0, a3, \NEST_OFF
+	l32i a3, a3, \INTSTACK_OFF
+
+_do_call_\@:
+	CROSS_STACK_CALL
+
+	/* Decrement nest count */
+	rsr.\SR a3
+	l32i a0, a3, \NEST_OFF
+	addi a0, a0, -1
+	s32i a0, a3, \NEST_OFF
+
+	/* Last trick: the called function returned the "next" handle
+         * to restore to in A6 (the call4'd function's A2).  If this
+         * is not the same handle as we started with, we need to do a
+         * register spill before restoring, for obvious reasons.
+         * Remember to mask interrupts (which have been unmasked
+         * during the handler execution) while we muck with the
+         * windows.  The restore will unmask them as needed.
+         */
+        beq a6, a1, _restore_\@
+	rsil a0, XCHAL_NMILEVEL
+	SPILL_ALL_WINDOWS
+	mov a1, a6
+
+_restore_\@:
+	j _restore_context
+.endm
+
+/* Defines an exception/interrupt vector for a specified level.  Saves
+ * off the interrupted A0-A3 registers and the per-level PS/PC
+ * registers to the stack before jumping to a handler (defined with
+ * EXCINT_HANDLER) to do the rest of the work.
+ *
+ * Arguments are a numeric interrupt level and symbol names for the
+ * entry code (defined via EXCINT_HANDLER) and a C handler for this
+ * particular level.
+ *
+ * FIXME: needs special handling for exceptions (level 1): it's "EPC"
+ * and not "EPC1" (though IIRC the assembler makes this work).
+ * And there is no EPS: instead PS is simply the interrupted PS
+ * with EXCM flipped from 0 to 1.
+ *
+ * FIXME: needs better locking.  The hardware will NOT mask out "high
+ * priority" exceptions on arrival here, so we have to do it ourselves
+ * with RSIL.
+ */
+.macro DEF_EXCINT LVL, ENTRY_SYM, C_HANDLER_SYM
+.pushsection .Level\LVL\()InterruptVector.text, "ax"
+.global _Level\LVL\()Vector
+_Level\LVL\()Vector:
+	addi a1, a1, -BASE_SAVE_AREA_SIZE
+	s32i a0, a1, BSA_A0_OFF
+	s32i a2, a1, BSA_A2_OFF
+	s32i a3, a1, BSA_A3_OFF
+
+	rsr.EPS\LVL a0
+	s32i a0, a1, BSA_PS_OFF
+	rsr.EPC\LVL a0
+	s32i a0, a1, BSA_PC_OFF
+
+	/* What's happening with this jump is that the L32R
+	 * instruction to load a full 32 bit immediate must use an
+	 * offset that is negative from PC.  Normally the assembler
+	 * fixes this up for you by putting the "literal pool"
+	 * somewhere at the start of the section.  But vectors start
+	 * at a fixed address in their own section, and don't (in our
+	 * current linker setup) have anywhere "definitely before
+	 * vectors" to place immediates.  Some platforms and apps will
+	 * link by dumb luck, others won't.  We add an extra jump just
+	 * to clear space we know to be legal.
+	 *
+	 * The right way to fix this would be to use a "literal_prefix"
+	 * to put the literals into a per-vector section, then link
+	 * that section into the PREVIOUS vector's area right after
+	 * the vector code.  Requires touching a lot of linker scripts
+	 * though.
+	 */
+	j _after_imms\LVL\()
+.align 4
+_handle_excint_imm\LVL:
+	.word \ENTRY_SYM
+_c_handler_imm\LVL:
+	.word \C_HANDLER_SYM
+_after_imms\LVL:
+	l32r a2, _c_handler_imm\LVL
+	l32r a0, _handle_excint_imm\LVL
+	jx a0
+.popsection
+.endm
diff --git a/arch/xtensa/include/xtensa-asm2.h b/arch/xtensa/include/xtensa-asm2.h
new file mode 100644
index 00000000000..69d62f2d2c3
--- /dev/null
+++ b/arch/xtensa/include/xtensa-asm2.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2017, Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef _XTENSA_ASM2_H
+#define _XTENSA_ASM2_H
+
+#include "xtensa-asm2-context.h"
+
+/**
+ * Initializes a stack area such that it can be "restored" later and
+ * begin running with the specified function and three arguments.  The
+ * entry function takes three arguments to match the signature of
+ * Zephyr's k_thread_entry_t.  Thread will start with EXCM clear and
+ * INTLEVEL set to zero (i.e. it's a user thread, we don't start with
+ * anything masked, so don't assume that!).
+ */
+void *xtensa_init_stack(int *stack_top,
+			void (*entry)(void *, void *, void *),
+			void *arg1, void *arg2, void *arg3);
+
+#endif /* _XTENSA_ASM2_H */