diff --git a/arch/Kconfig b/arch/Kconfig
index eb7d98334eb..1f8d84e4e77 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -420,6 +420,13 @@ config ARCH_HAS_EXTRA_EXCEPTION_INFO
 config ARCH_HAS_GDBSTUB
 	bool
 
+config ARCH_HAS_COHERENCE
+	bool
+	help
+	  When selected, the architecture supports the
+	  arch_mem_coherent() API and can link into incoherent/cached
+	  memory using the ".cached" linker section.
+
 #
 # Other architecture related options
 #
diff --git a/include/linker/section_tags.h b/include/linker/section_tags.h
index 1b9693ec165..d794a3c6701 100644
--- a/include/linker/section_tags.h
+++ b/include/linker/section_tags.h
@@ -38,6 +38,14 @@
 #define __nocache
 #endif /* CONFIG_NOCACHE_MEMORY */
 
+#if defined(CONFIG_KERNEL_COHERENCE)
+#define __incoherent __in_section_unique(cached)
+#define __stackmem __incoherent
+#else
+#define __incoherent Z_GENERIC_SECTION(.user_stacks)
+#define __stackmem __incoherent
+#endif /* CONFIG_KERNEL_COHERENCE */
+
 #endif /* !_ASMLANGUAGE */
 
 #endif /* ZEPHYR_INCLUDE_LINKER_SECTION_TAGS_H_ */
diff --git a/include/spinlock.h b/include/spinlock.h
index c5c21d8ef33..44d4c6abdad 100644
--- a/include/spinlock.h
+++ b/include/spinlock.h
@@ -118,6 +118,9 @@ static ALWAYS_INLINE k_spinlock_key_t k_spin_lock(struct k_spinlock *l)
 
 #ifdef CONFIG_SPIN_VALIDATE
 	__ASSERT(z_spin_lock_valid(l), "Recursive spinlock %p", l);
+# ifdef KERNEL_COHERENCE
+	__ASSERT_NO_MSG(arch_mem_coherent(l));
+# endif
 #endif
 
 #ifdef CONFIG_SMP
diff --git a/include/sys/arch_interface.h b/include/sys/arch_interface.h
index 5104fa95985..8f96a96d954 100644
--- a/include/sys/arch_interface.h
+++ b/include/sys/arch_interface.h
@@ -691,6 +691,79 @@ FUNC_NORETURN void arch_syscall_oops(void *ssf);
 size_t arch_user_string_nlen(const char *s, size_t maxsize, int *err);
 #endif /* CONFIG_USERSPACE */
 
+/**
+ * @brief Detect memory coherence type
+ *
+ * Required when ARCH_HAS_COHERENCE is true.  This function returns
+ * true if the byte pointed to lies within an architecture-defined
+ * "coherence region" (typically implemented with uncached memory) and
+ * can safely be used in multiprocessor code without explicit flush or
+ * invalidate operations.
+ *
+ * @note The result is for only the single byte at the specified
+ * address, this API is not required to check region boundaries or to
+ * expect aligned pointers.  The expectation is that the code above
+ * will have queried the appropriate address(es).
+ */
+#ifndef CONFIG_ARCH_HAS_COHERENCE
+static inline bool arch_mem_coherent(void *ptr)
+{
+	ARG_UNUSED(ptr);
+	return true;
+}
+#endif
+
+/**
+ * @brief Ensure cache coherence prior to context switch
+ *
+ * Required when ARCH_HAS_COHERENCE is true.  On cache-incoherent
+ * multiprocessor architectures, thread stacks are cached by default
+ * for performance reasons.  They must therefore be flushed
+ * appropriately on context switch.  The rules are:
+ *
+ * 1. The region containing live data in the old stack (generally the
+ *    bytes between the current stack pointer and the top of the stack
+ *    memory) must be flushed to underlying storage so a new CPU that
+ *    runs the same thread sees the correct data.  This must happen
+ *    before the assignment of the switch_handle field in the thread
+ *    struct which signals the completion of context switch.
+ *
+ * 2. Any data areas to be read from the new stack (generally the same
+ *    as the live region when it was saved) should be invalidated (and
+ *    NOT flushed!) in the data cache.  This is because another CPU
+ *    may have run or re-initialized the thread since this CPU
+ *    suspended it, and any data present in cache will be stale.
+ *
+ * @note The kernel will call this function during interrupt exit when
+ * a new thread has been chosen to run, and also immediately before
+ * entering arch_switch() to effect a code-driven context switch.  In
+ * the latter case, it is very likely that more data will be written
+ * to the old_thread stack region after this function returns but
+ * before the completion of the switch.  Simply flushing naively here
+ * is not sufficient on many architectures and coordination with the
+ * arch_switch() implementation is likely required.
+ *
+ * @arg old_thread The old thread to be flushed before being allowed
+ *                 to run on other CPUs.
+ * @arg old_switch_handle The switch handle to be stored into
+ *                        old_thread (it will not be valid until the
+ *                        cache is flushed so is not present yet).
+ *                        This will be NULL if inside z_swap()
+ *                        (because the arch_switch() has not saved it
+ *                        yet).
+ * @arg new_thread The new thread to be invalidated before it runs locally.
+ */
+#ifndef CONFIG_KERNEL_COHERENCE
+static inline void arch_cohere_stacks(struct k_thread *old_thread,
+                                      void *old_switch_handle,
+                                      struct k_thread *new_thread)
+{
+	ARG_UNUSED(old_thread);
+	ARG_UNUSED(old_switch_handle);
+	ARG_UNUSED(new_thread);
+}
+#endif
+
 /** @} */
 
 /**
diff --git a/include/sys/thread_stack.h b/include/sys/thread_stack.h
index 8c14fb993b0..87e7c74c54f 100644
--- a/include/sys/thread_stack.h
+++ b/include/sys/thread_stack.h
@@ -319,7 +319,7 @@ static inline char *Z_KERNEL_STACK_BUFFER(k_thread_stack_t *sym)
  * @param size Size of the stack memory region
  */
 #define K_THREAD_STACK_DEFINE(sym, size) \
-	struct z_thread_stack_element Z_GENERIC_SECTION(.user_stacks) \
+	struct z_thread_stack_element __stackmem \
 		__aligned(Z_THREAD_STACK_OBJ_ALIGN(size)) \
 		sym[Z_THREAD_STACK_SIZE_ADJUST(size)]
 
@@ -354,7 +354,7 @@ static inline char *Z_KERNEL_STACK_BUFFER(k_thread_stack_t *sym)
  * @param size Size of the stack memory region
  */
 #define K_THREAD_STACK_ARRAY_DEFINE(sym, nmemb, size) \
-	struct z_thread_stack_element Z_GENERIC_SECTION(.user_stacks) \
+	struct z_thread_stack_element __stackmem \
 		__aligned(Z_THREAD_STACK_OBJ_ALIGN(size)) \
 		sym[nmemb][K_THREAD_STACK_LEN(size)]
 
diff --git a/kernel/Kconfig b/kernel/Kconfig
index 9c940c1c65d..c45bdaa8438 100644
--- a/kernel/Kconfig
+++ b/kernel/Kconfig
@@ -806,6 +806,23 @@ config TRACE_SCHED_IPI
 	depends on SCHED_IPI_SUPPORTED
 	depends on MP_NUM_CPUS>1
 
+config KERNEL_COHERENCE
+	bool "Place all shared data into coherent memory"
+	default y if ARCH_HAS_COHERENCE && SMP && MP_NUM_CPUS > 1
+	select THREAD_STACK_INFO
+	help
+	  When available and selected, the kernel will build in a mode
+	  where all shared data is placed in multiprocessor-coherent
+	  (generally "uncached") memory.  Thread stacks will remain
+	  cached, as will application memory declared with
+	  __incoherent.  This is intended for Zephyr SMP kernels
+	  running on cache-incoherent architectures only.  Note that
+	  when this is selected, there is an implicit API change that
+	  assumes cache coherence to any memory passed to the kernel.
+	  Code that creates kernel data structures in uncached regions
+	  may fail strangely.  Some assertions exist to catch these
+	  mistakes, but not all circumstances can be tested.
+
 endmenu
 
 config TICKLESS_IDLE
diff --git a/kernel/include/kswap.h b/kernel/include/kswap.h
index 9b6cab32177..76c634b25f2 100644
--- a/kernel/include/kswap.h
+++ b/kernel/include/kswap.h
@@ -100,7 +100,6 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 
 #ifdef CONFIG_SMP
 		_current_cpu->swap_ok = 0;
-
 		new_thread->base.cpu = arch_curr_cpu()->id;
 
 		if (!is_spinlock) {
@@ -108,8 +107,10 @@ static ALWAYS_INLINE unsigned int do_swap(unsigned int key,
 		}
 #endif
 		sys_trace_thread_switched_out();
-		_current_cpu->current = new_thread;
 		wait_for_switch(new_thread);
+		arch_cohere_stacks(old_thread, NULL, new_thread);
+		_current_cpu->current = new_thread;
+
 		arch_switch(new_thread->switch_handle,
 			     &old_thread->switch_handle);
 
diff --git a/kernel/init.c b/kernel/init.c
index 85157815e4c..27c52aae120 100644
--- a/kernel/init.c
+++ b/kernel/init.c
@@ -247,6 +247,10 @@ static void bg_thread_main(void *unused1, void *unused2, void *unused3)
 
 	z_init_static_threads();
 
+#ifdef KERNEL_COHERENCE
+	__ASSERT_NO_MSG(arch_mem_coherent(_kernel));
+#endif
+
 #ifdef CONFIG_SMP
 	z_smp_init();
 	z_sys_init_run_level(_SYS_INIT_LEVEL_SMP);
diff --git a/kernel/sched.c b/kernel/sched.c
index a46432c99ed..9dfb7cae394 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -413,6 +413,10 @@ static void update_cache(int preempt_ok)
 
 static void ready_thread(struct k_thread *thread)
 {
+#ifdef KERNEL_COHERENCE
+	__ASSERT_NO_MSG(arch_mem_coherent(thread));
+#endif
+
 	if (z_is_thread_ready(thread)) {
 		sys_trace_thread_ready(thread);
 		_priq_run_add(&_kernel.ready_q.runq, thread);
@@ -662,6 +666,10 @@ static void add_thread_timeout(struct k_thread *thread, k_timeout_t timeout)
 static void pend(struct k_thread *thread, _wait_q_t *wait_q,
 		 k_timeout_t timeout)
 {
+#ifdef KERNEL_COHERENCE
+	__ASSERT_NO_MSG(arch_mem_coherent(wait_q));
+#endif
+
 	LOCKED(&sched_spinlock) {
 		add_to_waitq_locked(thread, wait_q);
 	}
@@ -903,22 +911,26 @@ static inline void set_current(struct k_thread *new_thread)
 #ifdef CONFIG_USE_SWITCH
 void *z_get_next_switch_handle(void *interrupted)
 {
-	_current->switch_handle = interrupted;
-
 	z_check_stack_sentinel();
 
 #ifdef CONFIG_SMP
 	LOCKED(&sched_spinlock) {
-		struct k_thread *thread = next_up();
+		struct k_thread *old_thread = _current, *new_thread;
 
-		if (_current != thread) {
-			update_metairq_preempt(thread);
+		old_thread->switch_handle = NULL;
+		new_thread = next_up();
+
+		if (old_thread != new_thread) {
+			update_metairq_preempt(new_thread);
+			wait_for_switch(new_thread);
+			arch_cohere_stacks(old_thread, interrupted, new_thread);
 
 #ifdef CONFIG_TIMESLICING
 			z_reset_time_slice();
 #endif
 			_current_cpu->swap_ok = 0;
-			set_current(thread);
+			set_current(new_thread);
+
 #ifdef CONFIG_SPIN_VALIDATE
 			/* Changed _current!  Update the spinlock
 			 * bookeeping so the validation doesn't get
@@ -928,15 +940,12 @@ void *z_get_next_switch_handle(void *interrupted)
 			z_spin_lock_set_owner(&sched_spinlock);
 #endif
 		}
+		old_thread->switch_handle = interrupted;
 	}
 #else
-	struct k_thread *thread = z_get_next_ready_thread();
-	if (_current != thread) {
-		set_current(thread);
-	}
+	_current->switch_handle = interrupted;
+	set_current(z_get_next_ready_thread());
 #endif
-
-	wait_for_switch(_current);
 	return _current->switch_handle;
 }
 #endif
diff --git a/kernel/thread.c b/kernel/thread.c
index 43093b2271a..e1ef8ce72a1 100644
--- a/kernel/thread.c
+++ b/kernel/thread.c
@@ -565,6 +565,13 @@ char *z_setup_new_thread(struct k_thread *new_thread,
 	z_init_thread_base(&new_thread->base, prio, _THREAD_PRESTART, options);
 	stack_ptr = setup_thread_stack(new_thread, stack, stack_size);
 
+#ifdef KERNEL_COHERENCE
+        /* Check that the thread object is safe, but that the stack is
+         * still cached! */
+	__ASSERT_NO_MSG(arch_mem_coherent(new_thread));
+	__ASSERT_NO_MSG(!arch_mem_coherent(stack));
+#endif
+
 	arch_new_thread(new_thread, stack, stack_ptr, entry, p1, p2, p3);
 
 	/* static threads overwrite it afterwards with real value */
diff --git a/kernel/timeout.c b/kernel/timeout.c
index 4bc031f74c3..fcd13855749 100644
--- a/kernel/timeout.c
+++ b/kernel/timeout.c
@@ -91,6 +91,10 @@ void z_add_timeout(struct _timeout *to, _timeout_func_t fn,
 		return;
 	}
 
+#ifdef KERNEL_COHERENCE
+	__ASSERT_NO_MSG(arch_mem_coherent(to));
+#endif
+
 #ifdef CONFIG_LEGACY_TIMEOUT_API
 	k_ticks_t ticks = timeout;
 #else