diff --git a/include/kernel_structs.h b/include/kernel_structs.h
index 84140d7809b..5199f82b92b 100644
--- a/include/kernel_structs.h
+++ b/include/kernel_structs.h
@@ -109,6 +109,10 @@ struct _cpu {
 	/* one assigned idle thread per CPU */
 	struct k_thread *idle_thread;
 
+#ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
+	struct _ready_q ready_q;
+#endif
+
 #if (CONFIG_NUM_METAIRQ_PRIORITIES > 0) && (CONFIG_NUM_COOP_PRIORITIES > 0)
 	/* Coop thread preempted by current metairq, or NULL */
 	struct k_thread *metairq_preempted;
@@ -143,7 +147,9 @@ struct z_kernel {
 	 * ready queue: can be big, keep after small fields, since some
 	 * assembly (e.g. ARC) are limited in the encoding of the offset
 	 */
+#ifndef CONFIG_SCHED_CPU_MASK_PIN_ONLY
 	struct _ready_q ready_q;
+#endif
 
 #ifdef CONFIG_FPU_SHARING
 	/*
diff --git a/kernel/Kconfig b/kernel/Kconfig
index cb5c7379a9c..9bf3d6d0105 100644
--- a/kernel/Kconfig
+++ b/kernel/Kconfig
@@ -136,6 +136,21 @@ config SCHED_CPU_MASK
 	  CPU.  With one CPU, it's just a higher overhead version of
 	  k_thread_start/stop().
 
+config SCHED_CPU_MASK_PIN_ONLY
+	bool "CPU mask variant with single-CPU pinning only"
+	depends on SMP && SCHED_CPU_MASK
+	help
+	  When true, enables a variant of SCHED_CPU_MASK where only
+	  one CPU may be specified for every thread.  Effectively, all
+	  threads have a single "assigned" CPU and they will never be
+	  scheduled symmetrically.  In general this is not helpful,
+	  but some applications have a carefully designed threading
+	  architecture and want to make their own decisions about how
+	  to assign work to CPUs.  In that circumstance, some moderate
+	  optimizations can be made (e.g. having a separate run queue
+	  per CPU, keeping the list length shorter).  Most
+	  applications don't want this.
+
 config MAIN_STACK_SIZE
 	int "Size of stack for initialization and main thread"
 	default 2048 if COVERAGE_GCOV
diff --git a/kernel/include/kernel_offsets.h b/kernel/include/kernel_offsets.h
index 8b864fad1f7..e27b84288f0 100644
--- a/kernel/include/kernel_offsets.h
+++ b/kernel/include/kernel_offsets.h
@@ -40,7 +40,9 @@ GEN_OFFSET_SYM(_kernel_t, threads);
 GEN_OFFSET_SYM(_kernel_t, idle);
 #endif
 
+#ifndef CONFIG_SCHED_CPU_MASK_PIN_ONLY
 GEN_OFFSET_SYM(_kernel_t, ready_q);
+#endif
 
 #ifndef CONFIG_SMP
 GEN_OFFSET_SYM(_ready_q_t, cache);
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c51bb73b4f..5eeb192268d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -16,6 +16,7 @@
 #include <kernel_internal.h>
 #include <logging/log.h>
 #include <sys/atomic.h>
+#include <sys/math_extras.h>
 LOG_MODULE_DECLARE(os, CONFIG_KERNEL_LOG_LEVEL);
 
 #if defined(CONFIG_SCHED_DUMB)
@@ -187,19 +188,47 @@ ALWAYS_INLINE void z_priq_dumb_add(sys_dlist_t *pq, struct k_thread *thread)
 	sys_dlist_append(pq, &thread->base.qnode_dlist);
 }
 
-ALWAYS_INLINE void runq_add(struct k_thread *thread)
+static ALWAYS_INLINE void *thread_runq(struct k_thread *thread)
 {
-	_priq_run_add(&_kernel.ready_q.runq, thread);
+#ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
+	int cpu, m = thread->base.cpu_mask;
+
+	/* Edge case: it's legal per the API to "make runnable" a
+	 * thread with all CPUs masked off (i.e. one that isn't
+	 * actually runnable!).  Sort of a wart in the API and maybe
+	 * we should address this in docs/assertions instead to avoid
+	 * the extra test.
+	 */
+	cpu = m == 0 ? 0 : u32_count_trailing_zeros(m);
+
+	return &_kernel.cpus[cpu].ready_q.runq;
+#else
+	return &_kernel.ready_q.runq;
+#endif
 }
 
-ALWAYS_INLINE void runq_remove(struct k_thread *thread)
+static ALWAYS_INLINE void *curr_cpu_runq(void)
 {
-	_priq_run_remove(&_kernel.ready_q.runq, thread);
+#ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
+	return &arch_curr_cpu()->ready_q.runq;
+#else
+	return &_kernel.ready_q.runq;
+#endif
 }
 
-ALWAYS_INLINE struct k_thread *runq_best(void)
+static ALWAYS_INLINE void runq_add(struct k_thread *thread)
 {
-	return _priq_run_best(&_kernel.ready_q.runq);
+	_priq_run_add(thread_runq(thread), thread);
+}
+
+static ALWAYS_INLINE void runq_remove(struct k_thread *thread)
+{
+	_priq_run_remove(thread_runq(thread), thread);
+}
+
+static ALWAYS_INLINE struct k_thread *runq_best(void)
+{
+	return _priq_run_best(curr_cpu_runq());
 }
 
 /* _current is never in the run queue until context switch on
@@ -1110,7 +1139,13 @@ void init_ready_q(struct _ready_q *rq)
 
 void z_sched_init(void)
 {
+#ifdef CONFIG_SCHED_CPU_MASK_PIN_ONLY
+	for (int i = 0; i < CONFIG_MP_NUM_CPUS; i++) {
+		init_ready_q(&_kernel.cpus[i].ready_q);
+	}
+#else
 	init_ready_q(&_kernel.ready_q);
+#endif
 
 #ifdef CONFIG_TIMESLICING
 	k_sched_time_slice_set(CONFIG_TIMESLICE_SIZE,
@@ -1433,6 +1468,14 @@ static int cpu_mask_mod(k_tid_t thread, uint32_t enable_mask, uint32_t disable_m
 			ret = -EINVAL;
 		}
 	}
+
+#if defined(CONFIG_ASSERT) && defined(CONFIG_SCHED_CPU_MASK_PIN_ONLY)
+		int m = thread->base.cpu_mask;
+
+		__ASSERT((m == 0) || ((m & (m - 1)) == 0),
+			 "Only one CPU allowed in mask when PIN_ONLY");
+#endif
+
 	return ret;
 }
 
diff --git a/kernel/thread.c b/kernel/thread.c
index 9a49907e800..0ea5198d6ea 100644
--- a/kernel/thread.c
+++ b/kernel/thread.c
@@ -582,7 +582,11 @@ char *z_setup_new_thread(struct k_thread *new_thread,
 	}
 #endif
 #ifdef CONFIG_SCHED_CPU_MASK
-	new_thread->base.cpu_mask = -1;
+	if (IS_ENABLED(CONFIG_SCHED_CPU_MASK_PIN_ONLY)) {
+		new_thread->base.cpu_mask = 1; /* must specify only one cpu */
+	} else {
+		new_thread->base.cpu_mask = -1; /* allow all cpus */
+	}
 #endif
 #ifdef CONFIG_ARCH_HAS_CUSTOM_SWAP_TO_MAIN
 	/* _current may be null if the dummy thread is not used */