diff --git a/samples/index.rst b/samples/index.rst
index 90a6bf7b000..ce1d6f08f97 100644
--- a/samples/index.rst
+++ b/samples/index.rst
@@ -24,6 +24,7 @@ Samples and Demos
    portability/*
    posix/*
    video/*
+   scheduler/*
    smp/*
 
 .. comment
diff --git a/samples/scheduler/index.rst b/samples/scheduler/index.rst
new file mode 100644
index 00000000000..19d0a740352
--- /dev/null
+++ b/samples/scheduler/index.rst
@@ -0,0 +1,10 @@
+.. _scheduler-samples:
+
+Various Kernel and Scheduler Samples
+####################################
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+
+   **/*
diff --git a/samples/scheduler/metairq_dispatch/CMakeLists.txt b/samples/scheduler/metairq_dispatch/CMakeLists.txt
new file mode 100644
index 00000000000..ade46f25ee7
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/CMakeLists.txt
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.13.1)
+
+include($ENV{ZEPHYR_BASE}/cmake/app/boilerplate.cmake NO_POLICY_SCOPE)
+project(metairq_dispatch)
+
+target_sources(app PRIVATE src/main.c src/msgdev.c)
diff --git a/samples/scheduler/metairq_dispatch/README.rst b/samples/scheduler/metairq_dispatch/README.rst
new file mode 100644
index 00000000000..c302474de6e
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/README.rst
@@ -0,0 +1,63 @@
+MetaIRQ Thread Priority Demonstration
+#####################################
+
+Overview
+********
+
+This sample demonstrates the use of a thread running at a MetaIRQ
+priority level to implement "bottom half" style processing
+synchronously with the end of a hardware ISR.  It implements a
+simulated "device" that produces messages that need to be dispatched
+to asynchronous queues feeding several worker threads, each running at
+a different priority.  The dispatch is handled by a MetaIRQ thread fed
+via a queue from the device ISR (really just a timer interrupt).
+
+Each message has a random (and non-trivial) amount of processing that
+must happen in the worker thread.  This implements a "bursty load"
+environment where occassional spikes in load require preemption of
+running threads and delay scheduling of lower priority threads.
+Messages are accompanied by a timestamp that allows per-message
+latencies to be computed at several points:
+
+* The cycle time between message creation in the ISR and receipt by
+  the MetaIRQ thread for dispatch.
+
+* The time between ISR and receipt by the worker thread.
+
+* The real time spent processing the message in the worker thread, for
+  comparison with the required processing time.  This provides a way
+  to measure preemption overhead where the thread is not scheduled.
+
+Aspects to note in the results:
+
+* On average, higher priority (lower numbered) threads have better
+  latencies and lower processing delays, as expected.
+
+* Cooperatively scheduled threads have significantly better processing
+  delay behavior than preemtible ones, as they can only be preempted
+  by the MetaIRQ thread.
+
+* Because of queueing and the bursty load, all worker threads of any
+  priority will experience some load-dependent delays, as the CPU
+  occasionally has more work to do than time available.
+
+* But, no matter the system load or thread configuration, the MetaIRQ
+  thread always runs immediately after the ISR.  It shows reliable,
+  constant latency under all circumstances because it can preempt all
+  other threads, including cooperative ones that cannot normally be
+  preempted.
+
+Requirements
+************
+
+This sample should run well on any Zephyr platform that provides
+preemption of running threads by interrupts, a working timer driver,
+and working log output.  For precision reasons, it produces better
+(and more) data on systems with a high timer tick rate (ideally 10+
+kHz).
+
+Note that because the test is fundamentally measuring thread
+preemption behavior, it does not run without modification on
+native_posix platforms.  In that emulation environment, threads will
+not be preempted except at specific instrumentation points (e.g. in
+k_busy_wait()) where they will voluntarily release the CPU.
diff --git a/samples/scheduler/metairq_dispatch/prj.conf b/samples/scheduler/metairq_dispatch/prj.conf
new file mode 100644
index 00000000000..55577692f94
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/prj.conf
@@ -0,0 +1,9 @@
+CONFIG_ASSERT=y
+CONFIG_NUM_METAIRQ_PRIORITIES=1
+CONFIG_TEST_RANDOM_GENERATOR=y
+CONFIG_LOG=y
+CONFIG_LOG_MINIMAL=y
+
+# We're testing delivery latency to threads on a single CPU, MP
+# dispatch will mess up the statistics.
+CONFIG_MP_NUM_CPUS=1
diff --git a/samples/scheduler/metairq_dispatch/sample.yaml b/samples/scheduler/metairq_dispatch/sample.yaml
new file mode 100644
index 00000000000..84f50dcfdc1
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/sample.yaml
@@ -0,0 +1,19 @@
+sample:
+  description: Sample demonstrating dispatch of hardware events from a
+    MetaIRQ thread
+  name: MetaIRQ Dispatch
+common:
+    harness: console
+    harness_config:
+      type: one_line
+      regex:
+        - "MetaIRQ Test Complete"
+
+# Note that native_posix architectures are filtered, they require
+# instrumentation (e.g. a k_busy_wait()) inside the worker threads
+# "busy" loops in order for the interrupts to fire on time, and the
+# sample is designed to demonstrate completely arbitrary CPU work.
+tests:
+  sample.metairq_dispatch:
+    tags: introduction
+    filter: not CONFIG_ARCH_POSIX
diff --git a/samples/scheduler/metairq_dispatch/src/main.c b/samples/scheduler/metairq_dispatch/src/main.c
new file mode 100644
index 00000000000..32c47844e7d
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/src/main.c
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <zephyr.h>
+#include "main.h"
+
+#include <logging/log.h>
+LOG_MODULE_REGISTER(main, LOG_LEVEL_INF);
+
+#define STACK_SIZE 2048
+
+/* How many messages can be queued for a single thread */
+#define QUEUE_DEPTH 16
+
+/* Array of worker threads, and their stacks */
+static struct thread_rec {
+	struct k_thread thread;
+	struct k_msgq msgq;
+	struct msg msgq_buf[QUEUE_DEPTH];
+} threads[NUM_THREADS];
+
+K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_THREADS, STACK_SIZE);
+
+/* The static metairq thread we'll use for dispatch */
+static void metairq_fn(void *p1, void *p2, void *p3);
+K_THREAD_DEFINE(metairq_thread, STACK_SIZE, metairq_fn,
+		NULL, NULL, NULL, K_HIGHEST_THREAD_PRIO, 0, K_NO_WAIT);
+
+/* Accumulated list of latencies, for a naive variance computation at
+ * the end.
+ */
+struct {
+	atomic_t num_mirq;
+	u32_t mirq_latencies[MAX_EVENTS];
+	struct {
+		u32_t nevt;
+		u32_t latencies[MAX_EVENTS * 2 / NUM_THREADS];
+	} threads[NUM_THREADS];
+} stats;
+
+/* A semaphore with an initial count, used to allow only one thread to
+ * log the final report.
+ */
+K_SEM_DEFINE(report_cookie, 1, 1);
+
+static void metairq_fn(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p1);
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+
+	while (true) {
+		/* Receive a message, immediately check a timestamp
+		 * and compute a latency value, then dispatch it to
+		 * the queue for its target thread
+		 */
+		struct msg m;
+
+		message_dev_fetch(&m);
+		m.metairq_latency = k_cycle_get_32() - m.timestamp;
+
+		int ret = k_msgq_put(&threads[m.target].msgq, &m, K_NO_WAIT);
+
+		if (ret) {
+			LOG_INF("Thread %d queue full, message %d dropped",
+			       m.target, m.seq);
+		}
+	}
+}
+
+/* Simple recursive implementation of an integer square root, cribbed
+ * from wikipedia
+ */
+static u32_t isqrt(u64_t n)
+{
+	if (n > 1) {
+		u64_t lo = isqrt(n >> 2) << 1;
+		u64_t hi = lo + 1;
+
+		return (u32_t)(((hi * hi) > n) ? lo : hi);
+	}
+	return (u32_t) n;
+}
+
+static void calc_stats(const u32_t *array, u32_t n,
+		       u32_t *lo, u32_t *hi, u32_t *mean, u32_t *stdev)
+{
+	u64_t tot = 0, totsq = 0;
+
+	*lo = INT_MAX;
+	*hi = 0;
+	for (int i = 0; i < n; i++) {
+		*lo = MIN(*lo, array[i]);
+		*hi = MAX(*hi, array[i]);
+		tot += array[i];
+	}
+
+	*mean = (u32_t)((tot + (n / 2)) / n);
+
+	for (int i = 0; i < n; i++) {
+		s64_t d = (s32_t) (array[i] - *mean);
+
+		totsq += d * d;
+	}
+
+	*stdev = isqrt((totsq + (n / 2)) / n);
+}
+
+static void record_latencies(struct msg *m, u32_t latency)
+{
+	/* Workaround: qemu emulation shows an erroneously high
+	 * metairq latency for the very first event of 7-8us.  Maybe
+	 * it needs to fault in the our code pages in the host?
+	 */
+	if (IS_ENABLED(CONFIG_QEMU_TARGET) && m->seq == 0) {
+		return;
+	}
+
+	int t = m->target;
+	int lidx = stats.threads[t].nevt++;
+
+	if (lidx < ARRAY_SIZE(stats.threads[t].latencies)) {
+		stats.threads[t].latencies[lidx] = latency;
+	}
+
+	stats.mirq_latencies[atomic_inc(&stats.num_mirq)] = m->metairq_latency;
+
+	/* Once we've logged our final event, print a report.  We use
+	 * a semaphore with an initial count of 1 to ensure that only
+	 * one thread gets to do this.  Also events can be processed
+	 * out of order, so add a small sleep to let the queues
+	 * finish.
+	 */
+	if (m->seq == MAX_EVENTS - 1) {
+		u32_t hi, lo, mean, stdev, ret;
+
+		ret = k_sem_take(&report_cookie, K_FOREVER);
+		__ASSERT_NO_MSG(ret == 0);
+		k_sleep(100);
+
+		calc_stats(stats.mirq_latencies, stats.num_mirq,
+			   &lo, &hi, &mean, &stdev);
+
+		LOG_INF("        ---------- Latency (cyc) ----------");
+		LOG_INF("            Best    Worst     Mean    Stdev");
+		LOG_INF("MetaIRQ %8d %8d %8d %8d", lo, hi, mean, stdev);
+
+
+		for (int i = 0; i < NUM_THREADS; i++) {
+			if (stats.threads[i].nevt == 0) {
+				LOG_WRN("No events for thread %d", i);
+				continue;
+			}
+
+			calc_stats(stats.threads[i].latencies,
+				   stats.threads[i].nevt,
+				   &lo, &hi, &mean, &stdev);
+
+			LOG_INF("Thread%d %8d %8d %8d %8d",
+				i, lo, hi, mean, stdev);
+		}
+
+		LOG_INF("MetaIRQ Test Complete");
+	}
+}
+
+static void thread_fn(void *p1, void *p2, void *p3)
+{
+	ARG_UNUSED(p2);
+	ARG_UNUSED(p3);
+	int id = (long)p1;
+	struct msg m;
+
+	LOG_INF("Starting Thread%d at priority %d", id,
+		k_thread_priority_get(k_current_get()));
+
+	while (true) {
+		int ret = k_msgq_get(&threads[id].msgq, &m, K_FOREVER);
+		u32_t start = k_cycle_get_32();
+
+		__ASSERT_NO_MSG(ret == 0);
+
+		/* Spin on the CPU for the requested number of cycles
+		 * doing the "work" required to "process" the event.
+		 * Note the inner loop: hammering on k_cycle_get_32()
+		 * on some platforms requires locking around the timer
+		 * driver internals and can affect interrupt latency.
+		 * Obviously we may be preempted as new events arrive
+		 * and get queued.
+		 */
+		while (k_cycle_get_32() - start < m.proc_cyc) {
+			for (volatile int i = 0; i < 100; i++) {
+			}
+		}
+
+		u32_t dur = k_cycle_get_32() - start;
+
+#ifdef LOG_EVERY_EVENT
+		/* Log the message, its thread, and the following cycle values:
+		 * 1. Receive it from the driver in the MetaIRQ thread
+		 * 2. Begin processing it out of the queue in the worker thread
+		 * 3. The requested processing time in the message
+		 * 4. The actual time taken to process the message
+		 *    (may be higher if the thread was preempted)
+		 */
+		LOG_INF("M%d T%d mirq %d disp %d proc %d real %d",
+			m.seq, id, m.metairq_latency,
+			start - m.timestamp, m.proc_cyc, dur);
+#endif
+
+		/* Collect the latency values in a big statistics array */
+		record_latencies(&m, start - m.timestamp);
+	}
+}
+
+void main(void)
+{
+	for (long i = 0; i < NUM_THREADS; i++) {
+		/* Each thread gets a different priority.  Half should
+		 * be at (negative) cooperative priorities.  Lower
+		 * thread numbers have higher priority values,
+		 * e.g. thread 0 will be preempted only by the
+		 * metairq.
+		 */
+		int prio = (-NUM_THREADS/2) + i;
+
+		k_msgq_init(&threads[i].msgq, (char *)threads[i].msgq_buf,
+			    sizeof(struct msg), QUEUE_DEPTH);
+
+		k_thread_create(&threads[i].thread,
+				thread_stacks[i], STACK_SIZE,
+				thread_fn, (void *)i, NULL, NULL,
+				prio, 0, K_NO_WAIT);
+	}
+
+	message_dev_init();
+}
diff --git a/samples/scheduler/metairq_dispatch/src/main.h b/samples/scheduler/metairq_dispatch/src/main.h
new file mode 100644
index 00000000000..6239220bd7b
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/src/main.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#ifndef SAMPLE_METAIRQ_MAIN_H
+
+/* Define this to enable logging of every event as it is processed to
+ * carefully inspect behavior.  Note that at high event rates (see
+ * MAX_EVENT_DELAY_TICKS) the log backend won't be able to keep up and
+ * that the log processing will add CPU load that is not accounted for
+ * in AVERAGE_LOAD_TARGET_PCT.  Disable for more accurate statistics
+ * measurement.  Note that if you don't want the log subsystem
+ * throttling to drop messages, you'll probably want to increase
+ * MAX_EVENT_DELAY_TICKS too, to slow down the reporting.
+ */
+#define LOG_EVERY_EVENT 1
+
+/* Number of worker threads, each at a separate priority, split evenly
+ * between cooperative and preemptible priorities.
+ */
+#define NUM_THREADS 4
+
+/* The proc_cyc duty cycle parameters are chosen to use approximately
+ * this fraction of one CPU's available cycles.  Default 60%
+ */
+#define AVERAGE_LOAD_TARGET_PCT 60
+
+/* "Hardware interrupts" for our fake device will arrive after a
+ * random delay of between zero and this number of ticks.  The event
+ * rate should be high enough to collect enough data but low enough
+ * that it is not regular.
+ */
+#define MAX_EVENT_DELAY_TICKS 8
+
+/* How many events will be sent before the test completes?  Note that
+ * we keep a naive array of latencies to compute variance, so this
+ * value has memory costs.
+ */
+#define MAX_EVENTS 40
+
+/* The "messages" dispatched by our MetaIRQ thread */
+struct msg {
+	/* Sequence number */
+	u32_t seq;
+
+	/* Cycle time when the message was "received" */
+	u32_t timestamp;
+
+	/* Thread to which the message is targeted */
+	u32_t target;
+
+	/* Cycles of processing the message requires */
+	u32_t proc_cyc;
+
+	/* Cycles of latency before the MetaIRQ thread received the message */
+	u32_t metairq_latency;
+};
+
+/* Initialize the fake "message" device, after this messages will
+ * begin arriving via message_dev_fetch().
+ */
+void message_dev_init(void);
+
+/* Retrieve the next message from the "device", blocks until
+ * delivery
+ */
+void message_dev_fetch(struct msg *m);
+
+#endif
diff --git a/samples/scheduler/metairq_dispatch/src/msgdev.c b/samples/scheduler/metairq_dispatch/src/msgdev.c
new file mode 100644
index 00000000000..6236686772b
--- /dev/null
+++ b/samples/scheduler/metairq_dispatch/src/msgdev.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <timeout_q.h>
+#include "main.h"
+
+/* This file implements a fake device that creates and enqueues
+ * "struct msg" messages for handling by the rest of the test.  It's
+ * based on Zephyr kernel timeouts only.
+ */
+
+/* Note: we use the internal timeout API to get tick precision,
+ * k_timer limits us to milliseconds.
+ */
+static struct _timeout timeout;
+
+/* The "proc_cyc" parameter in the message, indicating how many cycles
+ * the target thread should delay while "processing" the message, will
+ * be a random number between zero and this value.
+ */
+u32_t max_duty_cyc;
+
+u32_t msg_seq;
+
+K_MSGQ_DEFINE(hw_msgs, sizeof(struct msg), 2, sizeof(u32_t));
+
+static void timeout_reset(void);
+
+/* Use a custom RNG for good statistics, sys_rand32_get() is just a
+ * timer counter on some platforms.  Note that this is used only
+ * inside the ISR and needs no locking for the otherwise non-atomic
+ * state.
+ */
+static u32_t rand32(void)
+{
+	static u64_t state;
+
+	if (!state) {
+		state = ((u64_t)k_cycle_get_32()) << 16;
+	}
+
+	/* MMIX LCRNG parameters */
+	state = state * 6364136223846793005ULL + 1442695040888963407ULL;
+	return (u32_t)(state >> 32);
+}
+
+/* This acts as the "ISR" for our fake device.  It "reads from the
+ * hardware" a single timestamped message which needs to be dispatched
+ * (by the MetaIRQ) to a random thread, with a random argument
+ * indicating how long the thread should "process" the message.
+ */
+static void dev_timer_expired(struct _timeout *t)
+{
+	__ASSERT_NO_MSG(t == &timeout);
+	u32_t timestamp = k_cycle_get_32();
+	struct msg m;
+
+	m.seq = msg_seq++;
+	m.timestamp = timestamp;
+	m.target = rand32() % NUM_THREADS;
+	m.proc_cyc = rand32() % max_duty_cyc;
+
+	int ret = k_msgq_put(&hw_msgs, &m, K_NO_WAIT);
+
+	if (ret != 0) {
+		printk("ERROR: Queue full, event dropped!\n");
+	}
+
+	if (m.seq < MAX_EVENTS) {
+		timeout_reset();
+	}
+}
+
+static void timeout_reset(void)
+{
+	u32_t ticks = rand32() % MAX_EVENT_DELAY_TICKS;
+
+	z_add_timeout(&timeout, dev_timer_expired, ticks);
+}
+
+void message_dev_init(void)
+{
+	/* Compute a bound for the proc_cyc message parameter such
+	 * that on average we request a known percent of available
+	 * CPU.  We want the load to sometimes back up and require
+	 * queueing, but to be achievable over time.
+	 */
+	u64_t cyc_per_tick = k_ticks_to_cyc_near64(1);
+	u64_t avg_ticks_per_event = MAX_EVENT_DELAY_TICKS / 2;
+	u64_t avg_cyc_per_event = cyc_per_tick * avg_ticks_per_event;
+
+	max_duty_cyc = (2 * avg_cyc_per_event * AVERAGE_LOAD_TARGET_PCT) / 100;
+
+	z_add_timeout(&timeout, dev_timer_expired, K_NO_WAIT);
+}
+
+void message_dev_fetch(struct msg *m)
+{
+	int ret = k_msgq_get(&hw_msgs, m, K_FOREVER);
+
+	__ASSERT_NO_MSG(ret == 0);
+}