diff --git a/boards/xtensa/intel_adsp_cavs15/tools/adsplog.py b/boards/xtensa/intel_adsp_cavs15/tools/adsplog.py
index 8df555cf8e5..d099aa0dc48 100755
--- a/boards/xtensa/intel_adsp_cavs15/tools/adsplog.py
+++ b/boards/xtensa/intel_adsp_cavs15/tools/adsplog.py
@@ -3,83 +3,212 @@
 # Copyright (c) 2020 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import os
 import sys
 import time
-import struct
 import subprocess
 import mmap
 
+# Log reader for the trace output buffer on a ADSP device.
+#
+# The trace buffer is inside a shared memory region exposed by the
+# audio PCI device as a BAR at index 4.  The hardware provides 4 128k
+# "windows" starting at 512kb in the BAR which the DSP firmware can
+# map to 4k-aligned locations within its own address space.  By
+# protocol convention log output is an 8k region at window index 3.
+#
+# The 8k window is treated as an array of 64-byte "slots", each of
+# which is prefixed by a magic number, which should be 0x55aa for log
+# data, followed a 16 bit "ID" number, followed by a null-terminated
+# string in the final 60 bytes (or 60 non-null bytes of log data).
+# The DSP firmware will write sequential IDs into the buffer starting
+# from an ID of zero in the first slot, and wrapping at the end.
+
 MAP_SIZE = 8192
 SLOT_SIZE = 64
-
-# Location of the log output window within the DSP BAR on the PCI
-# device.  The hardware provides 4x 128k "windows" starting at 512kb
-# in the BAR which the DSP software can map to 4k-aligned locations
-# within its own address space.  By convention log output is an 8k
-# region at window index 3.
+NUM_SLOTS = int(MAP_SIZE / SLOT_SIZE)
+SLOT_MAGIC = 0x55aa
 WIN_OFFSET = 0x80000
-WIN_ID = 3
+WIN_IDX = 3
 WIN_SIZE = 0x20000
-LOG_OFFSET = WIN_OFFSET + WIN_ID * WIN_SIZE
+LOG_OFFSET = WIN_OFFSET + WIN_IDX * WIN_SIZE
 
-# Find me a way to do this detection as cleanly in python as shell, I
-# dare you.
-barfile = subprocess.Popen(["sh", "-c",
-                            "echo -n "
-                            "$(dirname "
-                            "  $(fgrep PCI_ID=8086:5A98 "
-                            "    /sys/bus/pci/devices/*/uevent))"
-                            "/resource4"],
-                           stdout=subprocess.PIPE).stdout.read()
-fd = open(barfile)
-mem = mmap.mmap(fd.fileno(), MAP_SIZE, offset=LOG_OFFSET,
-                prot=mmap.PROT_READ)
+# List of known ADSP devices by their PCI IDs
+DEVICES = ["8086:5a98"]
 
-# The mapping is an array of 64-byte "slots", each of which is
-# prefixed by a magic number, which should be 0x55aa for log data,
-# followed a 16 bit "ID" number, followed by a null-terminated string
-# in the final 60 bytes.  The DSP firmware will write sequential IDs
-# into the buffer starting from an ID of zero in the first slot, and
-# wrapping at the end.  So the algorithm here is to find the smallest
-# valid slot, print its data, and then enter a polling loop waiting
-# for the next slot to be valid and have the correct next ID before
-# printing that too.
+mem = None
 
-# NOTE: unfortunately there's no easy way to detect a warm reset of
-# the device, it will just jump back to the beginning and start
-# writing there, where we aren't looking.  Really that level of
-# robustness needs to be handled in the kernel.
+for dev in DEVICES:
+    # Find me a way to do this detection as cleanly in python as shell, I
+    # dare you.
+    barfile = subprocess.Popen(["sh", "-c",
+                                "echo -n "
+                                "$(dirname "
+                                f"  $(fgrep PCI_ID={dev.upper()} "
+                                "    /sys/bus/pci/devices/*/uevent))"
+                                "/resource4"],
+                               stdout=subprocess.PIPE).stdout.read()
+    if not os.path.exists(barfile):
+        continue
 
-next_slot = 0
-next_id = 0xffff
+    if not os.access(barfile, os.R_OK):
+        sys.stderr.write(f"ERROR: Cannot open {barfile} for reading.")
+        sys.exit(1)
 
-for slot in range(int(MAP_SIZE / SLOT_SIZE)):
+    fd = open(barfile)
+    mem = mmap.mmap(fd.fileno(), MAP_SIZE, offset=LOG_OFFSET,
+                    prot=mmap.PROT_READ)
+
+if mem is None:
+    sys.stderr.write("ERROR: No ADSP device found.")
+    sys.exit(1)
+
+# Returns a tuple of (id, msg) if the slot is valid, or (-1, "") if
+# the slot does not contain firmware trace data
+def read_slot(slot, mem):
     off = slot * SLOT_SIZE
-    (magic, sid) = struct.unpack("HH", mem[off:off+4])
-    if magic == 0x55aa:
-        if sid < next_id:
-            next_slot = slot
-            next_id = sid
 
-while True:
-    off = next_slot * SLOT_SIZE
-    (magic, sid) = struct.unpack("HH", mem[off:off+4])
-    if magic == 0x55aa and sid == next_id:
-        # This dance because indexing large variable-length slices of
-        # the mmap() array seems to produce garbage....
-        msgbytes = []
-        for i in range(4, SLOT_SIZE):
-            b = mem[off+i]
-            if b == 0:
-                break
-            msgbytes.append(b)
-        msg = bytearray(len(msgbytes))
-        for i, elem in enumerate(msgbytes):
-            msg[i] = elem
+    magic = (mem[off + 1] << 8) | mem[off]
+    sid = (mem[off + 3] << 8) | mem[off + 2]
 
-        sys.stdout.write(msg.decode(encoding="utf-8", errors="ignore"))
-        next_slot = int((next_slot + 1) % (MAP_SIZE / SLOT_SIZE))
-        next_id += 1
-    else:
+    if magic != SLOT_MAGIC:
+        return (-1, "")
+
+    # This dance because indexing large variable-length slices of
+    # the mmap() array seems to produce garbage....
+    msgbytes = []
+    for i in range(4, SLOT_SIZE):
+        b = mem[off+i]
+        if b == 0:
+            break
+        msgbytes.append(b)
+    msg = bytearray(len(msgbytes))
+    for i, elem in enumerate(msgbytes):
+        msg[i] = elem
+
+    return (sid, msg.decode(encoding="utf-8", errors="ignore"))
+
+def read_hist(start_slot):
+    id0, msg = read_slot(start_slot, mem)
+
+    # An invalid slot zero means no data has ever been placed in the
+    # trace buffer, which is likely a system reset condition.  Back
+    # off for one second, because continuing to read the buffer has
+    # been observed to hang the flash process (which I think can only
+    # be a hardware bug).
+    if start_slot == 0 and id0 < 0:
+        sys.stdout.write("===\n=== [ADSP Device Reset]\n===\n")
         sys.stdout.flush()
-        time.sleep(0.25)
+        time.sleep(1)
+        return (0, 0, "")
+
+    # Start at zero and read forward to get the last data in the
+    # buffer.  We are always guaranteed that slot zero will contain
+    # valid data if any slot contains valid data.
+    last_id = id0
+    final_slot = start_slot
+    for i in range(start_slot + 1, NUM_SLOTS):
+        id, s = read_slot(i, mem)
+        if id != ((last_id + 1) & 0xffff):
+            break
+        msg += s
+        final_slot = i
+        last_id = id
+
+    final_id = last_id
+
+    # Now read backwards from the end to get the prefix blocks from
+    # the last wraparound
+    last_id = id0
+    for i in range(NUM_SLOTS - 1, final_slot, -1):
+        id, s = read_slot(i, mem)
+        if id < 0:
+            break
+
+        # Race protection: the other side might have clobbered the
+        # data after we read the ID, make sure it hasn't changed.
+        id_check = read_slot(i, mem)[0]
+        if id_check != id:
+            break
+
+        if ((id + 1) & 0xffff) == last_id:
+            msg = s + msg
+        last_id = id
+
+    # If we were unable to read forward from slot zero, but could read
+    # backward, then this is a wrapped buffer being currently updated
+    # into slot zero.  See comment below.
+    if final_slot == start_slot and last_id != id0:
+        return None
+
+    return ((final_slot + 1) % NUM_SLOTS, (final_id + 1) & 0xffff, msg)
+
+# Returns a tuple containing the next slot to expect data in, the ID
+# that slot should hold, and the full string history of trace data
+# from the buffer.  Start with slot zero (which is always part of the
+# current string if there is any data at all) and scan forward and
+# back to find the maximum extent.
+def trace_history():
+    # This loop is a race protection for the situation where the
+    # buffer has wrapped and new data is currently being placed into
+    # slot zero.  In those circumstances, slot zero will have a valid
+    # magic number but its sequence ID will not correlate with the
+    # previous and next slots.
+    ret = None
+    while ret is None:
+        ret = read_hist(0)
+        if ret is None:
+            ret = read_hist(1)
+    return ret
+
+
+# Loop, reading the next slot if it has new data.  Otherwise check the
+# full buffer and see if history is discontiguous (i.e. what is in the
+# buffer should be a proper suffix of what we have stored).  If it
+# doesn't match, then just print it (it's a reboot or a ring buffer
+# overrun).  If nothing has changed, then sleep a bit and keep
+# polling.
+def main():
+    next_slot, next_id, last_hist = trace_history()
+    sys.stdout.write(last_hist)
+    while True:
+        id, smsg = read_slot(next_slot, mem)
+
+        if id == next_id:
+            next_slot = int((next_slot + 1) % NUM_SLOTS)
+            next_id = (id + 1) & 0xffff
+            last_hist += smsg
+            sys.stdout.write(smsg)
+        else:
+            slot2, id2, msg2 = trace_history()
+
+            # Device reset:
+            if slot2 == 0 and id2 == 0 and msg2 == "":
+                next_id = 1
+                next_slot = slot2
+                last_hist = ""
+
+            if not last_hist.endswith(msg2):
+                # On a mismatch, go back and check one last time to
+                # address the race where a new slot wasn't present
+                # just JUST THEN but is NOW.
+                id3, s3 = read_slot(next_slot, mem)
+                if id3 == next_id:
+                    next_slot = int((next_slot + 1) % NUM_SLOTS)
+                    next_id = (next_id + 1) & 0xffff
+                    last_hist += s3
+                    sys.stdout.write(s3)
+                    continue
+
+                # Otherwise it represents discontiguous data, either a
+                # reset of an overrun, just dump what we have and
+                # start over.
+                next_slot = slot2
+                last_hist = msg2
+                sys.stdout.write(msg2)
+            else:
+                sys.stdout.flush()
+                time.sleep(0.10)
+
+if __name__ == "__main__":
+    main()
diff --git a/soc/xtensa/intel_adsp/common/trace_out.c b/soc/xtensa/intel_adsp/common/trace_out.c
index 3760268ce0b..4edc18cacb6 100644
--- a/soc/xtensa/intel_adsp/common/trace_out.c
+++ b/soc/xtensa/intel_adsp/common/trace_out.c
@@ -56,7 +56,7 @@ static __aligned(64) union {
 	uint32_t cache_pad[16];
 } data_rec;
 
-#define data ((struct metadata *)UNCACHED_PTR(&data_rec.meta))
+#define data ((volatile struct metadata *)UNCACHED_PTR(&data_rec.meta))
 
 static inline struct slot *slot(int i)
 {
@@ -65,9 +65,18 @@ static inline struct slot *slot(int i)
 	return &slots[i];
 }
 
+static int slot_incr(int s)
+{
+	return (s + 1) % NSLOTS;
+}
+
 void intel_adsp_trace_out(int8_t *str, size_t len)
 {
-	k_spinlock_key_t key = k_spin_lock(&data->lock);
+	if (len == 0) {
+		return;
+	}
+
+	k_spinlock_key_t key = k_spin_lock((void *)&data->lock);
 
 	if (!data->initialized) {
 		slot(0)->hdr.magic = 0;
@@ -77,8 +86,8 @@ void intel_adsp_trace_out(int8_t *str, size_t len)
 	}
 
 	/* We work with a local copy of the global data for
-	 * performance reasons (*data is uncached!) and put it back at
-	 * the end.
+	 * performance reasons (The memory behind the "data" pointer
+	 * is uncached and volatile!) and put it back at the end.
 	 */
 	uint32_t curr_slot = data->curr_slot;
 	uint32_t n_bytes = data->n_bytes;
@@ -89,22 +98,40 @@ void intel_adsp_trace_out(int8_t *str, size_t len)
 
 		s->msg[n_bytes++] = c;
 
+		/* Are we done with this slot?  Terminate it and flag
+		 * it for consumption on the other side
+		 */
 		if (c == '\n' || n_bytes >= MSGSZ) {
-			curr_slot = (curr_slot + 1) % NSLOTS;
-			n_bytes = 0;
-			slot(curr_slot)->hdr.magic = 0;
-			slot(curr_slot)->hdr.id = s->hdr.id + 1;
-			s->hdr.magic = SLOT_MAGIC;
-		}
-	}
+			if (n_bytes < MSGSZ) {
+				s->msg[n_bytes] = 0;
+			}
 
-	if (n_bytes < MSGSZ) {
-		slot(curr_slot)->msg[n_bytes] = 0;
+			/* Make sure the next slot has a magic number
+			 * (so the reader can distinguish between
+			 * no-new-data and system-reset), but does NOT
+			 * have the correct successor ID (so can never
+			 * be picked up as valid data).  We'll
+			 * increment it later when we terminate that
+			 * slot.
+			 */
+			int next_slot = slot_incr(curr_slot);
+			uint16_t new_id = s->hdr.id + 1;
+
+			slot(next_slot)->hdr.id = new_id;
+			slot(next_slot)->hdr.magic = SLOT_MAGIC;
+			slot(next_slot)->msg[0] = 0;
+
+			s->hdr.id = new_id;
+			s->hdr.magic = SLOT_MAGIC;
+
+			curr_slot = next_slot;
+			n_bytes = 0;
+		}
 	}
 
 	data->curr_slot = curr_slot;
 	data->n_bytes = n_bytes;
-	k_spin_unlock(&data->lock, key);
+	k_spin_unlock((void *)&data->lock, key);
 }
 
 int arch_printk_char_out(int c)