From a3848cb8615174f5c2e4f9a6272890a58437e22f Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Fri, 1 May 2020 09:21:09 -0700
Subject: [PATCH] soc/xtensa/intel_adsp: Add arch_printk_char_out hook

Add a printk default hook that works in very early boot and doesn't
depend on the logging subsystem (which can still be used if desired,
of course).  It speaks the same protocol, is somewhat smaller (MUCH
smaller if the app doesn't otherwise need the logging and ring buffer
dependencies), and more efficiently uses the output slot space by
doing line buffering and flushing only when needed.

Most importantly this one is MP-safe via both locking and cache
coherence management, and can work reliably when SMP is enabled.
(Note that "reliable" means that all output appears without corruption
-- simulateous logging by two CPUs can still interleave bytes, of
course).

Longer term, if we keep this protocol it would be good to unify the
two backends to reduce duplicated code.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 .../xtensa/up_squared_adsp/tools/adsplog.py   | 10 +-
 .../cavs_v15/Kconfig.defconfig.series         |  3 -
 soc/xtensa/intel_adsp/common/printk_out.c     | 96 +++++++++++++++++++
 3 files changed, 101 insertions(+), 8 deletions(-)
 create mode 100644 soc/xtensa/intel_adsp/common/printk_out.c

diff --git a/boards/xtensa/up_squared_adsp/tools/adsplog.py b/boards/xtensa/up_squared_adsp/tools/adsplog.py
index 0a2a18280a1..f2084ebf13e 100755
--- a/boards/xtensa/up_squared_adsp/tools/adsplog.py
+++ b/boards/xtensa/up_squared_adsp/tools/adsplog.py
@@ -11,11 +11,11 @@ import mmap
 MAP_SIZE = 8192
 SLOT_SIZE = 64
 
-# Location of the log output window within the mapping of the SRAM
-# (BAR4) on the PCI device.  These numbers are cribbed from existing
-# scripting, I don't know what they really mean or where the spec for
-# these protocols is.  The driver on the DSP just hard codes an
-# address.
+# Location of the log output window within the DSP BAR on the PCI
+# device.  The hardware provides 4x 128k "windows" starting at 512kb
+# in the BAR which the DSP software can map to 4k-aligned locations
+# within its own address space.  By convention log output is an 8k
+# region at window index 3.
 WIN_OFFSET = 0x80000
 WIN_ID = 3
 WIN_SIZE = 0x20000
diff --git a/soc/xtensa/intel_adsp/cavs_v15/Kconfig.defconfig.series b/soc/xtensa/intel_adsp/cavs_v15/Kconfig.defconfig.series
index 8cfbb05d887..8d3dfe11832 100644
--- a/soc/xtensa/intel_adsp/cavs_v15/Kconfig.defconfig.series
+++ b/soc/xtensa/intel_adsp/cavs_v15/Kconfig.defconfig.series
@@ -47,9 +47,6 @@ config 2ND_LEVEL_INTERRUPTS
 config DYNAMIC_INTERRUPTS
 	default y
 
-config LOG
-	default y
-
 # To prevent test uses TEST_LOGGING_MINIMAL
 config TEST_LOGGING_DEFAULTS
 	default n
diff --git a/soc/xtensa/intel_adsp/common/printk_out.c b/soc/xtensa/intel_adsp/common/printk_out.c
new file mode 100644
index 00000000000..4ce6e9a6430
--- /dev/null
+++ b/soc/xtensa/intel_adsp/common/printk_out.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+#include <zephyr.h>
+#include <adsp/cache.h>
+#include <soc/shim.h>
+
+/* Simple char-at-a-time output rig to the host kernel from a ADSP
+ * device.  The protocol uses an array of "slots" in shared memory,
+ * each of which has a 16 bit magic number to validate and a
+ * sequential ID number.  The remaining bytes are a (potentially
+ * nul-terminated) string containing output data.
+ *
+ * IMPORTANT NOTE on cache coherence: the shared memory window is in
+ * HP-SRAM.  Each DSP core has an L1 cache that is incoherent (!) from
+ * the perspective of the other cores.  To handle this, we take care
+ * to access all memory through the uncached window into HP-SRAM at
+ * 0x9xxxxxxx and not the L1-cached mapping of the same memory at
+ * 0xBxxxxxxx.
+ */
+
+#define SLOT_SIZE 64
+#define SLOT_MAGIC 0x55aa
+
+#define NSLOTS (SRAM_TRACE_SIZE / SLOT_SIZE)
+#define MSGSZ (SLOT_SIZE - sizeof(struct slot_hdr))
+
+/* Translates a SRAM pointer into an address of the same memory in the
+ * uncached region from 0x80000000-0x9fffffff
+ */
+#define UNCACHED_PTR(p) ((void*)(((int)p) & ~0x20000000))
+
+struct slot_hdr {
+	uint16_t magic;
+	uint16_t id;
+};
+
+struct slot {
+	struct slot_hdr hdr;
+	char msg[MSGSZ];
+};
+
+struct metadata {
+	struct k_spinlock lock;
+	int initialized;
+	int curr_slot;   /* To which slot are we writing? */
+	int n_bytes;     /* How many bytes buffered in curr_slot */
+};
+
+/* Give it a cache line all its own! */
+static __aligned(64) union {
+	struct metadata meta;
+	uint32_t cache_pad[16];
+} data_rec;
+
+#define data ((struct metadata *)UNCACHED_PTR(&data_rec.meta))
+
+static inline struct slot *slot(int i)
+{
+	struct slot *slots = UNCACHED_PTR(SRAM_TRACE_BASE);
+
+	return &slots[i];
+}
+
+int arch_printk_char_out(int c)
+{
+	k_spinlock_key_t key = k_spin_lock(&data->lock);
+
+	if (!data->initialized) {
+		slot(0)->hdr.magic = 0;
+		slot(0)->hdr.id = 0;
+		data->curr_slot = data->n_bytes = 0;
+		data->initialized = 1;
+	}
+
+	struct slot *s = slot(data->curr_slot);
+
+	s->msg[data->n_bytes++] = c;
+
+	if (data->n_bytes < MSGSZ) {
+		s->msg[data->n_bytes] = 0;
+	}
+
+	if (c == '\n' || data->n_bytes >= MSGSZ) {
+		data->curr_slot = (data->curr_slot + 1) % NSLOTS;
+		data->n_bytes = 0;
+		slot(data->curr_slot)->hdr.magic = 0;
+		slot(data->curr_slot)->hdr.id = s->hdr.id + 1;
+		s->hdr.magic = SLOT_MAGIC;
+	}
+
+	k_spin_unlock(&data->lock, key);
+	return 0;
+}