From 97ada8bc04c1c38d395903f516f94a19afec0525 Mon Sep 17 00:00:00 2001
From: Andy Ross <andrew.j.ross@intel.com>
Date: Fri, 7 Jan 2022 05:09:39 -0800
Subject: [PATCH] arch/xtensa: Promote adsp RPO/cache utilities to an arch API

This is trick (mapping RAM twice so you can use alternate Region
Protection Option addresses to control cacheability) is something any
Xtensa hardware designer might productively choose to do.  And as it
works really well, we should encourage that by making this a generic
architecture feature for Zephyr.

Now everything works by setting two kconfig values at the soc level
defining the cached and uncached regions.  As long as these are
correct, you can then use the new arch_xtensa_un/cached_ptr() APIs to
convert between them and a ARCH_XTENSA_SET_RPO_TLB() macro that
provides much smaller initialization code (in C!) than the HAL
assembly macros.  The conversion routines have been generalized to
support conversion between any two regions.

Note that full KERNEL_COHERENCE still requires support from the
platform linker script, that can't be made generic given the way
Zephyr does linkage.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
---
 arch/xtensa/Kconfig                           |  17 +++
 arch/xtensa/include/kernel_arch_func.h        |  12 --
 include/arch/xtensa/cache.h                   | 136 +++++++++++++++++-
 soc/xtensa/intel_adsp/Kconfig.defconfig       |   8 ++
 soc/xtensa/intel_adsp/common/CMakeLists.txt   |   1 +
 soc/xtensa/intel_adsp/common/fix_elf_addrs.py |  22 +--
 .../intel_adsp/common/include/cavs-link.ld    |  28 ++--
 .../intel_adsp/common/include/cpu_init.h      |  44 +-----
 soc/xtensa/intel_adsp/common/include/soc.h    |  48 +------
 9 files changed, 191 insertions(+), 125 deletions(-)

diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig
index 8b51ec570b5..8b3ee35a670 100644
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@@ -89,4 +89,21 @@ config XTENSA_SMALL_VECTOR_TABLE_ENTRY
 	  handlers to the end of vector table, renaming them to
 	  _Level\LVL\()VectorHelper.
 
+config XTENSA_CACHED_REGION
+	int "Cached RPO mapping"
+	range 0 7
+	help
+	  A design trick on multi-core hardware is to map memory twice
+	  so that it can be seen in both (incoherent) cached mappings
+	  and a coherent "shared" area.  This specifies which 512M
+	  region (0-7, as defined by the Xtensa Region Protection
+	  Option) contains the "cached" mapping.
+
+config XTENSA_UNCACHED_REGION
+	int "Uncached RPO mapping"
+	range 0 7
+	help
+	  As for XTENSA_CACHED_REGION, this specifies which 512M
+	  region (0-7) contains the "uncached" mapping.
+
 endmenu
diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h
index b69f951adce..82a4f986c5b 100644
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@@ -58,18 +58,6 @@ static inline void arch_switch(void *switch_to, void **switched_from)
 	return xtensa_switch(switch_to, switched_from);
 }
 
-/* FIXME: we don't have a framework for including this from the SoC
- * layer, so we define it in the arch code here.
- */
-#if defined(CONFIG_SOC_FAMILY_INTEL_ADSP) && defined(CONFIG_KERNEL_COHERENCE)
-static inline bool arch_mem_coherent(void *ptr)
-{
-	size_t addr = (size_t) ptr;
-
-	return addr >= 0x80000000 && addr < 0xa0000000;
-}
-#endif
-
 #ifdef CONFIG_KERNEL_COHERENCE
 static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 					     void *old_switch_handle,
diff --git a/include/arch/xtensa/cache.h b/include/arch/xtensa/cache.h
index 12dc2c60065..bfd8608a4b8 100644
--- a/include/arch/xtensa/cache.h
+++ b/include/arch/xtensa/cache.h
@@ -15,8 +15,9 @@ extern "C" {
 
 #define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS)
 
-#if XCHAL_DCACHE_SIZE
 #define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0))
+
+#if XCHAL_DCACHE_SIZE
 BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE));
 BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX));
 #endif
@@ -78,6 +79,139 @@ static ALWAYS_INLINE void z_xtensa_cache_flush_inv_all(void)
 	z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX);
 }
 
+#ifdef CONFIG_ARCH_HAS_COHERENCE
+static inline bool arch_mem_coherent(void *ptr)
+{
+	size_t addr = (size_t) ptr;
+
+	return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION;
+}
+#endif
+
+static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom)
+{
+	/* The math here is all compile-time: when the two regions
+	 * differ by a power of two, we can convert between them by
+	 * setting or clearing just one bit.  Otherwise it needs two
+	 * operations.
+	 */
+	uint32_t rxor = (rto ^ rfrom) << 29;
+
+	rto <<= 29;
+	if (Z_IS_POW2(rxor)) {
+		if ((rxor & rto) == 0) {
+			return addr & ~rxor;
+		} else {
+			return addr | rxor;
+		}
+	} else {
+		return (addr & ~(7U << 29)) | rto;
+	}
+}
+
+/**
+ * @brief Return cached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory through the L1 data cache.  Data read
+ * through the resulting pointer will reflect locally cached values on
+ * the current CPU if they exist, and writes will go first into the
+ * cache and be written back later.
+ *
+ * @see arch_xtensa_uncached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object via the L1 dcache
+ */
+static inline void *arch_xtensa_cached_ptr(void *ptr)
+{
+	return (void *)z_xtrpoflip((uint32_t) ptr,
+				   CONFIG_XTENSA_CACHED_REGION,
+				   CONFIG_XTENSA_UNCACHED_REGION);
+}
+
+/**
+ * @brief Return uncached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory while bypassing the L1 data cache.  Data
+ * in the L1 cache will not be inspected nor modified by the access.
+ *
+ * @see arch_xtensa_cached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object bypassing the L1 dcache
+ */
+static inline void *arch_xtensa_uncached_ptr(void *ptr)
+{
+	return (void *)z_xtrpoflip((uint32_t) ptr,
+				   CONFIG_XTENSA_UNCACHED_REGION,
+				   CONFIG_XTENSA_CACHED_REGION);
+}
+
+/* Utility to generate an unrolled and optimal[1] code sequence to set
+ * the RPO TLB registers (contra the HAL cacheattr macros, which
+ * generate larger code and can't be called from C), based on the
+ * KERNEL_COHERENCE configuration in use.  Selects RPO attribute "2"
+ * for regions (including MMIO registers in region zero) which want to
+ * bypass L1, "4" for the cached region which wants writeback, and
+ * "15" (invalid) elsewhere.
+ *
+ * Note that on cores that have the "translation" option set, we need
+ * to put an identity mapping in the high bits.  Also per spec
+ * changing the current code region (by definition cached) requires
+ * that WITLB be followed by an ISYNC and that both instructions live
+ * in the same cache line (two 3-byte instructions fit in an 8-byte
+ * aligned region, so that's guaranteed not to cross a cache line
+ * boundary).
+ *
+ * [1] With the sole exception of gcc's infuriating insistence on
+ * emitting a precomputed literal for addr + addrincr instead of
+ * computing it with a single ADD instruction from values it already
+ * has in registers.  Explicitly assigning the variables to registers
+ * via an attribute works, but then emits needless MOV instructions
+ * instead.  I tell myself it's just 32 bytes of .text, but... Sigh.
+ */
+#define _REGION_ATTR(r)						\
+	((r) == 0 ? 2 :						\
+	 ((r) == CONFIG_XTENSA_CACHED_REGION ? 4 :		\
+	  ((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15)))
+
+#define _SET_ONE_TLB(region) do {				\
+	uint32_t attr = _REGION_ATTR(region);			\
+	if (XCHAL_HAVE_XLT_CACHEATTR) {				\
+		attr |= addr; /* RPO with translation */	\
+	}							\
+	if (region != CONFIG_XTENSA_CACHED_REGION) {		\
+		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"	\
+				 :: "r"(attr), "r"(addr));	\
+	} else {						\
+		__asm__ volatile("wdtlb %0, %1"			\
+				 :: "r"(attr), "r"(addr));	\
+		__asm__ volatile("j 1f; .align 8; 1:");		\
+		__asm__ volatile("witlb %0, %1; isync"		\
+				 :: "r"(attr), "r"(addr));	\
+	}							\
+	addr += addrincr;					\
+} while (0)
+
+#define ARCH_XTENSA_SET_RPO_TLB() do {				\
+	register uint32_t addr = 0, addrincr = 0x20000000;	\
+	FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7);	\
+} while (0)
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/soc/xtensa/intel_adsp/Kconfig.defconfig b/soc/xtensa/intel_adsp/Kconfig.defconfig
index 76b060560f9..1e6889389a7 100644
--- a/soc/xtensa/intel_adsp/Kconfig.defconfig
+++ b/soc/xtensa/intel_adsp/Kconfig.defconfig
@@ -4,3 +4,11 @@
 # SPDX-License-Identifier: Apache-2.0
 
 source "soc/xtensa/intel_adsp/*/Kconfig.defconfig.series"
+
+# Lower priority defaults come AFTER the series-specific ones set above
+
+config XTENSA_CACHED_REGION
+	default 5
+
+config XTENSA_UNCACHED_REGION
+	default 4
diff --git a/soc/xtensa/intel_adsp/common/CMakeLists.txt b/soc/xtensa/intel_adsp/common/CMakeLists.txt
index 435e07482d8..ede0063f235 100644
--- a/soc/xtensa/intel_adsp/common/CMakeLists.txt
+++ b/soc/xtensa/intel_adsp/common/CMakeLists.txt
@@ -59,6 +59,7 @@ add_custom_target(
       copy ${CMAKE_BINARY_DIR}/zephyr/${KERNEL_NAME}.elf ${KERNEL_REMAPPED}
 
   COMMAND ${ELF_FIX} ${CMAKE_OBJCOPY} ${KERNEL_REMAPPED}
+      ${CONFIG_XTENSA_CACHED_REGION} ${CONFIG_XTENSA_UNCACHED_REGION}
 
   # Extract modules for rimage
   COMMAND ${CMAKE_OBJCOPY}
diff --git a/soc/xtensa/intel_adsp/common/fix_elf_addrs.py b/soc/xtensa/intel_adsp/common/fix_elf_addrs.py
index 3f60fd386b4..8b0c7c2a21c 100755
--- a/soc/xtensa/intel_adsp/common/fix_elf_addrs.py
+++ b/soc/xtensa/intel_adsp/common/fix_elf_addrs.py
@@ -3,13 +3,11 @@
 # Copyright (c) 2020 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-# ADSP devices have their RAM regions mapped twice, once in the 512MB
-# region from 0x80000000-0x9fffffff and again from
-# 0xa0000000-0xbfffffff.  The first mapping is set in the CPU to
-# bypass the L1 cache, and so access through pointers in that region
-# is coherent between CPUs (but slow).  The second region accesses the
-# same memory through the L1 cache and requires careful flushing when
-# used with shared data.
+# ADSP devices have their RAM regions mapped twice.  The first mapping
+# is set in the CPU to bypass the L1 cache, and so access through
+# pointers in that region is coherent between CPUs (but slow).  The
+# second region accesses the same memory through the L1 cache and
+# requires careful flushing when used with shared data.
 #
 # This distinction is exposed in the linker script, where some symbols
 # (e.g. stack regions) are linked into cached memory, but others
@@ -26,13 +24,19 @@ from elftools.elf.elffile import ELFFile
 
 objcopy_bin = sys.argv[1]
 elffile = sys.argv[2]
+cached_reg = int(sys.argv[3])
+uncached_reg = int(sys.argv[4])
+
+uc_min = uncached_reg << 29
+uc_max = uc_min | 0x1fffffff
+cache_off = "0x%x" % ((cached_reg - uncached_reg) << 29)
 
 fixup =[]
 with open(elffile, "rb") as fd:
     elf = ELFFile(fd)
     for s in elf.iter_sections():
         addr = s.header.sh_addr
-        if 0x80000000 <= addr < 0xa0000000:
+        if uc_min <= addr <= uc_max:
             print(f"fix_elf_addrs.py: Moving section {s.name} to cached SRAM region")
             fixup.append(s.name)
 
@@ -43,5 +47,5 @@ for s in fixup:
     # error (no --quiet option, no -Werror=no-whatever, nothing).
     # Just swallow the error stream for now pending rework to the
     # linker framework.
-    cmd = f"{objcopy_bin} --change-section-address {s}+0x20000000 {elffile} 2>/dev/null"
+    cmd = f"{objcopy_bin} --change-section-address {s}+{cache_off} {elffile} 2>/dev/null"
     os.system(cmd)
diff --git a/soc/xtensa/intel_adsp/common/include/cavs-link.ld b/soc/xtensa/intel_adsp/common/include/cavs-link.ld
index 2978594b25a..85804185960 100644
--- a/soc/xtensa/intel_adsp/common/include/cavs-link.ld
+++ b/soc/xtensa/intel_adsp/common/include/cavs-link.ld
@@ -25,31 +25,25 @@ OUTPUT_ARCH(xtensa)
 
 ENTRY(rom_entry);
 
-/* DSP RAM regions (all of them) are mapped twice on the DSP: once in
- * a 512MB region from 0x80000000-0x9fffffff and again from
- * 0xa0000000-0xbfffffff.  The first mapping is set up to bypass the
- * L1 cache, so it must be used when multiprocessor coherence is
- * desired, where the latter mapping is best used for processor-local
- * data (e.g. stacks) or shared data that is managed with explicit
- * cache flush/invalidate operations.
+/* DSP RAM regions (all of them) are mapped twice on the DSP.  One
+ * mapping is set up to bypass the L1 cache, so it must be used when
+ * multiprocessor coherence is desired, where the latter mapping is
+ * best used for processor-local data (e.g. stacks) or shared data
+ * that is managed with explicit cache flush/invalidate operations.
  *
  * These macros will set up a segment start address correctly,
  * including alignment to a cache line.  Be sure to also emit the
- * section to ">ram :ram_phdr" or ">ucram :ucram_phdr" as
- * appropriate. (Forgetting the correct PHDR will actually work, as
- * the output tooling ignores it, but it will cause the linker to emit
- * 512MB of unused data into the output file!)
- *
- * (Note clumsy syntax because XCC doesn't understand the "~" operator)
+ * section to ">ram" or ">ucram" as appropriate, to prevent the linker
+ * from filling in 512MB of sparse zeros.
  */
 #ifdef CONFIG_KERNEL_COHERENCE
-#define SEGSTART_CACHED   (ALIGN(64) | 0x20000000)
-#define SEGSTART_UNCACHED (ALIGN(64) & 0xdfffffff) /* == ~0x20000000 */
+#define RPO_SET(addr, reg) ((addr & 0x1fffffff) | (reg << 29))
+#define SEGSTART_CACHED   RPO_SET(ALIGN(64), CONFIG_XTENSA_CACHED_REGION)
+#define SEGSTART_UNCACHED RPO_SET(ALIGN(64), CONFIG_XTENSA_UNCACHED_REGION)
 #else
 #define SEGSTART_CACHED   .
 #define SEGSTART_UNCACHED .
 #define ucram ram
-#define ucram_phdr ram_phdr
 #endif
 
 /* intlist.ld needs an IDT_LIST memory region */
@@ -129,7 +123,7 @@ MEMORY {
 	len = RAM_SIZE
 #ifdef CONFIG_KERNEL_COHERENCE
   ucram :
-	org = RAM_BASE - 0x20000000,
+	org = RPO_SET(RAM_BASE, CONFIG_XTENSA_UNCACHED_REGION),
 	len = RAM_SIZE
 #endif
 #ifdef CONFIG_GEN_ISR_TABLES
diff --git a/soc/xtensa/intel_adsp/common/include/cpu_init.h b/soc/xtensa/intel_adsp/common/include/cpu_init.h
index 6ba83cda0c6..b0ce7de9ab4 100644
--- a/soc/xtensa/intel_adsp/common/include/cpu_init.h
+++ b/soc/xtensa/intel_adsp/common/include/cpu_init.h
@@ -4,6 +4,7 @@
 #ifndef __INTEL_ADSP_CPU_INIT_H
 #define __INTEL_ADSP_CPU_INIT_H
 
+#include <arch/xtensa/cache.h>
 #include <xtensa/config/core-isa.h>
 
 #define CxL1CCAP (*(volatile uint32_t *)0x9F080080)
@@ -14,39 +15,6 @@
 #define CxL1CCAP_DCMWC ((CxL1CCAP >> 16) & 7)
 #define CxL1CCAP_ICMWC ((CxL1CCAP >> 20) & 7)
 
-/* Utilities to generate an unwrapped code sequence to set the RPO TLB
- * registers.  Pass the 8 region attributes as arguments, e.g.:
- *
- *     SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15);
- *
- * Note that cAVS 1.5 has the "translation" option that we don't use,
- * but still need to put an identity mapping in the high bits.  Also
- * per spec changing the current code region requires that WITLB be
- * followed by an ISYNC and that both instructions live in the same
- * cache line (two 3-byte instructions fit in an 8-byte aligned
- * region, so that's guaranteed not to cross a caceh line boundary).
- */
-#define SET_ONE_TLB(region, att) do {					\
-	uint32_t addr = region * 0x20000000U, attr = att;		\
-	if (XCHAL_HAVE_XLT_CACHEATTR) {					\
-		attr |= addr; /* RPO with translation */		\
-	}								\
-	if (region != (L2_SRAM_BASE >> 29)) {				\
-		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"		\
-				 :: "r"(attr), "r"(addr));		\
-	} else {							\
-		__asm__ volatile("wdtlb %0, %1"				\
-				 :: "r"(attr), "r"(addr));		\
-		__asm__ volatile("j 1f; .align 8; 1:");			\
-		__asm__ volatile("witlb %0, %1; isync"			\
-				 :: "r"(attr), "r"(addr));		\
-	}								\
-} while (0)
-
-#define SET_RPO_TLB(...) do {				\
-	FOR_EACH_IDX(SET_ONE_TLB, (;), __VA_ARGS__);	\
-} while (0)
-
 /* Low-level CPU initialization.  Call this immediately after entering
  * C code to initialize the cache, protection and synchronization
  * features.
@@ -98,15 +66,9 @@ static ALWAYS_INLINE void cpu_early_init(void)
 
 	/* Finally we need to enable the cache in the Region
 	 * Protection Option "TLB" entries.  The hardware defaults
-	 * have this set to RW/uncached (2) everywhere.  We want
-	 * writeback caching (4) in the sixth mapping (the second of
-	 * two RAM mappings) and to mark all unused regions
-	 * inaccessible (15) for safety.  Note that there is a HAL
-	 * routine that does this (by emulating the older "cacheattr"
-	 * hardware register), but it generates significantly larger
-	 * code.
+	 * have this set to RW/uncached everywhere.
 	 */
-	SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15);
+	ARCH_XTENSA_SET_RPO_TLB();
 
 	/* Initialize ATOMCTL: Hardware defaults for S32C1I use
 	 * "internal" operations, meaning they are atomic only WRT the
diff --git a/soc/xtensa/intel_adsp/common/include/soc.h b/soc/xtensa/intel_adsp/common/include/soc.h
index 26bcbede9ee..129df3f5453 100644
--- a/soc/xtensa/intel_adsp/common/include/soc.h
+++ b/soc/xtensa/intel_adsp/common/include/soc.h
@@ -80,54 +80,12 @@ extern void soc_start_core(int cpu_num);
 
 extern bool soc_cpus_active[CONFIG_MP_NUM_CPUS];
 
-/* Legacy SOC-level API still used in a few drivers */
+/* Legacy cache APIs still used in a few places */
 #define SOC_DCACHE_FLUSH(addr, size)		\
 	z_xtensa_cache_flush((addr), (size))
 #define SOC_DCACHE_INVALIDATE(addr, size)	\
 	z_xtensa_cache_inv((addr), (size))
-
-/**
- * @brief Return uncached pointer to a RAM address
- *
- * The Intel ADSP architecture maps all addressable RAM (of all types)
- * twice, in two different 512MB segments regions whose L1 cache
- * settings can be controlled independently.  So for any given
- * pointer, it is possible to convert it to and from a cached version.
- *
- * This function takes a pointer to any addressible object (either in
- * cacheable memory or not) and returns a pointer that can be used to
- * refer to the same memory while bypassing the L1 data cache.  Data
- * in the L1 cache will not be inspected nor modified by the access.
- *
- * @see z_soc_cached_ptr()
- *
- * @param p A pointer to a valid C object
- * @return A pointer to the same object bypassing the L1 dcache
- */
-static inline void *z_soc_uncached_ptr(void *p)
-{
-	return ((void *)(((size_t)p) & ~0x20000000));
-}
-
-/**
- * @brief Return cached pointer to a RAM address
- *
- * This function takes a pointer to any addressible object (either in
- * cacheable memory or not) and returns a pointer that can be used to
- * refer to the same memory through the L1 data cache.  Data read
- * through the resulting pointer will reflect locally cached values on
- * the current CPU if they exist, and writes will go first into the
- * cache and be written back later.
- *
- * @see z_soc_uncached_ptr()
- *
- * @param p A pointer to a valid C object
- * @return A pointer to the same object via the L1 dcache
-
- */
-static inline void *z_soc_cached_ptr(void *p)
-{
-	return ((void *)(((size_t)p) | 0x20000000));
-}
+#define z_soc_cached_ptr(p) arch_xtensa_cached_ptr(p)
+#define z_soc_uncached_ptr(p) arch_xtensa_uncached_ptr(p)
 
 #endif /* __INC_SOC_H */