arch/xtensa: Promote adsp RPO/cache utilities to an arch API

This is trick (mapping RAM twice so you can use alternate Region Protection Option addresses to control cacheability) is something any Xtensa hardware designer might productively choose to do. And as it works really well, we should encourage that by making this a generic architecture feature for Zephyr. Now everything works by setting two kconfig values at the soc level defining the cached and uncached regions. As long as these are correct, you can then use the new arch_xtensa_un/cached_ptr() APIs to convert between them and a ARCH_XTENSA_SET_RPO_TLB() macro that provides much smaller initialization code (in C!) than the HAL assembly macros. The conversion routines have been generalized to support conversion between any two regions. Note that full KERNEL_COHERENCE still requires support from the platform linker script, that can't be made generic given the way Zephyr does linkage. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2022-01-07 05:09:39 -08:00 · 2022-01-07 05:09:39 -08:00 · 97ada8bc04
commit 97ada8bc04
parent 6aa3d0c72f
9 changed files with 191 additions and 125 deletions
--- a/arch/xtensa/Kconfig
+++ b/arch/xtensa/Kconfig
@ -89,4 +89,21 @@ config XTENSA_SMALL_VECTOR_TABLE_ENTRY
 	  handlers to the end of vector table, renaming them to
 	  _Level\LVL\()VectorHelper.
 config XTENSA_CACHED_REGION
 	int "Cached RPO mapping"
 	range 0 7
 	help
 	  A design trick on multi-core hardware is to map memory twice
 	  so that it can be seen in both (incoherent) cached mappings
 	  and a coherent "shared" area.  This specifies which 512M
 	  region (0-7, as defined by the Xtensa Region Protection
 	  Option) contains the "cached" mapping.
 config XTENSA_UNCACHED_REGION
 	int "Uncached RPO mapping"
 	range 0 7
 	help
 	  As for XTENSA_CACHED_REGION, this specifies which 512M
 	  region (0-7) contains the "uncached" mapping.
 endmenu
--- a/arch/xtensa/include/kernel_arch_func.h
+++ b/arch/xtensa/include/kernel_arch_func.h
@ -58,18 +58,6 @@ static inline void arch_switch(void *switch_to, void **switched_from)
 	return xtensa_switch(switch_to, switched_from);
 }
 /* FIXME: we don't have a framework for including this from the SoC
 * layer, so we define it in the arch code here.
 */
 #if defined(CONFIG_SOC_FAMILY_INTEL_ADSP) && defined(CONFIG_KERNEL_COHERENCE)
 static inline bool arch_mem_coherent(void *ptr)
 {
 	size_t addr = (size_t) ptr;
 	return addr >= 0x80000000 && addr < 0xa0000000;
 }
 #endif
 #ifdef CONFIG_KERNEL_COHERENCE
 static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread,
 					     void *old_switch_handle,
--- a/include/arch/xtensa/cache.h
+++ b/include/arch/xtensa/cache.h
@ -15,8 +15,9 @@ extern "C" {
 #define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS)
 #if XCHAL_DCACHE_SIZE
 #define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0))
 #if XCHAL_DCACHE_SIZE
 BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE));
 BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX));
 #endif
@ -78,6 +79,139 @@ static ALWAYS_INLINE void z_xtensa_cache_flush_inv_all(void)
 	z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX);
 }
 #ifdef CONFIG_ARCH_HAS_COHERENCE
 static inline bool arch_mem_coherent(void *ptr)
 {
 	size_t addr = (size_t) ptr;
 	return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION;
 }
 #endif
 static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom)
 {
 	/* The math here is all compile-time: when the two regions
 	 * differ by a power of two, we can convert between them by
 	 * setting or clearing just one bit.  Otherwise it needs two
 	 * operations.
 	 */
 	uint32_t rxor = (rto ^ rfrom) << 29;
 	rto <<= 29;
 	if (Z_IS_POW2(rxor)) {
 		if ((rxor & rto) == 0) {
 			return addr & ~rxor;
 		} else {
 			return addr | rxor;
 		}
 	} else {
 		return (addr & ~(7U << 29)) | rto;
 	}
 }
 /**
 * @brief Return cached pointer to a RAM address
 *
 * The Xtensa coherence architecture maps addressable RAM twice, in
 * two different 512MB regions whose L1 cache settings can be
 * controlled independently.  So for any given pointer, it is possible
 * to convert it to and from a cached version.
 *
 * This function takes a pointer to any addressible object (either in
 * cacheable memory or not) and returns a pointer that can be used to
 * refer to the same memory through the L1 data cache.  Data read
 * through the resulting pointer will reflect locally cached values on
 * the current CPU if they exist, and writes will go first into the
 * cache and be written back later.
 *
 * @see arch_xtensa_uncached_ptr()
 *
 * @param ptr A pointer to a valid C object
 * @return A pointer to the same object via the L1 dcache
 */
 static inline void *arch_xtensa_cached_ptr(void *ptr)
 {
 	return (void *)z_xtrpoflip((uint32_t) ptr,
 				   CONFIG_XTENSA_CACHED_REGION,
 				   CONFIG_XTENSA_UNCACHED_REGION);
 }
 /**
 * @brief Return uncached pointer to a RAM address
 *
 * The Xtensa coherence architecture maps addressable RAM twice, in
 * two different 512MB regions whose L1 cache settings can be
 * controlled independently.  So for any given pointer, it is possible
 * to convert it to and from a cached version.
 *
 * This function takes a pointer to any addressible object (either in
 * cacheable memory or not) and returns a pointer that can be used to
 * refer to the same memory while bypassing the L1 data cache.  Data
 * in the L1 cache will not be inspected nor modified by the access.
 *
 * @see arch_xtensa_cached_ptr()
 *
 * @param ptr A pointer to a valid C object
 * @return A pointer to the same object bypassing the L1 dcache
 */
 static inline void *arch_xtensa_uncached_ptr(void *ptr)
 {
 	return (void *)z_xtrpoflip((uint32_t) ptr,
 				   CONFIG_XTENSA_UNCACHED_REGION,
 				   CONFIG_XTENSA_CACHED_REGION);
 }
 /* Utility to generate an unrolled and optimal[1] code sequence to set
 * the RPO TLB registers (contra the HAL cacheattr macros, which
 * generate larger code and can't be called from C), based on the
 * KERNEL_COHERENCE configuration in use.  Selects RPO attribute "2"
 * for regions (including MMIO registers in region zero) which want to
 * bypass L1, "4" for the cached region which wants writeback, and
 * "15" (invalid) elsewhere.
 *
 * Note that on cores that have the "translation" option set, we need
 * to put an identity mapping in the high bits.  Also per spec
 * changing the current code region (by definition cached) requires
 * that WITLB be followed by an ISYNC and that both instructions live
 * in the same cache line (two 3-byte instructions fit in an 8-byte
 * aligned region, so that's guaranteed not to cross a cache line
 * boundary).
 *
 * [1] With the sole exception of gcc's infuriating insistence on
 * emitting a precomputed literal for addr + addrincr instead of
 * computing it with a single ADD instruction from values it already
 * has in registers.  Explicitly assigning the variables to registers
 * via an attribute works, but then emits needless MOV instructions
 * instead.  I tell myself it's just 32 bytes of .text, but... Sigh.
 */
 #define _REGION_ATTR(r)						\
 	((r) == 0 ? 2 :						\
 	 ((r) == CONFIG_XTENSA_CACHED_REGION ? 4 :		\
 	  ((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15)))
 #define _SET_ONE_TLB(region) do {				\
 	uint32_t attr = _REGION_ATTR(region);			\
 	if (XCHAL_HAVE_XLT_CACHEATTR) {				\
 		attr |= addr; /* RPO with translation */	\
 	}							\
 	if (region != CONFIG_XTENSA_CACHED_REGION) {		\
 		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"	\
 				 :: "r"(attr), "r"(addr));	\
 	} else {						\
 		__asm__ volatile("wdtlb %0, %1"			\
 				 :: "r"(attr), "r"(addr));	\
 		__asm__ volatile("j 1f; .align 8; 1:");		\
 		__asm__ volatile("witlb %0, %1; isync"		\
 				 :: "r"(attr), "r"(addr));	\
 	}							\
 	addr += addrincr;					\
 } while (0)
 #define ARCH_XTENSA_SET_RPO_TLB() do {				\
 	register uint32_t addr = 0, addrincr = 0x20000000;	\
 	FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7);	\
 } while (0)
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
--- a/soc/xtensa/intel_adsp/Kconfig.defconfig
+++ b/soc/xtensa/intel_adsp/Kconfig.defconfig
@ -4,3 +4,11 @@
 # SPDX-License-Identifier: Apache-2.0
 source "soc/xtensa/intel_adsp/*/Kconfig.defconfig.series"
 # Lower priority defaults come AFTER the series-specific ones set above
 config XTENSA_CACHED_REGION
 	default 5
 config XTENSA_UNCACHED_REGION
 	default 4
--- a/soc/xtensa/intel_adsp/common/CMakeLists.txt
+++ b/soc/xtensa/intel_adsp/common/CMakeLists.txt
@ -59,6 +59,7 @@ add_custom_target(
      copy ${CMAKE_BINARY_DIR}/zephyr/${KERNEL_NAME}.elf ${KERNEL_REMAPPED}
  COMMAND ${ELF_FIX} ${CMAKE_OBJCOPY} ${KERNEL_REMAPPED}
      ${CONFIG_XTENSA_CACHED_REGION} ${CONFIG_XTENSA_UNCACHED_REGION}
  # Extract modules for rimage
  COMMAND ${CMAKE_OBJCOPY}
--- a/soc/xtensa/intel_adsp/common/fix_elf_addrs.py
+++ b/soc/xtensa/intel_adsp/common/fix_elf_addrs.py
@ -3,13 +3,11 @@
 # Copyright (c) 2020 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
-# ADSP devices have their RAM regions mapped twice, once in the 512MB
+# ADSP devices have their RAM regions mapped twice.  The first mapping
-# region from 0x80000000-0x9fffffff and again from
+# is set in the CPU to bypass the L1 cache, and so access through
-# 0xa0000000-0xbfffffff.  The first mapping is set in the CPU to
+# pointers in that region is coherent between CPUs (but slow).  The
-# bypass the L1 cache, and so access through pointers in that region
+# second region accesses the same memory through the L1 cache and
-# is coherent between CPUs (but slow).  The second region accesses the
+# requires careful flushing when used with shared data.
 # same memory through the L1 cache and requires careful flushing when
 # used with shared data.
 #
 # This distinction is exposed in the linker script, where some symbols
 # (e.g. stack regions) are linked into cached memory, but others
@ -26,13 +24,19 @@ from elftools.elf.elffile import ELFFile
 objcopy_bin = sys.argv[1]
 elffile = sys.argv[2]
 cached_reg = int(sys.argv[3])
 uncached_reg = int(sys.argv[4])
 uc_min = uncached_reg << 29
 uc_max = uc_min | 0x1fffffff
 cache_off = "0x%x" % ((cached_reg - uncached_reg) << 29)
 fixup =[]
 with open(elffile, "rb") as fd:
    elf = ELFFile(fd)
    for s in elf.iter_sections():
        addr = s.header.sh_addr
-        if 0x80000000 <= addr < 0xa0000000:
+        if uc_min <= addr <= uc_max:
            print(f"fix_elf_addrs.py: Moving section {s.name} to cached SRAM region")
            fixup.append(s.name)
@ -43,5 +47,5 @@ for s in fixup:
    # error (no --quiet option, no -Werror=no-whatever, nothing).
    # Just swallow the error stream for now pending rework to the
    # linker framework.
-    cmd = f"{objcopy_bin} --change-section-address {s}+0x20000000 {elffile} 2>/dev/null"
+    cmd = f"{objcopy_bin} --change-section-address {s}+{cache_off} {elffile} 2>/dev/null"
    os.system(cmd)
--- a/soc/xtensa/intel_adsp/common/include/cavs-link.ld
+++ b/soc/xtensa/intel_adsp/common/include/cavs-link.ld
@ -25,31 +25,25 @@ OUTPUT_ARCH(xtensa)
 ENTRY(rom_entry);
-/* DSP RAM regions (all of them) are mapped twice on the DSP: once in
+/* DSP RAM regions (all of them) are mapped twice on the DSP.  One
- * a 512MB region from 0x80000000-0x9fffffff and again from
+ * mapping is set up to bypass the L1 cache, so it must be used when
- * 0xa0000000-0xbfffffff.  The first mapping is set up to bypass the
+ * multiprocessor coherence is desired, where the latter mapping is
- * L1 cache, so it must be used when multiprocessor coherence is
+ * best used for processor-local data (e.g. stacks) or shared data
- * desired, where the latter mapping is best used for processor-local
+ * that is managed with explicit cache flush/invalidate operations.
 * data (e.g. stacks) or shared data that is managed with explicit
 * cache flush/invalidate operations.
 *
 * These macros will set up a segment start address correctly,
 * including alignment to a cache line.  Be sure to also emit the
- * section to ">ram :ram_phdr" or ">ucram :ucram_phdr" as
+ * section to ">ram" or ">ucram" as appropriate, to prevent the linker
- * appropriate. (Forgetting the correct PHDR will actually work, as
+ * from filling in 512MB of sparse zeros.
 * the output tooling ignores it, but it will cause the linker to emit
 * 512MB of unused data into the output file!)
 *
 * (Note clumsy syntax because XCC doesn't understand the "~" operator)
 */
 #ifdef CONFIG_KERNEL_COHERENCE
-#define SEGSTART_CACHED   (ALIGN(64) | 0x20000000)
+#define RPO_SET(addr, reg) ((addr & 0x1fffffff) | (reg << 29))
-#define SEGSTART_UNCACHED (ALIGN(64) & 0xdfffffff) /* == ~0x20000000 */
+#define SEGSTART_CACHED   RPO_SET(ALIGN(64), CONFIG_XTENSA_CACHED_REGION)
 #define SEGSTART_UNCACHED RPO_SET(ALIGN(64), CONFIG_XTENSA_UNCACHED_REGION)
 #else
 #define SEGSTART_CACHED   .
 #define SEGSTART_UNCACHED .
 #define ucram ram
 #define ucram_phdr ram_phdr
 #endif
 /* intlist.ld needs an IDT_LIST memory region */
@ -129,7 +123,7 @@ MEMORY {
 	len = RAM_SIZE
 #ifdef CONFIG_KERNEL_COHERENCE
  ucram :
-	org = RAM_BASE - 0x20000000,
+	org = RPO_SET(RAM_BASE, CONFIG_XTENSA_UNCACHED_REGION),
 	len = RAM_SIZE
 #endif
 #ifdef CONFIG_GEN_ISR_TABLES
--- a/soc/xtensa/intel_adsp/common/include/cpu_init.h
+++ b/soc/xtensa/intel_adsp/common/include/cpu_init.h
@ -4,6 +4,7 @@
 #ifndef __INTEL_ADSP_CPU_INIT_H
 #define __INTEL_ADSP_CPU_INIT_H
 #include <arch/xtensa/cache.h>
 #include <xtensa/config/core-isa.h>
 #define CxL1CCAP (*(volatile uint32_t *)0x9F080080)
@ -14,39 +15,6 @@
 #define CxL1CCAP_DCMWC ((CxL1CCAP >> 16) & 7)
 #define CxL1CCAP_ICMWC ((CxL1CCAP >> 20) & 7)
 /* Utilities to generate an unwrapped code sequence to set the RPO TLB
 * registers.  Pass the 8 region attributes as arguments, e.g.:
 *
 *     SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15);
 *
 * Note that cAVS 1.5 has the "translation" option that we don't use,
 * but still need to put an identity mapping in the high bits.  Also
 * per spec changing the current code region requires that WITLB be
 * followed by an ISYNC and that both instructions live in the same
 * cache line (two 3-byte instructions fit in an 8-byte aligned
 * region, so that's guaranteed not to cross a caceh line boundary).
 */
 #define SET_ONE_TLB(region, att) do {					\
 	uint32_t addr = region * 0x20000000U, attr = att;		\
 	if (XCHAL_HAVE_XLT_CACHEATTR) {					\
 		attr |= addr; /* RPO with translation */		\
 	}								\
 	if (region != (L2_SRAM_BASE >> 29)) {				\
 		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"		\
 				 :: "r"(attr), "r"(addr));		\
 	} else {							\
 		__asm__ volatile("wdtlb %0, %1"				\
 				 :: "r"(attr), "r"(addr));		\
 		__asm__ volatile("j 1f; .align 8; 1:");			\
 		__asm__ volatile("witlb %0, %1; isync"			\
 				 :: "r"(attr), "r"(addr));		\
 	}								\
 } while (0)
 #define SET_RPO_TLB(...) do {				\
 	FOR_EACH_IDX(SET_ONE_TLB, (;), __VA_ARGS__);	\
 } while (0)
 /* Low-level CPU initialization.  Call this immediately after entering
 * C code to initialize the cache, protection and synchronization
 * features.
@ -98,15 +66,9 @@ static ALWAYS_INLINE void cpu_early_init(void)
 	/* Finally we need to enable the cache in the Region
 	 * Protection Option "TLB" entries.  The hardware defaults
-	 * have this set to RW/uncached (2) everywhere.  We want
+	 * have this set to RW/uncached everywhere.
 	 * writeback caching (4) in the sixth mapping (the second of
 	 * two RAM mappings) and to mark all unused regions
 	 * inaccessible (15) for safety.  Note that there is a HAL
 	 * routine that does this (by emulating the older "cacheattr"
 	 * hardware register), but it generates significantly larger
 	 * code.
 	 */
-	SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15);
+	ARCH_XTENSA_SET_RPO_TLB();
 	/* Initialize ATOMCTL: Hardware defaults for S32C1I use
 	 * "internal" operations, meaning they are atomic only WRT the
--- a/soc/xtensa/intel_adsp/common/include/soc.h
+++ b/soc/xtensa/intel_adsp/common/include/soc.h
@ -80,54 +80,12 @@ extern void soc_start_core(int cpu_num);
 extern bool soc_cpus_active[CONFIG_MP_NUM_CPUS];
-/* Legacy SOC-level API still used in a few drivers */
+/* Legacy cache APIs still used in a few places */
 #define SOC_DCACHE_FLUSH(addr, size)		\
 	z_xtensa_cache_flush((addr), (size))
 #define SOC_DCACHE_INVALIDATE(addr, size)	\
 	z_xtensa_cache_inv((addr), (size))
-
+#define z_soc_cached_ptr(p) arch_xtensa_cached_ptr(p)
-/**
+#define z_soc_uncached_ptr(p) arch_xtensa_uncached_ptr(p)
 * @brief Return uncached pointer to a RAM address
 *
 * The Intel ADSP architecture maps all addressable RAM (of all types)
 * twice, in two different 512MB segments regions whose L1 cache
 * settings can be controlled independently.  So for any given
 * pointer, it is possible to convert it to and from a cached version.
 *
 * This function takes a pointer to any addressible object (either in
 * cacheable memory or not) and returns a pointer that can be used to
 * refer to the same memory while bypassing the L1 data cache.  Data
 * in the L1 cache will not be inspected nor modified by the access.
 *
 * @see z_soc_cached_ptr()
 *
 * @param p A pointer to a valid C object
 * @return A pointer to the same object bypassing the L1 dcache
 */
 static inline void *z_soc_uncached_ptr(void *p)
 {
 	return ((void *)(((size_t)p) & ~0x20000000));
 }
 /**
 * @brief Return cached pointer to a RAM address
 *
 * This function takes a pointer to any addressible object (either in
 * cacheable memory or not) and returns a pointer that can be used to
 * refer to the same memory through the L1 data cache.  Data read
 * through the resulting pointer will reflect locally cached values on
 * the current CPU if they exist, and writes will go first into the
 * cache and be written back later.
 *
 * @see z_soc_uncached_ptr()
 *
 * @param p A pointer to a valid C object
 * @return A pointer to the same object via the L1 dcache
 */
 static inline void *z_soc_cached_ptr(void *p)
 {
 	return ((void *)(((size_t)p) | 0x20000000));
 }
 #endif /* __INC_SOC_H */