arch/xtensa: Promote adsp RPO/cache utilities to an arch API

This is trick (mapping RAM twice so you can use alternate Region Protection Option addresses to control cacheability) is something any Xtensa hardware designer might productively choose to do. And as it works really well, we should encourage that by making this a generic architecture feature for Zephyr. Now everything works by setting two kconfig values at the soc level defining the cached and uncached regions. As long as these are correct, you can then use the new arch_xtensa_un/cached_ptr() APIs to convert between them and a ARCH_XTENSA_SET_RPO_TLB() macro that provides much smaller initialization code (in C!) than the HAL assembly macros. The conversion routines have been generalized to support conversion between any two regions. Note that full KERNEL_COHERENCE still requires support from the platform linker script, that can't be made generic given the way Zephyr does linkage. Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
2022-01-07 05:09:39 -08:00 · 2022-01-07 05:09:39 -08:00 · 97ada8bc04
commit 97ada8bc04
parent 6aa3d0c72f
9 changed files with 191 additions and 125 deletions
--- a/include/arch/xtensa/cache.h
+++ b/include/arch/xtensa/cache.h
@ -15,8 +15,9 @@ extern "C" {

 #define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS)

-#if XCHAL_DCACHE_SIZE
 #define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0))
+
+#if XCHAL_DCACHE_SIZE
 BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE));
 BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX));
 #endif
@ -78,6 +79,139 @@ static ALWAYS_INLINE void z_xtensa_cache_flush_inv_all(void)
 	z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX);
 }

+#ifdef CONFIG_ARCH_HAS_COHERENCE
+static inline bool arch_mem_coherent(void *ptr)
+{
+	size_t addr = (size_t) ptr;
+
+	return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION;
+}
+#endif
+
+static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom)
+{
+	/* The math here is all compile-time: when the two regions
+	 * differ by a power of two, we can convert between them by
+	 * setting or clearing just one bit.  Otherwise it needs two
+	 * operations.
+	 */
+	uint32_t rxor = (rto ^ rfrom) << 29;
+
+	rto <<= 29;
+	if (Z_IS_POW2(rxor)) {
+		if ((rxor & rto) == 0) {
+			return addr & ~rxor;
+		} else {
+			return addr | rxor;
+		}
+	} else {
+		return (addr & ~(7U << 29)) | rto;
+	}
+}
+
+/**
+ * @brief Return cached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory through the L1 data cache.  Data read
+ * through the resulting pointer will reflect locally cached values on
+ * the current CPU if they exist, and writes will go first into the
+ * cache and be written back later.
+ *
+ * @see arch_xtensa_uncached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object via the L1 dcache
+ */
+static inline void *arch_xtensa_cached_ptr(void *ptr)
+{
+	return (void *)z_xtrpoflip((uint32_t) ptr,
+				   CONFIG_XTENSA_CACHED_REGION,
+				   CONFIG_XTENSA_UNCACHED_REGION);
+}
+
+/**
+ * @brief Return uncached pointer to a RAM address
+ *
+ * The Xtensa coherence architecture maps addressable RAM twice, in
+ * two different 512MB regions whose L1 cache settings can be
+ * controlled independently.  So for any given pointer, it is possible
+ * to convert it to and from a cached version.
+ *
+ * This function takes a pointer to any addressible object (either in
+ * cacheable memory or not) and returns a pointer that can be used to
+ * refer to the same memory while bypassing the L1 data cache.  Data
+ * in the L1 cache will not be inspected nor modified by the access.
+ *
+ * @see arch_xtensa_cached_ptr()
+ *
+ * @param ptr A pointer to a valid C object
+ * @return A pointer to the same object bypassing the L1 dcache
+ */
+static inline void *arch_xtensa_uncached_ptr(void *ptr)
+{
+	return (void *)z_xtrpoflip((uint32_t) ptr,
+				   CONFIG_XTENSA_UNCACHED_REGION,
+				   CONFIG_XTENSA_CACHED_REGION);
+}
+
+/* Utility to generate an unrolled and optimal[1] code sequence to set
+ * the RPO TLB registers (contra the HAL cacheattr macros, which
+ * generate larger code and can't be called from C), based on the
+ * KERNEL_COHERENCE configuration in use.  Selects RPO attribute "2"
+ * for regions (including MMIO registers in region zero) which want to
+ * bypass L1, "4" for the cached region which wants writeback, and
+ * "15" (invalid) elsewhere.
+ *
+ * Note that on cores that have the "translation" option set, we need
+ * to put an identity mapping in the high bits.  Also per spec
+ * changing the current code region (by definition cached) requires
+ * that WITLB be followed by an ISYNC and that both instructions live
+ * in the same cache line (two 3-byte instructions fit in an 8-byte
+ * aligned region, so that's guaranteed not to cross a cache line
+ * boundary).
+ *
+ * [1] With the sole exception of gcc's infuriating insistence on
+ * emitting a precomputed literal for addr + addrincr instead of
+ * computing it with a single ADD instruction from values it already
+ * has in registers.  Explicitly assigning the variables to registers
+ * via an attribute works, but then emits needless MOV instructions
+ * instead.  I tell myself it's just 32 bytes of .text, but... Sigh.
+ */
+#define _REGION_ATTR(r)						\
+	((r) == 0 ? 2 :						\
+	 ((r) == CONFIG_XTENSA_CACHED_REGION ? 4 :		\
+	  ((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15)))
+
+#define _SET_ONE_TLB(region) do {				\
+	uint32_t attr = _REGION_ATTR(region);			\
+	if (XCHAL_HAVE_XLT_CACHEATTR) {				\
+		attr |= addr; /* RPO with translation */	\
+	}							\
+	if (region != CONFIG_XTENSA_CACHED_REGION) {		\
+		__asm__ volatile("wdtlb %0, %1; witlb %0, %1"	\
+				 :: "r"(attr), "r"(addr));	\
+	} else {						\
+		__asm__ volatile("wdtlb %0, %1"			\
+				 :: "r"(attr), "r"(addr));	\
+		__asm__ volatile("j 1f; .align 8; 1:");		\
+		__asm__ volatile("witlb %0, %1; isync"		\
+				 :: "r"(attr), "r"(addr));	\
+	}							\
+	addr += addrincr;					\
+} while (0)
+
+#define ARCH_XTENSA_SET_RPO_TLB() do {				\
+	register uint32_t addr = 0, addrincr = 0x20000000;	\
+	FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7);	\
+} while (0)
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif