arch/xtensa: Promote adsp RPO/cache utilities to an arch API

This is trick (mapping RAM twice so you can use alternate Region
Protection Option addresses to control cacheability) is something any
Xtensa hardware designer might productively choose to do.  And as it
works really well, we should encourage that by making this a generic
architecture feature for Zephyr.

Now everything works by setting two kconfig values at the soc level
defining the cached and uncached regions.  As long as these are
correct, you can then use the new arch_xtensa_un/cached_ptr() APIs to
convert between them and a ARCH_XTENSA_SET_RPO_TLB() macro that
provides much smaller initialization code (in C!) than the HAL
assembly macros.  The conversion routines have been generalized to
support conversion between any two regions.

Note that full KERNEL_COHERENCE still requires support from the
platform linker script, that can't be made generic given the way
Zephyr does linkage.

Signed-off-by: Andy Ross <andrew.j.ross@intel.com>
This commit is contained in:
Andy Ross 2022-01-07 05:09:39 -08:00 committed by Carles Cufí
commit 97ada8bc04
9 changed files with 191 additions and 125 deletions

View file

@ -15,8 +15,9 @@ extern "C" {
#define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS)
#if XCHAL_DCACHE_SIZE
#define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0))
#if XCHAL_DCACHE_SIZE
BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE));
BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX));
#endif
@ -78,6 +79,139 @@ static ALWAYS_INLINE void z_xtensa_cache_flush_inv_all(void)
z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX);
}
#ifdef CONFIG_ARCH_HAS_COHERENCE
static inline bool arch_mem_coherent(void *ptr)
{
size_t addr = (size_t) ptr;
return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION;
}
#endif
static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom)
{
/* The math here is all compile-time: when the two regions
* differ by a power of two, we can convert between them by
* setting or clearing just one bit. Otherwise it needs two
* operations.
*/
uint32_t rxor = (rto ^ rfrom) << 29;
rto <<= 29;
if (Z_IS_POW2(rxor)) {
if ((rxor & rto) == 0) {
return addr & ~rxor;
} else {
return addr | rxor;
}
} else {
return (addr & ~(7U << 29)) | rto;
}
}
/**
* @brief Return cached pointer to a RAM address
*
* The Xtensa coherence architecture maps addressable RAM twice, in
* two different 512MB regions whose L1 cache settings can be
* controlled independently. So for any given pointer, it is possible
* to convert it to and from a cached version.
*
* This function takes a pointer to any addressible object (either in
* cacheable memory or not) and returns a pointer that can be used to
* refer to the same memory through the L1 data cache. Data read
* through the resulting pointer will reflect locally cached values on
* the current CPU if they exist, and writes will go first into the
* cache and be written back later.
*
* @see arch_xtensa_uncached_ptr()
*
* @param ptr A pointer to a valid C object
* @return A pointer to the same object via the L1 dcache
*/
static inline void *arch_xtensa_cached_ptr(void *ptr)
{
return (void *)z_xtrpoflip((uint32_t) ptr,
CONFIG_XTENSA_CACHED_REGION,
CONFIG_XTENSA_UNCACHED_REGION);
}
/**
* @brief Return uncached pointer to a RAM address
*
* The Xtensa coherence architecture maps addressable RAM twice, in
* two different 512MB regions whose L1 cache settings can be
* controlled independently. So for any given pointer, it is possible
* to convert it to and from a cached version.
*
* This function takes a pointer to any addressible object (either in
* cacheable memory or not) and returns a pointer that can be used to
* refer to the same memory while bypassing the L1 data cache. Data
* in the L1 cache will not be inspected nor modified by the access.
*
* @see arch_xtensa_cached_ptr()
*
* @param ptr A pointer to a valid C object
* @return A pointer to the same object bypassing the L1 dcache
*/
static inline void *arch_xtensa_uncached_ptr(void *ptr)
{
return (void *)z_xtrpoflip((uint32_t) ptr,
CONFIG_XTENSA_UNCACHED_REGION,
CONFIG_XTENSA_CACHED_REGION);
}
/* Utility to generate an unrolled and optimal[1] code sequence to set
* the RPO TLB registers (contra the HAL cacheattr macros, which
* generate larger code and can't be called from C), based on the
* KERNEL_COHERENCE configuration in use. Selects RPO attribute "2"
* for regions (including MMIO registers in region zero) which want to
* bypass L1, "4" for the cached region which wants writeback, and
* "15" (invalid) elsewhere.
*
* Note that on cores that have the "translation" option set, we need
* to put an identity mapping in the high bits. Also per spec
* changing the current code region (by definition cached) requires
* that WITLB be followed by an ISYNC and that both instructions live
* in the same cache line (two 3-byte instructions fit in an 8-byte
* aligned region, so that's guaranteed not to cross a cache line
* boundary).
*
* [1] With the sole exception of gcc's infuriating insistence on
* emitting a precomputed literal for addr + addrincr instead of
* computing it with a single ADD instruction from values it already
* has in registers. Explicitly assigning the variables to registers
* via an attribute works, but then emits needless MOV instructions
* instead. I tell myself it's just 32 bytes of .text, but... Sigh.
*/
#define _REGION_ATTR(r) \
((r) == 0 ? 2 : \
((r) == CONFIG_XTENSA_CACHED_REGION ? 4 : \
((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15)))
#define _SET_ONE_TLB(region) do { \
uint32_t attr = _REGION_ATTR(region); \
if (XCHAL_HAVE_XLT_CACHEATTR) { \
attr |= addr; /* RPO with translation */ \
} \
if (region != CONFIG_XTENSA_CACHED_REGION) { \
__asm__ volatile("wdtlb %0, %1; witlb %0, %1" \
:: "r"(attr), "r"(addr)); \
} else { \
__asm__ volatile("wdtlb %0, %1" \
:: "r"(attr), "r"(addr)); \
__asm__ volatile("j 1f; .align 8; 1:"); \
__asm__ volatile("witlb %0, %1; isync" \
:: "r"(attr), "r"(addr)); \
} \
addr += addrincr; \
} while (0)
#define ARCH_XTENSA_SET_RPO_TLB() do { \
register uint32_t addr = 0, addrincr = 0x20000000; \
FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7); \
} while (0)
#ifdef __cplusplus
} /* extern "C" */
#endif