diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8b51ec570b5..8b3ee35a670 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -89,4 +89,21 @@ config XTENSA_SMALL_VECTOR_TABLE_ENTRY handlers to the end of vector table, renaming them to _Level\LVL\()VectorHelper. +config XTENSA_CACHED_REGION + int "Cached RPO mapping" + range 0 7 + help + A design trick on multi-core hardware is to map memory twice + so that it can be seen in both (incoherent) cached mappings + and a coherent "shared" area. This specifies which 512M + region (0-7, as defined by the Xtensa Region Protection + Option) contains the "cached" mapping. + +config XTENSA_UNCACHED_REGION + int "Uncached RPO mapping" + range 0 7 + help + As for XTENSA_CACHED_REGION, this specifies which 512M + region (0-7) contains the "uncached" mapping. + endmenu diff --git a/arch/xtensa/include/kernel_arch_func.h b/arch/xtensa/include/kernel_arch_func.h index b69f951adce..82a4f986c5b 100644 --- a/arch/xtensa/include/kernel_arch_func.h +++ b/arch/xtensa/include/kernel_arch_func.h @@ -58,18 +58,6 @@ static inline void arch_switch(void *switch_to, void **switched_from) return xtensa_switch(switch_to, switched_from); } -/* FIXME: we don't have a framework for including this from the SoC - * layer, so we define it in the arch code here. - */ -#if defined(CONFIG_SOC_FAMILY_INTEL_ADSP) && defined(CONFIG_KERNEL_COHERENCE) -static inline bool arch_mem_coherent(void *ptr) -{ - size_t addr = (size_t) ptr; - - return addr >= 0x80000000 && addr < 0xa0000000; -} -#endif - #ifdef CONFIG_KERNEL_COHERENCE static ALWAYS_INLINE void arch_cohere_stacks(struct k_thread *old_thread, void *old_switch_handle, diff --git a/include/arch/xtensa/cache.h b/include/arch/xtensa/cache.h index 12dc2c60065..bfd8608a4b8 100644 --- a/include/arch/xtensa/cache.h +++ b/include/arch/xtensa/cache.h @@ -15,8 +15,9 @@ extern "C" { #define Z_DCACHE_MAX (XCHAL_DCACHE_SIZE / XCHAL_DCACHE_WAYS) -#if XCHAL_DCACHE_SIZE #define Z_IS_POW2(x) (((x) != 0) && (((x) & ((x)-1)) == 0)) + +#if XCHAL_DCACHE_SIZE BUILD_ASSERT(Z_IS_POW2(XCHAL_DCACHE_LINESIZE)); BUILD_ASSERT(Z_IS_POW2(Z_DCACHE_MAX)); #endif @@ -78,6 +79,139 @@ static ALWAYS_INLINE void z_xtensa_cache_flush_inv_all(void) z_xtensa_cache_flush_inv(NULL, Z_DCACHE_MAX); } +#ifdef CONFIG_ARCH_HAS_COHERENCE +static inline bool arch_mem_coherent(void *ptr) +{ + size_t addr = (size_t) ptr; + + return (addr >> 29) == CONFIG_XTENSA_UNCACHED_REGION; +} +#endif + +static ALWAYS_INLINE uint32_t z_xtrpoflip(uint32_t addr, uint32_t rto, uint32_t rfrom) +{ + /* The math here is all compile-time: when the two regions + * differ by a power of two, we can convert between them by + * setting or clearing just one bit. Otherwise it needs two + * operations. + */ + uint32_t rxor = (rto ^ rfrom) << 29; + + rto <<= 29; + if (Z_IS_POW2(rxor)) { + if ((rxor & rto) == 0) { + return addr & ~rxor; + } else { + return addr | rxor; + } + } else { + return (addr & ~(7U << 29)) | rto; + } +} + +/** + * @brief Return cached pointer to a RAM address + * + * The Xtensa coherence architecture maps addressable RAM twice, in + * two different 512MB regions whose L1 cache settings can be + * controlled independently. So for any given pointer, it is possible + * to convert it to and from a cached version. + * + * This function takes a pointer to any addressible object (either in + * cacheable memory or not) and returns a pointer that can be used to + * refer to the same memory through the L1 data cache. Data read + * through the resulting pointer will reflect locally cached values on + * the current CPU if they exist, and writes will go first into the + * cache and be written back later. + * + * @see arch_xtensa_uncached_ptr() + * + * @param ptr A pointer to a valid C object + * @return A pointer to the same object via the L1 dcache + */ +static inline void *arch_xtensa_cached_ptr(void *ptr) +{ + return (void *)z_xtrpoflip((uint32_t) ptr, + CONFIG_XTENSA_CACHED_REGION, + CONFIG_XTENSA_UNCACHED_REGION); +} + +/** + * @brief Return uncached pointer to a RAM address + * + * The Xtensa coherence architecture maps addressable RAM twice, in + * two different 512MB regions whose L1 cache settings can be + * controlled independently. So for any given pointer, it is possible + * to convert it to and from a cached version. + * + * This function takes a pointer to any addressible object (either in + * cacheable memory or not) and returns a pointer that can be used to + * refer to the same memory while bypassing the L1 data cache. Data + * in the L1 cache will not be inspected nor modified by the access. + * + * @see arch_xtensa_cached_ptr() + * + * @param ptr A pointer to a valid C object + * @return A pointer to the same object bypassing the L1 dcache + */ +static inline void *arch_xtensa_uncached_ptr(void *ptr) +{ + return (void *)z_xtrpoflip((uint32_t) ptr, + CONFIG_XTENSA_UNCACHED_REGION, + CONFIG_XTENSA_CACHED_REGION); +} + +/* Utility to generate an unrolled and optimal[1] code sequence to set + * the RPO TLB registers (contra the HAL cacheattr macros, which + * generate larger code and can't be called from C), based on the + * KERNEL_COHERENCE configuration in use. Selects RPO attribute "2" + * for regions (including MMIO registers in region zero) which want to + * bypass L1, "4" for the cached region which wants writeback, and + * "15" (invalid) elsewhere. + * + * Note that on cores that have the "translation" option set, we need + * to put an identity mapping in the high bits. Also per spec + * changing the current code region (by definition cached) requires + * that WITLB be followed by an ISYNC and that both instructions live + * in the same cache line (two 3-byte instructions fit in an 8-byte + * aligned region, so that's guaranteed not to cross a cache line + * boundary). + * + * [1] With the sole exception of gcc's infuriating insistence on + * emitting a precomputed literal for addr + addrincr instead of + * computing it with a single ADD instruction from values it already + * has in registers. Explicitly assigning the variables to registers + * via an attribute works, but then emits needless MOV instructions + * instead. I tell myself it's just 32 bytes of .text, but... Sigh. + */ +#define _REGION_ATTR(r) \ + ((r) == 0 ? 2 : \ + ((r) == CONFIG_XTENSA_CACHED_REGION ? 4 : \ + ((r) == CONFIG_XTENSA_UNCACHED_REGION ? 2 : 15))) + +#define _SET_ONE_TLB(region) do { \ + uint32_t attr = _REGION_ATTR(region); \ + if (XCHAL_HAVE_XLT_CACHEATTR) { \ + attr |= addr; /* RPO with translation */ \ + } \ + if (region != CONFIG_XTENSA_CACHED_REGION) { \ + __asm__ volatile("wdtlb %0, %1; witlb %0, %1" \ + :: "r"(attr), "r"(addr)); \ + } else { \ + __asm__ volatile("wdtlb %0, %1" \ + :: "r"(attr), "r"(addr)); \ + __asm__ volatile("j 1f; .align 8; 1:"); \ + __asm__ volatile("witlb %0, %1; isync" \ + :: "r"(attr), "r"(addr)); \ + } \ + addr += addrincr; \ +} while (0) + +#define ARCH_XTENSA_SET_RPO_TLB() do { \ + register uint32_t addr = 0, addrincr = 0x20000000; \ + FOR_EACH(_SET_ONE_TLB, (;), 0, 1, 2, 3, 4, 5, 6, 7); \ +} while (0) + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/soc/xtensa/intel_adsp/Kconfig.defconfig b/soc/xtensa/intel_adsp/Kconfig.defconfig index 76b060560f9..1e6889389a7 100644 --- a/soc/xtensa/intel_adsp/Kconfig.defconfig +++ b/soc/xtensa/intel_adsp/Kconfig.defconfig @@ -4,3 +4,11 @@ # SPDX-License-Identifier: Apache-2.0 source "soc/xtensa/intel_adsp/*/Kconfig.defconfig.series" + +# Lower priority defaults come AFTER the series-specific ones set above + +config XTENSA_CACHED_REGION + default 5 + +config XTENSA_UNCACHED_REGION + default 4 diff --git a/soc/xtensa/intel_adsp/common/CMakeLists.txt b/soc/xtensa/intel_adsp/common/CMakeLists.txt index 435e07482d8..ede0063f235 100644 --- a/soc/xtensa/intel_adsp/common/CMakeLists.txt +++ b/soc/xtensa/intel_adsp/common/CMakeLists.txt @@ -59,6 +59,7 @@ add_custom_target( copy ${CMAKE_BINARY_DIR}/zephyr/${KERNEL_NAME}.elf ${KERNEL_REMAPPED} COMMAND ${ELF_FIX} ${CMAKE_OBJCOPY} ${KERNEL_REMAPPED} + ${CONFIG_XTENSA_CACHED_REGION} ${CONFIG_XTENSA_UNCACHED_REGION} # Extract modules for rimage COMMAND ${CMAKE_OBJCOPY} diff --git a/soc/xtensa/intel_adsp/common/fix_elf_addrs.py b/soc/xtensa/intel_adsp/common/fix_elf_addrs.py index 3f60fd386b4..8b0c7c2a21c 100755 --- a/soc/xtensa/intel_adsp/common/fix_elf_addrs.py +++ b/soc/xtensa/intel_adsp/common/fix_elf_addrs.py @@ -3,13 +3,11 @@ # Copyright (c) 2020 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -# ADSP devices have their RAM regions mapped twice, once in the 512MB -# region from 0x80000000-0x9fffffff and again from -# 0xa0000000-0xbfffffff. The first mapping is set in the CPU to -# bypass the L1 cache, and so access through pointers in that region -# is coherent between CPUs (but slow). The second region accesses the -# same memory through the L1 cache and requires careful flushing when -# used with shared data. +# ADSP devices have their RAM regions mapped twice. The first mapping +# is set in the CPU to bypass the L1 cache, and so access through +# pointers in that region is coherent between CPUs (but slow). The +# second region accesses the same memory through the L1 cache and +# requires careful flushing when used with shared data. # # This distinction is exposed in the linker script, where some symbols # (e.g. stack regions) are linked into cached memory, but others @@ -26,13 +24,19 @@ from elftools.elf.elffile import ELFFile objcopy_bin = sys.argv[1] elffile = sys.argv[2] +cached_reg = int(sys.argv[3]) +uncached_reg = int(sys.argv[4]) + +uc_min = uncached_reg << 29 +uc_max = uc_min | 0x1fffffff +cache_off = "0x%x" % ((cached_reg - uncached_reg) << 29) fixup =[] with open(elffile, "rb") as fd: elf = ELFFile(fd) for s in elf.iter_sections(): addr = s.header.sh_addr - if 0x80000000 <= addr < 0xa0000000: + if uc_min <= addr <= uc_max: print(f"fix_elf_addrs.py: Moving section {s.name} to cached SRAM region") fixup.append(s.name) @@ -43,5 +47,5 @@ for s in fixup: # error (no --quiet option, no -Werror=no-whatever, nothing). # Just swallow the error stream for now pending rework to the # linker framework. - cmd = f"{objcopy_bin} --change-section-address {s}+0x20000000 {elffile} 2>/dev/null" + cmd = f"{objcopy_bin} --change-section-address {s}+{cache_off} {elffile} 2>/dev/null" os.system(cmd) diff --git a/soc/xtensa/intel_adsp/common/include/cavs-link.ld b/soc/xtensa/intel_adsp/common/include/cavs-link.ld index 2978594b25a..85804185960 100644 --- a/soc/xtensa/intel_adsp/common/include/cavs-link.ld +++ b/soc/xtensa/intel_adsp/common/include/cavs-link.ld @@ -25,31 +25,25 @@ OUTPUT_ARCH(xtensa) ENTRY(rom_entry); -/* DSP RAM regions (all of them) are mapped twice on the DSP: once in - * a 512MB region from 0x80000000-0x9fffffff and again from - * 0xa0000000-0xbfffffff. The first mapping is set up to bypass the - * L1 cache, so it must be used when multiprocessor coherence is - * desired, where the latter mapping is best used for processor-local - * data (e.g. stacks) or shared data that is managed with explicit - * cache flush/invalidate operations. +/* DSP RAM regions (all of them) are mapped twice on the DSP. One + * mapping is set up to bypass the L1 cache, so it must be used when + * multiprocessor coherence is desired, where the latter mapping is + * best used for processor-local data (e.g. stacks) or shared data + * that is managed with explicit cache flush/invalidate operations. * * These macros will set up a segment start address correctly, * including alignment to a cache line. Be sure to also emit the - * section to ">ram :ram_phdr" or ">ucram :ucram_phdr" as - * appropriate. (Forgetting the correct PHDR will actually work, as - * the output tooling ignores it, but it will cause the linker to emit - * 512MB of unused data into the output file!) - * - * (Note clumsy syntax because XCC doesn't understand the "~" operator) + * section to ">ram" or ">ucram" as appropriate, to prevent the linker + * from filling in 512MB of sparse zeros. */ #ifdef CONFIG_KERNEL_COHERENCE -#define SEGSTART_CACHED (ALIGN(64) | 0x20000000) -#define SEGSTART_UNCACHED (ALIGN(64) & 0xdfffffff) /* == ~0x20000000 */ +#define RPO_SET(addr, reg) ((addr & 0x1fffffff) | (reg << 29)) +#define SEGSTART_CACHED RPO_SET(ALIGN(64), CONFIG_XTENSA_CACHED_REGION) +#define SEGSTART_UNCACHED RPO_SET(ALIGN(64), CONFIG_XTENSA_UNCACHED_REGION) #else #define SEGSTART_CACHED . #define SEGSTART_UNCACHED . #define ucram ram -#define ucram_phdr ram_phdr #endif /* intlist.ld needs an IDT_LIST memory region */ @@ -129,7 +123,7 @@ MEMORY { len = RAM_SIZE #ifdef CONFIG_KERNEL_COHERENCE ucram : - org = RAM_BASE - 0x20000000, + org = RPO_SET(RAM_BASE, CONFIG_XTENSA_UNCACHED_REGION), len = RAM_SIZE #endif #ifdef CONFIG_GEN_ISR_TABLES diff --git a/soc/xtensa/intel_adsp/common/include/cpu_init.h b/soc/xtensa/intel_adsp/common/include/cpu_init.h index 6ba83cda0c6..b0ce7de9ab4 100644 --- a/soc/xtensa/intel_adsp/common/include/cpu_init.h +++ b/soc/xtensa/intel_adsp/common/include/cpu_init.h @@ -4,6 +4,7 @@ #ifndef __INTEL_ADSP_CPU_INIT_H #define __INTEL_ADSP_CPU_INIT_H +#include #include #define CxL1CCAP (*(volatile uint32_t *)0x9F080080) @@ -14,39 +15,6 @@ #define CxL1CCAP_DCMWC ((CxL1CCAP >> 16) & 7) #define CxL1CCAP_ICMWC ((CxL1CCAP >> 20) & 7) -/* Utilities to generate an unwrapped code sequence to set the RPO TLB - * registers. Pass the 8 region attributes as arguments, e.g.: - * - * SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15); - * - * Note that cAVS 1.5 has the "translation" option that we don't use, - * but still need to put an identity mapping in the high bits. Also - * per spec changing the current code region requires that WITLB be - * followed by an ISYNC and that both instructions live in the same - * cache line (two 3-byte instructions fit in an 8-byte aligned - * region, so that's guaranteed not to cross a caceh line boundary). - */ -#define SET_ONE_TLB(region, att) do { \ - uint32_t addr = region * 0x20000000U, attr = att; \ - if (XCHAL_HAVE_XLT_CACHEATTR) { \ - attr |= addr; /* RPO with translation */ \ - } \ - if (region != (L2_SRAM_BASE >> 29)) { \ - __asm__ volatile("wdtlb %0, %1; witlb %0, %1" \ - :: "r"(attr), "r"(addr)); \ - } else { \ - __asm__ volatile("wdtlb %0, %1" \ - :: "r"(attr), "r"(addr)); \ - __asm__ volatile("j 1f; .align 8; 1:"); \ - __asm__ volatile("witlb %0, %1; isync" \ - :: "r"(attr), "r"(addr)); \ - } \ -} while (0) - -#define SET_RPO_TLB(...) do { \ - FOR_EACH_IDX(SET_ONE_TLB, (;), __VA_ARGS__); \ -} while (0) - /* Low-level CPU initialization. Call this immediately after entering * C code to initialize the cache, protection and synchronization * features. @@ -98,15 +66,9 @@ static ALWAYS_INLINE void cpu_early_init(void) /* Finally we need to enable the cache in the Region * Protection Option "TLB" entries. The hardware defaults - * have this set to RW/uncached (2) everywhere. We want - * writeback caching (4) in the sixth mapping (the second of - * two RAM mappings) and to mark all unused regions - * inaccessible (15) for safety. Note that there is a HAL - * routine that does this (by emulating the older "cacheattr" - * hardware register), but it generates significantly larger - * code. + * have this set to RW/uncached everywhere. */ - SET_RPO_TLB(2, 15, 15, 15, 2, 4, 15, 15); + ARCH_XTENSA_SET_RPO_TLB(); /* Initialize ATOMCTL: Hardware defaults for S32C1I use * "internal" operations, meaning they are atomic only WRT the diff --git a/soc/xtensa/intel_adsp/common/include/soc.h b/soc/xtensa/intel_adsp/common/include/soc.h index 26bcbede9ee..129df3f5453 100644 --- a/soc/xtensa/intel_adsp/common/include/soc.h +++ b/soc/xtensa/intel_adsp/common/include/soc.h @@ -80,54 +80,12 @@ extern void soc_start_core(int cpu_num); extern bool soc_cpus_active[CONFIG_MP_NUM_CPUS]; -/* Legacy SOC-level API still used in a few drivers */ +/* Legacy cache APIs still used in a few places */ #define SOC_DCACHE_FLUSH(addr, size) \ z_xtensa_cache_flush((addr), (size)) #define SOC_DCACHE_INVALIDATE(addr, size) \ z_xtensa_cache_inv((addr), (size)) - -/** - * @brief Return uncached pointer to a RAM address - * - * The Intel ADSP architecture maps all addressable RAM (of all types) - * twice, in two different 512MB segments regions whose L1 cache - * settings can be controlled independently. So for any given - * pointer, it is possible to convert it to and from a cached version. - * - * This function takes a pointer to any addressible object (either in - * cacheable memory or not) and returns a pointer that can be used to - * refer to the same memory while bypassing the L1 data cache. Data - * in the L1 cache will not be inspected nor modified by the access. - * - * @see z_soc_cached_ptr() - * - * @param p A pointer to a valid C object - * @return A pointer to the same object bypassing the L1 dcache - */ -static inline void *z_soc_uncached_ptr(void *p) -{ - return ((void *)(((size_t)p) & ~0x20000000)); -} - -/** - * @brief Return cached pointer to a RAM address - * - * This function takes a pointer to any addressible object (either in - * cacheable memory or not) and returns a pointer that can be used to - * refer to the same memory through the L1 data cache. Data read - * through the resulting pointer will reflect locally cached values on - * the current CPU if they exist, and writes will go first into the - * cache and be written back later. - * - * @see z_soc_uncached_ptr() - * - * @param p A pointer to a valid C object - * @return A pointer to the same object via the L1 dcache - - */ -static inline void *z_soc_cached_ptr(void *p) -{ - return ((void *)(((size_t)p) | 0x20000000)); -} +#define z_soc_cached_ptr(p) arch_xtensa_cached_ptr(p) +#define z_soc_uncached_ptr(p) arch_xtensa_uncached_ptr(p) #endif /* __INC_SOC_H */