diff --git a/tests/benchmarks/wait_queues/CMakeLists.txt b/tests/benchmarks/wait_queues/CMakeLists.txt new file mode 100644 index 00000000000..ec1dceae561 --- /dev/null +++ b/tests/benchmarks/wait_queues/CMakeLists.txt @@ -0,0 +1,12 @@ +# SPDX-License-Identifier: Apache-2.0 + +cmake_minimum_required(VERSION 3.20.0) +find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE}) +project(wait_queues) + +FILE(GLOB app_sources src/*.c) +target_sources(app PRIVATE ${app_sources}) +target_include_directories(app PRIVATE + ${ZEPHYR_BASE}/kernel/include + ${ZEPHYR_BASE}/arch/${ARCH}/include + ) diff --git a/tests/benchmarks/wait_queues/Kconfig b/tests/benchmarks/wait_queues/Kconfig new file mode 100644 index 00000000000..e7bc86e8655 --- /dev/null +++ b/tests/benchmarks/wait_queues/Kconfig @@ -0,0 +1,30 @@ +# Copyright (c) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +mainmenu "Wait Queue Benchmark" + +source "Kconfig.zephyr" + +config BENCHMARK_NUM_ITERATIONS + int "Number of iterations to gather data" + default 1000 + help + This option specifies the number of times each test will be executed + before calculating the average times for reporting. + +config BENCHMARK_NUM_THREADS + int "Number of threads" + default 100 + help + This option specifies the maximum number of threads that the test + will add to a wait queue. Increasing this value will places greater + stress on the wait queues and better highlight the performance + differences as the number of threads in the wait queue changes. + +config BENCHMARK_VERBOSE + bool "Display detailed results" + default y + help + This option displays the average time of all the iterations done for + each thread in the tests. This generates large amounts of output. To + analyze it, it is recommended redirect or copy the data to a file. diff --git a/tests/benchmarks/wait_queues/README.rst b/tests/benchmarks/wait_queues/README.rst new file mode 100644 index 00000000000..6bfa004d15a --- /dev/null +++ b/tests/benchmarks/wait_queues/README.rst @@ -0,0 +1,19 @@ +Wait Queue Measurements +####################### + +A Zehpyr application developer may choose between two different wait queue +implementations--dumb and scalable. These two queue implementations perform +differently under different loads. This benchmark can be used to showcase how +the performance of these two implementations vary under varying conditions. + +These conditions include: +* Time to add threads of increasing priority to a wait queue +* Time to add threads of decreasing priority to a wait queue +* Time to remove highest priority thread from a wait queue +* Time to remove lowest priority thread from a wait queue + +By default, these tests show the minimum, maximum, and averages of the measured +times. However, if the verbose option is enabled then the raw timings will also +be displayed. The following will build this project with verbose support: + + EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b diff --git a/tests/benchmarks/wait_queues/prj.conf b/tests/benchmarks/wait_queues/prj.conf new file mode 100644 index 00000000000..9c2898bd4ec --- /dev/null +++ b/tests/benchmarks/wait_queues/prj.conf @@ -0,0 +1,29 @@ +# Default base configuration file + +CONFIG_TEST=y + +# eliminate timer interrupts during the benchmark +CONFIG_SYS_CLOCK_TICKS_PER_SEC=1 + +# We use irq_offload(), enable it +CONFIG_IRQ_OFFLOAD=y + +# Reduce memory/code footprint +CONFIG_BT=n +CONFIG_FORCE_NO_ASSERT=y + +CONFIG_TEST_HW_STACK_PROTECTION=n +# Disable HW Stack Protection (see #28664) +CONFIG_HW_STACK_PROTECTION=n +CONFIG_COVERAGE=n + +# Disable system power management +CONFIG_PM=n + +CONFIG_TIMING_FUNCTIONS=y + +CONFIG_HEAP_MEM_POOL_SIZE=2048 +CONFIG_APPLICATION_DEFINED_SYSCALL=y + +# Disable time slicing +CONFIG_TIMESLICING=n diff --git a/tests/benchmarks/wait_queues/prj.verbose.conf b/tests/benchmarks/wait_queues/prj.verbose.conf new file mode 100644 index 00000000000..b6204397cea --- /dev/null +++ b/tests/benchmarks/wait_queues/prj.verbose.conf @@ -0,0 +1,4 @@ +# Extra configuration file to enable verbose reporting +# Use with EXTRA_CONF_FILE + +CONFIG_BENCHMARK_VERBOSE=y diff --git a/tests/benchmarks/wait_queues/src/main.c b/tests/benchmarks/wait_queues/src/main.c new file mode 100644 index 00000000000..56fe031d48b --- /dev/null +++ b/tests/benchmarks/wait_queues/src/main.c @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2024 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * @file + * This file contains tests that will measure the length of time required + * to add and remove threads from a wait queue that holds a varying number of + * threads. Each thread added to (and removed from) the wait queue is a dummy + * thread. As these dummy threads are inherently non-executable, this helps + * prevent the addition/removal of threads to/from the ready queue from being + * included in these measurements. Furthermore, the use of dummy threads helps + * reduce the memory footprint as not only are thread stacks not required, + * but we also do not need the full k_thread structure for each of these + * dummy threads. + */ + +#include +#include +#include +#include "utils.h" +#include +#include +#include +#include + +uint32_t tm_off; + +static struct _thread_base dummy_thread[CONFIG_BENCHMARK_NUM_THREADS]; +static _wait_q_t wait_q; + +uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS]; +uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS]; + +/** + * Initialize each dummy thread. + */ +static void dummy_threads_init(unsigned int num_threads) +{ + unsigned int i; + unsigned int bucket_size; + + bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1; + + for (i = 0; i < num_threads; i++) { + z_init_thread_base(&dummy_thread[i], i / bucket_size, + _THREAD_DUMMY, 0); + } +} + +static void cycles_reset(unsigned int num_threads) +{ + unsigned int i; + + for (i = 0; i < num_threads; i++) { + add_cycles[i] = 0ULL; + remove_cycles[i] = 0ULL; + } +} + +/** + * Each successive dummy thread added to the wait queue is either of the + * same or lower priority. Each dummy thread removed from the wait queue + * is of the same or lower priority than the one previous. + */ +static void test_decreasing_priority(_wait_q_t *q, unsigned int num_threads) +{ + unsigned int i; + timing_t start; + timing_t finish; + + /* Add to tail of wait queue */ + + for (i = 0; i < num_threads; i++) { + start = timing_counter_get(); + z_pend_thread((struct k_thread *)&dummy_thread[i], + q, K_FOREVER); + finish = timing_counter_get(); + + add_cycles[i] += timing_cycles_get(&start, &finish); + } + + /* Remove from head of wait queue */ + + for (i = 0; i < num_threads; i++) { + start = timing_counter_get(); + z_unpend_thread((struct k_thread *)&dummy_thread[i]); + finish = timing_counter_get(); + + remove_cycles[i] += timing_cycles_get(&start, &finish); + } +} + +static void test_increasing_priority(_wait_q_t *q, unsigned int num_threads) +{ + unsigned int i; + timing_t start; + timing_t finish; + struct k_thread *thread; + + /* Add to head of wait queue */ + + for (i = 0; i < num_threads; i++) { + start = timing_counter_get(); + thread = (struct k_thread *) + &dummy_thread[num_threads - i - 1]; + z_pend_thread(thread, q, K_FOREVER); + finish = timing_counter_get(); + + add_cycles[i] += timing_cycles_get(&start, &finish); + } + + /* Remove from tail of wait queue */ + + for (i = 0; i < num_threads; i++) { + start = timing_counter_get(); + thread = (struct k_thread *) + &dummy_thread[num_threads - i - 1]; + z_unpend_thread(thread); + finish = timing_counter_get(); + + remove_cycles[i] += timing_cycles_get(&start, &finish); + } +} + + +static uint64_t sqrt_u64(uint64_t square) +{ + if (square > 1) { + uint64_t lo = sqrt_u64(square >> 2) << 1; + uint64_t hi = lo + 1; + + return ((hi * hi) > square) ? lo : hi; + } + + return square; +} + + +static void compute_and_report_stats(unsigned int num_threads, + unsigned int num_iterations, + uint64_t *cycles, + const char *str) +{ + uint64_t minimum = cycles[0]; + uint64_t maximum = cycles[0]; + uint64_t total = cycles[0]; + uint64_t average; + uint64_t std_dev = 0; + uint64_t tmp; + uint64_t diff; + unsigned int i; + + for (i = 1; i < num_threads; i++) { + if (cycles[i] > maximum) { + maximum = cycles[i]; + } + + if (cycles[i] < minimum) { + minimum = cycles[i]; + } + + total += cycles[i]; + } + + minimum /= (uint64_t)num_iterations; + maximum /= (uint64_t)num_iterations; + average = total / (num_threads * num_iterations); + + /* Calculate standard deviation */ + + for (i = 0; i < num_threads; i++) { + tmp = cycles[i] / num_iterations; + diff = (average > tmp) ? (average - tmp) : (tmp - average); + + std_dev += (diff * diff); + } + std_dev /= num_threads; + std_dev = sqrt_u64(std_dev); + + printk("%s\n", str); + + printk(" Minimum : %7llu cycles (%7u nsec)\n", + minimum, (uint32_t)timing_cycles_to_ns(minimum)); + printk(" Maximum : %7llu cycles (%7u nsec)\n", + maximum, (uint32_t)timing_cycles_to_ns(maximum)); + printk(" Average : %7llu cycles (%7u nsec)\n", + average, (uint32_t)timing_cycles_to_ns(average)); + printk(" Std Deviation: %7llu cycles (%7u nsec)\n", + std_dev, (uint32_t)timing_cycles_to_ns(std_dev)); + +} + +int main(void) +{ + unsigned int i; + unsigned int freq; +#ifdef CONFIG_BENCHMARK_VERBOSE + char description[120]; + char tag[50]; + struct k_thread *thread; +#endif + + timing_init(); + + bench_test_init(); + + freq = timing_freq_get_mhz(); + + printk("Time Measurements for %s wait queues\n", + IS_ENABLED(CONFIG_WAITQ_DUMB) ? "dumb" : "scalable"); + printk("Timing results: Clock frequency: %u MHz\n", freq); + + z_waitq_init(&wait_q); + + dummy_threads_init(CONFIG_BENCHMARK_NUM_THREADS); + + timing_start(); + + cycles_reset(CONFIG_BENCHMARK_NUM_THREADS); + + for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) { + test_decreasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS); + } + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + add_cycles, + "Add threads of decreasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "WaitQ.add.to.tail.%04u.waiters", i); + snprintf(description, sizeof(description), + "%-40s - Add thread of priority %u", + tag, dummy_thread[i].prio); + PRINT_STATS_AVG(description, (uint32_t)add_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + remove_cycles, + "Remove threads of decreasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "WaitQ.remove.from.head.%04u.waiters", + CONFIG_BENCHMARK_NUM_THREADS - i); + snprintf(description, sizeof(description), + "%-40s - Remove thread of priority %u", + tag, dummy_thread[i].prio); + PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + cycles_reset(CONFIG_BENCHMARK_NUM_THREADS); + + for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) { + test_increasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS); + } + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + add_cycles, + "Add threads of increasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "WaitQ.add.to.head.%04u.waiters", i); + thread = (struct k_thread *) + &dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1]; + snprintf(description, sizeof(description), + "%-40s - Add priority %u to waitq", + tag, thread->base.prio); + PRINT_STATS_AVG(description, (uint32_t)add_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + printk("------------------------------------\n"); + + compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS, + CONFIG_BENCHMARK_NUM_ITERATIONS, + remove_cycles, + "Remove threads of increasing priority"); + +#ifdef CONFIG_BENCHMARK_VERBOSE + for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) { + snprintf(tag, sizeof(tag), + "WaitQ.remove.from.tail.%04u.waiters", + CONFIG_BENCHMARK_NUM_THREADS - i); + thread = (struct k_thread *) + &dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1]; + snprintf(description, sizeof(description), + "%-40s - Remove priority %u from waitq", + tag, thread->base.prio); + PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i], + CONFIG_BENCHMARK_NUM_ITERATIONS); + } +#endif + + timing_stop(); + + TC_END_REPORT(0); + + return 0; +} diff --git a/tests/benchmarks/wait_queues/src/utils.h b/tests/benchmarks/wait_queues/src/utils.h new file mode 100644 index 00000000000..5e95ae1e7b6 --- /dev/null +++ b/tests/benchmarks/wait_queues/src/utils.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2024 Intel Corporation + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __BENCHMARK_WAITQ_UTILS_H +#define __BENCHMARK_WAITQ_UTILS_H +/* + * @brief This file contains macros used in the wait queue benchmarking. + */ + +#include + +#ifdef CSV_FORMAT_OUTPUT +#define FORMAT_STR "%-74s,%s,%s\n" +#define CYCLE_FORMAT "%8u" +#define NSEC_FORMAT "%8u" +#else +#define FORMAT_STR "%-74s:%s , %s\n" +#define CYCLE_FORMAT "%8u cycles" +#define NSEC_FORMAT "%8u ns" +#endif + +/** + * @brief Display a line of statistics + * + * This macro displays the following: + * 1. Test description summary + * 2. Number of cycles + * 3. Number of nanoseconds + */ +#define PRINT_F(summary, cycles, nsec) \ + do { \ + char cycle_str[32]; \ + char nsec_str[32]; \ + \ + snprintk(cycle_str, 30, CYCLE_FORMAT, cycles); \ + snprintk(nsec_str, 30, NSEC_FORMAT, nsec); \ + printk(FORMAT_STR, summary, cycle_str, nsec_str); \ + } while (0) + +#define PRINT_STATS_AVG(summary, value, counter) \ + PRINT_F(summary, value / counter, \ + (uint32_t)timing_cycles_to_ns_avg(value, counter)) + +#endif diff --git a/tests/benchmarks/wait_queues/testcase.yaml b/tests/benchmarks/wait_queues/testcase.yaml new file mode 100644 index 00000000000..a6fd1440b73 --- /dev/null +++ b/tests/benchmarks/wait_queues/testcase.yaml @@ -0,0 +1,21 @@ +common: + tags: + - kernel + - benchmark + integration_platforms: + - qemu_x86 + - qemu_cortex_a53 + harness: console + harness_config: + type: one_line + regex: + - "PROJECT EXECUTION SUCCESSFUL" + +tests: + benchmark.wait_queues.dumb: + extra_configs: + - CONFIG_WAITQ_DUMB=y + + benchmark.wait_queues.scalable: + extra_configs: + - CONFIG_WAITQ_SCALABLE=y