tests: wait queue benchmarks

Implements a set of tests designed to show how the performance of the two wait queue implementations (DUMB and SCALABLE) change as the number of threads in the wait queue varies. Signed-off-by: Peter Mitsis <peter.mitsis@intel.com>
2024-09-11 23:11:15 +00:00 · 2024-09-11 23:11:15 +00:00 · 2221ca82d4
commit 2221ca82d4
parent 626174e982
8 changed files with 481 additions and 0 deletions
--- a/tests/benchmarks/wait_queues/CMakeLists.txt
+++ b/tests/benchmarks/wait_queues/CMakeLists.txt
@ -0,0 +1,12 @@
+# SPDX-License-Identifier: Apache-2.0
+
+cmake_minimum_required(VERSION 3.20.0)
+find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
+project(wait_queues)
+
+FILE(GLOB app_sources src/*.c)
+target_sources(app PRIVATE ${app_sources})
+target_include_directories(app PRIVATE
+  ${ZEPHYR_BASE}/kernel/include
+  ${ZEPHYR_BASE}/arch/${ARCH}/include
+  )
--- a/tests/benchmarks/wait_queues/Kconfig
+++ b/tests/benchmarks/wait_queues/Kconfig
@ -0,0 +1,30 @@
+# Copyright (c) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+mainmenu "Wait Queue Benchmark"
+
+source "Kconfig.zephyr"
+
+config BENCHMARK_NUM_ITERATIONS
+	int "Number of iterations to gather data"
+	default 1000
+	help
+	  This option specifies the number of times each test will be executed
+	  before calculating the average times for reporting.
+
+config BENCHMARK_NUM_THREADS
+	int "Number of threads"
+	default 100
+	help
+	  This option specifies the maximum number of threads that the test
+	  will add to a wait queue. Increasing this value will places greater
+	  stress on the wait queues and better highlight the performance
+	  differences as the number of threads in the wait queue changes.
+
+config BENCHMARK_VERBOSE
+	bool "Display detailed results"
+	default y
+	help
+	  This option displays the average time of all the iterations done for
+	  each thread in the tests. This generates large amounts of output. To
+	  analyze it, it is recommended redirect or copy the data to a file.
--- a/tests/benchmarks/wait_queues/README.rst
+++ b/tests/benchmarks/wait_queues/README.rst
@ -0,0 +1,19 @@
+Wait Queue Measurements
+#######################
+
+A Zehpyr application developer may choose between two different wait queue
+implementations--dumb and scalable. These two queue implementations perform
+differently under different loads. This benchmark can be used to showcase how
+the performance of these two implementations vary under varying conditions.
+
+These conditions include:
+* Time to add threads of increasing priority to a wait queue
+* Time to add threads of decreasing priority to a wait queue
+* Time to remove highest priority thread from a wait queue
+* Time to remove lowest priority thread from a wait queue
+
+By default, these tests show the minimum, maximum, and averages of the measured
+times. However, if the verbose option is enabled then the raw timings will also
+be displayed. The following will build this project with verbose support:
+
+    EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b <board> <path to project>
--- a/tests/benchmarks/wait_queues/prj.conf
+++ b/tests/benchmarks/wait_queues/prj.conf
@ -0,0 +1,29 @@
+# Default base configuration file
+
+CONFIG_TEST=y
+
+# eliminate timer interrupts during the benchmark
+CONFIG_SYS_CLOCK_TICKS_PER_SEC=1
+
+# We use irq_offload(), enable it
+CONFIG_IRQ_OFFLOAD=y
+
+# Reduce memory/code footprint
+CONFIG_BT=n
+CONFIG_FORCE_NO_ASSERT=y
+
+CONFIG_TEST_HW_STACK_PROTECTION=n
+# Disable HW Stack Protection (see #28664)
+CONFIG_HW_STACK_PROTECTION=n
+CONFIG_COVERAGE=n
+
+# Disable system power management
+CONFIG_PM=n
+
+CONFIG_TIMING_FUNCTIONS=y
+
+CONFIG_HEAP_MEM_POOL_SIZE=2048
+CONFIG_APPLICATION_DEFINED_SYSCALL=y
+
+# Disable time slicing
+CONFIG_TIMESLICING=n
--- a/tests/benchmarks/wait_queues/prj.verbose.conf
+++ b/tests/benchmarks/wait_queues/prj.verbose.conf
@ -0,0 +1,4 @@
+# Extra configuration file to enable verbose reporting
+# Use with EXTRA_CONF_FILE
+
+CONFIG_BENCHMARK_VERBOSE=y
--- a/tests/benchmarks/wait_queues/src/main.c
+++ b/tests/benchmarks/wait_queues/src/main.c
@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/*
+ * @file
+ * This file contains tests that will measure the length of time required
+ * to add and remove threads from a wait queue that holds a varying number of
+ * threads. Each thread added to (and removed from) the wait queue is a dummy
+ * thread. As these dummy threads are inherently non-executable, this helps
+ * prevent the addition/removal of threads to/from the ready queue from being
+ * included in these measurements. Furthermore, the use of dummy threads helps
+ * reduce the memory footprint as not only are thread stacks not required,
+ * but we also do not need the full k_thread structure for each of these
+ * dummy threads.
+ */
+
+#include <zephyr/kernel.h>
+#include <zephyr/timestamp.h>
+#include <zephyr/timing/timing.h>
+#include "utils.h"
+#include <zephyr/tc_util.h>
+#include <wait_q.h>
+#include <ksched.h>
+#include <stdio.h>
+
+uint32_t tm_off;
+
+static struct _thread_base dummy_thread[CONFIG_BENCHMARK_NUM_THREADS];
+static _wait_q_t wait_q;
+
+uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS];
+uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS];
+
+/**
+ * Initialize each dummy thread.
+ */
+static void dummy_threads_init(unsigned int num_threads)
+{
+	unsigned int i;
+	unsigned int bucket_size;
+
+	bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1;
+
+	for (i = 0; i < num_threads; i++) {
+		z_init_thread_base(&dummy_thread[i], i / bucket_size,
+				   _THREAD_DUMMY, 0);
+	}
+}
+
+static void cycles_reset(unsigned int num_threads)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_threads; i++) {
+		add_cycles[i] = 0ULL;
+		remove_cycles[i] = 0ULL;
+	}
+}
+
+/**
+ * Each successive dummy thread added to the wait queue is either of the
+ * same or lower priority. Each dummy thread removed from the wait queue
+ * is of the same or lower priority than the one previous.
+ */
+static void test_decreasing_priority(_wait_q_t *q, unsigned int num_threads)
+{
+	unsigned int i;
+	timing_t start;
+	timing_t finish;
+
+	/* Add to tail of wait queue */
+
+	for (i = 0; i < num_threads; i++) {
+		start = timing_counter_get();
+		z_pend_thread((struct k_thread *)&dummy_thread[i],
+			      q, K_FOREVER);
+		finish = timing_counter_get();
+
+		add_cycles[i] += timing_cycles_get(&start, &finish);
+	}
+
+	/* Remove from head of wait queue */
+
+	for (i = 0; i < num_threads; i++) {
+		start = timing_counter_get();
+		z_unpend_thread((struct k_thread *)&dummy_thread[i]);
+		finish = timing_counter_get();
+
+		remove_cycles[i] += timing_cycles_get(&start, &finish);
+	}
+}
+
+static void test_increasing_priority(_wait_q_t *q, unsigned int num_threads)
+{
+	unsigned int i;
+	timing_t start;
+	timing_t finish;
+	struct k_thread *thread;
+
+	/* Add to head of wait queue */
+
+	for (i = 0; i < num_threads; i++) {
+		start = timing_counter_get();
+		thread = (struct k_thread *)
+			 &dummy_thread[num_threads - i - 1];
+		z_pend_thread(thread, q, K_FOREVER);
+		finish = timing_counter_get();
+
+		add_cycles[i] += timing_cycles_get(&start, &finish);
+	}
+
+	/* Remove from tail of wait queue */
+
+	for (i = 0; i < num_threads; i++) {
+		start = timing_counter_get();
+		thread = (struct k_thread *)
+			 &dummy_thread[num_threads - i - 1];
+		z_unpend_thread(thread);
+		finish = timing_counter_get();
+
+		remove_cycles[i] += timing_cycles_get(&start, &finish);
+	}
+}
+
+
+static uint64_t sqrt_u64(uint64_t square)
+{
+	if (square > 1) {
+		uint64_t lo = sqrt_u64(square >> 2) << 1;
+		uint64_t hi = lo + 1;
+
+		return ((hi * hi) > square) ? lo : hi;
+	}
+
+	return square;
+}
+
+
+static void compute_and_report_stats(unsigned int num_threads,
+				     unsigned int num_iterations,
+				     uint64_t *cycles,
+				     const char *str)
+{
+	uint64_t minimum = cycles[0];
+	uint64_t maximum = cycles[0];
+	uint64_t total = cycles[0];
+	uint64_t average;
+	uint64_t std_dev = 0;
+	uint64_t tmp;
+	uint64_t diff;
+	unsigned int i;
+
+	for (i = 1; i < num_threads; i++) {
+		if (cycles[i] > maximum) {
+			maximum = cycles[i];
+		}
+
+		if (cycles[i] < minimum) {
+			minimum = cycles[i];
+		}
+
+		total += cycles[i];
+	}
+
+	minimum /= (uint64_t)num_iterations;
+	maximum /= (uint64_t)num_iterations;
+	average = total / (num_threads * num_iterations);
+
+	/* Calculate standard deviation */
+
+	for (i = 0; i < num_threads; i++) {
+		tmp = cycles[i] / num_iterations;
+		diff = (average > tmp) ? (average - tmp) : (tmp - average);
+
+		std_dev += (diff * diff);
+	}
+	std_dev /= num_threads;
+	std_dev = sqrt_u64(std_dev);
+
+	printk("%s\n", str);
+
+	printk("    Minimum : %7llu cycles (%7u nsec)\n",
+	       minimum, (uint32_t)timing_cycles_to_ns(minimum));
+	printk("    Maximum : %7llu cycles (%7u nsec)\n",
+	       maximum, (uint32_t)timing_cycles_to_ns(maximum));
+	printk("    Average : %7llu cycles (%7u nsec)\n",
+	       average, (uint32_t)timing_cycles_to_ns(average));
+	printk("    Std Deviation: %7llu cycles (%7u nsec)\n",
+	       std_dev, (uint32_t)timing_cycles_to_ns(std_dev));
+
+}
+
+int main(void)
+{
+	unsigned int i;
+	unsigned int freq;
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	char description[120];
+	char tag[50];
+	struct k_thread *thread;
+#endif
+
+	timing_init();
+
+	bench_test_init();
+
+	freq = timing_freq_get_mhz();
+
+	printk("Time Measurements for %s wait queues\n",
+	       IS_ENABLED(CONFIG_WAITQ_DUMB) ? "dumb" : "scalable");
+	printk("Timing results: Clock frequency: %u MHz\n", freq);
+
+	z_waitq_init(&wait_q);
+
+	dummy_threads_init(CONFIG_BENCHMARK_NUM_THREADS);
+
+	timing_start();
+
+	cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
+		test_decreasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS);
+	}
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 add_cycles,
+				 "Add threads of decreasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "WaitQ.add.to.tail.%04u.waiters", i);
+		snprintf(description, sizeof(description),
+			 "%-40s - Add thread of priority %u",
+			 tag, dummy_thread[i].prio);
+		PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 remove_cycles,
+				 "Remove threads of decreasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "WaitQ.remove.from.head.%04u.waiters",
+			 CONFIG_BENCHMARK_NUM_THREADS - i);
+		snprintf(description, sizeof(description),
+			 "%-40s - Remove thread of priority %u",
+			 tag, dummy_thread[i].prio);
+		PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
+
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
+		test_increasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS);
+	}
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				add_cycles,
+				 "Add threads of increasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			 "WaitQ.add.to.head.%04u.waiters", i);
+		thread = (struct k_thread *)
+			 &dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
+		snprintf(description, sizeof(description),
+			 "%-40s - Add priority %u to waitq",
+			 tag, thread->base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	printk("------------------------------------\n");
+
+	compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
+				 CONFIG_BENCHMARK_NUM_ITERATIONS,
+				 remove_cycles,
+				 "Remove threads of increasing priority");
+
+#ifdef CONFIG_BENCHMARK_VERBOSE
+	for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
+		snprintf(tag, sizeof(tag),
+			"WaitQ.remove.from.tail.%04u.waiters",
+			CONFIG_BENCHMARK_NUM_THREADS - i);
+		thread = (struct k_thread *)
+			 &dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
+		snprintf(description, sizeof(description),
+			 "%-40s - Remove priority %u from waitq",
+			 tag, thread->base.prio);
+		PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
+				CONFIG_BENCHMARK_NUM_ITERATIONS);
+	}
+#endif
+
+	timing_stop();
+
+	TC_END_REPORT(0);
+
+	return 0;
+}
--- a/tests/benchmarks/wait_queues/src/utils.h
+++ b/tests/benchmarks/wait_queues/src/utils.h
@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024 Intel Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __BENCHMARK_WAITQ_UTILS_H
+#define __BENCHMARK_WAITQ_UTILS_H
+/*
+ * @brief This file contains macros used in the wait queue benchmarking.
+ */
+
+#include <zephyr/sys/printk.h>
+
+#ifdef CSV_FORMAT_OUTPUT
+#define FORMAT_STR   "%-74s,%s,%s\n"
+#define CYCLE_FORMAT "%8u"
+#define NSEC_FORMAT  "%8u"
+#else
+#define FORMAT_STR   "%-74s:%s , %s\n"
+#define CYCLE_FORMAT "%8u cycles"
+#define NSEC_FORMAT  "%8u ns"
+#endif
+
+/**
+ * @brief Display a line of statistics
+ *
+ * This macro displays the following:
+ *  1. Test description summary
+ *  2. Number of cycles
+ *  3. Number of nanoseconds
+ */
+#define PRINT_F(summary, cycles, nsec)                                   \
+	do {                                                             \
+		char cycle_str[32];                                      \
+		char nsec_str[32];                                       \
+									 \
+		snprintk(cycle_str, 30, CYCLE_FORMAT, cycles);           \
+		snprintk(nsec_str, 30, NSEC_FORMAT, nsec);               \
+		printk(FORMAT_STR, summary, cycle_str, nsec_str);        \
+	} while (0)
+
+#define PRINT_STATS_AVG(summary, value, counter)                    \
+	PRINT_F(summary, value / counter,                           \
+		(uint32_t)timing_cycles_to_ns_avg(value, counter))
+
+#endif
--- a/tests/benchmarks/wait_queues/testcase.yaml
+++ b/tests/benchmarks/wait_queues/testcase.yaml
@ -0,0 +1,21 @@
+common:
+  tags:
+    - kernel
+    - benchmark
+  integration_platforms:
+    - qemu_x86
+    - qemu_cortex_a53
+  harness: console
+  harness_config:
+    type: one_line
+    regex:
+      - "PROJECT EXECUTION SUCCESSFUL"
+
+tests:
+  benchmark.wait_queues.dumb:
+    extra_configs:
+      - CONFIG_WAITQ_DUMB=y
+
+  benchmark.wait_queues.scalable:
+    extra_configs:
+      - CONFIG_WAITQ_SCALABLE=y