tests: wait queue benchmarks

Implements a set of tests designed to show how the performance of the two
wait queue implementations (DUMB and SCALABLE) change as the number of
threads in the wait queue varies.

Signed-off-by: Peter Mitsis <peter.mitsis@intel.com>
This commit is contained in:
Peter Mitsis 2024-09-11 23:11:15 +00:00 committed by Anas Nashif
commit 2221ca82d4
8 changed files with 481 additions and 0 deletions

View file

@ -0,0 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
cmake_minimum_required(VERSION 3.20.0)
find_package(Zephyr REQUIRED HINTS $ENV{ZEPHYR_BASE})
project(wait_queues)
FILE(GLOB app_sources src/*.c)
target_sources(app PRIVATE ${app_sources})
target_include_directories(app PRIVATE
${ZEPHYR_BASE}/kernel/include
${ZEPHYR_BASE}/arch/${ARCH}/include
)

View file

@ -0,0 +1,30 @@
# Copyright (c) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
mainmenu "Wait Queue Benchmark"
source "Kconfig.zephyr"
config BENCHMARK_NUM_ITERATIONS
int "Number of iterations to gather data"
default 1000
help
This option specifies the number of times each test will be executed
before calculating the average times for reporting.
config BENCHMARK_NUM_THREADS
int "Number of threads"
default 100
help
This option specifies the maximum number of threads that the test
will add to a wait queue. Increasing this value will places greater
stress on the wait queues and better highlight the performance
differences as the number of threads in the wait queue changes.
config BENCHMARK_VERBOSE
bool "Display detailed results"
default y
help
This option displays the average time of all the iterations done for
each thread in the tests. This generates large amounts of output. To
analyze it, it is recommended redirect or copy the data to a file.

View file

@ -0,0 +1,19 @@
Wait Queue Measurements
#######################
A Zehpyr application developer may choose between two different wait queue
implementations--dumb and scalable. These two queue implementations perform
differently under different loads. This benchmark can be used to showcase how
the performance of these two implementations vary under varying conditions.
These conditions include:
* Time to add threads of increasing priority to a wait queue
* Time to add threads of decreasing priority to a wait queue
* Time to remove highest priority thread from a wait queue
* Time to remove lowest priority thread from a wait queue
By default, these tests show the minimum, maximum, and averages of the measured
times. However, if the verbose option is enabled then the raw timings will also
be displayed. The following will build this project with verbose support:
EXTRA_CONF_FILE="prj.verbose.conf" west build -p -b <board> <path to project>

View file

@ -0,0 +1,29 @@
# Default base configuration file
CONFIG_TEST=y
# eliminate timer interrupts during the benchmark
CONFIG_SYS_CLOCK_TICKS_PER_SEC=1
# We use irq_offload(), enable it
CONFIG_IRQ_OFFLOAD=y
# Reduce memory/code footprint
CONFIG_BT=n
CONFIG_FORCE_NO_ASSERT=y
CONFIG_TEST_HW_STACK_PROTECTION=n
# Disable HW Stack Protection (see #28664)
CONFIG_HW_STACK_PROTECTION=n
CONFIG_COVERAGE=n
# Disable system power management
CONFIG_PM=n
CONFIG_TIMING_FUNCTIONS=y
CONFIG_HEAP_MEM_POOL_SIZE=2048
CONFIG_APPLICATION_DEFINED_SYSCALL=y
# Disable time slicing
CONFIG_TIMESLICING=n

View file

@ -0,0 +1,4 @@
# Extra configuration file to enable verbose reporting
# Use with EXTRA_CONF_FILE
CONFIG_BENCHMARK_VERBOSE=y

View file

@ -0,0 +1,319 @@
/*
* Copyright (c) 2024 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* @file
* This file contains tests that will measure the length of time required
* to add and remove threads from a wait queue that holds a varying number of
* threads. Each thread added to (and removed from) the wait queue is a dummy
* thread. As these dummy threads are inherently non-executable, this helps
* prevent the addition/removal of threads to/from the ready queue from being
* included in these measurements. Furthermore, the use of dummy threads helps
* reduce the memory footprint as not only are thread stacks not required,
* but we also do not need the full k_thread structure for each of these
* dummy threads.
*/
#include <zephyr/kernel.h>
#include <zephyr/timestamp.h>
#include <zephyr/timing/timing.h>
#include "utils.h"
#include <zephyr/tc_util.h>
#include <wait_q.h>
#include <ksched.h>
#include <stdio.h>
uint32_t tm_off;
static struct _thread_base dummy_thread[CONFIG_BENCHMARK_NUM_THREADS];
static _wait_q_t wait_q;
uint64_t add_cycles[CONFIG_BENCHMARK_NUM_THREADS];
uint64_t remove_cycles[CONFIG_BENCHMARK_NUM_THREADS];
/**
* Initialize each dummy thread.
*/
static void dummy_threads_init(unsigned int num_threads)
{
unsigned int i;
unsigned int bucket_size;
bucket_size = (num_threads / CONFIG_NUM_PREEMPT_PRIORITIES) + 1;
for (i = 0; i < num_threads; i++) {
z_init_thread_base(&dummy_thread[i], i / bucket_size,
_THREAD_DUMMY, 0);
}
}
static void cycles_reset(unsigned int num_threads)
{
unsigned int i;
for (i = 0; i < num_threads; i++) {
add_cycles[i] = 0ULL;
remove_cycles[i] = 0ULL;
}
}
/**
* Each successive dummy thread added to the wait queue is either of the
* same or lower priority. Each dummy thread removed from the wait queue
* is of the same or lower priority than the one previous.
*/
static void test_decreasing_priority(_wait_q_t *q, unsigned int num_threads)
{
unsigned int i;
timing_t start;
timing_t finish;
/* Add to tail of wait queue */
for (i = 0; i < num_threads; i++) {
start = timing_counter_get();
z_pend_thread((struct k_thread *)&dummy_thread[i],
q, K_FOREVER);
finish = timing_counter_get();
add_cycles[i] += timing_cycles_get(&start, &finish);
}
/* Remove from head of wait queue */
for (i = 0; i < num_threads; i++) {
start = timing_counter_get();
z_unpend_thread((struct k_thread *)&dummy_thread[i]);
finish = timing_counter_get();
remove_cycles[i] += timing_cycles_get(&start, &finish);
}
}
static void test_increasing_priority(_wait_q_t *q, unsigned int num_threads)
{
unsigned int i;
timing_t start;
timing_t finish;
struct k_thread *thread;
/* Add to head of wait queue */
for (i = 0; i < num_threads; i++) {
start = timing_counter_get();
thread = (struct k_thread *)
&dummy_thread[num_threads - i - 1];
z_pend_thread(thread, q, K_FOREVER);
finish = timing_counter_get();
add_cycles[i] += timing_cycles_get(&start, &finish);
}
/* Remove from tail of wait queue */
for (i = 0; i < num_threads; i++) {
start = timing_counter_get();
thread = (struct k_thread *)
&dummy_thread[num_threads - i - 1];
z_unpend_thread(thread);
finish = timing_counter_get();
remove_cycles[i] += timing_cycles_get(&start, &finish);
}
}
static uint64_t sqrt_u64(uint64_t square)
{
if (square > 1) {
uint64_t lo = sqrt_u64(square >> 2) << 1;
uint64_t hi = lo + 1;
return ((hi * hi) > square) ? lo : hi;
}
return square;
}
static void compute_and_report_stats(unsigned int num_threads,
unsigned int num_iterations,
uint64_t *cycles,
const char *str)
{
uint64_t minimum = cycles[0];
uint64_t maximum = cycles[0];
uint64_t total = cycles[0];
uint64_t average;
uint64_t std_dev = 0;
uint64_t tmp;
uint64_t diff;
unsigned int i;
for (i = 1; i < num_threads; i++) {
if (cycles[i] > maximum) {
maximum = cycles[i];
}
if (cycles[i] < minimum) {
minimum = cycles[i];
}
total += cycles[i];
}
minimum /= (uint64_t)num_iterations;
maximum /= (uint64_t)num_iterations;
average = total / (num_threads * num_iterations);
/* Calculate standard deviation */
for (i = 0; i < num_threads; i++) {
tmp = cycles[i] / num_iterations;
diff = (average > tmp) ? (average - tmp) : (tmp - average);
std_dev += (diff * diff);
}
std_dev /= num_threads;
std_dev = sqrt_u64(std_dev);
printk("%s\n", str);
printk(" Minimum : %7llu cycles (%7u nsec)\n",
minimum, (uint32_t)timing_cycles_to_ns(minimum));
printk(" Maximum : %7llu cycles (%7u nsec)\n",
maximum, (uint32_t)timing_cycles_to_ns(maximum));
printk(" Average : %7llu cycles (%7u nsec)\n",
average, (uint32_t)timing_cycles_to_ns(average));
printk(" Std Deviation: %7llu cycles (%7u nsec)\n",
std_dev, (uint32_t)timing_cycles_to_ns(std_dev));
}
int main(void)
{
unsigned int i;
unsigned int freq;
#ifdef CONFIG_BENCHMARK_VERBOSE
char description[120];
char tag[50];
struct k_thread *thread;
#endif
timing_init();
bench_test_init();
freq = timing_freq_get_mhz();
printk("Time Measurements for %s wait queues\n",
IS_ENABLED(CONFIG_WAITQ_DUMB) ? "dumb" : "scalable");
printk("Timing results: Clock frequency: %u MHz\n", freq);
z_waitq_init(&wait_q);
dummy_threads_init(CONFIG_BENCHMARK_NUM_THREADS);
timing_start();
cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
test_decreasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS);
}
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
add_cycles,
"Add threads of decreasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"WaitQ.add.to.tail.%04u.waiters", i);
snprintf(description, sizeof(description),
"%-40s - Add thread of priority %u",
tag, dummy_thread[i].prio);
PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
remove_cycles,
"Remove threads of decreasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"WaitQ.remove.from.head.%04u.waiters",
CONFIG_BENCHMARK_NUM_THREADS - i);
snprintf(description, sizeof(description),
"%-40s - Remove thread of priority %u",
tag, dummy_thread[i].prio);
PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
cycles_reset(CONFIG_BENCHMARK_NUM_THREADS);
for (i = 0; i < CONFIG_BENCHMARK_NUM_ITERATIONS; i++) {
test_increasing_priority(&wait_q, CONFIG_BENCHMARK_NUM_THREADS);
}
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
add_cycles,
"Add threads of increasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"WaitQ.add.to.head.%04u.waiters", i);
thread = (struct k_thread *)
&dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
snprintf(description, sizeof(description),
"%-40s - Add priority %u to waitq",
tag, thread->base.prio);
PRINT_STATS_AVG(description, (uint32_t)add_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
printk("------------------------------------\n");
compute_and_report_stats(CONFIG_BENCHMARK_NUM_THREADS,
CONFIG_BENCHMARK_NUM_ITERATIONS,
remove_cycles,
"Remove threads of increasing priority");
#ifdef CONFIG_BENCHMARK_VERBOSE
for (i = 0; i < CONFIG_BENCHMARK_NUM_THREADS; i++) {
snprintf(tag, sizeof(tag),
"WaitQ.remove.from.tail.%04u.waiters",
CONFIG_BENCHMARK_NUM_THREADS - i);
thread = (struct k_thread *)
&dummy_thread[CONFIG_BENCHMARK_NUM_THREADS - i - 1];
snprintf(description, sizeof(description),
"%-40s - Remove priority %u from waitq",
tag, thread->base.prio);
PRINT_STATS_AVG(description, (uint32_t)remove_cycles[i],
CONFIG_BENCHMARK_NUM_ITERATIONS);
}
#endif
timing_stop();
TC_END_REPORT(0);
return 0;
}

View file

@ -0,0 +1,47 @@
/*
* Copyright (c) 2024 Intel Corporation
*
* SPDX-License-Identifier: Apache-2.0
*/
#ifndef __BENCHMARK_WAITQ_UTILS_H
#define __BENCHMARK_WAITQ_UTILS_H
/*
* @brief This file contains macros used in the wait queue benchmarking.
*/
#include <zephyr/sys/printk.h>
#ifdef CSV_FORMAT_OUTPUT
#define FORMAT_STR "%-74s,%s,%s\n"
#define CYCLE_FORMAT "%8u"
#define NSEC_FORMAT "%8u"
#else
#define FORMAT_STR "%-74s:%s , %s\n"
#define CYCLE_FORMAT "%8u cycles"
#define NSEC_FORMAT "%8u ns"
#endif
/**
* @brief Display a line of statistics
*
* This macro displays the following:
* 1. Test description summary
* 2. Number of cycles
* 3. Number of nanoseconds
*/
#define PRINT_F(summary, cycles, nsec) \
do { \
char cycle_str[32]; \
char nsec_str[32]; \
\
snprintk(cycle_str, 30, CYCLE_FORMAT, cycles); \
snprintk(nsec_str, 30, NSEC_FORMAT, nsec); \
printk(FORMAT_STR, summary, cycle_str, nsec_str); \
} while (0)
#define PRINT_STATS_AVG(summary, value, counter) \
PRINT_F(summary, value / counter, \
(uint32_t)timing_cycles_to_ns_avg(value, counter))
#endif

View file

@ -0,0 +1,21 @@
common:
tags:
- kernel
- benchmark
integration_platforms:
- qemu_x86
- qemu_cortex_a53
harness: console
harness_config:
type: one_line
regex:
- "PROJECT EXECUTION SUCCESSFUL"
tests:
benchmark.wait_queues.dumb:
extra_configs:
- CONFIG_WAITQ_DUMB=y
benchmark.wait_queues.scalable:
extra_configs:
- CONFIG_WAITQ_SCALABLE=y