tests: benchmarks: move pthread_pressure to benchmarks/posix

The pthread_pressure test was not a typical test per se. It was
a benchmark in search of the proper home.

Let's move it to the correct place in the Zephyr tree, add a
doc, and provide some reporting.

Currently, k_threads out-perform pthreads by almost a factor of
2. The theoretical maximum performance of pthreads would be at
parity of k_threads, since pthreads are a wrapper around kernel
threads. It would be great to reduce the gap.

Signed-off-by: Chris Friedt <cfriedt@tenstorrent.com>
This commit is contained in:
Chris Friedt 2024-12-03 14:10:13 -05:00 committed by Benjamin Cabé
commit 3f60489fae
10 changed files with 327 additions and 268 deletions

View file

@ -1,15 +1,11 @@
# Copyright (c) 2023, Meta
# Copyright (c) 2024, Tenstorrent AI ULC
#
# SPDX-License-Identifier: Apache-2.0
source "Kconfig.zephyr"
mainmenu "POSIX Threads Benchmark"
config TEST_NUM_CPUS
int "Number of CPUs to use in parallel"
range 1 MP_MAX_NUM_CPUS
default MP_MAX_NUM_CPUS
help
The number of parallel threads to run during the test.
source "Kconfig.zephyr"
config TEST_DURATION_S
int "Number of seconds to run the test"
@ -44,8 +40,7 @@ config TEST_PTHREADS
help
Run tests for pthreads
config TEST_EXTRA_ASSERTIONS
bool "Add extra assertions into the hot path"
default y
config TEST_PERIODIC_STATS
bool "Print statistics periodically"
help
This can be disabled for benchmarking.
Print statistics periodically throughout the benchmark.

View file

@ -0,0 +1,45 @@
POSIX Thread Benchmark
######################
Overview
********
This benchmark creates and joins as many threads as possible within a configurable time window.
It provides a rough comparison Zephyr's POSIX threads (pthreads) with Zephyr's kernel threads
(k_threads) API, highlighting the overhead of the POSIX. Ideally, this overhead would shrink over
time.
Sample output of the benchmark::
*** Booting Zephyr OS build v4.0.0-1410-gfca33facee37 ***
ASSERT: y
BOARD: qemu_riscv64
NUM_CPUS: 1
TEST_DELAY_US: 0
TEST_DURATION_S: 5
SMP: n
API, Thread ID, time(s), threads, cores, rate (threads/s/core)
k_thread, ALL, 5, 47663, 1, 9532
pthread, ALL, 5, 28180, 1, 5636
PROJECT EXECUTION SUCCESSFUL
To observe periodic statistics on a per-thread basis in addition to the summary of statistics
printed at the end of execution, use CONFIG_TEST_PERIODIC_STATS.
Several other options can be tuned on an as-needed basis:
- CONFIG_MP_MAX_NUM_CPUS - Number of CPUs to use in parallel.
- CONFIG_TEST_DURATION_S - Number of seconds to run the test.
- CONFIG_TEST_DELAY_US - Microseconds to delay between pthread join and create.
- CONFIG_TEST_KTHREADS - Exercise k_threads in the test app.
- CONFIG_TEST_PTHREADS - Exercise pthreads in the test app.
- CONFIG_TEST_STACK_SIZE - Size of each thread stack in this test.
The following table summarizes the purposes of the different extra
configuration files that are available to be used with this benchmark.
A tester may mix and match them allowing them different scenarios to
be easily compared the default.
+-----------------------------+----------------------------------------+
| prj-assert.conf | Enable assertions for API verification |
+-----------------------------+----------------------------------------+

View file

@ -0,0 +1,6 @@
CONFIG_FORCE_NO_ASSERT=n
CONFIG_ASSERT=y
# May be enabled for GitHub CI to reduce host scheduling noise while running
# several concurrent Qemu processes each under stressful SMP load.
# CONFIG_PTHREAD_CREATE_BARRIER=y

View file

@ -0,0 +1,6 @@
CONFIG_TEST=y
CONFIG_FORCE_NO_ASSERT=y
CONFIG_POSIX_API=y
CONFIG_POSIX_AEP_CHOICE_BASE=y
CONFIG_POSIX_PRIORITY_SCHEDULING=y

View file

@ -0,0 +1,243 @@
/*
* Copyright (c) 2023, Meta
* Copyright (c) 2024, Tenstorrent AI ULC
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <pthread.h>
#include <stdio.h>
#include <zephyr/sys/__assert.h>
#include <zephyr/sys/util.h>
#define STACK_SIZE K_THREAD_STACK_LEN(CONFIG_TEST_STACK_SIZE)
/* update interval for printing stats */
#if CONFIG_TEST_DURATION_S >= 60
#define UPDATE_INTERVAL_S 10
#elif CONFIG_TEST_DURATION_S >= 30
#define UPDATE_INTERVAL_S 5
#else
#define UPDATE_INTERVAL_S 1
#endif
/* 32 threads is mainly a limitation of find_lsb_set() */
#define NUM_CPUS MIN(32, MIN(CONFIG_MP_MAX_NUM_CPUS, CONFIG_POSIX_THREAD_THREADS_MAX))
typedef int (*create_fn)(int i);
typedef int (*join_fn)(int i);
static void before(void);
/* bitmask of available threads */
static bool alive[NUM_CPUS];
/* array of thread stacks */
static K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_CPUS, STACK_SIZE);
static struct k_thread k_threads[NUM_CPUS];
static uint64_t counters[NUM_CPUS];
static uint64_t prev_counters[NUM_CPUS];
static void print_stats(const char *tag, uint64_t now, uint64_t end)
{
for (int i = 0; i < NUM_CPUS; ++i) {
printf("%s, %d, %u, %llu, 1, %llu\n", tag, i, UPDATE_INTERVAL_S, counters[i],
(counters[i] - prev_counters[i]) / UPDATE_INTERVAL_S);
prev_counters[i] = counters[i];
}
}
static void print_group_stats(const char *tag)
{
uint64_t count = 0;
for (int i = 0; i < NUM_CPUS; ++i) {
count += counters[i];
}
printf("%s, ALL, %u, %llu, %u, %llu\n", tag, CONFIG_TEST_DURATION_S, count, NUM_CPUS,
count / CONFIG_TEST_DURATION_S / NUM_CPUS);
}
static void create_join_common(const char *tag, create_fn create, join_fn join)
{
int i;
int __maybe_unused ret;
uint64_t now_ms = k_uptime_get();
const uint64_t end_ms = now_ms + MSEC_PER_SEC * CONFIG_TEST_DURATION_S;
uint64_t update_ms = now_ms + MSEC_PER_SEC * UPDATE_INTERVAL_S;
for (i = 0; i < NUM_CPUS; ++i) {
/* spawn thread i */
prev_counters[i] = 0;
ret = create(i);
__ASSERT(ret == 0, "%s_create(%d)[%zu] failed: %d", tag, i, counters[i], ret);
}
do {
if (!IS_ENABLED(CONFIG_SMP)) {
/* allow the test thread to be swapped-out */
k_yield();
}
for (i = 0; i < NUM_CPUS; ++i) {
if (alive[i]) {
ret = join(i);
__ASSERT(ret, "%s_join(%d)[%zu] failed: %d", tag, i, counters[i],
ret);
alive[i] = false;
/* update counter i after each (create,join) pair */
++counters[i];
if (IS_ENABLED(CONFIG_TEST_DELAY_US)) {
/* success with 0 delay means we are ~raceless */
k_busy_wait(CONFIG_TEST_DELAY_US);
}
/* re-spawn thread i */
ret = create(i);
__ASSERT(ret == 0, "%s_create(%d)[%zu] failed: %d", tag, i,
counters[i], ret);
}
}
/* are we there yet? */
now_ms = k_uptime_get();
/* dump some stats periodically */
if (now_ms > update_ms) {
update_ms += MSEC_PER_SEC * UPDATE_INTERVAL_S;
/* at this point, we should have seen many context switches */
for (i = 0; IS_ENABLED(CONFIG_ASSERT) && i < NUM_CPUS; ++i) {
__ASSERT(counters[i] > 0, "%s %d was never scheduled", tag, i);
}
if (IS_ENABLED(CONFIG_TEST_PERIODIC_STATS)) {
print_stats(tag, now_ms, end_ms);
}
}
Z_SPIN_DELAY(100);
} while (end_ms > now_ms);
print_group_stats(tag);
}
/*
* Wrappers for k_threads
*/
static void k_thread_fun(void *arg1, void *arg2, void *arg3)
{
int i = POINTER_TO_INT(arg1);
alive[i] = true;
}
static int k_thread_create_wrapper(int i)
{
k_thread_create(&k_threads[i], thread_stacks[i], STACK_SIZE, k_thread_fun,
INT_TO_POINTER(i), NULL, NULL, K_HIGHEST_APPLICATION_THREAD_PRIO, 0,
K_NO_WAIT);
return 0;
}
static int k_thread_join_wrapper(int i)
{
return k_thread_join(&k_threads[i], K_FOREVER);
}
static void create_join_kthread(void)
{
if (IS_ENABLED(CONFIG_TEST_KTHREADS)) {
before();
create_join_common("k_thread", k_thread_create_wrapper, k_thread_join_wrapper);
}
}
/*
* Wrappers for pthreads
*/
static pthread_t pthreads[NUM_CPUS];
static pthread_attr_t pthread_attrs[NUM_CPUS];
static void *pthread_fun(void *arg)
{
k_thread_fun(arg, NULL, NULL);
return NULL;
}
static int pthread_create_wrapper(int i)
{
return pthread_create(&pthreads[i], &pthread_attrs[i], pthread_fun, INT_TO_POINTER(i));
}
static int pthread_join_wrapper(int i)
{
return pthread_join(pthreads[i], NULL);
}
static void create_join_pthread(void)
{
if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
before();
create_join_common("pthread", pthread_create_wrapper, pthread_join_wrapper);
}
}
static void setup(void)
{
printf("ASSERT: %c\n", IS_ENABLED(CONFIG_ASSERT) ? 'y' : 'n');
printf("BOARD: %s\n", CONFIG_BOARD);
printf("NUM_CPUS: %u\n", NUM_CPUS);
printf("TEST_DELAY_US: %u\n", CONFIG_TEST_DELAY_US);
printf("TEST_DURATION_S: %u\n", CONFIG_TEST_DURATION_S);
printf("SMP: %c\n", IS_ENABLED(CONFIG_SMP) ? 'y' : 'n');
printf("API, Thread ID, time(s), threads, cores, rate (threads/s/core)\n");
if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
int __maybe_unused ret;
const struct sched_param param = {
.sched_priority = sched_get_priority_max(SCHED_FIFO),
};
/* setup pthread stacks */
for (int i = 0; i < NUM_CPUS; ++i) {
ret = pthread_attr_init(&pthread_attrs[i]);
__ASSERT(ret == 0, "pthread_attr_init[%d] failed: %d", i, ret);
ret = pthread_attr_setstack(&pthread_attrs[i], thread_stacks[i],
STACK_SIZE);
__ASSERT(ret == 0, "pthread_attr_setstack[%d] failed: %d", i, ret);
ret = pthread_attr_setschedpolicy(&pthread_attrs[i], SCHED_FIFO);
__ASSERT(ret == 0, "pthread_attr_setschedpolicy[%d] failed: %d", i, ret);
ret = pthread_attr_setschedparam(&pthread_attrs[i], &param);
__ASSERT(ret == 0, "pthread_attr_setschedparam[%d] failed: %d", i, ret);
}
}
}
static void before(void)
{
for (int i = 0; i < NUM_CPUS; ++i) {
counters[i] = 0;
}
}
int main(void)
{
setup();
create_join_kthread();
create_join_pthread();
printf("PROJECT EXECUTION SUCCESSFUL\n");
}

View file

@ -0,0 +1,21 @@
common:
tags:
- posix
- benchmark
min_ram: 64
arch_exclude:
- posix
integration_platforms:
- qemu_cortex_a53/qemu_cortex_a53/smp
- qemu_riscv64/qemu_virt_riscv64/smp
- qemu_riscv32/qemu_virt_riscv32/smp
- qemu_x86_64
harness: console
harness_config:
type: one_line
record:
regex: "(?P<api>.*), ALL, (?P<time>.*), (?P<threads>.*), (?P<cores>.*), (?P<rate>.*)"
regex:
- "PROJECT EXECUTION SUCCESSFUL"
tests:
benchmark.posix.threads: {}

View file

@ -1,11 +0,0 @@
CONFIG_ZTEST=y
CONFIG_POSIX_API=y
CONFIG_POSIX_PRIORITY_SCHEDULING=y
## Note: for benchmarking purposes, uncomment the Kconfig below
# CONFIG_TEST_DURATION_S=60
# CONFIG_TEST_EXTRA_ASSERTIONS=n
# CONFIG_ASSERT=n
## Optionally, uncomment this to only test pthreads:
# CONFIG_TEST_KTHREADS=n

View file

@ -1,230 +0,0 @@
/*
* Copyright (c) 2023, Meta
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <pthread.h>
#include <zephyr/sys/util.h>
#include <zephyr/ztest.h>
#define STACK_SIZE K_THREAD_STACK_LEN(CONFIG_TEST_STACK_SIZE)
/* update interval for printing stats */
#if CONFIG_TEST_DURATION_S >= 60
#define UPDATE_INTERVAL_S 10
#elif CONFIG_TEST_DURATION_S >= 30
#define UPDATE_INTERVAL_S 5
#else
#define UPDATE_INTERVAL_S 1
#endif
/* 32 threads is mainly a limitation of find_lsb_set() */
#define NUM_THREADS MIN(32, MIN(CONFIG_TEST_NUM_CPUS, CONFIG_POSIX_THREAD_THREADS_MAX))
typedef int (*create_fn)(int i);
typedef int (*join_fn)(int i);
static void *setup(void);
static void before(void *fixture);
/* bitmask of available threads */
static bool alive[NUM_THREADS];
/* array of thread stacks */
static K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_THREADS, STACK_SIZE);
static struct k_thread k_threads[NUM_THREADS];
static uint64_t counters[NUM_THREADS];
static uint64_t prev_counters[NUM_THREADS];
static void print_stats(uint64_t now, uint64_t end)
{
printk("now (ms): %llu end (ms): %llu\n", now, end);
for (int i = 0; i < NUM_THREADS; ++i) {
printk("Thread %d created and joined %llu times (%llu joins/s)\n", i, counters[i],
(counters[i] - prev_counters[i]) / UPDATE_INTERVAL_S);
prev_counters[i] = counters[i];
}
}
static void test_create_join_common(const char *tag, create_fn create, join_fn join)
{
int i;
int ret;
uint64_t now_ms = k_uptime_get();
const uint64_t end_ms = now_ms + MSEC_PER_SEC * CONFIG_TEST_DURATION_S;
uint64_t update_ms = now_ms + MSEC_PER_SEC * UPDATE_INTERVAL_S;
printk("BOARD: %s\n", CONFIG_BOARD);
printk("CONFIG_SMP: %s\n", IS_ENABLED(CONFIG_SMP) ? "y" : "n");
printk("NUM_THREADS: %u\n", NUM_THREADS);
printk("TEST_NUM_CPUS: %u\n", CONFIG_TEST_NUM_CPUS);
printk("TEST_DURATION_S: %u\n", CONFIG_TEST_DURATION_S);
printk("TEST_DELAY_US: %u\n", CONFIG_TEST_DELAY_US);
for (i = 0; i < NUM_THREADS; ++i) {
/* spawn thread i */
prev_counters[i] = 0;
ret = create(i);
if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
zassert_ok(ret, "%s_create(%d)[%zu] failed: %d", tag, i, counters[i], ret);
}
}
do {
if (!IS_ENABLED(CONFIG_SMP)) {
/* allow the test thread to be swapped-out */
k_yield();
}
for (i = 0; i < NUM_THREADS; ++i) {
if (alive[i]) {
ret = join(i);
if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
zassert_ok(ret, "%s_join(%d)[%zu] failed: %d", tag, i,
counters[i], ret);
}
alive[i] = false;
/* update counter i after each (create,join) pair */
++counters[i];
if (IS_ENABLED(CONFIG_TEST_DELAY_US)) {
/* success with 0 delay means we are ~raceless */
k_busy_wait(CONFIG_TEST_DELAY_US);
}
/* re-spawn thread i */
ret = create(i);
if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
zassert_ok(ret, "%s_create(%d)[%zu] failed: %d", tag, i,
counters[i], ret);
}
}
}
/* are we there yet? */
now_ms = k_uptime_get();
/* dump some stats periodically */
if (now_ms > update_ms) {
update_ms += MSEC_PER_SEC * UPDATE_INTERVAL_S;
/* at this point, we should have seen many context switches */
for (i = 0; i < NUM_THREADS; ++i) {
if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
zassert_true(counters[i] > 0, "%s %d was never scheduled",
tag, i);
}
}
print_stats(now_ms, end_ms);
}
Z_SPIN_DELAY(100);
} while (end_ms > now_ms);
print_stats(now_ms, end_ms);
}
/*
* Wrappers for k_threads
*/
static void k_thread_fun(void *arg1, void *arg2, void *arg3)
{
int i = POINTER_TO_INT(arg1);
alive[i] = true;
}
static int k_thread_create_wrapper(int i)
{
k_thread_create(&k_threads[i], thread_stacks[i], STACK_SIZE, k_thread_fun,
INT_TO_POINTER(i), NULL, NULL, K_HIGHEST_APPLICATION_THREAD_PRIO, 0,
K_NO_WAIT);
return 0;
}
static int k_thread_join_wrapper(int i)
{
return k_thread_join(&k_threads[i], K_FOREVER);
}
ZTEST(pthread_pressure, test_k_thread_create_join)
{
if (IS_ENABLED(CONFIG_TEST_KTHREADS)) {
test_create_join_common("k_thread", k_thread_create_wrapper, k_thread_join_wrapper);
} else {
ztest_test_skip();
}
}
/*
* Wrappers for pthreads
*/
static pthread_t pthreads[NUM_THREADS];
static pthread_attr_t pthread_attrs[NUM_THREADS];
static void *pthread_fun(void *arg)
{
k_thread_fun(arg, NULL, NULL);
return NULL;
}
static int pthread_create_wrapper(int i)
{
return pthread_create(&pthreads[i], &pthread_attrs[i], pthread_fun, INT_TO_POINTER(i));
}
static int pthread_join_wrapper(int i)
{
return pthread_join(pthreads[i], NULL);
}
ZTEST(pthread_pressure, test_pthread_create_join)
{
if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
test_create_join_common("pthread", pthread_create_wrapper, pthread_join_wrapper);
} else {
ztest_test_skip();
}
}
/*
* Test suite / fixture
*/
ZTEST_SUITE(pthread_pressure, NULL, setup, before, NULL, NULL);
static void *setup(void)
{
if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
const struct sched_param param = {
.sched_priority = sched_get_priority_max(SCHED_FIFO),
};
/* setup pthread stacks */
for (int i = 0; i < NUM_THREADS; ++i) {
zassert_ok(pthread_attr_init(&pthread_attrs[i]));
zassert_ok(pthread_attr_setstack(&pthread_attrs[i], thread_stacks[i],
STACK_SIZE));
zassert_ok(pthread_attr_setschedpolicy(&pthread_attrs[i], SCHED_FIFO));
zassert_ok(pthread_attr_setschedparam(&pthread_attrs[i], &param));
}
}
return NULL;
}
static void before(void *fixture)
{
ARG_UNUSED(before);
for (int i = 0; i < NUM_THREADS; ++i) {
counters[i] = 0;
}
}

View file

@ -1,16 +0,0 @@
common:
tags: posix
min_ram: 64
arch_exclude:
- posix
integration_platforms:
- qemu_cortex_a53/qemu_cortex_a53/smp
- qemu_riscv64/qemu_virt_riscv64/smp
- qemu_riscv32/qemu_virt_riscv32/smp
- qemu_x86_64
tests:
portability.posix.pthread_pressure:
extra_configs:
# Enabled for GitHub CI to reduce host scheduling noise while running
# several concurrent Qemu processes each under stressful SMP load.
- CONFIG_PTHREAD_CREATE_BARRIER=y