tests: benchmarks: move pthread_pressure to benchmarks/posix

The pthread_pressure test was not a typical test per se. It was a benchmark in search of the proper home. Let's move it to the correct place in the Zephyr tree, add a doc, and provide some reporting. Currently, k_threads out-perform pthreads by almost a factor of 2. The theoretical maximum performance of pthreads would be at parity of k_threads, since pthreads are a wrapper around kernel threads. It would be great to reduce the gap. Signed-off-by: Chris Friedt <cfriedt@tenstorrent.com>
2024-12-03 14:10:13 -05:00 · 2024-12-03 14:10:13 -05:00 · 3f60489fae
commit 3f60489fae
parent 7497b8bbbe
10 changed files with 327 additions and 268 deletions
--- a/tests/benchmarks/posix/threads/CMakeLists.txt
+++ b/tests/benchmarks/posix/threads/CMakeLists.txt
--- a/tests/benchmarks/posix/threads/Kconfig
+++ b/tests/benchmarks/posix/threads/Kconfig
@ -1,15 +1,11 @@
 # Copyright (c) 2023, Meta
+# Copyright (c) 2024, Tenstorrent AI ULC
 #
 # SPDX-License-Identifier: Apache-2.0

-source "Kconfig.zephyr"
+mainmenu "POSIX Threads Benchmark"

-config TEST_NUM_CPUS
-	int "Number of CPUs to use in parallel"
-	range 1 MP_MAX_NUM_CPUS
-	default MP_MAX_NUM_CPUS
-	help
-	  The number of parallel threads to run during the test.
+source "Kconfig.zephyr"

 config TEST_DURATION_S
 	int "Number of seconds to run the test"
@ -44,8 +40,7 @@ config TEST_PTHREADS
 	help
 	  Run tests for pthreads

-config TEST_EXTRA_ASSERTIONS
-	bool "Add extra assertions into the hot path"
-	default y
+config TEST_PERIODIC_STATS
+	bool "Print statistics periodically"
 	help
-	  This can be disabled for benchmarking.
+	  Print statistics periodically throughout the benchmark.
--- a/tests/benchmarks/posix/threads/README.rst
+++ b/tests/benchmarks/posix/threads/README.rst
@ -0,0 +1,45 @@
+POSIX Thread Benchmark
+######################
+
+Overview
+********
+
+This benchmark creates and joins as many threads as possible within a configurable time window.
+It provides a rough comparison Zephyr's POSIX threads (pthreads) with Zephyr's kernel threads
+(k_threads) API, highlighting the overhead of the POSIX. Ideally, this overhead would shrink over
+time.
+
+Sample output of the benchmark::
+
+    *** Booting Zephyr OS build v4.0.0-1410-gfca33facee37 ***
+    ASSERT: y
+    BOARD: qemu_riscv64
+    NUM_CPUS: 1
+    TEST_DELAY_US: 0
+    TEST_DURATION_S: 5
+    SMP: n
+    API, Thread ID, time(s), threads, cores, rate (threads/s/core)
+    k_thread, ALL, 5, 47663, 1, 9532
+    pthread, ALL, 5, 28180, 1, 5636
+    PROJECT EXECUTION SUCCESSFUL
+
+To observe periodic statistics on a per-thread basis in addition to the summary of statistics
+printed at the end of execution, use CONFIG_TEST_PERIODIC_STATS.
+
+Several other options can be tuned on an as-needed basis:
+
+- CONFIG_MP_MAX_NUM_CPUS - Number of CPUs to use in parallel.
+- CONFIG_TEST_DURATION_S - Number of seconds to run the test.
+- CONFIG_TEST_DELAY_US - Microseconds to delay between pthread join and create.
+- CONFIG_TEST_KTHREADS - Exercise k_threads in the test app.
+- CONFIG_TEST_PTHREADS - Exercise pthreads in the test app.
+- CONFIG_TEST_STACK_SIZE - Size of each thread stack in this test.
+
+The following table summarizes the purposes of the different extra
+configuration files that are available to be used with this benchmark.
+A tester may mix and match them allowing them different scenarios to
+be easily compared the default.
+
+-----------------------------+----------------------------------------+
+| prj-assert.conf             | Enable assertions for API verification |
+-----------------------------+----------------------------------------+
--- a/tests/benchmarks/posix/threads/prj-assert.conf
+++ b/tests/benchmarks/posix/threads/prj-assert.conf
@ -0,0 +1,6 @@
+CONFIG_FORCE_NO_ASSERT=n
+CONFIG_ASSERT=y
+
+# May be enabled for GitHub CI to reduce host scheduling noise while running
+# several concurrent Qemu processes each under stressful SMP load.
+# CONFIG_PTHREAD_CREATE_BARRIER=y
--- a/tests/benchmarks/posix/threads/prj.conf
+++ b/tests/benchmarks/posix/threads/prj.conf
@ -0,0 +1,6 @@
+CONFIG_TEST=y
+CONFIG_FORCE_NO_ASSERT=y
+
+CONFIG_POSIX_API=y
+CONFIG_POSIX_AEP_CHOICE_BASE=y
+CONFIG_POSIX_PRIORITY_SCHEDULING=y
--- a/tests/benchmarks/posix/threads/src/main.c
+++ b/tests/benchmarks/posix/threads/src/main.c
@ -0,0 +1,243 @@
+/*
+ * Copyright (c) 2023, Meta
+ * Copyright (c) 2024, Tenstorrent AI ULC
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <pthread.h>
+#include <stdio.h>
+
+#include <zephyr/sys/__assert.h>
+#include <zephyr/sys/util.h>
+
+#define STACK_SIZE K_THREAD_STACK_LEN(CONFIG_TEST_STACK_SIZE)
+
+/* update interval for printing stats */
+#if CONFIG_TEST_DURATION_S >= 60
+#define UPDATE_INTERVAL_S 10
+#elif CONFIG_TEST_DURATION_S >= 30
+#define UPDATE_INTERVAL_S 5
+#else
+#define UPDATE_INTERVAL_S 1
+#endif
+
+/* 32 threads is mainly a limitation of find_lsb_set() */
+#define NUM_CPUS MIN(32, MIN(CONFIG_MP_MAX_NUM_CPUS, CONFIG_POSIX_THREAD_THREADS_MAX))
+
+typedef int (*create_fn)(int i);
+typedef int (*join_fn)(int i);
+
+static void before(void);
+
+/* bitmask of available threads */
+static bool alive[NUM_CPUS];
+
+/* array of thread stacks */
+static K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_CPUS, STACK_SIZE);
+
+static struct k_thread k_threads[NUM_CPUS];
+static uint64_t counters[NUM_CPUS];
+static uint64_t prev_counters[NUM_CPUS];
+
+static void print_stats(const char *tag, uint64_t now, uint64_t end)
+{
+	for (int i = 0; i < NUM_CPUS; ++i) {
+		printf("%s, %d, %u, %llu, 1, %llu\n", tag, i, UPDATE_INTERVAL_S, counters[i],
+		       (counters[i] - prev_counters[i]) / UPDATE_INTERVAL_S);
+		prev_counters[i] = counters[i];
+	}
+}
+
+static void print_group_stats(const char *tag)
+{
+	uint64_t count = 0;
+
+	for (int i = 0; i < NUM_CPUS; ++i) {
+		count += counters[i];
+	}
+
+	printf("%s, ALL, %u, %llu, %u, %llu\n", tag, CONFIG_TEST_DURATION_S, count, NUM_CPUS,
+	       count / CONFIG_TEST_DURATION_S / NUM_CPUS);
+}
+
+static void create_join_common(const char *tag, create_fn create, join_fn join)
+{
+	int i;
+	int __maybe_unused ret;
+	uint64_t now_ms = k_uptime_get();
+	const uint64_t end_ms = now_ms + MSEC_PER_SEC * CONFIG_TEST_DURATION_S;
+	uint64_t update_ms = now_ms + MSEC_PER_SEC * UPDATE_INTERVAL_S;
+
+	for (i = 0; i < NUM_CPUS; ++i) {
+		/* spawn thread i */
+		prev_counters[i] = 0;
+		ret = create(i);
+		__ASSERT(ret == 0, "%s_create(%d)[%zu] failed: %d", tag, i, counters[i], ret);
+	}
+
+	do {
+		if (!IS_ENABLED(CONFIG_SMP)) {
+			/* allow the test thread to be swapped-out */
+			k_yield();
+		}
+
+		for (i = 0; i < NUM_CPUS; ++i) {
+			if (alive[i]) {
+				ret = join(i);
+				__ASSERT(ret, "%s_join(%d)[%zu] failed: %d", tag, i, counters[i],
+					 ret);
+				alive[i] = false;
+
+				/* update counter i after each (create,join) pair */
+				++counters[i];
+
+				if (IS_ENABLED(CONFIG_TEST_DELAY_US)) {
+					/* success with 0 delay means we are ~raceless */
+					k_busy_wait(CONFIG_TEST_DELAY_US);
+				}
+
+				/* re-spawn thread i */
+				ret = create(i);
+				__ASSERT(ret == 0, "%s_create(%d)[%zu] failed: %d", tag, i,
+					 counters[i], ret);
+			}
+		}
+
+		/* are we there yet? */
+		now_ms = k_uptime_get();
+
+		/* dump some stats periodically */
+		if (now_ms > update_ms) {
+			update_ms += MSEC_PER_SEC * UPDATE_INTERVAL_S;
+
+			/* at this point, we should have seen many context switches */
+			for (i = 0; IS_ENABLED(CONFIG_ASSERT) && i < NUM_CPUS; ++i) {
+				__ASSERT(counters[i] > 0, "%s %d was never scheduled", tag, i);
+			}
+
+			if (IS_ENABLED(CONFIG_TEST_PERIODIC_STATS)) {
+				print_stats(tag, now_ms, end_ms);
+			}
+		}
+		Z_SPIN_DELAY(100);
+	} while (end_ms > now_ms);
+
+	print_group_stats(tag);
+}
+
+/*
+ * Wrappers for k_threads
+ */
+
+static void k_thread_fun(void *arg1, void *arg2, void *arg3)
+{
+	int i = POINTER_TO_INT(arg1);
+
+	alive[i] = true;
+}
+
+static int k_thread_create_wrapper(int i)
+{
+	k_thread_create(&k_threads[i], thread_stacks[i], STACK_SIZE, k_thread_fun,
+			INT_TO_POINTER(i), NULL, NULL, K_HIGHEST_APPLICATION_THREAD_PRIO, 0,
+			K_NO_WAIT);
+
+	return 0;
+}
+
+static int k_thread_join_wrapper(int i)
+{
+	return k_thread_join(&k_threads[i], K_FOREVER);
+}
+
+static void create_join_kthread(void)
+{
+	if (IS_ENABLED(CONFIG_TEST_KTHREADS)) {
+		before();
+		create_join_common("k_thread", k_thread_create_wrapper, k_thread_join_wrapper);
+	}
+}
+
+/*
+ * Wrappers for pthreads
+ */
+
+static pthread_t pthreads[NUM_CPUS];
+static pthread_attr_t pthread_attrs[NUM_CPUS];
+
+static void *pthread_fun(void *arg)
+{
+	k_thread_fun(arg, NULL, NULL);
+	return NULL;
+}
+
+static int pthread_create_wrapper(int i)
+{
+	return pthread_create(&pthreads[i], &pthread_attrs[i], pthread_fun, INT_TO_POINTER(i));
+}
+
+static int pthread_join_wrapper(int i)
+{
+	return pthread_join(pthreads[i], NULL);
+}
+
+static void create_join_pthread(void)
+{
+	if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
+		before();
+		create_join_common("pthread", pthread_create_wrapper, pthread_join_wrapper);
+	}
+}
+
+static void setup(void)
+{
+	printf("ASSERT: %c\n", IS_ENABLED(CONFIG_ASSERT) ? 'y' : 'n');
+	printf("BOARD: %s\n", CONFIG_BOARD);
+	printf("NUM_CPUS: %u\n", NUM_CPUS);
+	printf("TEST_DELAY_US: %u\n", CONFIG_TEST_DELAY_US);
+	printf("TEST_DURATION_S: %u\n", CONFIG_TEST_DURATION_S);
+	printf("SMP: %c\n", IS_ENABLED(CONFIG_SMP) ? 'y' : 'n');
+
+	printf("API, Thread ID, time(s), threads, cores, rate (threads/s/core)\n");
+
+	if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
+		int __maybe_unused ret;
+		const struct sched_param param = {
+			.sched_priority = sched_get_priority_max(SCHED_FIFO),
+		};
+
+		/* setup pthread stacks */
+		for (int i = 0; i < NUM_CPUS; ++i) {
+			ret = pthread_attr_init(&pthread_attrs[i]);
+			__ASSERT(ret == 0, "pthread_attr_init[%d] failed: %d", i, ret);
+
+			ret = pthread_attr_setstack(&pthread_attrs[i], thread_stacks[i],
+						    STACK_SIZE);
+			__ASSERT(ret == 0, "pthread_attr_setstack[%d] failed: %d", i, ret);
+
+			ret = pthread_attr_setschedpolicy(&pthread_attrs[i], SCHED_FIFO);
+			__ASSERT(ret == 0, "pthread_attr_setschedpolicy[%d] failed: %d", i, ret);
+
+			ret = pthread_attr_setschedparam(&pthread_attrs[i], &param);
+			__ASSERT(ret == 0, "pthread_attr_setschedparam[%d] failed: %d", i, ret);
+		}
+	}
+}
+
+static void before(void)
+{
+	for (int i = 0; i < NUM_CPUS; ++i) {
+		counters[i] = 0;
+	}
+}
+
+int main(void)
+{
+	setup();
+
+	create_join_kthread();
+	create_join_pthread();
+
+	printf("PROJECT EXECUTION SUCCESSFUL\n");
+}
--- a/tests/benchmarks/posix/threads/testcase.yaml
+++ b/tests/benchmarks/posix/threads/testcase.yaml
@ -0,0 +1,21 @@
+common:
+  tags:
+    - posix
+    - benchmark
+  min_ram: 64
+  arch_exclude:
+    - posix
+  integration_platforms:
+    - qemu_cortex_a53/qemu_cortex_a53/smp
+    - qemu_riscv64/qemu_virt_riscv64/smp
+    - qemu_riscv32/qemu_virt_riscv32/smp
+    - qemu_x86_64
+  harness: console
+  harness_config:
+    type: one_line
+    record:
+      regex: "(?P<api>.*), ALL, (?P<time>.*), (?P<threads>.*), (?P<cores>.*), (?P<rate>.*)"
+    regex:
+      - "PROJECT EXECUTION SUCCESSFUL"
+tests:
+  benchmark.posix.threads: {}
--- a/tests/posix/pthread_pressure/prj.conf
+++ b/tests/posix/pthread_pressure/prj.conf
@ -1,11 +0,0 @@
-CONFIG_ZTEST=y
-CONFIG_POSIX_API=y
-CONFIG_POSIX_PRIORITY_SCHEDULING=y
-
-## Note: for benchmarking purposes, uncomment the Kconfig below
-# CONFIG_TEST_DURATION_S=60
-# CONFIG_TEST_EXTRA_ASSERTIONS=n
-# CONFIG_ASSERT=n
-
-## Optionally, uncomment this to only test pthreads:
-# CONFIG_TEST_KTHREADS=n
--- a/tests/posix/pthread_pressure/src/main.c
+++ b/tests/posix/pthread_pressure/src/main.c
@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2023, Meta
- *
- * SPDX-License-Identifier: Apache-2.0
- */
-
-#include <pthread.h>
-
-#include <zephyr/sys/util.h>
-#include <zephyr/ztest.h>
-
-#define STACK_SIZE K_THREAD_STACK_LEN(CONFIG_TEST_STACK_SIZE)
-
-/* update interval for printing stats */
-#if CONFIG_TEST_DURATION_S >= 60
-#define UPDATE_INTERVAL_S 10
-#elif CONFIG_TEST_DURATION_S >= 30
-#define UPDATE_INTERVAL_S 5
-#else
-#define UPDATE_INTERVAL_S 1
-#endif
-
-/* 32 threads is mainly a limitation of find_lsb_set() */
-#define NUM_THREADS MIN(32, MIN(CONFIG_TEST_NUM_CPUS, CONFIG_POSIX_THREAD_THREADS_MAX))
-
-typedef int (*create_fn)(int i);
-typedef int (*join_fn)(int i);
-
-static void *setup(void);
-static void before(void *fixture);
-
-/* bitmask of available threads */
-static bool alive[NUM_THREADS];
-
-/* array of thread stacks */
-static K_THREAD_STACK_ARRAY_DEFINE(thread_stacks, NUM_THREADS, STACK_SIZE);
-
-static struct k_thread k_threads[NUM_THREADS];
-static uint64_t counters[NUM_THREADS];
-static uint64_t prev_counters[NUM_THREADS];
-
-static void print_stats(uint64_t now, uint64_t end)
-{
-	printk("now (ms): %llu end (ms): %llu\n", now, end);
-	for (int i = 0; i < NUM_THREADS; ++i) {
-		printk("Thread %d created and joined %llu times (%llu joins/s)\n", i, counters[i],
-		       (counters[i] - prev_counters[i]) / UPDATE_INTERVAL_S);
-		prev_counters[i] = counters[i];
-	}
-}
-
-static void test_create_join_common(const char *tag, create_fn create, join_fn join)
-{
-	int i;
-	int ret;
-	uint64_t now_ms = k_uptime_get();
-	const uint64_t end_ms = now_ms + MSEC_PER_SEC * CONFIG_TEST_DURATION_S;
-	uint64_t update_ms = now_ms + MSEC_PER_SEC * UPDATE_INTERVAL_S;
-
-	printk("BOARD: %s\n", CONFIG_BOARD);
-	printk("CONFIG_SMP: %s\n", IS_ENABLED(CONFIG_SMP) ? "y" : "n");
-	printk("NUM_THREADS: %u\n", NUM_THREADS);
-	printk("TEST_NUM_CPUS: %u\n", CONFIG_TEST_NUM_CPUS);
-	printk("TEST_DURATION_S: %u\n", CONFIG_TEST_DURATION_S);
-	printk("TEST_DELAY_US: %u\n", CONFIG_TEST_DELAY_US);
-
-	for (i = 0; i < NUM_THREADS; ++i) {
-		/* spawn thread i */
-		prev_counters[i] = 0;
-		ret = create(i);
-		if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
-			zassert_ok(ret, "%s_create(%d)[%zu] failed: %d", tag, i, counters[i], ret);
-		}
-	}
-
-	do {
-		if (!IS_ENABLED(CONFIG_SMP)) {
-			/* allow the test thread to be swapped-out */
-			k_yield();
-		}
-
-		for (i = 0; i < NUM_THREADS; ++i) {
-			if (alive[i]) {
-				ret = join(i);
-				if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
-					zassert_ok(ret, "%s_join(%d)[%zu] failed: %d", tag, i,
-						   counters[i], ret);
-				}
-				alive[i] = false;
-
-				/* update counter i after each (create,join) pair */
-				++counters[i];
-
-				if (IS_ENABLED(CONFIG_TEST_DELAY_US)) {
-					/* success with 0 delay means we are ~raceless */
-					k_busy_wait(CONFIG_TEST_DELAY_US);
-				}
-
-				/* re-spawn thread i */
-				ret = create(i);
-				if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
-					zassert_ok(ret, "%s_create(%d)[%zu] failed: %d", tag, i,
-						   counters[i], ret);
-				}
-			}
-		}
-
-		/* are we there yet? */
-		now_ms = k_uptime_get();
-
-		/* dump some stats periodically */
-		if (now_ms > update_ms) {
-			update_ms += MSEC_PER_SEC * UPDATE_INTERVAL_S;
-
-			/* at this point, we should have seen many context switches */
-			for (i = 0; i < NUM_THREADS; ++i) {
-				if (IS_ENABLED(CONFIG_TEST_EXTRA_ASSERTIONS)) {
-					zassert_true(counters[i] > 0, "%s %d was never scheduled",
-						     tag, i);
-				}
-			}
-
-			print_stats(now_ms, end_ms);
-		}
-		Z_SPIN_DELAY(100);
-	} while (end_ms > now_ms);
-
-	print_stats(now_ms, end_ms);
-}
-
-/*
- * Wrappers for k_threads
- */
-
-static void k_thread_fun(void *arg1, void *arg2, void *arg3)
-{
-	int i = POINTER_TO_INT(arg1);
-
-	alive[i] = true;
-}
-
-static int k_thread_create_wrapper(int i)
-{
-	k_thread_create(&k_threads[i], thread_stacks[i], STACK_SIZE, k_thread_fun,
-			INT_TO_POINTER(i), NULL, NULL, K_HIGHEST_APPLICATION_THREAD_PRIO, 0,
-			K_NO_WAIT);
-
-	return 0;
-}
-
-static int k_thread_join_wrapper(int i)
-{
-	return k_thread_join(&k_threads[i], K_FOREVER);
-}
-
-ZTEST(pthread_pressure, test_k_thread_create_join)
-{
-	if (IS_ENABLED(CONFIG_TEST_KTHREADS)) {
-		test_create_join_common("k_thread", k_thread_create_wrapper, k_thread_join_wrapper);
-	} else {
-		ztest_test_skip();
-	}
-}
-
-/*
- * Wrappers for pthreads
- */
-
-static pthread_t pthreads[NUM_THREADS];
-static pthread_attr_t pthread_attrs[NUM_THREADS];
-
-static void *pthread_fun(void *arg)
-{
-	k_thread_fun(arg, NULL, NULL);
-	return NULL;
-}
-
-static int pthread_create_wrapper(int i)
-{
-	return pthread_create(&pthreads[i], &pthread_attrs[i], pthread_fun, INT_TO_POINTER(i));
-}
-
-static int pthread_join_wrapper(int i)
-{
-	return pthread_join(pthreads[i], NULL);
-}
-
-ZTEST(pthread_pressure, test_pthread_create_join)
-{
-	if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
-		test_create_join_common("pthread", pthread_create_wrapper, pthread_join_wrapper);
-	} else {
-		ztest_test_skip();
-	}
-}
-
-/*
- * Test suite / fixture
- */
-
-ZTEST_SUITE(pthread_pressure, NULL, setup, before, NULL, NULL);
-
-static void *setup(void)
-{
-	if (IS_ENABLED(CONFIG_TEST_PTHREADS)) {
-		const struct sched_param param = {
-			.sched_priority = sched_get_priority_max(SCHED_FIFO),
-		};
-
-		/* setup pthread stacks */
-		for (int i = 0; i < NUM_THREADS; ++i) {
-			zassert_ok(pthread_attr_init(&pthread_attrs[i]));
-			zassert_ok(pthread_attr_setstack(&pthread_attrs[i], thread_stacks[i],
-							 STACK_SIZE));
-			zassert_ok(pthread_attr_setschedpolicy(&pthread_attrs[i], SCHED_FIFO));
-			zassert_ok(pthread_attr_setschedparam(&pthread_attrs[i], &param));
-		}
-	}
-
-	return NULL;
-}
-
-static void before(void *fixture)
-{
-	ARG_UNUSED(before);
-
-	for (int i = 0; i < NUM_THREADS; ++i) {
-		counters[i] = 0;
-	}
-}
--- a/tests/posix/pthread_pressure/testcase.yaml
+++ b/tests/posix/pthread_pressure/testcase.yaml
@ -1,16 +0,0 @@
-common:
-  tags: posix
-  min_ram: 64
-  arch_exclude:
-    - posix
-  integration_platforms:
-    - qemu_cortex_a53/qemu_cortex_a53/smp
-    - qemu_riscv64/qemu_virt_riscv64/smp
-    - qemu_riscv32/qemu_virt_riscv32/smp
-    - qemu_x86_64
-tests:
-  portability.posix.pthread_pressure:
-    extra_configs:
-      # Enabled for GitHub CI to reduce host scheduling noise while running
-      # several concurrent Qemu processes each under stressful SMP load.
-      - CONFIG_PTHREAD_CREATE_BARRIER=y