summaryrefslogtreecommitdiff
path: root/tools/sched_ext
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2024-06-18 10:09:18 -1000
committerTejun Heo <tj@kernel.org>2024-06-18 10:09:18 -1000
commit07814a9439a3b03d79a1001614b5bc1cab69bcec (patch)
tree045150e5c235dbf6754efb6b15d47e94d0354499 /tools/sched_ext
parent1538e33995eaf3f315cbb5506019b9f913ed8555 (diff)
sched_ext: Print debug dump after an error exit
If a BPF scheduler triggers an error, the scheduler is aborted and the system is reverted to the built-in scheduler. In the process, a lot of information which may be useful for figuring out what happened can be lost. This patch adds debug dump which captures information which may be useful for debugging including runqueue and runnable thread states at the time of failure. The following shows a debug dump after triggering the watchdog: root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100 stats : enq=1 dsp=0 delta=1 deq=0 stats : enq=90 dsp=90 delta=0 deq=0 stats : enq=156 dsp=156 delta=0 deq=0 stats : enq=218 dsp=218 delta=0 deq=0 stats : enq=255 dsp=255 delta=0 deq=0 stats : enq=271 dsp=271 delta=0 deq=0 stats : enq=284 dsp=284 delta=0 deq=0 stats : enq=293 dsp=293 delta=0 deq=0 DEBUG DUMP ================================================================================ kworker/u32:12[320] triggered exit kind 1026: runnable task stall (stress[1530] failed to run for 6.841s) Backtrace: scx_watchdog_workfn+0x136/0x1c0 process_scheduled_works+0x2b5/0x600 worker_thread+0x269/0x360 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 QMAP FIFO[0]: QMAP FIFO[1]: QMAP FIFO[2]: 1436 QMAP FIFO[3]: QMAP FIFO[4]: CPU states ---------- CPU 0 : nr_run=1 ops_qseq=244 curr=swapper/0[0] class=idle_sched_class QMAP: dsp_idx=1 dsp_cnt=0 R stress[1530] -6841ms scx_state/flags=3/0x1 ops_state/qseq=2/20 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=0 asm_sysvec_apic_timer_interrupt+0x16/0x20 CPU 2 : nr_run=2 ops_qseq=142 curr=swapper/2[0] class=idle_sched_class QMAP: dsp_idx=1 dsp_cnt=0 R sshd[1703] -5905ms scx_state/flags=3/0x9 ops_state/qseq=2/88 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=1 __x64_sys_ppoll+0xf6/0x120 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e R fish[1539] -4141ms scx_state/flags=3/0x9 ops_state/qseq=2/124 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=1 futex_wait+0x60/0xe0 do_futex+0x109/0x180 __x64_sys_futex+0x117/0x190 do_syscall_64+0x7b/0x150 entry_SYSCALL_64_after_hwframe+0x76/0x7e CPU 3 : nr_run=2 ops_qseq=162 curr=kworker/u32:12[320] class=ext_sched_class QMAP: dsp_idx=1 dsp_cnt=0 *R kworker/u32:12[320] +0ms scx_state/flags=3/0xd ops_state/qseq=0/0 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=ff QMAP: force_local=0 scx_dump_state+0x613/0x6f0 scx_ops_error_irq_workfn+0x1f/0x40 irq_work_run_list+0x82/0xd0 irq_work_run+0x14/0x30 __sysvec_irq_work+0x40/0x140 sysvec_irq_work+0x60/0x70 asm_sysvec_irq_work+0x16/0x20 scx_watchdog_workfn+0x15f/0x1c0 process_scheduled_works+0x2b5/0x600 worker_thread+0x269/0x360 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 R kworker/3:2[1436] +0ms scx_state/flags=3/0x9 ops_state/qseq=2/160 sticky/holding_cpu=-1/-1 dsq_id=(n/a) cpus=08 QMAP: force_local=0 kthread+0xeb/0x110 ret_from_fork+0x36/0x40 ret_from_fork_asm+0x1a/0x30 CPU 7 : nr_run=0 ops_qseq=76 curr=swapper/7[0] class=idle_sched_class ================================================================================ EXIT: runnable task stall (stress[1530] failed to run for 6.841s) It shows that CPU 3 was running the watchdog when it triggered the error condition and the scx_qmap thread has been queued on CPU 0 for over 5 seconds but failed to run. It also prints out scx_qmap specific information - e.g. which tasks are queued on each FIFO and so on using the dump_*() ops. This dump has proved pretty useful for developing and debugging BPF schedulers. Debug dump is generated automatically when the BPF scheduler exits due to an error. The debug buffer used in such cases is determined by sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns the available buffer, the output is truncated and marked accordingly. Debug dump output can also be read through the sched_ext_dump tracepoint. When read through the tracepoint, there is no length limit. SysRq-D can be used to trigger debug dump at any time while a BPF scheduler is loaded. This is non-destructive - the scheduler keeps running afterwards. The output can be read through the sched_ext_dump tracepoint. v2: - The size of exit debug dump buffer can now be customized using sched_ext_ops.exit_dump_len. - sched_ext_ops.dump*() added to enable dumping of BPF scheduler specific information. - Tracpoint output and SysRq-D triggering added. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: David Vernet <dvernet@meta.com>
Diffstat (limited to 'tools/sched_ext')
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h12
-rw-r--r--tools/sched_ext/include/scx/compat.h9
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.h19
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c54
-rw-r--r--tools/sched_ext/scx_qmap.c14
-rw-r--r--tools/sched_ext/scx_simple.c2
6 files changed, 105 insertions, 5 deletions
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 833fe1bdccf9..3ea5cdf58bc7 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -38,6 +38,7 @@ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
@@ -97,6 +98,17 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
___scx_bpf_bstr_format_checker(fmt, ##args); \
})
+/*
+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
+ * instead of an array of u64. To be used from ops.dump() and friends.
+ */
+#define scx_bpf_dump(fmt, args...) \
+({ \
+ scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \
+ ___scx_bpf_bstr_format_checker(fmt, ##args); \
+})
+
#define BPF_STRUCT_OPS(name, args...) \
SEC("struct_ops/"#name) \
BPF_PROG(name, ##args)
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
index a7fdaf8a858e..c58024c980c8 100644
--- a/tools/sched_ext/include/scx/compat.h
+++ b/tools/sched_ext/include/scx/compat.h
@@ -111,16 +111,23 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field
* is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
* and attach it, backward compatibility is automatically maintained where
* reasonable.
+ *
+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
+ * the current minimum required kernel version.
*/
#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
struct __scx_name *__skel; \
\
+ SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \
+ "sched_ext_ops.dump() missing, kernel too old?"); \
+ \
__skel = __scx_name##__open(); \
SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
__skel; \
})
-#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
})
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
index 8c3b7fac4d05..c2ef85c645e1 100644
--- a/tools/sched_ext/include/scx/user_exit_info.h
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -13,6 +13,7 @@
enum uei_sizes {
UEI_REASON_LEN = 128,
UEI_MSG_LEN = 1024,
+ UEI_DUMP_DFL_LEN = 32768,
};
struct user_exit_info {
@@ -28,6 +29,8 @@ struct user_exit_info {
#include <bpf/bpf_core_read.h>
#define UEI_DEFINE(__name) \
+ char RESIZABLE_ARRAY(data, __name##_dump); \
+ const volatile u32 __name##_dump_len; \
struct user_exit_info __name SEC(".data")
#define UEI_RECORD(__uei_name, __ei) ({ \
@@ -35,6 +38,8 @@ struct user_exit_info {
sizeof(__uei_name.reason), (__ei)->reason); \
bpf_probe_read_kernel_str(__uei_name.msg, \
sizeof(__uei_name.msg), (__ei)->msg); \
+ bpf_probe_read_kernel_str(__uei_name##_dump, \
+ __uei_name##_dump_len, (__ei)->dump); \
if (bpf_core_field_exists((__ei)->exit_code)) \
__uei_name.exit_code = (__ei)->exit_code; \
/* use __sync to force memory barrier */ \
@@ -47,6 +52,13 @@ struct user_exit_info {
#include <stdio.h>
#include <stdbool.h>
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+ (__skel)->rodata->__uei_name##_dump_len = __len; \
+ RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \
+})
+
#define UEI_EXITED(__skel, __uei_name) ({ \
/* use __sync to force memory barrier */ \
__sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \
@@ -54,6 +66,13 @@ struct user_exit_info {
#define UEI_REPORT(__skel, __uei_name) ({ \
struct user_exit_info *__uei = &(__skel)->data->__uei_name; \
+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+ if (__uei_dump[0] != '\0') { \
+ fputs("\nDEBUG DUMP\n", stderr); \
+ fputs("================================================================================\n\n", stderr); \
+ fputs(__uei_dump, stderr); \
+ fputs("\n================================================================================\n\n", stderr); \
+ } \
fprintf(stderr, "EXIT: %s", __uei->reason); \
if (__uei->msg[0] != '\0') \
fprintf(stderr, " (%s)", __uei->msg); \
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 5ff217c4bfa0..5b3da28bf042 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -33,6 +33,7 @@ const volatile u32 stall_user_nth;
const volatile u32 stall_kernel_nth;
const volatile u32 dsp_batch;
const volatile s32 disallow_tgid;
+const volatile bool suppress_dump;
u32 test_error_cnt;
@@ -258,6 +259,56 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
return -ENOMEM;
}
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+ s32 i, pid;
+
+ if (suppress_dump)
+ return;
+
+ bpf_for(i, 0, 5) {
+ void *fifo;
+
+ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
+ return;
+
+ scx_bpf_dump("QMAP FIFO[%d]:", i);
+ bpf_repeat(4096) {
+ if (bpf_map_pop_elem(fifo, &pid))
+ break;
+ scx_bpf_dump(" %d", pid);
+ }
+ scx_bpf_dump("\n");
+ }
+}
+
+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+{
+ u32 zero = 0;
+ struct cpu_ctx *cpuc;
+
+ if (suppress_dump || idle)
+ return;
+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
+ return;
+
+ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu",
+ cpuc->dsp_idx, cpuc->dsp_cnt);
+}
+
+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
+{
+ struct task_ctx *taskc;
+
+ if (suppress_dump)
+ return;
+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ return;
+
+ scx_bpf_dump("QMAP: force_local=%d",
+ taskc->force_local);
+}
+
s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
{
return scx_bpf_create_dsq(SHARED_DSQ, -1);
@@ -274,6 +325,9 @@ SCX_OPS_DEFINE(qmap_ops,
.dequeue = (void *)qmap_dequeue,
.dispatch = (void *)qmap_dispatch,
.init_task = (void *)qmap_init_task,
+ .dump = (void *)qmap_dump,
+ .dump_cpu = (void *)qmap_dump_cpu,
+ .dump_task = (void *)qmap_dump_task,
.init = (void *)qmap_init,
.exit = (void *)qmap_exit,
.timeout_ms = 5000U,
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
index a2614994cfaa..a1123a17581b 100644
--- a/tools/sched_ext/scx_qmap.c
+++ b/tools/sched_ext/scx_qmap.c
@@ -20,7 +20,7 @@ const char help_fmt[] =
"See the top-level comment in .bpf.c for more details.\n"
"\n"
"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n"
-" [-d PID] [-p] [-v]\n"
+" [-d PID] [-D LEN] [-p] [-v]\n"
"\n"
" -s SLICE_US Override slice duration\n"
" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
@@ -28,6 +28,8 @@ const char help_fmt[] =
" -T COUNT Stall every COUNT'th kernel thread\n"
" -b COUNT Dispatch upto COUNT tasks together\n"
" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -D LEN Set scx_exit_info.dump buffer length\n"
+" -S Suppress qmap-specific debug dump\n"
" -p Switch only tasks on SCHED_EXT policy intead of all\n"
" -v Print libbpf debug messages\n"
" -h Display this help and exit\n";
@@ -59,7 +61,7 @@ int main(int argc, char **argv)
skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
- while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) {
+ while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) {
switch (opt) {
case 's':
skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
@@ -81,6 +83,12 @@ int main(int argc, char **argv)
if (skel->rodata->disallow_tgid < 0)
skel->rodata->disallow_tgid = getpid();
break;
+ case 'D':
+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ skel->rodata->suppress_dump = true;
+ break;
case 'p':
skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
break;
@@ -93,7 +101,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, qmap_ops, scx_qmap);
+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
while (!exit_req && !UEI_EXITED(skel, uei)) {
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
index 789ac62fea8e..7f500d1d56ac 100644
--- a/tools/sched_ext/scx_simple.c
+++ b/tools/sched_ext/scx_simple.c
@@ -80,7 +80,7 @@ int main(int argc, char **argv)
}
}
- SCX_OPS_LOAD(skel, simple_ops, scx_simple);
+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
while (!exit_req && !UEI_EXITED(skel, uei)) {