diff options
author | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:18 -1000 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:18 -1000 |
commit | 07814a9439a3b03d79a1001614b5bc1cab69bcec (patch) | |
tree | 045150e5c235dbf6754efb6b15d47e94d0354499 /tools/sched_ext | |
parent | 1538e33995eaf3f315cbb5506019b9f913ed8555 (diff) |
sched_ext: Print debug dump after an error exit
If a BPF scheduler triggers an error, the scheduler is aborted and the
system is reverted to the built-in scheduler. In the process, a lot of
information which may be useful for figuring out what happened can be lost.
This patch adds debug dump which captures information which may be useful
for debugging including runqueue and runnable thread states at the time of
failure. The following shows a debug dump after triggering the watchdog:
root@test ~# os/work/tools/sched_ext/build/bin/scx_qmap -t 100
stats : enq=1 dsp=0 delta=1 deq=0
stats : enq=90 dsp=90 delta=0 deq=0
stats : enq=156 dsp=156 delta=0 deq=0
stats : enq=218 dsp=218 delta=0 deq=0
stats : enq=255 dsp=255 delta=0 deq=0
stats : enq=271 dsp=271 delta=0 deq=0
stats : enq=284 dsp=284 delta=0 deq=0
stats : enq=293 dsp=293 delta=0 deq=0
DEBUG DUMP
================================================================================
kworker/u32:12[320] triggered exit kind 1026:
runnable task stall (stress[1530] failed to run for 6.841s)
Backtrace:
scx_watchdog_workfn+0x136/0x1c0
process_scheduled_works+0x2b5/0x600
worker_thread+0x269/0x360
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
QMAP FIFO[0]:
QMAP FIFO[1]:
QMAP FIFO[2]: 1436
QMAP FIFO[3]:
QMAP FIFO[4]:
CPU states
----------
CPU 0 : nr_run=1 ops_qseq=244
curr=swapper/0[0] class=idle_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
R stress[1530] -6841ms
scx_state/flags=3/0x1 ops_state/qseq=2/20
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=0
asm_sysvec_apic_timer_interrupt+0x16/0x20
CPU 2 : nr_run=2 ops_qseq=142
curr=swapper/2[0] class=idle_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
R sshd[1703] -5905ms
scx_state/flags=3/0x9 ops_state/qseq=2/88
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=1
__x64_sys_ppoll+0xf6/0x120
do_syscall_64+0x7b/0x150
entry_SYSCALL_64_after_hwframe+0x76/0x7e
R fish[1539] -4141ms
scx_state/flags=3/0x9 ops_state/qseq=2/124
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=1
futex_wait+0x60/0xe0
do_futex+0x109/0x180
__x64_sys_futex+0x117/0x190
do_syscall_64+0x7b/0x150
entry_SYSCALL_64_after_hwframe+0x76/0x7e
CPU 3 : nr_run=2 ops_qseq=162
curr=kworker/u32:12[320] class=ext_sched_class
QMAP: dsp_idx=1 dsp_cnt=0
*R kworker/u32:12[320] +0ms
scx_state/flags=3/0xd ops_state/qseq=0/0
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=ff
QMAP: force_local=0
scx_dump_state+0x613/0x6f0
scx_ops_error_irq_workfn+0x1f/0x40
irq_work_run_list+0x82/0xd0
irq_work_run+0x14/0x30
__sysvec_irq_work+0x40/0x140
sysvec_irq_work+0x60/0x70
asm_sysvec_irq_work+0x16/0x20
scx_watchdog_workfn+0x15f/0x1c0
process_scheduled_works+0x2b5/0x600
worker_thread+0x269/0x360
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
R kworker/3:2[1436] +0ms
scx_state/flags=3/0x9 ops_state/qseq=2/160
sticky/holding_cpu=-1/-1 dsq_id=(n/a)
cpus=08
QMAP: force_local=0
kthread+0xeb/0x110
ret_from_fork+0x36/0x40
ret_from_fork_asm+0x1a/0x30
CPU 7 : nr_run=0 ops_qseq=76
curr=swapper/7[0] class=idle_sched_class
================================================================================
EXIT: runnable task stall (stress[1530] failed to run for 6.841s)
It shows that CPU 3 was running the watchdog when it triggered the error
condition and the scx_qmap thread has been queued on CPU 0 for over 5
seconds but failed to run. It also prints out scx_qmap specific information
- e.g. which tasks are queued on each FIFO and so on using the dump_*() ops.
This dump has proved pretty useful for developing and debugging BPF
schedulers.
Debug dump is generated automatically when the BPF scheduler exits due to an
error. The debug buffer used in such cases is determined by
sched_ext_ops.exit_dump_len and defaults to 32k. If the debug dump overruns
the available buffer, the output is truncated and marked accordingly.
Debug dump output can also be read through the sched_ext_dump tracepoint.
When read through the tracepoint, there is no length limit.
SysRq-D can be used to trigger debug dump at any time while a BPF scheduler
is loaded. This is non-destructive - the scheduler keeps running afterwards.
The output can be read through the sched_ext_dump tracepoint.
v2: - The size of exit debug dump buffer can now be customized using
sched_ext_ops.exit_dump_len.
- sched_ext_ops.dump*() added to enable dumping of BPF scheduler
specific information.
- Tracpoint output and SysRq-D triggering added.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Diffstat (limited to 'tools/sched_ext')
-rw-r--r-- | tools/sched_ext/include/scx/common.bpf.h | 12 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/compat.h | 9 | ||||
-rw-r--r-- | tools/sched_ext/include/scx/user_exit_info.h | 19 | ||||
-rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 54 | ||||
-rw-r--r-- | tools/sched_ext/scx_qmap.c | 14 | ||||
-rw-r--r-- | tools/sched_ext/scx_simple.c | 2 |
6 files changed, 105 insertions, 5 deletions
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h index 833fe1bdccf9..3ea5cdf58bc7 100644 --- a/tools/sched_ext/include/scx/common.bpf.h +++ b/tools/sched_ext/include/scx/common.bpf.h @@ -38,6 +38,7 @@ s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; +void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; @@ -97,6 +98,17 @@ void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} ___scx_bpf_bstr_format_checker(fmt, ##args); \ }) +/* + * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments + * instead of an array of u64. To be used from ops.dump() and friends. + */ +#define scx_bpf_dump(fmt, args...) \ +({ \ + scx_bpf_bstr_preamble(fmt, args) \ + scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ + ___scx_bpf_bstr_format_checker(fmt, ##args); \ +}) + #define BPF_STRUCT_OPS(name, args...) \ SEC("struct_ops/"#name) \ BPF_PROG(name, ##args) diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h index a7fdaf8a858e..c58024c980c8 100644 --- a/tools/sched_ext/include/scx/compat.h +++ b/tools/sched_ext/include/scx/compat.h @@ -111,16 +111,23 @@ static inline bool __COMPAT_struct_has_field(const char *type, const char *field * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load * and attach it, backward compatibility is automatically maintained where * reasonable. + * + * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is + * the current minimum required kernel version. */ #define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ struct __scx_name *__skel; \ \ + SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \ + "sched_ext_ops.dump() missing, kernel too old?"); \ + \ __skel = __scx_name##__open(); \ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ __skel; \ }) -#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name) ({ \ +#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ + UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ }) diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h index 8c3b7fac4d05..c2ef85c645e1 100644 --- a/tools/sched_ext/include/scx/user_exit_info.h +++ b/tools/sched_ext/include/scx/user_exit_info.h @@ -13,6 +13,7 @@ enum uei_sizes { UEI_REASON_LEN = 128, UEI_MSG_LEN = 1024, + UEI_DUMP_DFL_LEN = 32768, }; struct user_exit_info { @@ -28,6 +29,8 @@ struct user_exit_info { #include <bpf/bpf_core_read.h> #define UEI_DEFINE(__name) \ + char RESIZABLE_ARRAY(data, __name##_dump); \ + const volatile u32 __name##_dump_len; \ struct user_exit_info __name SEC(".data") #define UEI_RECORD(__uei_name, __ei) ({ \ @@ -35,6 +38,8 @@ struct user_exit_info { sizeof(__uei_name.reason), (__ei)->reason); \ bpf_probe_read_kernel_str(__uei_name.msg, \ sizeof(__uei_name.msg), (__ei)->msg); \ + bpf_probe_read_kernel_str(__uei_name##_dump, \ + __uei_name##_dump_len, (__ei)->dump); \ if (bpf_core_field_exists((__ei)->exit_code)) \ __uei_name.exit_code = (__ei)->exit_code; \ /* use __sync to force memory barrier */ \ @@ -47,6 +52,13 @@ struct user_exit_info { #include <stdio.h> #include <stdbool.h> +/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ +#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ + u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ + (__skel)->rodata->__uei_name##_dump_len = __len; \ + RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \ +}) + #define UEI_EXITED(__skel, __uei_name) ({ \ /* use __sync to force memory barrier */ \ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ @@ -54,6 +66,13 @@ struct user_exit_info { #define UEI_REPORT(__skel, __uei_name) ({ \ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ + char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ + if (__uei_dump[0] != '\0') { \ + fputs("\nDEBUG DUMP\n", stderr); \ + fputs("================================================================================\n\n", stderr); \ + fputs(__uei_dump, stderr); \ + fputs("\n================================================================================\n\n", stderr); \ + } \ fprintf(stderr, "EXIT: %s", __uei->reason); \ if (__uei->msg[0] != '\0') \ fprintf(stderr, " (%s)", __uei->msg); \ diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 5ff217c4bfa0..5b3da28bf042 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -33,6 +33,7 @@ const volatile u32 stall_user_nth; const volatile u32 stall_kernel_nth; const volatile u32 dsp_batch; const volatile s32 disallow_tgid; +const volatile bool suppress_dump; u32 test_error_cnt; @@ -258,6 +259,56 @@ s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, return -ENOMEM; } +void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) +{ + s32 i, pid; + + if (suppress_dump) + return; + + bpf_for(i, 0, 5) { + void *fifo; + + if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) + return; + + scx_bpf_dump("QMAP FIFO[%d]:", i); + bpf_repeat(4096) { + if (bpf_map_pop_elem(fifo, &pid)) + break; + scx_bpf_dump(" %d", pid); + } + scx_bpf_dump("\n"); + } +} + +void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) +{ + u32 zero = 0; + struct cpu_ctx *cpuc; + + if (suppress_dump || idle) + return; + if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) + return; + + scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu", + cpuc->dsp_idx, cpuc->dsp_cnt); +} + +void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) +{ + struct task_ctx *taskc; + + if (suppress_dump) + return; + if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) + return; + + scx_bpf_dump("QMAP: force_local=%d", + taskc->force_local); +} + s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) { return scx_bpf_create_dsq(SHARED_DSQ, -1); @@ -274,6 +325,9 @@ SCX_OPS_DEFINE(qmap_ops, .dequeue = (void *)qmap_dequeue, .dispatch = (void *)qmap_dispatch, .init_task = (void *)qmap_init_task, + .dump = (void *)qmap_dump, + .dump_cpu = (void *)qmap_dump_cpu, + .dump_task = (void *)qmap_dump_task, .init = (void *)qmap_init, .exit = (void *)qmap_exit, .timeout_ms = 5000U, diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c index a2614994cfaa..a1123a17581b 100644 --- a/tools/sched_ext/scx_qmap.c +++ b/tools/sched_ext/scx_qmap.c @@ -20,7 +20,7 @@ const char help_fmt[] = "See the top-level comment in .bpf.c for more details.\n" "\n" "Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-b COUNT]\n" -" [-d PID] [-p] [-v]\n" +" [-d PID] [-D LEN] [-p] [-v]\n" "\n" " -s SLICE_US Override slice duration\n" " -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" @@ -28,6 +28,8 @@ const char help_fmt[] = " -T COUNT Stall every COUNT'th kernel thread\n" " -b COUNT Dispatch upto COUNT tasks together\n" " -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" +" -D LEN Set scx_exit_info.dump buffer length\n" +" -S Suppress qmap-specific debug dump\n" " -p Switch only tasks on SCHED_EXT policy intead of all\n" " -v Print libbpf debug messages\n" " -h Display this help and exit\n"; @@ -59,7 +61,7 @@ int main(int argc, char **argv) skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); - while ((opt = getopt(argc, argv, "s:e:t:T:b:d:pvh")) != -1) { + while ((opt = getopt(argc, argv, "s:e:t:T:b:d:D:Spvh")) != -1) { switch (opt) { case 's': skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; @@ -81,6 +83,12 @@ int main(int argc, char **argv) if (skel->rodata->disallow_tgid < 0) skel->rodata->disallow_tgid = getpid(); break; + case 'D': + skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); + break; + case 'S': + skel->rodata->suppress_dump = true; + break; case 'p': skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL; break; @@ -93,7 +101,7 @@ int main(int argc, char **argv) } } - SCX_OPS_LOAD(skel, qmap_ops, scx_qmap); + SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); while (!exit_req && !UEI_EXITED(skel, uei)) { diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c index 789ac62fea8e..7f500d1d56ac 100644 --- a/tools/sched_ext/scx_simple.c +++ b/tools/sched_ext/scx_simple.c @@ -80,7 +80,7 @@ int main(int argc, char **argv) } } - SCX_OPS_LOAD(skel, simple_ops, scx_simple); + SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); while (!exit_req && !UEI_EXITED(skel, uei)) { |