summaryrefslogtreecommitdiff
path: root/tools/sched_ext
diff options
context:
space:
mode:
Diffstat (limited to 'tools/sched_ext')
-rw-r--r--tools/sched_ext/.gitignore2
-rw-r--r--tools/sched_ext/Makefile246
-rw-r--r--tools/sched_ext/README.md270
-rw-r--r--tools/sched_ext/include/bpf-compat/gnu/stubs.h11
-rw-r--r--tools/sched_ext/include/scx/common.bpf.h412
-rw-r--r--tools/sched_ext/include/scx/common.h75
-rw-r--r--tools/sched_ext/include/scx/compat.bpf.h28
-rw-r--r--tools/sched_ext/include/scx/compat.h186
-rw-r--r--tools/sched_ext/include/scx/user_exit_info.h111
-rw-r--r--tools/sched_ext/scx_central.bpf.c361
-rw-r--r--tools/sched_ext/scx_central.c135
-rw-r--r--tools/sched_ext/scx_flatcg.bpf.c949
-rw-r--r--tools/sched_ext/scx_flatcg.c233
-rw-r--r--tools/sched_ext/scx_flatcg.h51
-rw-r--r--tools/sched_ext/scx_qmap.bpf.c827
-rw-r--r--tools/sched_ext/scx_qmap.c153
-rw-r--r--tools/sched_ext/scx_show_state.py40
-rw-r--r--tools/sched_ext/scx_simple.bpf.c156
-rw-r--r--tools/sched_ext/scx_simple.c107
19 files changed, 4353 insertions, 0 deletions
diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore
new file mode 100644
index 000000000000..d6264fe1c8cd
--- /dev/null
+++ b/tools/sched_ext/.gitignore
@@ -0,0 +1,2 @@
+tools/
+build/
diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile
new file mode 100644
index 000000000000..ca3815e572d8
--- /dev/null
+++ b/tools/sched_ext/Makefile
@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+include ../build/Build.include
+include ../scripts/Makefile.arch
+include ../scripts/Makefile.include
+
+all: all_targets
+
+ifneq ($(LLVM),)
+ifneq ($(filter %/,$(LLVM)),)
+LLVM_PREFIX := $(LLVM)
+else ifneq ($(filter -%,$(LLVM)),)
+LLVM_SUFFIX := $(LLVM)
+endif
+
+CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi
+CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu
+CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl
+CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu
+CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu
+CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu
+CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu
+CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu
+CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu
+CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH))
+
+ifeq ($(CROSS_COMPILE),)
+ifeq ($(CLANG_TARGET_FLAGS),)
+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk)
+else
+CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS)
+endif # CLANG_TARGET_FLAGS
+else
+CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%))
+endif # CROSS_COMPILE
+
+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as
+else
+CC := $(CROSS_COMPILE)gcc
+endif # LLVM
+
+CURDIR := $(abspath .)
+TOOLSDIR := $(abspath ..)
+LIBDIR := $(TOOLSDIR)/lib
+BPFDIR := $(LIBDIR)/bpf
+TOOLSINCDIR := $(TOOLSDIR)/include
+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool
+APIDIR := $(TOOLSINCDIR)/uapi
+GENDIR := $(abspath ../../include/generated)
+GENHDR := $(GENDIR)/autoconf.h
+
+ifeq ($(O),)
+OUTPUT_DIR := $(CURDIR)/build
+else
+OUTPUT_DIR := $(O)/build
+endif # O
+OBJ_DIR := $(OUTPUT_DIR)/obj
+INCLUDE_DIR := $(OUTPUT_DIR)/include
+BPFOBJ_DIR := $(OBJ_DIR)/libbpf
+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext
+BINDIR := $(OUTPUT_DIR)/bin
+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a
+ifneq ($(CROSS_COMPILE),)
+HOST_BUILD_DIR := $(OBJ_DIR)/host
+HOST_OUTPUT_DIR := host-tools
+HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include
+else
+HOST_BUILD_DIR := $(OBJ_DIR)
+HOST_OUTPUT_DIR := $(OUTPUT_DIR)
+HOST_INCLUDE_DIR := $(INCLUDE_DIR)
+endif
+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a
+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids
+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool
+
+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \
+ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \
+ ../../vmlinux \
+ /sys/kernel/btf/vmlinux \
+ /boot/vmlinux-$(shell uname -r)
+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS))))
+ifeq ($(VMLINUX_BTF),)
+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)")
+endif
+
+BPFTOOL ?= $(DEFAULT_BPFTOOL)
+
+ifneq ($(wildcard $(GENHDR)),)
+ GENFLAGS := -DHAVE_GENHDR
+endif
+
+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \
+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \
+ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include
+
+# Silence some warnings when compiled with clang
+ifneq ($(LLVM),)
+CFLAGS += -Wno-unused-command-line-argument
+endif
+
+LDFLAGS = -lelf -lz -lpthread
+
+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - </dev/null | \
+ grep 'define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__')
+
+# Get Clang's default includes on this system, as opposed to those seen by
+# '-target bpf'. This fixes "missing" files on some architectures/distros,
+# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
+#
+# Use '-idirafter': Don't interfere with include mechanics except where the
+# build would have failed anyways.
+define get_sys_includes
+$(shell $(1) -v -E - </dev/null 2>&1 \
+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
+$(shell $(1) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
+endef
+
+BPF_CFLAGS = -g -D__TARGET_ARCH_$(SRCARCH) \
+ $(if $(IS_LITTLE_ENDIAN),-mlittle-endian,-mbig-endian) \
+ -I$(CURDIR)/include -I$(CURDIR)/include/bpf-compat \
+ -I$(INCLUDE_DIR) -I$(APIDIR) \
+ -I../../include \
+ $(call get_sys_includes,$(CLANG)) \
+ -Wall -Wno-compare-distinct-pointer-types \
+ -O2 -mcpu=v3
+
+# sort removes libbpf duplicates when not cross-building
+MAKE_DIRS := $(sort $(OBJ_DIR)/libbpf $(HOST_BUILD_DIR)/libbpf \
+ $(HOST_BUILD_DIR)/bpftool $(HOST_BUILD_DIR)/resolve_btfids \
+ $(INCLUDE_DIR) $(SCXOBJ_DIR) $(BINDIR))
+
+$(MAKE_DIRS):
+ $(call msg,MKDIR,,$@)
+ $(Q)mkdir -p $@
+
+$(BPFOBJ): $(wildcard $(BPFDIR)/*.[ch] $(BPFDIR)/Makefile) \
+ $(APIDIR)/linux/bpf.h \
+ | $(OBJ_DIR)/libbpf
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFDIR) OUTPUT=$(OBJ_DIR)/libbpf/ \
+ EXTRA_CFLAGS='-g -O0 -fPIC' \
+ DESTDIR=$(OUTPUT_DIR) prefix= all install_headers
+
+$(DEFAULT_BPFTOOL): $(wildcard $(BPFTOOLDIR)/*.[ch] $(BPFTOOLDIR)/Makefile) \
+ $(HOST_BPFOBJ) | $(HOST_BUILD_DIR)/bpftool
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOLDIR) \
+ ARCH= CROSS_COMPILE= CC=$(HOSTCC) LD=$(HOSTLD) \
+ EXTRA_CFLAGS='-g -O0' \
+ OUTPUT=$(HOST_BUILD_DIR)/bpftool/ \
+ LIBBPF_OUTPUT=$(HOST_BUILD_DIR)/libbpf/ \
+ LIBBPF_DESTDIR=$(HOST_OUTPUT_DIR)/ \
+ prefix= DESTDIR=$(HOST_OUTPUT_DIR)/ install-bin
+
+$(INCLUDE_DIR)/vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL) | $(INCLUDE_DIR)
+ifeq ($(VMLINUX_H),)
+ $(call msg,GEN,,$@)
+ $(Q)$(BPFTOOL) btf dump file $(VMLINUX_BTF) format c > $@
+else
+ $(call msg,CP,,$@)
+ $(Q)cp "$(VMLINUX_H)" $@
+endif
+
+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \
+ | $(BPFOBJ) $(SCXOBJ_DIR)
+ $(call msg,CLNG-BPF,,$(notdir $@))
+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@
+
+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL)
+ $(eval sched=$(notdir $@))
+ $(call msg,GEN-SKEL,,$(sched))
+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $<
+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o)
+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o)
+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o)
+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@
+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h)
+
+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR)
+
+c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg
+
+$(addprefix $(BINDIR)/,$(c-sched-targets)): \
+ $(BINDIR)/%: \
+ $(filter-out %.bpf.c,%.c) \
+ $(INCLUDE_DIR)/%.bpf.skel.h \
+ $(SCX_COMMON_DEPS)
+ $(eval sched=$(notdir $@))
+ $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o
+ $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS)
+
+$(c-sched-targets): %: $(BINDIR)/%
+
+install: all
+ $(Q)mkdir -p $(DESTDIR)/usr/local/bin/
+ $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/
+
+clean:
+ rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR)
+ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h
+ rm -f $(c-sched-targets)
+
+help:
+ @echo 'Building targets'
+ @echo '================'
+ @echo ''
+ @echo ' all - Compile all schedulers'
+ @echo ''
+ @echo 'Alternatively, you may compile individual schedulers:'
+ @echo ''
+ @printf ' %s\n' $(c-sched-targets)
+ @echo ''
+ @echo 'For any scheduler build target, you may specify an alternative'
+ @echo 'build output path with the O= environment variable. For example:'
+ @echo ''
+ @echo ' O=/tmp/sched_ext make all'
+ @echo ''
+ @echo 'will compile all schedulers, and emit the build artifacts to'
+ @echo '/tmp/sched_ext/build.'
+ @echo ''
+ @echo ''
+ @echo 'Installing targets'
+ @echo '=================='
+ @echo ''
+ @echo ' install - Compile and install all schedulers to /usr/bin.'
+ @echo ' You may specify the DESTDIR= environment variable'
+ @echo ' to indicate a prefix for /usr/bin. For example:'
+ @echo ''
+ @echo ' DESTDIR=/tmp/sched_ext make install'
+ @echo ''
+ @echo ' will build the schedulers in CWD/build, and'
+ @echo ' install the schedulers to /tmp/sched_ext/usr/bin.'
+ @echo ''
+ @echo ''
+ @echo 'Cleaning targets'
+ @echo '================'
+ @echo ''
+ @echo ' clean - Remove all generated files'
+
+all_targets: $(c-sched-targets)
+
+.PHONY: all all_targets $(c-sched-targets) clean help
+
+# delete failed targets
+.DELETE_ON_ERROR:
+
+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets
+.SECONDARY:
diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md
new file mode 100644
index 000000000000..16a42e4060f6
--- /dev/null
+++ b/tools/sched_ext/README.md
@@ -0,0 +1,270 @@
+SCHED_EXT EXAMPLE SCHEDULERS
+============================
+
+# Introduction
+
+This directory contains a number of example sched_ext schedulers. These
+schedulers are meant to provide examples of different types of schedulers
+that can be built using sched_ext, and illustrate how various features of
+sched_ext can be used.
+
+Some of the examples are performant, production-ready schedulers. That is, for
+the correct workload and with the correct tuning, they may be deployed in a
+production environment with acceptable or possibly even improved performance.
+Others are just examples that in practice, would not provide acceptable
+performance (though they could be improved to get there).
+
+This README will describe these example schedulers, including describing the
+types of workloads or scenarios they're designed to accommodate, and whether or
+not they're production ready. For more details on any of these schedulers,
+please see the header comment in their .bpf.c file.
+
+
+# Compiling the examples
+
+There are a few toolchain dependencies for compiling the example schedulers.
+
+## Toolchain dependencies
+
+1. clang >= 16.0.0
+
+The schedulers are BPF programs, and therefore must be compiled with clang. gcc
+is actively working on adding a BPF backend compiler as well, but are still
+missing some features such as BTF type tags which are necessary for using
+kptrs.
+
+2. pahole >= 1.25
+
+You may need pahole in order to generate BTF from DWARF.
+
+3. rust >= 1.70.0
+
+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You
+should be able to use the stable build from rustup, but if that doesn't
+work, try using the rustup nightly build.
+
+There are other requirements as well, such as make, but these are the main /
+non-trivial ones.
+
+## Compiling the kernel
+
+In order to run a sched_ext scheduler, you'll have to run a kernel compiled
+with the patches in this repository, and with a minimum set of necessary
+Kconfig options:
+
+```
+CONFIG_BPF=y
+CONFIG_SCHED_CLASS_EXT=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_BPF_JIT=y
+CONFIG_DEBUG_INFO_BTF=y
+```
+
+It's also recommended that you also include the following Kconfig options:
+
+```
+CONFIG_BPF_JIT_ALWAYS_ON=y
+CONFIG_BPF_JIT_DEFAULT_ON=y
+CONFIG_PAHOLE_HAS_SPLIT_BTF=y
+CONFIG_PAHOLE_HAS_BTF_TAG=y
+```
+
+There is a `Kconfig` file in this directory whose contents you can append to
+your local `.config` file, as long as there are no conflicts with any existing
+options in the file.
+
+## Getting a vmlinux.h file
+
+You may notice that most of the example schedulers include a "vmlinux.h" file.
+This is a large, auto-generated header file that contains all of the types
+defined in some vmlinux binary that was compiled with
+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig
+options specified above).
+
+The header file is created using `bpftool`, by passing it a vmlinux binary
+compiled with BTF as follows:
+
+```bash
+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h
+```
+
+`bpftool` analyzes all of the BTF encodings in the binary, and produces a
+header file that can be included by BPF programs to access those types. For
+example, using vmlinux.h allows a scheduler to access fields defined directly
+in vmlinux as follows:
+
+```c
+#include "vmlinux.h"
+// vmlinux.h is also implicitly included by scx_common.bpf.h.
+#include "scx_common.bpf.h"
+
+/*
+ * vmlinux.h provides definitions for struct task_struct and
+ * struct scx_enable_args.
+ */
+void BPF_STRUCT_OPS(example_enable, struct task_struct *p,
+ struct scx_enable_args *args)
+{
+ bpf_printk("Task %s enabled in example scheduler", p->comm);
+}
+
+// vmlinux.h provides the definition for struct sched_ext_ops.
+SEC(".struct_ops.link")
+struct sched_ext_ops example_ops {
+ .enable = (void *)example_enable,
+ .name = "example",
+}
+```
+
+The scheduler build system will generate this vmlinux.h file as part of the
+scheduler build pipeline. It looks for a vmlinux file in the following
+dependency order:
+
+1. If the O= environment variable is defined, at `$O/vmlinux`
+2. If the KBUILD_OUTPUT= environment variable is defined, at
+ `$KBUILD_OUTPUT/vmlinux`
+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're
+ compiling the schedulers)
+3. `/sys/kernel/btf/vmlinux`
+4. `/boot/vmlinux-$(uname -r)`
+
+In other words, if you have compiled a kernel in your local repo, its vmlinux
+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of
+the kernel you're currently running on. This means that if you're running on a
+kernel with sched_ext support, you may not need to compile a local kernel at
+all.
+
+### Aside on CO-RE
+
+One of the cooler features of BPF is that it supports
+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run
+Everywhere). This feature allows you to reference fields inside of structs with
+types defined internal to the kernel, and not have to recompile if you load the
+BPF program on a different kernel with the field at a different offset. In our
+example above, we print out a task name with `p->comm`. CO-RE would perform
+relocations for that access when the program is loaded to ensure that it's
+referencing the correct offset for the currently running kernel.
+
+## Compiling the schedulers
+
+Once you have your toolchain setup, and a vmlinux that can be used to generate
+a full vmlinux.h file, you can compile the schedulers using `make`:
+
+```bash
+$ make -j($nproc)
+```
+
+# Example schedulers
+
+This directory contains the following example schedulers. These schedulers are
+for testing and demonstrating different aspects of sched_ext. While some may be
+useful in limited scenarios, they are not intended to be practical.
+
+For more scheduler implementations, tools and documentation, visit
+https://github.com/sched-ext/scx.
+
+## scx_simple
+
+A simple scheduler that provides an example of a minimal sched_ext scheduler.
+scx_simple can be run in either global weighted vtime mode, or FIFO mode.
+
+Though very simple, in limited scenarios, this scheduler can perform reasonably
+well on single-socket systems with a unified L3 cache.
+
+## scx_qmap
+
+Another simple, yet slightly more complex scheduler that provides an example of
+a basic weighted FIFO queuing policy. It also provides examples of some common
+useful BPF features, such as sleepable per-task storage allocation in the
+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to
+enqueue tasks. It also illustrates how core-sched support could be implemented.
+
+## scx_central
+
+A "central" scheduler where scheduling decisions are made from a single CPU.
+This scheduler illustrates how scheduling decisions can be dispatched from a
+single CPU, allowing other cores to run with infinite slices, without timer
+ticks, and without having to incur the overhead of making scheduling decisions.
+
+The approach demonstrated by this scheduler may be useful for any workload that
+benefits from minimizing scheduling overhead and timer ticks. An example of
+where this could be particularly useful is running VMs, where running with
+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive
+vmexits.
+
+## scx_flatcg
+
+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical
+weight-based cgroup CPU control by flattening the cgroup hierarchy into a single
+layer, by compounding the active weight share at each level. The effect of this
+is a much more performant CPU controller, which does not need to descend down
+cgroup trees in order to properly compute a cgroup's share.
+
+Similar to scx_simple, in limited scenarios, this scheduler can perform
+reasonably well on single socket-socket systems with a unified L3 cache and show
+significantly lowered hierarchical scheduling overhead.
+
+
+# Troubleshooting
+
+There are a number of common issues that you may run into when building the
+schedulers. We'll go over some of the common ones here.
+
+## Build Failures
+
+### Old version of clang
+
+```
+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole
+ _Static_assert(SCX_DSQ_FLAG_BUILTIN,
+ ^~~~~~~~~~~~~~~~~~~~
+1 error generated.
+```
+
+This means you built the kernel or the schedulers with an older version of
+clang than what's supported (i.e. older than 16.0.0). To remediate this:
+
+1. `which clang` to make sure you're using a sufficiently new version of clang.
+
+2. `make fullclean` in the root path of the repository, and rebuild the kernel
+ and schedulers.
+
+3. Rebuild the kernel, and then your example schedulers.
+
+The schedulers are also cleaned if you invoke `make mrproper` in the root
+directory of the tree.
+
+### Stale kernel build / incomplete vmlinux.h file
+
+As described above, you'll need a `vmlinux.h` file that was generated from a
+vmlinux built with BTF, and with sched_ext support enabled. If you don't,
+you'll see errors such as the following which indicate that a type being
+referenced in a scheduler is unknown:
+
+```
+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info'
+
+const struct scx_exit_info *ei)
+
+^
+```
+
+In order to resolve this, please follow the steps above in
+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your
+schedulers are using a vmlinux.h file that includes the requisite types.
+
+## Misc
+
+### llvm: [OFF]
+
+You may see the following output when building the schedulers:
+
+```
+Auto-detecting system features:
+... clang-bpf-co-re: [ on ]
+... llvm: [ OFF ]
+... libcap: [ on ]
+... libbfd: [ on ]
+```
+
+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore.
diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
new file mode 100644
index 000000000000..ad7d139ce907
--- /dev/null
+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h
@@ -0,0 +1,11 @@
+/*
+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when
+ * compiling BPF files although its content doesn't play any role. The file in
+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is
+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus
+ * stubs-32.h is selected. However, the file is not there if the system doesn't
+ * have 32bit glibc devel package installed leading to a build failure.
+ *
+ * The problem is worked around by making this file available in the include
+ * search paths before the system one when building BPF.
+ */
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
new file mode 100644
index 000000000000..f538c75db183
--- /dev/null
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -0,0 +1,412 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMMON_BPF_H
+#define __SCX_COMMON_BPF_H
+
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+#include <asm-generic/errno.h>
+#include "user_exit_info.h"
+
+#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
+#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
+#define PF_EXITING 0x00000004
+#define CLOCK_MONOTONIC 1
+
+/*
+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can
+ * lead to really confusing misbehaviors. Let's trigger a build failure.
+ */
+static inline void ___vmlinux_h_sanity_check___(void)
+{
+ _Static_assert(SCX_DSQ_FLAG_BUILTIN,
+ "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole");
+}
+
+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+u32 scx_bpf_dispatch_nr_slots(void) __ksym;
+void scx_bpf_dispatch_cancel(void) __ksym;
+bool scx_bpf_consume(u64 dsq_id) __ksym;
+void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+u32 scx_bpf_reenqueue_local(void) __ksym;
+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym;
+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak;
+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak;
+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak;
+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym;
+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak;
+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak;
+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak;
+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak;
+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak;
+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak;
+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak;
+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym;
+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym;
+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym;
+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym;
+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym;
+bool scx_bpf_task_running(const struct task_struct *p) __ksym;
+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym;
+struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym;
+
+/*
+ * Use the following as @it__iter when calling
+ * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ */
+#define BPF_FOR_EACH_ITER (&___it)
+
+static inline __attribute__((format(printf, 1, 2)))
+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {}
+
+/*
+ * Helper macro for initializing the fmt and variadic argument inputs to both
+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to
+ * refer to the initialized list of inputs to the bstr kfunc.
+ */
+#define scx_bpf_bstr_preamble(fmt, args...) \
+ static char ___fmt[] = fmt; \
+ /* \
+ * Note that __param[] must have at least one \
+ * element to keep the verifier happy. \
+ */ \
+ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \
+ \
+ _Pragma("GCC diagnostic push") \
+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \
+ ___bpf_fill(___param, args); \
+ _Pragma("GCC diagnostic pop") \
+
+/*
+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Using this macro will cause the scheduler to
+ * exit cleanly with the specified exit code being passed to user space.
+ */
+#define scx_bpf_exit(code, fmt, args...) \
+({ \
+ scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \
+ ___scx_bpf_bstr_format_checker(fmt, ##args); \
+})
+
+/*
+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments
+ * instead of an array of u64. Invoking this macro will cause the scheduler to
+ * exit in an erroneous state, with diagnostic information being passed to the
+ * user.
+ */
+#define scx_bpf_error(fmt, args...) \
+({ \
+ scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \
+ ___scx_bpf_bstr_format_checker(fmt, ##args); \
+})
+
+/*
+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments
+ * instead of an array of u64. To be used from ops.dump() and friends.
+ */
+#define scx_bpf_dump(fmt, args...) \
+({ \
+ scx_bpf_bstr_preamble(fmt, args) \
+ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \
+ ___scx_bpf_bstr_format_checker(fmt, ##args); \
+})
+
+#define BPF_STRUCT_OPS(name, args...) \
+SEC("struct_ops/"#name) \
+BPF_PROG(name, ##args)
+
+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \
+SEC("struct_ops.s/"#name) \
+BPF_PROG(name, ##args)
+
+/**
+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized
+ * @elfsec: the data section of the BPF program in which to place the array
+ * @arr: the name of the array
+ *
+ * libbpf has an API for setting map value sizes. Since data sections (i.e.
+ * bss, data, rodata) themselves are maps, a data section can be resized. If
+ * a data section has an array as its last element, the BTF info for that
+ * array will be adjusted so that length of the array is extended to meet the
+ * new length of the data section. This macro annotates an array to have an
+ * element count of one with the assumption that this array can be resized
+ * within the userspace program. It also annotates the section specifier so
+ * this array exists in a custom sub data section which can be resized
+ * independently.
+ *
+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an
+ * array declared with RESIZABLE_ARRAY().
+ */
+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr)
+
+/**
+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member
+ * @base: struct or array to index
+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...)
+ *
+ * The verifier often gets confused by the instruction sequence the compiler
+ * generates for indexing struct fields or arrays. This macro forces the
+ * compiler to generate a code sequence which first calculates the byte offset,
+ * checks it against the struct or array size and add that byte offset to
+ * generate the pointer to the member to help the verifier.
+ *
+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However,
+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller
+ * must check for %NULL and take appropriate action to appease the verifier. To
+ * avoid confusing the verifier, it's best to check for %NULL and dereference
+ * immediately.
+ *
+ * vptr = MEMBER_VPTR(my_array, [i][j]);
+ * if (!vptr)
+ * return error;
+ * *vptr = new_value;
+ *
+ * sizeof(@base) should encompass the memory area to be accessed and thus can't
+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of
+ * `MEMBER_VPTR(ptr, ->member)`.
+ */
+#define MEMBER_VPTR(base, member) (typeof((base) member) *) \
+({ \
+ u64 __base = (u64)&(base); \
+ u64 __addr = (u64)&((base) member) - __base; \
+ _Static_assert(sizeof(base) >= sizeof((base) member), \
+ "@base is smaller than @member, is @base a pointer?"); \
+ asm volatile ( \
+ "if %0 <= %[max] goto +2\n" \
+ "%0 = 0\n" \
+ "goto +1\n" \
+ "%0 += %1\n" \
+ : "+r"(__addr) \
+ : "r"(__base), \
+ [max]"i"(sizeof(base) - sizeof((base) member))); \
+ __addr; \
+})
+
+/**
+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element
+ * @arr: array to index into
+ * @i: array index
+ * @n: number of elements in array
+ *
+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the
+ * element count needs to be explicit.
+ * It can be used in cases where a global array is defined with an initial
+ * size but is intended to be be resized before loading the BPF program.
+ * Without this version of the macro, MEMBER_VPTR() will use the compile time
+ * size of the array to compute the max, which will result in rejection by
+ * the verifier.
+ */
+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \
+({ \
+ u64 __base = (u64)arr; \
+ u64 __addr = (u64)&(arr[i]) - __base; \
+ asm volatile ( \
+ "if %0 <= %[max] goto +2\n" \
+ "%0 = 0\n" \
+ "goto +1\n" \
+ "%0 += %1\n" \
+ : "+r"(__addr) \
+ : "r"(__base), \
+ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \
+ __addr; \
+})
+
+
+/*
+ * BPF declarations and helpers
+ */
+
+/* list and rbtree */
+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node)))
+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8)))
+
+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym;
+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym;
+
+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL))
+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL)
+
+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym;
+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym;
+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym;
+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root,
+ struct bpf_rb_node *node) __ksym;
+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node,
+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b),
+ void *meta, __u64 off) __ksym;
+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0)
+
+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym;
+
+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym;
+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL)
+
+/* task */
+struct task_struct *bpf_task_from_pid(s32 pid) __ksym;
+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym;
+void bpf_task_release(struct task_struct *p) __ksym;
+
+/* cgroup */
+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym;
+void bpf_cgroup_release(struct cgroup *cgrp) __ksym;
+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym;
+
+/* css iteration */
+struct bpf_iter_css;
+struct cgroup_subsys_state;
+extern int bpf_iter_css_new(struct bpf_iter_css *it,
+ struct cgroup_subsys_state *start,
+ unsigned int flags) __weak __ksym;
+extern struct cgroup_subsys_state *
+bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym;
+extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym;
+
+/* cpumask */
+struct bpf_cpumask *bpf_cpumask_create(void) __ksym;
+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym;
+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym;
+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1,
+ const struct cpumask *src2) __ksym;
+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1,
+ const struct cpumask *src2) __ksym;
+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1,
+ const struct cpumask *src2) __ksym;
+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym;
+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym;
+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym;
+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym;
+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym;
+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1,
+ const struct cpumask *src2) __ksym;
+
+/* rcu */
+void bpf_rcu_read_lock(void) __ksym;
+void bpf_rcu_read_unlock(void) __ksym;
+
+
+/*
+ * Other helpers
+ */
+
+/* useful compiler attributes */
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#define __maybe_unused __attribute__((__unused__))
+
+/*
+ * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They
+ * prevent compiler from caching, redoing or reordering reads or writes.
+ */
+typedef __u8 __attribute__((__may_alias__)) __u8_alias_t;
+typedef __u16 __attribute__((__may_alias__)) __u16_alias_t;
+typedef __u32 __attribute__((__may_alias__)) __u32_alias_t;
+typedef __u64 __attribute__((__may_alias__)) __u64_alias_t;
+
+static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+{
+ switch (size) {
+ case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break;
+ case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break;
+ case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break;
+ case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break;
+ default:
+ barrier();
+ __builtin_memcpy((void *)res, (const void *)p, size);
+ barrier();
+ }
+}
+
+static __always_inline void __write_once_size(volatile void *p, void *res, int size)
+{
+ switch (size) {
+ case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break;
+ case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break;
+ case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break;
+ case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break;
+ default:
+ barrier();
+ __builtin_memcpy((void *)p, (const void *)res, size);
+ barrier();
+ }
+}
+
+#define READ_ONCE(x) \
+({ \
+ union { typeof(x) __val; char __c[1]; } __u = \
+ { .__c = { 0 } }; \
+ __read_once_size(&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
+})
+
+#define WRITE_ONCE(x, val) \
+({ \
+ union { typeof(x) __val; char __c[1]; } __u = \
+ { .__val = (val) }; \
+ __write_once_size(&(x), __u.__c, sizeof(x)); \
+ __u.__val; \
+})
+
+/*
+ * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u32(u32 v)
+{
+ u32 r;
+ u32 shift;
+
+ r = (v > 0xFFFF) << 4; v >>= r;
+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift;
+ shift = (v > 0xF) << 2; v >>= shift; r |= shift;
+ shift = (v > 0x3) << 1; v >>= shift; r |= shift;
+ r |= (v >> 1);
+ return r;
+}
+
+/*
+ * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value.
+ * @v: The value for which we're computing the base 2 logarithm.
+ */
+static inline u32 log2_u64(u64 v)
+{
+ u32 hi = v >> 32;
+ if (hi)
+ return log2_u32(hi) + 32 + 1;
+ else
+ return log2_u32(v) + 1;
+}
+
+#include "compat.bpf.h"
+
+#endif /* __SCX_COMMON_BPF_H */
diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h
new file mode 100644
index 000000000000..5b0f90152152
--- /dev/null
+++ b/tools/sched_ext/include/scx/common.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCHED_EXT_COMMON_H
+#define __SCHED_EXT_COMMON_H
+
+#ifdef __KERNEL__
+#error "Should not be included by BPF programs"
+#endif
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+#define SCX_BUG(__fmt, ...) \
+ do { \
+ fprintf(stderr, "[SCX_BUG] %s:%d", __FILE__, __LINE__); \
+ if (errno) \
+ fprintf(stderr, " (%s)\n", strerror(errno)); \
+ else \
+ fprintf(stderr, "\n"); \
+ fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \
+ fprintf(stderr, "\n"); \
+ \
+ exit(EXIT_FAILURE); \
+ } while (0)
+
+#define SCX_BUG_ON(__cond, __fmt, ...) \
+ do { \
+ if (__cond) \
+ SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \
+ } while (0)
+
+/**
+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array
+ * @__skel: the skeleton containing the array
+ * @elfsec: the data section of the BPF program in which the array exists
+ * @arr: the name of the array
+ * @n: the desired array element count
+ *
+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two
+ * operations. It resizes the map which corresponds to the custom data
+ * section that contains the target array. As a side effect, the BTF info for
+ * the array is adjusted so that the array length is sized to cover the new
+ * data section size. The second operation is reassigning the skeleton pointer
+ * for that custom data section so that it points to the newly memory mapped
+ * region.
+ */
+#define RESIZE_ARRAY(__skel, elfsec, arr, n) \
+ do { \
+ size_t __sz; \
+ bpf_map__set_value_size((__skel)->maps.elfsec##_##arr, \
+ sizeof((__skel)->elfsec##_##arr->arr[0]) * (n)); \
+ (__skel)->elfsec##_##arr = \
+ bpf_map__initial_value((__skel)->maps.elfsec##_##arr, &__sz); \
+ } while (0)
+
+#include "user_exit_info.h"
+#include "compat.h"
+
+#endif /* __SCHED_EXT_COMMON_H */
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
new file mode 100644
index 000000000000..3d2fe1208900
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_BPF_H
+#define __SCX_COMPAT_BPF_H
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \
+({ \
+ __type __ret = 0; \
+ if (bpf_core_enum_value_exists(__type, __ent)) \
+ __ret = __ent; \
+ __ret; \
+})
+
+/*
+ * Define sched_ext_ops. This may be expanded to define multiple variants for
+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
+ */
+#define SCX_OPS_DEFINE(__name, ...) \
+ SEC(".struct_ops.link") \
+ struct sched_ext_ops __name = { \
+ __VA_ARGS__, \
+ };
+
+#endif /* __SCX_COMPAT_BPF_H */
diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h
new file mode 100644
index 000000000000..cc56ff9aa252
--- /dev/null
+++ b/tools/sched_ext/include/scx/compat.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2024 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2024 David Vernet <dvernet@meta.com>
+ */
+#ifndef __SCX_COMPAT_H
+#define __SCX_COMPAT_H
+
+#include <bpf/btf.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+struct btf *__COMPAT_vmlinux_btf __attribute__((weak));
+
+static inline void __COMPAT_load_vmlinux_btf(void)
+{
+ if (!__COMPAT_vmlinux_btf) {
+ __COMPAT_vmlinux_btf = btf__load_vmlinux_btf();
+ SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()");
+ }
+}
+
+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v)
+{
+ const struct btf_type *t;
+ const char *n;
+ s32 tid;
+ int i;
+
+ __COMPAT_load_vmlinux_btf();
+
+ tid = btf__find_by_name(__COMPAT_vmlinux_btf, type);
+ if (tid < 0)
+ return false;
+
+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+ if (btf_is_enum(t)) {
+ struct btf_enum *e = btf_enum(t);
+
+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+ SCX_BUG_ON(!n, "btf__name_by_offset()");
+ if (!strcmp(n, name)) {
+ *v = e[i].val;
+ return true;
+ }
+ }
+ } else if (btf_is_enum64(t)) {
+ struct btf_enum64 *e = btf_enum64(t);
+
+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off);
+ SCX_BUG_ON(!n, "btf__name_by_offset()");
+ if (!strcmp(n, name)) {
+ *v = btf_enum64_value(&e[i]);
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \
+({ \
+ u64 __val = 0; \
+ __COMPAT_read_enum(__type, __ent, &__val); \
+ __val; \
+})
+
+static inline bool __COMPAT_has_ksym(const char *ksym)
+{
+ __COMPAT_load_vmlinux_btf();
+ return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0;
+}
+
+static inline bool __COMPAT_struct_has_field(const char *type, const char *field)
+{
+ const struct btf_type *t;
+ const struct btf_member *m;
+ const char *n;
+ s32 tid;
+ int i;
+
+ __COMPAT_load_vmlinux_btf();
+ tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT);
+ if (tid < 0)
+ return false;
+
+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid);
+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid);
+
+ m = btf_members(t);
+
+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) {
+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off);
+ SCX_BUG_ON(!n, "btf__name_by_offset()");
+ if (!strcmp(n, field))
+ return true;
+ }
+
+ return false;
+}
+
+#define SCX_OPS_SWITCH_PARTIAL \
+ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL")
+
+static inline long scx_hotplug_seq(void)
+{
+ int fd;
+ char buf[32];
+ ssize_t len;
+ long val;
+
+ fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY);
+ if (fd < 0)
+ return -ENOENT;
+
+ len = read(fd, buf, sizeof(buf) - 1);
+ SCX_BUG_ON(len <= 0, "read failed (%ld)", len);
+ buf[len] = 0;
+ close(fd);
+
+ val = strtoul(buf, NULL, 10);
+ SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val);
+
+ return val;
+}
+
+/*
+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE()
+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load
+ * and attach it, backward compatibility is automatically maintained where
+ * reasonable.
+ *
+ * ec7e3b0463e1 ("implement-ops") in https://github.com/sched-ext/sched_ext is
+ * the current minimum required kernel version.
+ */
+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \
+ struct __scx_name *__skel; \
+ \
+ SCX_BUG_ON(!__COMPAT_struct_has_field("sched_ext_ops", "dump"), \
+ "sched_ext_ops.dump() missing, kernel too old?"); \
+ \
+ __skel = __scx_name##__open(); \
+ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \
+ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \
+ __skel; \
+})
+
+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \
+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \
+ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \
+})
+
+/*
+ * New versions of bpftool now emit additional link placeholders for BPF maps,
+ * and set up BPF skeleton in such a way that libbpf will auto-attach BPF maps
+ * automatically, assumming libbpf is recent enough (v1.5+). Old libbpf will do
+ * nothing with those links and won't attempt to auto-attach maps.
+ *
+ * To maintain compatibility with older libbpf while avoiding trying to attach
+ * twice, disable the autoattach feature on newer libbpf.
+ */
+#if LIBBPF_MAJOR_VERSION > 1 || \
+ (LIBBPF_MAJOR_VERSION == 1 && LIBBPF_MINOR_VERSION >= 5)
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) \
+ bpf_map__set_autoattach((__skel)->maps.__ops_name, false)
+#else
+#define __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name) do {} while (0)
+#endif
+
+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \
+ struct bpf_link *__link; \
+ __SCX_OPS_DISABLE_AUTOATTACH(__skel, __ops_name); \
+ SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \
+ __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \
+ SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \
+ __link; \
+})
+
+#endif /* __SCX_COMPAT_H */
diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h
new file mode 100644
index 000000000000..891693ee604e
--- /dev/null
+++ b/tools/sched_ext/include/scx/user_exit_info.h
@@ -0,0 +1,111 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Define struct user_exit_info which is shared between BPF and userspace parts
+ * to communicate exit status and other information.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#ifndef __USER_EXIT_INFO_H
+#define __USER_EXIT_INFO_H
+
+enum uei_sizes {
+ UEI_REASON_LEN = 128,
+ UEI_MSG_LEN = 1024,
+ UEI_DUMP_DFL_LEN = 32768,
+};
+
+struct user_exit_info {
+ int kind;
+ s64 exit_code;
+ char reason[UEI_REASON_LEN];
+ char msg[UEI_MSG_LEN];
+};
+
+#ifdef __bpf__
+
+#include "vmlinux.h"
+#include <bpf/bpf_core_read.h>
+
+#define UEI_DEFINE(__name) \
+ char RESIZABLE_ARRAY(data, __name##_dump); \
+ const volatile u32 __name##_dump_len; \
+ struct user_exit_info __name SEC(".data")
+
+#define UEI_RECORD(__uei_name, __ei) ({ \
+ bpf_probe_read_kernel_str(__uei_name.reason, \
+ sizeof(__uei_name.reason), (__ei)->reason); \
+ bpf_probe_read_kernel_str(__uei_name.msg, \
+ sizeof(__uei_name.msg), (__ei)->msg); \
+ bpf_probe_read_kernel_str(__uei_name##_dump, \
+ __uei_name##_dump_len, (__ei)->dump); \
+ if (bpf_core_field_exists((__ei)->exit_code)) \
+ __uei_name.exit_code = (__ei)->exit_code; \
+ /* use __sync to force memory barrier */ \
+ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \
+ (__ei)->kind); \
+})
+
+#else /* !__bpf__ */
+
+#include <stdio.h>
+#include <stdbool.h>
+
+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */
+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \
+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \
+ (__skel)->rodata->__uei_name##_dump_len = __len; \
+ RESIZE_ARRAY((__skel), data, __uei_name##_dump, __len); \
+})
+
+#define UEI_EXITED(__skel, __uei_name) ({ \
+ /* use __sync to force memory barrier */ \
+ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \
+})
+
+#define UEI_REPORT(__skel, __uei_name) ({ \
+ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \
+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \
+ if (__uei_dump[0] != '\0') { \
+ fputs("\nDEBUG DUMP\n", stderr); \
+ fputs("================================================================================\n\n", stderr); \
+ fputs(__uei_dump, stderr); \
+ fputs("\n================================================================================\n\n", stderr); \
+ } \
+ fprintf(stderr, "EXIT: %s", __uei->reason); \
+ if (__uei->msg[0] != '\0') \
+ fprintf(stderr, " (%s)", __uei->msg); \
+ fputs("\n", stderr); \
+ __uei->exit_code; \
+})
+
+/*
+ * We can't import vmlinux.h while compiling user C code. Let's duplicate
+ * scx_exit_code definition.
+ */
+enum scx_exit_code {
+ /* Reasons */
+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32,
+
+ /* Actions */
+ SCX_ECODE_ACT_RESTART = 1LLU << 48,
+};
+
+enum uei_ecode_mask {
+ UEI_ECODE_USER_MASK = ((1LLU << 32) - 1),
+ UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32,
+ UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48,
+};
+
+/*
+ * These macro interpret the ecode returned from UEI_REPORT().
+ */
+#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK)
+#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK)
+#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK)
+
+#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART)
+
+#endif /* __bpf__ */
+#endif /* __USER_EXIT_INFO_H */
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
new file mode 100644
index 000000000000..8dd8eb73b6b8
--- /dev/null
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -0,0 +1,361 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A central FIFO sched_ext scheduler which demonstrates the followings:
+ *
+ * a. Making all scheduling decisions from one CPU:
+ *
+ * The central CPU is the only one making scheduling decisions. All other
+ * CPUs kick the central CPU when they run out of tasks to run.
+ *
+ * There is one global BPF queue and the central CPU schedules all CPUs by
+ * dispatching from the global queue to each CPU's local dsq from dispatch().
+ * This isn't the most straightforward. e.g. It'd be easier to bounce
+ * through per-CPU BPF queues. The current design is chosen to maximally
+ * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching.
+ *
+ * b. Tickless operation
+ *
+ * All tasks are dispatched with the infinite slice which allows stopping the
+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full
+ * parameter. The tickless operation can be observed through
+ * /proc/interrupts.
+ *
+ * Periodic switching is enforced by a periodic timer checking all CPUs and
+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't
+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to
+ * the central CPU.
+ *
+ * c. Preemption
+ *
+ * Kthreads are unconditionally queued to the head of a matching local dsq
+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always
+ * prioritized over user threads, which is required for ensuring forward
+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the
+ * ksoftirqd gets starved by a user thread, there may not be anything else to
+ * vacate that user thread.
+ *
+ * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the
+ * next tasks.
+ *
+ * This scheduler is designed to maximize usage of various SCX mechanisms. A
+ * more practical implementation would likely put the scheduling loop outside
+ * the central CPU's dispatch() path and add some form of priority mechanism.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+enum {
+ FALLBACK_DSQ_ID = 0,
+ MS_TO_NS = 1000LLU * 1000,
+ TIMER_INTERVAL_NS = 1 * MS_TO_NS,
+};
+
+const volatile s32 central_cpu;
+const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+
+bool timer_pinned = true;
+u64 nr_total, nr_locals, nr_queued, nr_lost_pids;
+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries;
+u64 nr_overflows;
+
+UEI_DEFINE(uei);
+
+struct {
+ __uint(type, BPF_MAP_TYPE_QUEUE);
+ __uint(max_entries, 4096);
+ __type(value, s32);
+} central_q SEC(".maps");
+
+/* can't use percpu map due to bad lookups */
+bool RESIZABLE_ARRAY(data, cpu_gimme_task);
+u64 RESIZABLE_ARRAY(data, cpu_started_at);
+
+struct central_timer {
+ struct bpf_timer timer;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct central_timer);
+} central_timer SEC(".maps");
+
+static bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags)
+{
+ /*
+ * Steer wakeups to the central CPU as much as possible to avoid
+ * disturbing other CPUs. It's safe to blindly return the central cpu as
+ * select_cpu() is a hint and if @p can't be on it, the kernel will
+ * automatically pick a fallback CPU.
+ */
+ return central_cpu;
+}
+
+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ s32 pid = p->pid;
+
+ __sync_fetch_and_add(&nr_total, 1);
+
+ /*
+ * Push per-cpu kthreads at the head of local dsq's and preempt the
+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked
+ * behind other threads which is necessary for forward progress
+ * guarantee as we depend on the BPF timer which may run from ksoftirqd.
+ */
+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
+ __sync_fetch_and_add(&nr_locals, 1);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+ enq_flags | SCX_ENQ_PREEMPT);
+ return;
+ }
+
+ if (bpf_map_push_elem(&central_q, &pid, 0)) {
+ __sync_fetch_and_add(&nr_overflows, 1);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
+ return;
+ }
+
+ __sync_fetch_and_add(&nr_queued, 1);
+
+ if (!scx_bpf_task_running(p))
+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+}
+
+static bool dispatch_to_cpu(s32 cpu)
+{
+ struct task_struct *p;
+ s32 pid;
+
+ bpf_repeat(BPF_MAX_LOOPS) {
+ if (bpf_map_pop_elem(&central_q, &pid))
+ break;
+
+ __sync_fetch_and_sub(&nr_queued, 1);
+
+ p = bpf_task_from_pid(pid);
+ if (!p) {
+ __sync_fetch_and_add(&nr_lost_pids, 1);
+ continue;
+ }
+
+ /*
+ * If we can't run the task at the top, do the dumb thing and
+ * bounce it to the fallback dsq.
+ */
+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
+ __sync_fetch_and_add(&nr_mismatches, 1);
+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
+ bpf_task_release(p);
+ /*
+ * We might run out of dispatch buffer slots if we continue dispatching
+ * to the fallback DSQ, without dispatching to the local DSQ of the
+ * target CPU. In such a case, break the loop now as will fail the
+ * next dispatch operation.
+ */
+ if (!scx_bpf_dispatch_nr_slots())
+ break;
+ continue;
+ }
+
+ /* dispatch to local and mark that @cpu doesn't need more */
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
+
+ if (cpu != central_cpu)
+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+
+ bpf_task_release(p);
+ return true;
+ }
+
+ return false;
+}
+
+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
+{
+ if (cpu == central_cpu) {
+ /* dispatch for all other CPUs first */
+ __sync_fetch_and_add(&nr_dispatches, 1);
+
+ bpf_for(cpu, 0, nr_cpu_ids) {
+ bool *gimme;
+
+ if (!scx_bpf_dispatch_nr_slots())
+ break;
+
+ /* central's gimme is never set */
+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
+ if (!gimme || !*gimme)
+ continue;
+
+ if (dispatch_to_cpu(cpu))
+ *gimme = false;
+ }
+
+ /*
+ * Retry if we ran out of dispatch buffer slots as we might have
+ * skipped some CPUs and also need to dispatch for self. The ext
+ * core automatically retries if the local dsq is empty but we
+ * can't rely on that as we're dispatching for other CPUs too.
+ * Kick self explicitly to retry.
+ */
+ if (!scx_bpf_dispatch_nr_slots()) {
+ __sync_fetch_and_add(&nr_retries, 1);
+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+ return;
+ }
+
+ /* look for a task to run on the central CPU */
+ if (scx_bpf_consume(FALLBACK_DSQ_ID))
+ return;
+ dispatch_to_cpu(central_cpu);
+ } else {
+ bool *gimme;
+
+ if (scx_bpf_consume(FALLBACK_DSQ_ID))
+ return;
+
+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
+ if (gimme)
+ *gimme = true;
+
+ /*
+ * Force dispatch on the scheduling CPU so that it finds a task
+ * to run for us.
+ */
+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT);
+ }
+}
+
+void BPF_STRUCT_OPS(central_running, struct task_struct *p)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at)
+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */
+}
+
+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable)
+{
+ s32 cpu = scx_bpf_task_cpu(p);
+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at)
+ *started_at = 0;
+}
+
+static int central_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ u64 now = bpf_ktime_get_ns();
+ u64 nr_to_kick = nr_queued;
+ s32 i, curr_cpu;
+
+ curr_cpu = bpf_get_smp_processor_id();
+ if (timer_pinned && (curr_cpu != central_cpu)) {
+ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d",
+ curr_cpu, central_cpu);
+ return 0;
+ }
+
+ bpf_for(i, 0, nr_cpu_ids) {
+ s32 cpu = (nr_timers + i) % nr_cpu_ids;
+ u64 *started_at;
+
+ if (cpu == central_cpu)
+ continue;
+
+ /* kick iff the current one exhausted its slice */
+ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids);
+ if (started_at && *started_at &&
+ vtime_before(now, *started_at + slice_ns))
+ continue;
+
+ /* and there's something pending */
+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) ||
+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu))
+ ;
+ else if (nr_to_kick)
+ nr_to_kick--;
+ else
+ continue;
+
+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT);
+ }
+
+ bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+ __sync_fetch_and_add(&nr_timers, 1);
+ return 0;
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(central_init)
+{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ int ret;
+
+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(&central_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ if (bpf_get_smp_processor_id() != central_cpu) {
+ scx_bpf_error("init from non-central CPU");
+ return -EINVAL;
+ }
+
+ bpf_timer_init(timer, &central_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, central_timerfn);
+
+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN);
+ /*
+ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a
+ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL.
+ * Retry without the PIN. This would be the perfect use case for
+ * bpf_core_enum_value_exists() but the enum type doesn't have a name
+ * and can't be used with bpf_core_enum_value_exists(). Oh well...
+ */
+ if (ret == -EINVAL) {
+ timer_pinned = false;
+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0);
+ }
+ if (ret)
+ scx_bpf_error("bpf_timer_start failed (%d)", ret);
+ return ret;
+}
+
+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(central_ops,
+ /*
+ * We are offloading all scheduling decisions to the central CPU
+ * and thus being the last task on a given CPU doesn't mean
+ * anything special. Enqueue the last tasks like any other tasks.
+ */
+ .flags = SCX_OPS_ENQ_LAST,
+
+ .select_cpu = (void *)central_select_cpu,
+ .enqueue = (void *)central_enqueue,
+ .dispatch = (void *)central_dispatch,
+ .running = (void *)central_running,
+ .stopping = (void *)central_stopping,
+ .init = (void *)central_init,
+ .exit = (void *)central_exit,
+ .name = "central");
diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c
new file mode 100644
index 000000000000..21deea320bd7
--- /dev/null
+++ b/tools/sched_ext/scx_central.c
@@ -0,0 +1,135 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define _GNU_SOURCE
+#include <sched.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_central.bpf.skel.h"
+
+const char help_fmt[] =
+"A central FIFO sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-c CPU]\n"
+"\n"
+" -s SLICE_US Override slice duration\n"
+" -c CPU Override the central CPU (default: 0)\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+ exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_central *skel;
+ struct bpf_link *link;
+ __u64 seq = 0, ecode;
+ __s32 opt;
+ cpu_set_t *cpuset;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+restart:
+ skel = SCX_OPS_OPEN(central_ops, scx_central);
+
+ skel->rodata->central_cpu = 0;
+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus();
+
+ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) {
+ switch (opt) {
+ case 's':
+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+ break;
+ case 'c':
+ skel->rodata->central_cpu = strtoul(optarg, NULL, 0);
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ /* Resize arrays so their element count is equal to cpu count. */
+ RESIZE_ARRAY(skel, data, cpu_gimme_task, skel->rodata->nr_cpu_ids);
+ RESIZE_ARRAY(skel, data, cpu_started_at, skel->rodata->nr_cpu_ids);
+
+ SCX_OPS_LOAD(skel, central_ops, scx_central, uei);
+
+ /*
+ * Affinitize the loading thread to the central CPU, as:
+ * - That's where the BPF timer is first invoked in the BPF program.
+ * - We probably don't want this user space component to take up a core
+ * from a task that would benefit from avoiding preemption on one of
+ * the tickless cores.
+ *
+ * Until BPF supports pinning the timer, it's not guaranteed that it
+ * will always be invoked on the central CPU. In practice, this
+ * suffices the majority of the time.
+ */
+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids);
+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset");
+ CPU_ZERO(cpuset);
+ CPU_SET(skel->rodata->central_cpu, cpuset);
+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset),
+ "Failed to affinitize to central CPU %d (max %d)",
+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1);
+ CPU_FREE(cpuset);
+
+ link = SCX_OPS_ATTACH(skel, central_ops, scx_central);
+
+ if (!skel->data->timer_pinned)
+ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n");
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ printf("[SEQ %llu]\n", seq++);
+ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n",
+ skel->bss->nr_total,
+ skel->bss->nr_locals,
+ skel->bss->nr_queued,
+ skel->bss->nr_lost_pids);
+ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n",
+ skel->bss->nr_timers,
+ skel->bss->nr_dispatches,
+ skel->bss->nr_mismatches,
+ skel->bss->nr_retries);
+ printf("overflow:%10" PRIu64 "\n",
+ skel->bss->nr_overflows);
+ fflush(stdout);
+ sleep(1);
+ }
+
+ bpf_link__destroy(link);
+ ecode = UEI_REPORT(skel, uei);
+ scx_central__destroy(skel);
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
+ return 0;
+}
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
new file mode 100644
index 000000000000..3ab2b60781a0
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -0,0 +1,949 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements
+ * hierarchical weight-based cgroup CPU control by flattening the cgroup
+ * hierarchy into a single layer by compounding the active weight share at each
+ * level. Consider the following hierarchy with weights in parentheses:
+ *
+ * R + A (100) + B (100)
+ * | \ C (100)
+ * \ D (200)
+ *
+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks.
+ * Let's say all three have runnable tasks. The total share that each of these
+ * three cgroups is entitled to can be calculated by compounding its share at
+ * each level.
+ *
+ * For example, B is competing against C and in that competition its share is
+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's
+ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the
+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's
+ * eventual shaer is the same at 1/6. D is only competing at the top level and
+ * its share is 200/(100+200) == 2/3.
+ *
+ * So, instead of hierarchically scheduling level-by-level, we can consider it
+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3
+ * and keep updating the eventual shares as the cgroups' runnable states change.
+ *
+ * This flattening of hierarchy can bring a substantial performance gain when
+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using
+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it
+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two
+ * apache instances competing with 2:1 weight ratio nested four level deep.
+ *
+ * However, the gain comes at the cost of not being able to properly handle
+ * thundering herd of cgroups. For example, if many cgroups which are nested
+ * behind a low priority parent cgroup wake up around the same time, they may be
+ * able to consume more CPU cycles than they are entitled to. In many use cases,
+ * this isn't a real concern especially given the performance gain. Also, there
+ * are ways to mitigate the problem further by e.g. introducing an extra
+ * scheduling layer on cgroup delegation boundaries.
+ *
+ * The scheduler first picks the cgroup to run and then schedule the tasks
+ * within by using nested weighted vtime scheduling by default. The
+ * cgroup-internal scheduling can be switched to FIFO with the -f option.
+ */
+#include <scx/common.bpf.h>
+#include "scx_flatcg.h"
+
+/*
+ * Maximum amount of retries to find a valid cgroup.
+ */
+#define CGROUP_MAX_RETRIES 1024
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */
+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL;
+const volatile bool fifo_sched;
+
+u64 cvtime_now;
+UEI_DEFINE(uei);
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, u64);
+ __uint(max_entries, FCG_NR_STATS);
+} stats SEC(".maps");
+
+static void stat_inc(enum fcg_stat_idx idx)
+{
+ u32 idx_v = idx;
+
+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v);
+ if (cnt_p)
+ (*cnt_p)++;
+}
+
+struct fcg_cpu_ctx {
+ u64 cur_cgid;
+ u64 cur_at;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __type(key, u32);
+ __type(value, struct fcg_cpu_ctx);
+ __uint(max_entries, 1);
+} cpu_ctx SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct fcg_cgrp_ctx);
+} cgrp_ctx SEC(".maps");
+
+struct cgv_node {
+ struct bpf_rb_node rb_node;
+ __u64 cvtime;
+ __u64 cgid;
+};
+
+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock;
+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node);
+
+struct cgv_node_stash {
+ struct cgv_node __kptr *node;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_HASH);
+ __uint(max_entries, 16384);
+ __type(key, __u64);
+ __type(value, struct cgv_node_stash);
+} cgv_node_stash SEC(".maps");
+
+struct fcg_task_ctx {
+ u64 bypassed_at;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct fcg_task_ctx);
+} task_ctx SEC(".maps");
+
+/* gets inc'd on weight tree changes to expire the cached hweights */
+u64 hweight_gen = 1;
+
+static u64 div_round_up(u64 dividend, u64 divisor)
+{
+ return (dividend + divisor - 1) / divisor;
+}
+
+static bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b)
+{
+ struct cgv_node *cgc_a, *cgc_b;
+
+ cgc_a = container_of(a, struct cgv_node, rb_node);
+ cgc_b = container_of(b, struct cgv_node, rb_node);
+
+ return cgc_a->cvtime < cgc_b->cvtime;
+}
+
+static struct fcg_cpu_ctx *find_cpu_ctx(void)
+{
+ struct fcg_cpu_ctx *cpuc;
+ u32 idx = 0;
+
+ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx);
+ if (!cpuc) {
+ scx_bpf_error("cpu_ctx lookup failed");
+ return NULL;
+ }
+ return cpuc;
+}
+
+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp)
+{
+ struct fcg_cgrp_ctx *cgc;
+
+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+ if (!cgc) {
+ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id);
+ return NULL;
+ }
+ return cgc;
+}
+
+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level)
+{
+ struct fcg_cgrp_ctx *cgc;
+
+ cgrp = bpf_cgroup_ancestor(cgrp, level);
+ if (!cgrp) {
+ scx_bpf_error("ancestor cgroup lookup failed");
+ return NULL;
+ }
+
+ cgc = find_cgrp_ctx(cgrp);
+ if (!cgc)
+ scx_bpf_error("ancestor cgrp_ctx lookup failed");
+ bpf_cgroup_release(cgrp);
+ return cgc;
+}
+
+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+ int level;
+
+ if (!cgc->nr_active) {
+ stat_inc(FCG_STAT_HWT_SKIP);
+ return;
+ }
+
+ if (cgc->hweight_gen == hweight_gen) {
+ stat_inc(FCG_STAT_HWT_CACHE);
+ return;
+ }
+
+ stat_inc(FCG_STAT_HWT_UPDATES);
+ bpf_for(level, 0, cgrp->level + 1) {
+ struct fcg_cgrp_ctx *cgc;
+ bool is_active;
+
+ cgc = find_ancestor_cgrp_ctx(cgrp, level);
+ if (!cgc)
+ break;
+
+ if (!level) {
+ cgc->hweight = FCG_HWEIGHT_ONE;
+ cgc->hweight_gen = hweight_gen;
+ } else {
+ struct fcg_cgrp_ctx *pcgc;
+
+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+ if (!pcgc)
+ break;
+
+ /*
+ * We can be oppotunistic here and not grab the
+ * cgv_tree_lock and deal with the occasional races.
+ * However, hweight updates are already cached and
+ * relatively low-frequency. Let's just do the
+ * straightforward thing.
+ */
+ bpf_spin_lock(&cgv_tree_lock);
+ is_active = cgc->nr_active;
+ if (is_active) {
+ cgc->hweight_gen = pcgc->hweight_gen;
+ cgc->hweight =
+ div_round_up(pcgc->hweight * cgc->weight,
+ pcgc->child_weight_sum);
+ }
+ bpf_spin_unlock(&cgv_tree_lock);
+
+ if (!is_active) {
+ stat_inc(FCG_STAT_HWT_RACE);
+ break;
+ }
+ }
+ }
+}
+
+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc)
+{
+ u64 delta, cvtime, max_budget;
+
+ /*
+ * A node which is on the rbtree can't be pointed to from elsewhere yet
+ * and thus can't be updated and repositioned. Instead, we collect the
+ * vtime deltas separately and apply it asynchronously here.
+ */
+ delta = cgc->cvtime_delta;
+ __sync_fetch_and_sub(&cgc->cvtime_delta, delta);
+ cvtime = cgv_node->cvtime + delta;
+
+ /*
+ * Allow a cgroup to carry the maximum budget proportional to its
+ * hweight such that a full-hweight cgroup can immediately take up half
+ * of the CPUs at the most while staying at the front of the rbtree.
+ */
+ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) /
+ (2 * FCG_HWEIGHT_ONE);
+ if (vtime_before(cvtime, cvtime_now - max_budget))
+ cvtime = cvtime_now - max_budget;
+
+ cgv_node->cvtime = cvtime;
+}
+
+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc)
+{
+ struct cgv_node_stash *stash;
+ struct cgv_node *cgv_node;
+ u64 cgid = cgrp->kn->id;
+
+ /* paired with cmpxchg in try_pick_next_cgroup() */
+ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) {
+ stat_inc(FCG_STAT_ENQ_SKIP);
+ return;
+ }
+
+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+ if (!stash) {
+ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid);
+ return;
+ }
+
+ /* NULL if the node is already on the rbtree */
+ cgv_node = bpf_kptr_xchg(&stash->node, NULL);
+ if (!cgv_node) {
+ stat_inc(FCG_STAT_ENQ_RACE);
+ return;
+ }
+
+ bpf_spin_lock(&cgv_tree_lock);
+ cgrp_cap_budget(cgv_node, cgc);
+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+ bpf_spin_unlock(&cgv_tree_lock);
+}
+
+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc)
+{
+ /*
+ * Tell fcg_stopping() that this bypassed the regular scheduling path
+ * and should be force charged to the cgroup. 0 is used to indicate that
+ * the task isn't bypassing, so if the current runtime is 0, go back by
+ * one nanosecond.
+ */
+ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1;
+}
+
+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ struct fcg_task_ctx *taskc;
+ bool is_idle = false;
+ s32 cpu;
+
+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+
+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+ if (!taskc) {
+ scx_bpf_error("task_ctx lookup failed");
+ return cpu;
+ }
+
+ /*
+ * If select_cpu_dfl() is recommending local enqueue, the target CPU is
+ * idle. Follow it and charge the cgroup later in fcg_stopping() after
+ * the fact.
+ */
+ if (is_idle) {
+ set_bypassed_at(p, taskc);
+ stat_inc(FCG_STAT_LOCAL);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+ }
+
+ return cpu;
+}
+
+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ struct fcg_task_ctx *taskc;
+ struct cgroup *cgrp;
+ struct fcg_cgrp_ctx *cgc;
+
+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+ if (!taskc) {
+ scx_bpf_error("task_ctx lookup failed");
+ return;
+ }
+
+ /*
+ * Use the direct dispatching and force charging to deal with tasks with
+ * custom affinities so that we don't have to worry about per-cgroup
+ * dq's containing tasks that can't be executed from some CPUs.
+ */
+ if (p->nr_cpus_allowed != nr_cpus) {
+ set_bypassed_at(p, taskc);
+
+ /*
+ * The global dq is deprioritized as we don't want to let tasks
+ * to boost themselves by constraining its cpumask. The
+ * deprioritization is rather severe, so let's not apply that to
+ * per-cpu kernel threads. This is ham-fisted. We probably wanna
+ * implement per-cgroup fallback dq's instead so that we have
+ * more control over when tasks with custom cpumask get issued.
+ */
+ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
+ stat_inc(FCG_STAT_LOCAL);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+ } else {
+ stat_inc(FCG_STAT_GLOBAL);
+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+ }
+ return;
+ }
+
+ cgrp = scx_bpf_task_cgroup(p);
+ cgc = find_cgrp_ctx(cgrp);
+ if (!cgc)
+ goto out_release;
+
+ if (fifo_sched) {
+ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+ } else {
+ u64 tvtime = p->scx.dsq_vtime;
+
+ /*
+ * Limit the amount of budget that an idling task can accumulate
+ * to one slice.
+ */
+ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
+ tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
+
+ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+ tvtime, enq_flags);
+ }
+
+ cgrp_enqueued(cgrp, cgc);
+out_release:
+ bpf_cgroup_release(cgrp);
+}
+
+/*
+ * Walk the cgroup tree to update the active weight sums as tasks wake up and
+ * sleep. The weight sums are used as the base when calculating the proportion a
+ * given cgroup or task is entitled to at each level.
+ */
+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable)
+{
+ struct fcg_cgrp_ctx *cgc;
+ bool updated = false;
+ int idx;
+
+ cgc = find_cgrp_ctx(cgrp);
+ if (!cgc)
+ return;
+
+ /*
+ * In most cases, a hot cgroup would have multiple threads going to
+ * sleep and waking up while the whole cgroup stays active. In leaf
+ * cgroups, ->nr_runnable which is updated with __sync operations gates
+ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock
+ * repeatedly for a busy cgroup which is staying active.
+ */
+ if (runnable) {
+ if (__sync_fetch_and_add(&cgc->nr_runnable, 1))
+ return;
+ stat_inc(FCG_STAT_ACT);
+ } else {
+ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1))
+ return;
+ stat_inc(FCG_STAT_DEACT);
+ }
+
+ /*
+ * If @cgrp is becoming runnable, its hweight should be refreshed after
+ * it's added to the weight tree so that enqueue has the up-to-date
+ * value. If @cgrp is becoming quiescent, the hweight should be
+ * refreshed before it's removed from the weight tree so that the usage
+ * charging which happens afterwards has access to the latest value.
+ */
+ if (!runnable)
+ cgrp_refresh_hweight(cgrp, cgc);
+
+ /* propagate upwards */
+ bpf_for(idx, 0, cgrp->level) {
+ int level = cgrp->level - idx;
+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+ bool propagate = false;
+
+ cgc = find_ancestor_cgrp_ctx(cgrp, level);
+ if (!cgc)
+ break;
+ if (level) {
+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1);
+ if (!pcgc)
+ break;
+ }
+
+ /*
+ * We need the propagation protected by a lock to synchronize
+ * against weight changes. There's no reason to drop the lock at
+ * each level but bpf_spin_lock() doesn't want any function
+ * calls while locked.
+ */
+ bpf_spin_lock(&cgv_tree_lock);
+
+ if (runnable) {
+ if (!cgc->nr_active++) {
+ updated = true;
+ if (pcgc) {
+ propagate = true;
+ pcgc->child_weight_sum += cgc->weight;
+ }
+ }
+ } else {
+ if (!--cgc->nr_active) {
+ updated = true;
+ if (pcgc) {
+ propagate = true;
+ pcgc->child_weight_sum -= cgc->weight;
+ }
+ }
+ }
+
+ bpf_spin_unlock(&cgv_tree_lock);
+
+ if (!propagate)
+ break;
+ }
+
+ if (updated)
+ __sync_fetch_and_add(&hweight_gen, 1);
+
+ if (runnable)
+ cgrp_refresh_hweight(cgrp, cgc);
+}
+
+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags)
+{
+ struct cgroup *cgrp;
+
+ cgrp = scx_bpf_task_cgroup(p);
+ update_active_weight_sums(cgrp, true);
+ bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p)
+{
+ struct cgroup *cgrp;
+ struct fcg_cgrp_ctx *cgc;
+
+ if (fifo_sched)
+ return;
+
+ cgrp = scx_bpf_task_cgroup(p);
+ cgc = find_cgrp_ctx(cgrp);
+ if (cgc) {
+ /*
+ * @cgc->tvtime_now always progresses forward as tasks start
+ * executing. The test and update can be performed concurrently
+ * from multiple CPUs and thus racy. Any error should be
+ * contained and temporary. Let's just live with it.
+ */
+ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime))
+ cgc->tvtime_now = p->scx.dsq_vtime;
+ }
+ bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable)
+{
+ struct fcg_task_ctx *taskc;
+ struct cgroup *cgrp;
+ struct fcg_cgrp_ctx *cgc;
+
+ /*
+ * Scale the execution time by the inverse of the weight and charge.
+ *
+ * Note that the default yield implementation yields by setting
+ * @p->scx.slice to zero and the following would treat the yielding task
+ * as if it has consumed all its slice. If this penalizes yielding tasks
+ * too much, determine the execution time by taking explicit timestamps
+ * instead of depending on @p->scx.slice.
+ */
+ if (!fifo_sched)
+ p->scx.dsq_vtime +=
+ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+
+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0);
+ if (!taskc) {
+ scx_bpf_error("task_ctx lookup failed");
+ return;
+ }
+
+ if (!taskc->bypassed_at)
+ return;
+
+ cgrp = scx_bpf_task_cgroup(p);
+ cgc = find_cgrp_ctx(cgrp);
+ if (cgc) {
+ __sync_fetch_and_add(&cgc->cvtime_delta,
+ p->se.sum_exec_runtime - taskc->bypassed_at);
+ taskc->bypassed_at = 0;
+ }
+ bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags)
+{
+ struct cgroup *cgrp;
+
+ cgrp = scx_bpf_task_cgroup(p);
+ update_active_weight_sums(cgrp, false);
+ bpf_cgroup_release(cgrp);
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight)
+{
+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL;
+
+ cgc = find_cgrp_ctx(cgrp);
+ if (!cgc)
+ return;
+
+ if (cgrp->level) {
+ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1);
+ if (!pcgc)
+ return;
+ }
+
+ bpf_spin_lock(&cgv_tree_lock);
+ if (pcgc && cgc->nr_active)
+ pcgc->child_weight_sum += (s64)weight - cgc->weight;
+ cgc->weight = weight;
+ bpf_spin_unlock(&cgv_tree_lock);
+}
+
+static bool try_pick_next_cgroup(u64 *cgidp)
+{
+ struct bpf_rb_node *rb_node;
+ struct cgv_node_stash *stash;
+ struct cgv_node *cgv_node;
+ struct fcg_cgrp_ctx *cgc;
+ struct cgroup *cgrp;
+ u64 cgid;
+
+ /* pop the front cgroup and wind cvtime_now accordingly */
+ bpf_spin_lock(&cgv_tree_lock);
+
+ rb_node = bpf_rbtree_first(&cgv_tree);
+ if (!rb_node) {
+ bpf_spin_unlock(&cgv_tree_lock);
+ stat_inc(FCG_STAT_PNC_NO_CGRP);
+ *cgidp = 0;
+ return true;
+ }
+
+ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node);
+ bpf_spin_unlock(&cgv_tree_lock);
+
+ if (!rb_node) {
+ /*
+ * This should never happen. bpf_rbtree_first() was called
+ * above while the tree lock was held, so the node should
+ * always be present.
+ */
+ scx_bpf_error("node could not be removed");
+ return true;
+ }
+
+ cgv_node = container_of(rb_node, struct cgv_node, rb_node);
+ cgid = cgv_node->cgid;
+
+ if (vtime_before(cvtime_now, cgv_node->cvtime))
+ cvtime_now = cgv_node->cvtime;
+
+ /*
+ * If lookup fails, the cgroup's gone. Free and move on. See
+ * fcg_cgroup_exit().
+ */
+ cgrp = bpf_cgroup_from_id(cgid);
+ if (!cgrp) {
+ stat_inc(FCG_STAT_PNC_GONE);
+ goto out_free;
+ }
+
+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+ if (!cgc) {
+ bpf_cgroup_release(cgrp);
+ stat_inc(FCG_STAT_PNC_GONE);
+ goto out_free;
+ }
+
+ if (!scx_bpf_consume(cgid)) {
+ bpf_cgroup_release(cgrp);
+ stat_inc(FCG_STAT_PNC_EMPTY);
+ goto out_stash;
+ }
+
+ /*
+ * Successfully consumed from the cgroup. This will be our current
+ * cgroup for the new slice. Refresh its hweight.
+ */
+ cgrp_refresh_hweight(cgrp, cgc);
+
+ bpf_cgroup_release(cgrp);
+
+ /*
+ * As the cgroup may have more tasks, add it back to the rbtree. Note
+ * that here we charge the full slice upfront and then exact later
+ * according to the actual consumption. This prevents lowpri thundering
+ * herd from saturating the machine.
+ */
+ bpf_spin_lock(&cgv_tree_lock);
+ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1);
+ cgrp_cap_budget(cgv_node, cgc);
+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+ bpf_spin_unlock(&cgv_tree_lock);
+
+ *cgidp = cgid;
+ stat_inc(FCG_STAT_PNC_NEXT);
+ return true;
+
+out_stash:
+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+ if (!stash) {
+ stat_inc(FCG_STAT_PNC_GONE);
+ goto out_free;
+ }
+
+ /*
+ * Paired with cmpxchg in cgrp_enqueued(). If they see the following
+ * transition, they'll enqueue the cgroup. If they are earlier, we'll
+ * see their task in the dq below and requeue the cgroup.
+ */
+ __sync_val_compare_and_swap(&cgc->queued, 1, 0);
+
+ if (scx_bpf_dsq_nr_queued(cgid)) {
+ bpf_spin_lock(&cgv_tree_lock);
+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less);
+ bpf_spin_unlock(&cgv_tree_lock);
+ stat_inc(FCG_STAT_PNC_RACE);
+ } else {
+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+ if (cgv_node) {
+ scx_bpf_error("unexpected !NULL cgv_node stash");
+ goto out_free;
+ }
+ }
+
+ return false;
+
+out_free:
+ bpf_obj_drop(cgv_node);
+ return false;
+}
+
+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
+{
+ struct fcg_cpu_ctx *cpuc;
+ struct fcg_cgrp_ctx *cgc;
+ struct cgroup *cgrp;
+ u64 now = bpf_ktime_get_ns();
+ bool picked_next = false;
+
+ cpuc = find_cpu_ctx();
+ if (!cpuc)
+ return;
+
+ if (!cpuc->cur_cgid)
+ goto pick_next_cgroup;
+
+ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
+ if (scx_bpf_consume(cpuc->cur_cgid)) {
+ stat_inc(FCG_STAT_CNS_KEEP);
+ return;
+ }
+ stat_inc(FCG_STAT_CNS_EMPTY);
+ } else {
+ stat_inc(FCG_STAT_CNS_EXPIRE);
+ }
+
+ /*
+ * The current cgroup is expiring. It was already charged a full slice.
+ * Calculate the actual usage and accumulate the delta.
+ */
+ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid);
+ if (!cgrp) {
+ stat_inc(FCG_STAT_CNS_GONE);
+ goto pick_next_cgroup;
+ }
+
+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0);
+ if (cgc) {
+ /*
+ * We want to update the vtime delta and then look for the next
+ * cgroup to execute but the latter needs to be done in a loop
+ * and we can't keep the lock held. Oh well...
+ */
+ bpf_spin_lock(&cgv_tree_lock);
+ __sync_fetch_and_add(&cgc->cvtime_delta,
+ (cpuc->cur_at + cgrp_slice_ns - now) *
+ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1));
+ bpf_spin_unlock(&cgv_tree_lock);
+ } else {
+ stat_inc(FCG_STAT_CNS_GONE);
+ }
+
+ bpf_cgroup_release(cgrp);
+
+pick_next_cgroup:
+ cpuc->cur_at = now;
+
+ if (scx_bpf_consume(SCX_DSQ_GLOBAL)) {
+ cpuc->cur_cgid = 0;
+ return;
+ }
+
+ bpf_repeat(CGROUP_MAX_RETRIES) {
+ if (try_pick_next_cgroup(&cpuc->cur_cgid)) {
+ picked_next = true;
+ break;
+ }
+ }
+
+ /*
+ * This only happens if try_pick_next_cgroup() races against enqueue
+ * path for more than CGROUP_MAX_RETRIES times, which is extremely
+ * unlikely and likely indicates an underlying bug. There shouldn't be
+ * any stall risk as the race is against enqueue.
+ */
+ if (!picked_next)
+ stat_inc(FCG_STAT_PNC_FAIL);
+}
+
+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
+{
+ struct fcg_task_ctx *taskc;
+ struct fcg_cgrp_ctx *cgc;
+
+ /*
+ * @p is new. Let's ensure that its task_ctx is available. We can sleep
+ * in this function and the following will automatically use GFP_KERNEL.
+ */
+ taskc = bpf_task_storage_get(&task_ctx, p, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!taskc)
+ return -ENOMEM;
+
+ taskc->bypassed_at = 0;
+
+ if (!(cgc = find_cgrp_ctx(args->cgroup)))
+ return -ENOENT;
+
+ p->scx.dsq_vtime = cgc->tvtime_now;
+
+ return 0;
+}
+
+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp,
+ struct scx_cgroup_init_args *args)
+{
+ struct fcg_cgrp_ctx *cgc;
+ struct cgv_node *cgv_node;
+ struct cgv_node_stash empty_stash = {}, *stash;
+ u64 cgid = cgrp->kn->id;
+ int ret;
+
+ /*
+ * Technically incorrect as cgroup ID is full 64bit while dq ID is
+ * 63bit. Should not be a problem in practice and easy to spot in the
+ * unlikely case that it breaks.
+ */
+ ret = scx_bpf_create_dsq(cgid, -1);
+ if (ret)
+ return ret;
+
+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE);
+ if (!cgc) {
+ ret = -ENOMEM;
+ goto err_destroy_dsq;
+ }
+
+ cgc->weight = args->weight;
+ cgc->hweight = FCG_HWEIGHT_ONE;
+
+ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash,
+ BPF_NOEXIST);
+ if (ret) {
+ if (ret != -ENOMEM)
+ scx_bpf_error("unexpected stash creation error (%d)",
+ ret);
+ goto err_destroy_dsq;
+ }
+
+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid);
+ if (!stash) {
+ scx_bpf_error("unexpected cgv_node stash lookup failure");
+ ret = -ENOENT;
+ goto err_destroy_dsq;
+ }
+
+ cgv_node = bpf_obj_new(struct cgv_node);
+ if (!cgv_node) {
+ ret = -ENOMEM;
+ goto err_del_cgv_node;
+ }
+
+ cgv_node->cgid = cgid;
+ cgv_node->cvtime = cvtime_now;
+
+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node);
+ if (cgv_node) {
+ scx_bpf_error("unexpected !NULL cgv_node stash");
+ ret = -EBUSY;
+ goto err_drop;
+ }
+
+ return 0;
+
+err_drop:
+ bpf_obj_drop(cgv_node);
+err_del_cgv_node:
+ bpf_map_delete_elem(&cgv_node_stash, &cgid);
+err_destroy_dsq:
+ scx_bpf_destroy_dsq(cgid);
+ return ret;
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp)
+{
+ u64 cgid = cgrp->kn->id;
+
+ /*
+ * For now, there's no way find and remove the cgv_node if it's on the
+ * cgv_tree. Let's drain them in the dispatch path as they get popped
+ * off the front of the tree.
+ */
+ bpf_map_delete_elem(&cgv_node_stash, &cgid);
+ scx_bpf_destroy_dsq(cgid);
+}
+
+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p,
+ struct cgroup *from, struct cgroup *to)
+{
+ struct fcg_cgrp_ctx *from_cgc, *to_cgc;
+ s64 vtime_delta;
+
+ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */
+ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to)))
+ return;
+
+ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now;
+ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta;
+}
+
+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(flatcg_ops,
+ .select_cpu = (void *)fcg_select_cpu,
+ .enqueue = (void *)fcg_enqueue,
+ .dispatch = (void *)fcg_dispatch,
+ .runnable = (void *)fcg_runnable,
+ .running = (void *)fcg_running,
+ .stopping = (void *)fcg_stopping,
+ .quiescent = (void *)fcg_quiescent,
+ .init_task = (void *)fcg_init_task,
+ .cgroup_set_weight = (void *)fcg_cgroup_set_weight,
+ .cgroup_init = (void *)fcg_cgroup_init,
+ .cgroup_exit = (void *)fcg_cgroup_exit,
+ .cgroup_move = (void *)fcg_cgroup_move,
+ .exit = (void *)fcg_exit,
+ .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING,
+ .name = "flatcg");
diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c
new file mode 100644
index 000000000000..5d24ca9c29d9
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.c
@@ -0,0 +1,233 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2023 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2023 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <signal.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <limits.h>
+#include <inttypes.h>
+#include <fcntl.h>
+#include <time.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_flatcg.h"
+#include "scx_flatcg.bpf.skel.h"
+
+#ifndef FILEID_KERNFS
+#define FILEID_KERNFS 0xfe
+#endif
+
+const char help_fmt[] =
+"A flattened cgroup hierarchy sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n"
+"\n"
+" -s SLICE_US Override slice duration\n"
+" -i INTERVAL Report interval\n"
+" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+ exit_req = 1;
+}
+
+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle)
+{
+ FILE *fp;
+ char buf[4096];
+ char *line, *cur = NULL, *tok;
+ __u64 sum = 0, idle = 0;
+ __u64 delta_sum, delta_idle;
+ int idx;
+
+ fp = fopen("/proc/stat", "r");
+ if (!fp) {
+ perror("fopen(\"/proc/stat\")");
+ return 0.0;
+ }
+
+ if (!fgets(buf, sizeof(buf), fp)) {
+ perror("fgets(\"/proc/stat\")");
+ fclose(fp);
+ return 0.0;
+ }
+ fclose(fp);
+
+ line = buf;
+ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) {
+ char *endp = NULL;
+ __u64 v;
+
+ if (idx == 0) {
+ line = NULL;
+ continue;
+ }
+ v = strtoull(tok, &endp, 0);
+ if (!endp || *endp != '\0') {
+ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n",
+ idx, tok);
+ continue;
+ }
+ sum += v;
+ if (idx == 4)
+ idle = v;
+ }
+
+ delta_sum = sum - *last_sum;
+ delta_idle = idle - *last_idle;
+ *last_sum = sum;
+ *last_idle = idle;
+
+ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0;
+}
+
+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats)
+{
+ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus];
+ __u32 idx;
+
+ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS);
+
+ for (idx = 0; idx < FCG_NR_STATS; idx++) {
+ int ret, cpu;
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+ &idx, cnts[idx]);
+ if (ret < 0)
+ continue;
+ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++)
+ stats[idx] += cnts[idx][cpu];
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_flatcg *skel;
+ struct bpf_link *link;
+ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 };
+ bool dump_cgrps = false;
+ __u64 last_cpu_sum = 0, last_cpu_idle = 0;
+ __u64 last_stats[FCG_NR_STATS] = {};
+ unsigned long seq = 0;
+ __s32 opt;
+ __u64 ecode;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+restart:
+ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg);
+
+ skel->rodata->nr_cpus = libbpf_num_possible_cpus();
+
+ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) {
+ double v;
+
+ switch (opt) {
+ case 's':
+ v = strtod(optarg, NULL);
+ skel->rodata->cgrp_slice_ns = v * 1000;
+ break;
+ case 'i':
+ v = strtod(optarg, NULL);
+ intv_ts.tv_sec = v;
+ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000;
+ break;
+ case 'd':
+ dump_cgrps = true;
+ break;
+ case 'f':
+ skel->rodata->fifo_sched = true;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ case 'h':
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d",
+ (double)skel->rodata->cgrp_slice_ns / 1000000.0,
+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0,
+ dump_cgrps);
+
+ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei);
+ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg);
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ __u64 acc_stats[FCG_NR_STATS];
+ __u64 stats[FCG_NR_STATS];
+ float cpu_util;
+ int i;
+
+ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle);
+
+ fcg_read_stats(skel, acc_stats);
+ for (i = 0; i < FCG_NR_STATS; i++)
+ stats[i] = acc_stats[i] - last_stats[i];
+
+ memcpy(last_stats, acc_stats, sizeof(acc_stats));
+
+ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n",
+ seq++, cpu_util * 100.0, skel->data->hweight_gen);
+ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n",
+ stats[FCG_STAT_ACT],
+ stats[FCG_STAT_DEACT],
+ stats[FCG_STAT_GLOBAL],
+ stats[FCG_STAT_LOCAL]);
+ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n",
+ stats[FCG_STAT_HWT_CACHE],
+ stats[FCG_STAT_HWT_UPDATES],
+ stats[FCG_STAT_HWT_SKIP],
+ stats[FCG_STAT_HWT_RACE]);
+ printf("ENQ skip:%6llu race:%6llu\n",
+ stats[FCG_STAT_ENQ_SKIP],
+ stats[FCG_STAT_ENQ_RACE]);
+ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n",
+ stats[FCG_STAT_CNS_KEEP],
+ stats[FCG_STAT_CNS_EXPIRE],
+ stats[FCG_STAT_CNS_EMPTY],
+ stats[FCG_STAT_CNS_GONE]);
+ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n",
+ stats[FCG_STAT_PNC_NEXT],
+ stats[FCG_STAT_PNC_EMPTY],
+ stats[FCG_STAT_PNC_NO_CGRP],
+ stats[FCG_STAT_PNC_GONE],
+ stats[FCG_STAT_PNC_RACE],
+ stats[FCG_STAT_PNC_FAIL]);
+ printf("BAD remove:%6llu\n",
+ acc_stats[FCG_STAT_BAD_REMOVAL]);
+ fflush(stdout);
+
+ nanosleep(&intv_ts, NULL);
+ }
+
+ bpf_link__destroy(link);
+ ecode = UEI_REPORT(skel, uei);
+ scx_flatcg__destroy(skel);
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
+ return 0;
+}
diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h
new file mode 100644
index 000000000000..6f2ea50acb1c
--- /dev/null
+++ b/tools/sched_ext/scx_flatcg.h
@@ -0,0 +1,51 @@
+#ifndef __SCX_EXAMPLE_FLATCG_H
+#define __SCX_EXAMPLE_FLATCG_H
+
+enum {
+ FCG_HWEIGHT_ONE = 1LLU << 16,
+};
+
+enum fcg_stat_idx {
+ FCG_STAT_ACT,
+ FCG_STAT_DEACT,
+ FCG_STAT_LOCAL,
+ FCG_STAT_GLOBAL,
+
+ FCG_STAT_HWT_UPDATES,
+ FCG_STAT_HWT_CACHE,
+ FCG_STAT_HWT_SKIP,
+ FCG_STAT_HWT_RACE,
+
+ FCG_STAT_ENQ_SKIP,
+ FCG_STAT_ENQ_RACE,
+
+ FCG_STAT_CNS_KEEP,
+ FCG_STAT_CNS_EXPIRE,
+ FCG_STAT_CNS_EMPTY,
+ FCG_STAT_CNS_GONE,
+
+ FCG_STAT_PNC_NO_CGRP,
+ FCG_STAT_PNC_NEXT,
+ FCG_STAT_PNC_EMPTY,
+ FCG_STAT_PNC_GONE,
+ FCG_STAT_PNC_RACE,
+ FCG_STAT_PNC_FAIL,
+
+ FCG_STAT_BAD_REMOVAL,
+
+ FCG_NR_STATS,
+};
+
+struct fcg_cgrp_ctx {
+ u32 nr_active;
+ u32 nr_runnable;
+ u32 queued;
+ u32 weight;
+ u32 hweight;
+ u64 child_weight_sum;
+ u64 hweight_gen;
+ s64 cvtime_delta;
+ u64 tvtime_now;
+};
+
+#endif /* __SCX_EXAMPLE_FLATCG_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
new file mode 100644
index 000000000000..83c8f54c1e31
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -0,0 +1,827 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple five-level FIFO queue scheduler.
+ *
+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets
+ * assigned to one depending on its compound weight. Each CPU round robins
+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from
+ * queue0, 2 from queue1, 4 from queue2 and so on.
+ *
+ * This scheduler demonstrates:
+ *
+ * - BPF-side queueing using PIDs.
+ * - Sleepable per-task storage allocation using ops.prep_enable().
+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking
+ * the CPU away.
+ * - Core-sched support.
+ *
+ * This scheduler is primarily for demonstration and testing of sched_ext
+ * features and unlikely to be useful for actual workloads.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+enum consts {
+ ONE_SEC_IN_NS = 1000000000,
+ SHARED_DSQ = 0,
+ HIGHPRI_DSQ = 1,
+ HIGHPRI_WEIGHT = 8668, /* this is what -20 maps to */
+};
+
+char _license[] SEC("license") = "GPL";
+
+const volatile u64 slice_ns = SCX_SLICE_DFL;
+const volatile u32 stall_user_nth;
+const volatile u32 stall_kernel_nth;
+const volatile u32 dsp_inf_loop_after;
+const volatile u32 dsp_batch;
+const volatile bool highpri_boosting;
+const volatile bool print_shared_dsq;
+const volatile s32 disallow_tgid;
+const volatile bool suppress_dump;
+
+u64 nr_highpri_queued;
+u32 test_error_cnt;
+
+UEI_DEFINE(uei);
+
+struct qmap {
+ __uint(type, BPF_MAP_TYPE_QUEUE);
+ __uint(max_entries, 4096);
+ __type(value, u32);
+} queue0 SEC(".maps"),
+ queue1 SEC(".maps"),
+ queue2 SEC(".maps"),
+ queue3 SEC(".maps"),
+ queue4 SEC(".maps");
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
+ __uint(max_entries, 5);
+ __type(key, int);
+ __array(values, struct qmap);
+} queue_arr SEC(".maps") = {
+ .values = {
+ [0] = &queue0,
+ [1] = &queue1,
+ [2] = &queue2,
+ [3] = &queue3,
+ [4] = &queue4,
+ },
+};
+
+/*
+ * If enabled, CPU performance target is set according to the queue index
+ * according to the following table.
+ */
+static const u32 qidx_to_cpuperf_target[] = {
+ [0] = SCX_CPUPERF_ONE * 0 / 4,
+ [1] = SCX_CPUPERF_ONE * 1 / 4,
+ [2] = SCX_CPUPERF_ONE * 2 / 4,
+ [3] = SCX_CPUPERF_ONE * 3 / 4,
+ [4] = SCX_CPUPERF_ONE * 4 / 4,
+};
+
+/*
+ * Per-queue sequence numbers to implement core-sched ordering.
+ *
+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the
+ * sequence number of the latest dispatched task. The distance between the a
+ * task's seq and the associated queue's head seq is called the queue distance
+ * and used when comparing two tasks for ordering. See qmap_core_sched_before().
+ */
+static u64 core_sched_head_seqs[5];
+static u64 core_sched_tail_seqs[5];
+
+/* Per-task scheduling context */
+struct task_ctx {
+ bool force_local; /* Dispatch directly to local_dsq */
+ bool highpri;
+ u64 core_sched_seq;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE);
+ __uint(map_flags, BPF_F_NO_PREALLOC);
+ __type(key, int);
+ __type(value, struct task_ctx);
+} task_ctx_stor SEC(".maps");
+
+struct cpu_ctx {
+ u64 dsp_idx; /* dispatch index */
+ u64 dsp_cnt; /* remaining count */
+ u32 avg_weight;
+ u32 cpuperf_target;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct cpu_ctx);
+} cpu_ctx_stor SEC(".maps");
+
+/* Statistics */
+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued, nr_ddsp_from_enq;
+u64 nr_core_sched_execed;
+u64 nr_expedited_local, nr_expedited_remote, nr_expedited_lost, nr_expedited_from_timer;
+u32 cpuperf_min, cpuperf_avg, cpuperf_max;
+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max;
+
+static s32 pick_direct_dispatch_cpu(struct task_struct *p, s32 prev_cpu)
+{
+ s32 cpu;
+
+ if (p->nr_cpus_allowed == 1 ||
+ scx_bpf_test_and_clear_cpu_idle(prev_cpu))
+ return prev_cpu;
+
+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+ if (cpu >= 0)
+ return cpu;
+
+ return -1;
+}
+
+static struct task_ctx *lookup_task_ctx(struct task_struct *p)
+{
+ struct task_ctx *tctx;
+
+ if (!(tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) {
+ scx_bpf_error("task_ctx lookup failed");
+ return NULL;
+ }
+ return tctx;
+}
+
+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p,
+ s32 prev_cpu, u64 wake_flags)
+{
+ struct task_ctx *tctx;
+ s32 cpu;
+
+ if (!(tctx = lookup_task_ctx(p)))
+ return -ESRCH;
+
+ cpu = pick_direct_dispatch_cpu(p, prev_cpu);
+
+ if (cpu >= 0) {
+ tctx->force_local = true;
+ return cpu;
+ } else {
+ return prev_cpu;
+ }
+}
+
+static int weight_to_idx(u32 weight)
+{
+ /* Coarsely map the compound weight to a FIFO. */
+ if (weight <= 25)
+ return 0;
+ else if (weight <= 50)
+ return 1;
+ else if (weight < 200)
+ return 2;
+ else if (weight < 400)
+ return 3;
+ else
+ return 4;
+}
+
+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ static u32 user_cnt, kernel_cnt;
+ struct task_ctx *tctx;
+ u32 pid = p->pid;
+ int idx = weight_to_idx(p->scx.weight);
+ void *ring;
+ s32 cpu;
+
+ if (p->flags & PF_KTHREAD) {
+ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth))
+ return;
+ } else {
+ if (stall_user_nth && !(++user_cnt % stall_user_nth))
+ return;
+ }
+
+ if (test_error_cnt && !--test_error_cnt)
+ scx_bpf_error("test triggering error");
+
+ if (!(tctx = lookup_task_ctx(p)))
+ return;
+
+ /*
+ * All enqueued tasks must have their core_sched_seq updated for correct
+ * core-sched ordering. Also, take a look at the end of qmap_dispatch().
+ */
+ tctx->core_sched_seq = core_sched_tail_seqs[idx]++;
+
+ /*
+ * If qmap_select_cpu() is telling us to or this is the last runnable
+ * task on the CPU, enqueue locally.
+ */
+ if (tctx->force_local) {
+ tctx->force_local = false;
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+ return;
+ }
+
+ /* if !WAKEUP, select_cpu() wasn't called, try direct dispatch */
+ if (!(enq_flags & SCX_ENQ_WAKEUP) &&
+ (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
+ __sync_fetch_and_add(&nr_ddsp_from_enq, 1);
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+ return;
+ }
+
+ /*
+ * If the task was re-enqueued due to the CPU being preempted by a
+ * higher priority scheduling class, just re-enqueue the task directly
+ * on the global DSQ. As we want another CPU to pick it up, find and
+ * kick an idle CPU.
+ */
+ if (enq_flags & SCX_ENQ_REENQ) {
+ s32 cpu;
+
+ scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
+ if (cpu >= 0)
+ scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
+ return;
+ }
+
+ ring = bpf_map_lookup_elem(&queue_arr, &idx);
+ if (!ring) {
+ scx_bpf_error("failed to find ring %d", idx);
+ return;
+ }
+
+ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
+ if (bpf_map_push_elem(ring, &pid, 0)) {
+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
+ return;
+ }
+
+ if (highpri_boosting && p->scx.weight >= HIGHPRI_WEIGHT) {
+ tctx->highpri = true;
+ __sync_fetch_and_add(&nr_highpri_queued, 1);
+ }
+ __sync_fetch_and_add(&nr_enqueued, 1);
+}
+
+/*
+ * The BPF queue map doesn't support removal and sched_ext can handle spurious
+ * dispatches. qmap_dequeue() is only used to collect statistics.
+ */
+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags)
+{
+ __sync_fetch_and_add(&nr_dequeued, 1);
+ if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC)
+ __sync_fetch_and_add(&nr_core_sched_execed, 1);
+}
+
+static void update_core_sched_head_seq(struct task_struct *p)
+{
+ int idx = weight_to_idx(p->scx.weight);
+ struct task_ctx *tctx;
+
+ if ((tctx = lookup_task_ctx(p)))
+ core_sched_head_seqs[idx] = tctx->core_sched_seq;
+}
+
+/*
+ * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
+ * selective priority boosting mechanism by scanning SHARED_DSQ looking for
+ * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
+ * makes minor difference only when dsp_batch is larger than 1.
+ *
+ * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
+ * non-rq-lock holding BPF programs. As demonstration, this function is called
+ * from qmap_dispatch() and monitor_timerfn().
+ */
+static bool dispatch_highpri(bool from_timer)
+{
+ struct task_struct *p;
+ s32 this_cpu = bpf_get_smp_processor_id();
+
+ /* scan SHARED_DSQ and move highpri tasks to HIGHPRI_DSQ */
+ bpf_for_each(scx_dsq, p, SHARED_DSQ, 0) {
+ static u64 highpri_seq;
+ struct task_ctx *tctx;
+
+ if (!(tctx = lookup_task_ctx(p)))
+ return false;
+
+ if (tctx->highpri) {
+ /* exercise the set_*() and vtime interface too */
+ scx_bpf_dispatch_from_dsq_set_slice(
+ BPF_FOR_EACH_ITER, slice_ns * 2);
+ scx_bpf_dispatch_from_dsq_set_vtime(
+ BPF_FOR_EACH_ITER, highpri_seq++);
+ scx_bpf_dispatch_vtime_from_dsq(
+ BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
+ }
+ }
+
+ /*
+ * Scan HIGHPRI_DSQ and dispatch until a task that can run on this CPU
+ * is found.
+ */
+ bpf_for_each(scx_dsq, p, HIGHPRI_DSQ, 0) {
+ bool dispatched = false;
+ s32 cpu;
+
+ if (bpf_cpumask_test_cpu(this_cpu, p->cpus_ptr))
+ cpu = this_cpu;
+ else
+ cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
+
+ if (scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
+ SCX_DSQ_LOCAL_ON | cpu,
+ SCX_ENQ_PREEMPT)) {
+ if (cpu == this_cpu) {
+ dispatched = true;
+ __sync_fetch_and_add(&nr_expedited_local, 1);
+ } else {
+ __sync_fetch_and_add(&nr_expedited_remote, 1);
+ }
+ if (from_timer)
+ __sync_fetch_and_add(&nr_expedited_from_timer, 1);
+ } else {
+ __sync_fetch_and_add(&nr_expedited_lost, 1);
+ }
+
+ if (dispatched)
+ return true;
+ }
+
+ return false;
+}
+
+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
+{
+ struct task_struct *p;
+ struct cpu_ctx *cpuc;
+ struct task_ctx *tctx;
+ u32 zero = 0, batch = dsp_batch ?: 1;
+ void *fifo;
+ s32 i, pid;
+
+ if (dispatch_highpri(false))
+ return;
+
+ if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
+ return;
+
+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
+ /*
+ * PID 2 should be kthreadd which should mostly be idle and off
+ * the scheduler. Let's keep dispatching it to force the kernel
+ * to call this function over and over again.
+ */
+ p = bpf_task_from_pid(2);
+ if (p) {
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+ bpf_task_release(p);
+ return;
+ }
+ }
+
+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+ scx_bpf_error("failed to look up cpu_ctx");
+ return;
+ }
+
+ for (i = 0; i < 5; i++) {
+ /* Advance the dispatch cursor and pick the fifo. */
+ if (!cpuc->dsp_cnt) {
+ cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5;
+ cpuc->dsp_cnt = 1 << cpuc->dsp_idx;
+ }
+
+ fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx);
+ if (!fifo) {
+ scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx);
+ return;
+ }
+
+ /* Dispatch or advance. */
+ bpf_repeat(BPF_MAX_LOOPS) {
+ struct task_ctx *tctx;
+
+ if (bpf_map_pop_elem(fifo, &pid))
+ break;
+
+ p = bpf_task_from_pid(pid);
+ if (!p)
+ continue;
+
+ if (!(tctx = lookup_task_ctx(p))) {
+ bpf_task_release(p);
+ return;
+ }
+
+ if (tctx->highpri)
+ __sync_fetch_and_sub(&nr_highpri_queued, 1);
+
+ update_core_sched_head_seq(p);
+ __sync_fetch_and_add(&nr_dispatched, 1);
+
+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+ bpf_task_release(p);
+
+ batch--;
+ cpuc->dsp_cnt--;
+ if (!batch || !scx_bpf_dispatch_nr_slots()) {
+ if (dispatch_highpri(false))
+ return;
+ scx_bpf_consume(SHARED_DSQ);
+ return;
+ }
+ if (!cpuc->dsp_cnt)
+ break;
+ }
+
+ cpuc->dsp_cnt = 0;
+ }
+
+ /*
+ * No other tasks. @prev will keep running. Update its core_sched_seq as
+ * if the task were enqueued and dispatched immediately.
+ */
+ if (prev) {
+ tctx = bpf_task_storage_get(&task_ctx_stor, prev, 0, 0);
+ if (!tctx) {
+ scx_bpf_error("task_ctx lookup failed");
+ return;
+ }
+
+ tctx->core_sched_seq =
+ core_sched_tail_seqs[weight_to_idx(prev->scx.weight)]++;
+ }
+}
+
+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p)
+{
+ struct cpu_ctx *cpuc;
+ u32 zero = 0;
+ int idx;
+
+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) {
+ scx_bpf_error("failed to look up cpu_ctx");
+ return;
+ }
+
+ /*
+ * Use the running avg of weights to select the target cpuperf level.
+ * This is a demonstration of the cpuperf feature rather than a
+ * practical strategy to regulate CPU frequency.
+ */
+ cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4;
+ idx = weight_to_idx(cpuc->avg_weight);
+ cpuc->cpuperf_target = qidx_to_cpuperf_target[idx];
+
+ scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target);
+}
+
+/*
+ * The distance from the head of the queue scaled by the weight of the queue.
+ * The lower the number, the older the task and the higher the priority.
+ */
+static s64 task_qdist(struct task_struct *p)
+{
+ int idx = weight_to_idx(p->scx.weight);
+ struct task_ctx *tctx;
+ s64 qdist;
+
+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0);
+ if (!tctx) {
+ scx_bpf_error("task_ctx lookup failed");
+ return 0;
+ }
+
+ qdist = tctx->core_sched_seq - core_sched_head_seqs[idx];
+
+ /*
+ * As queue index increments, the priority doubles. The queue w/ index 3
+ * is dispatched twice more frequently than 2. Reflect the difference by
+ * scaling qdists accordingly. Note that the shift amount needs to be
+ * flipped depending on the sign to avoid flipping priority direction.
+ */
+ if (qdist >= 0)
+ return qdist << (4 - idx);
+ else
+ return qdist << idx;
+}
+
+/*
+ * This is called to determine the task ordering when core-sched is picking
+ * tasks to execute on SMT siblings and should encode about the same ordering as
+ * the regular scheduling path. Use the priority-scaled distances from the head
+ * of the queues to compare the two tasks which should be consistent with the
+ * dispatch path behavior.
+ */
+bool BPF_STRUCT_OPS(qmap_core_sched_before,
+ struct task_struct *a, struct task_struct *b)
+{
+ return task_qdist(a) > task_qdist(b);
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args)
+{
+ u32 cnt;
+
+ /*
+ * Called when @cpu is taken by a higher priority scheduling class. This
+ * makes @cpu no longer available for executing sched_ext tasks. As we
+ * don't want the tasks in @cpu's local dsq to sit there until @cpu
+ * becomes available again, re-enqueue them into the global dsq. See
+ * %SCX_ENQ_REENQ handling in qmap_enqueue().
+ */
+ cnt = scx_bpf_reenqueue_local();
+ if (cnt)
+ __sync_fetch_and_add(&nr_reenqueued, cnt);
+}
+
+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p,
+ struct scx_init_task_args *args)
+{
+ if (p->tgid == disallow_tgid)
+ p->scx.disallow = true;
+
+ /*
+ * @p is new. Let's ensure that its task_ctx is available. We can sleep
+ * in this function and the following will automatically use GFP_KERNEL.
+ */
+ if (bpf_task_storage_get(&task_ctx_stor, p, 0,
+ BPF_LOCAL_STORAGE_GET_F_CREATE))
+ return 0;
+ else
+ return -ENOMEM;
+}
+
+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx)
+{
+ s32 i, pid;
+
+ if (suppress_dump)
+ return;
+
+ bpf_for(i, 0, 5) {
+ void *fifo;
+
+ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i)))
+ return;
+
+ scx_bpf_dump("QMAP FIFO[%d]:", i);
+ bpf_repeat(4096) {
+ if (bpf_map_pop_elem(fifo, &pid))
+ break;
+ scx_bpf_dump(" %d", pid);
+ }
+ scx_bpf_dump("\n");
+ }
+}
+
+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle)
+{
+ u32 zero = 0;
+ struct cpu_ctx *cpuc;
+
+ if (suppress_dump || idle)
+ return;
+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu)))
+ return;
+
+ scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u",
+ cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight,
+ cpuc->cpuperf_target);
+}
+
+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p)
+{
+ struct task_ctx *taskc;
+
+ if (suppress_dump)
+ return;
+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0)))
+ return;
+
+ scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu",
+ taskc->force_local, taskc->core_sched_seq);
+}
+
+/*
+ * Print out the online and possible CPU map using bpf_printk() as a
+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline().
+ */
+static void print_cpus(void)
+{
+ const struct cpumask *possible, *online;
+ s32 cpu;
+ char buf[128] = "", *p;
+ int idx;
+
+ possible = scx_bpf_get_possible_cpumask();
+ online = scx_bpf_get_online_cpumask();
+
+ idx = 0;
+ bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) {
+ if (!(p = MEMBER_VPTR(buf, [idx++])))
+ break;
+ if (bpf_cpumask_test_cpu(cpu, online))
+ *p++ = 'O';
+ else if (bpf_cpumask_test_cpu(cpu, possible))
+ *p++ = 'X';
+ else
+ *p++ = ' ';
+
+ if ((cpu & 7) == 7) {
+ if (!(p = MEMBER_VPTR(buf, [idx++])))
+ break;
+ *p++ = '|';
+ }
+ }
+ buf[sizeof(buf) - 1] = '\0';
+
+ scx_bpf_put_cpumask(online);
+ scx_bpf_put_cpumask(possible);
+
+ bpf_printk("CPUS: |%s", buf);
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu)
+{
+ bpf_printk("CPU %d coming online", cpu);
+ /* @cpu is already online at this point */
+ print_cpus();
+}
+
+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu)
+{
+ bpf_printk("CPU %d going offline", cpu);
+ /* @cpu is still online at this point */
+ print_cpus();
+}
+
+struct monitor_timer {
+ struct bpf_timer timer;
+};
+
+struct {
+ __uint(type, BPF_MAP_TYPE_ARRAY);
+ __uint(max_entries, 1);
+ __type(key, u32);
+ __type(value, struct monitor_timer);
+} monitor_timer SEC(".maps");
+
+/*
+ * Print out the min, avg and max performance levels of CPUs every second to
+ * demonstrate the cpuperf interface.
+ */
+static void monitor_cpuperf(void)
+{
+ u32 zero = 0, nr_cpu_ids;
+ u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0;
+ u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0;
+ const struct cpumask *online;
+ int i, nr_online_cpus = 0;
+
+ nr_cpu_ids = scx_bpf_nr_cpu_ids();
+ online = scx_bpf_get_online_cpumask();
+
+ bpf_for(i, 0, nr_cpu_ids) {
+ struct cpu_ctx *cpuc;
+ u32 cap, cur;
+
+ if (!bpf_cpumask_test_cpu(i, online))
+ continue;
+ nr_online_cpus++;
+
+ /* collect the capacity and current cpuperf */
+ cap = scx_bpf_cpuperf_cap(i);
+ cur = scx_bpf_cpuperf_cur(i);
+
+ cur_min = cur < cur_min ? cur : cur_min;
+ cur_max = cur > cur_max ? cur : cur_max;
+
+ /*
+ * $cur is relative to $cap. Scale it down accordingly so that
+ * it's in the same scale as other CPUs and $cur_sum/$cap_sum
+ * makes sense.
+ */
+ cur_sum += cur * cap / SCX_CPUPERF_ONE;
+ cap_sum += cap;
+
+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) {
+ scx_bpf_error("failed to look up cpu_ctx");
+ goto out;
+ }
+
+ /* collect target */
+ cur = cpuc->cpuperf_target;
+ target_sum += cur;
+ target_min = cur < target_min ? cur : target_min;
+ target_max = cur > target_max ? cur : target_max;
+ }
+
+ cpuperf_min = cur_min;
+ cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum;
+ cpuperf_max = cur_max;
+
+ cpuperf_target_min = target_min;
+ cpuperf_target_avg = target_sum / nr_online_cpus;
+ cpuperf_target_max = target_max;
+out:
+ scx_bpf_put_cpumask(online);
+}
+
+/*
+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of
+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to
+ * see meaningful dumps in the trace pipe.
+ */
+static void dump_shared_dsq(void)
+{
+ struct task_struct *p;
+ s32 nr;
+
+ if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ)))
+ return;
+
+ bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr);
+
+ bpf_rcu_read_lock();
+ bpf_for_each(scx_dsq, p, SHARED_DSQ, SCX_DSQ_ITER_REV)
+ bpf_printk("%s[%d]", p->comm, p->pid);
+ bpf_rcu_read_unlock();
+}
+
+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer)
+{
+ bpf_rcu_read_lock();
+ dispatch_highpri(true);
+ bpf_rcu_read_unlock();
+
+ monitor_cpuperf();
+
+ if (print_shared_dsq)
+ dump_shared_dsq();
+
+ bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+ return 0;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init)
+{
+ u32 key = 0;
+ struct bpf_timer *timer;
+ s32 ret;
+
+ print_cpus();
+
+ ret = scx_bpf_create_dsq(SHARED_DSQ, -1);
+ if (ret)
+ return ret;
+
+ ret = scx_bpf_create_dsq(HIGHPRI_DSQ, -1);
+ if (ret)
+ return ret;
+
+ timer = bpf_map_lookup_elem(&monitor_timer, &key);
+ if (!timer)
+ return -ESRCH;
+
+ bpf_timer_init(timer, &monitor_timer, CLOCK_MONOTONIC);
+ bpf_timer_set_callback(timer, monitor_timerfn);
+
+ return bpf_timer_start(timer, ONE_SEC_IN_NS, 0);
+}
+
+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(qmap_ops,
+ .select_cpu = (void *)qmap_select_cpu,
+ .enqueue = (void *)qmap_enqueue,
+ .dequeue = (void *)qmap_dequeue,
+ .dispatch = (void *)qmap_dispatch,
+ .tick = (void *)qmap_tick,
+ .core_sched_before = (void *)qmap_core_sched_before,
+ .cpu_release = (void *)qmap_cpu_release,
+ .init_task = (void *)qmap_init_task,
+ .dump = (void *)qmap_dump,
+ .dump_cpu = (void *)qmap_dump_cpu,
+ .dump_task = (void *)qmap_dump_task,
+ .cpu_online = (void *)qmap_cpu_online,
+ .cpu_offline = (void *)qmap_cpu_offline,
+ .init = (void *)qmap_init,
+ .exit = (void *)qmap_exit,
+ .timeout_ms = 5000U,
+ .name = "qmap");
diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c
new file mode 100644
index 000000000000..ac45a02b4055
--- /dev/null
+++ b/tools/sched_ext/scx_qmap.c
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_qmap.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple five-level FIFO queue sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n"
+" [-P] [-d PID] [-D LEN] [-p] [-v]\n"
+"\n"
+" -s SLICE_US Override slice duration\n"
+" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n"
+" -t COUNT Stall every COUNT'th user thread\n"
+" -T COUNT Stall every COUNT'th kernel thread\n"
+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n"
+" -b COUNT Dispatch upto COUNT tasks together\n"
+" -P Print out DSQ content to trace_pipe every second, use with -b\n"
+" -H Boost nice -20 tasks in SHARED_DSQ, use with -b\n"
+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n"
+" -D LEN Set scx_exit_info.dump buffer length\n"
+" -S Suppress qmap-specific debug dump\n"
+" -p Switch only tasks on SCHED_EXT policy instead of all\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int dummy)
+{
+ exit_req = 1;
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_qmap *skel;
+ struct bpf_link *link;
+ int opt;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+
+ skel = SCX_OPS_OPEN(qmap_ops, scx_qmap);
+
+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PHd:D:Spvh")) != -1) {
+ switch (opt) {
+ case 's':
+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000;
+ break;
+ case 'e':
+ skel->bss->test_error_cnt = strtoul(optarg, NULL, 0);
+ break;
+ case 't':
+ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0);
+ break;
+ case 'T':
+ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0);
+ break;
+ case 'l':
+ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0);
+ break;
+ case 'b':
+ skel->rodata->dsp_batch = strtoul(optarg, NULL, 0);
+ break;
+ case 'P':
+ skel->rodata->print_shared_dsq = true;
+ break;
+ case 'H':
+ skel->rodata->highpri_boosting = true;
+ break;
+ case 'd':
+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0);
+ if (skel->rodata->disallow_tgid < 0)
+ skel->rodata->disallow_tgid = getpid();
+ break;
+ case 'D':
+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0);
+ break;
+ case 'S':
+ skel->rodata->suppress_dump = true;
+ break;
+ case 'p':
+ skel->struct_ops.qmap_ops->flags |= SCX_OPS_SWITCH_PARTIAL;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei);
+ link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap);
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ long nr_enqueued = skel->bss->nr_enqueued;
+ long nr_dispatched = skel->bss->nr_dispatched;
+
+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" enq_ddsp=%"PRIu64"\n",
+ nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched,
+ skel->bss->nr_reenqueued, skel->bss->nr_dequeued,
+ skel->bss->nr_core_sched_execed,
+ skel->bss->nr_ddsp_from_enq);
+ printf(" exp_local=%"PRIu64" exp_remote=%"PRIu64" exp_timer=%"PRIu64" exp_lost=%"PRIu64"\n",
+ skel->bss->nr_expedited_local,
+ skel->bss->nr_expedited_remote,
+ skel->bss->nr_expedited_from_timer,
+ skel->bss->nr_expedited_lost);
+ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur"))
+ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n",
+ skel->bss->cpuperf_min,
+ skel->bss->cpuperf_avg,
+ skel->bss->cpuperf_max,
+ skel->bss->cpuperf_target_min,
+ skel->bss->cpuperf_target_avg,
+ skel->bss->cpuperf_target_max);
+ fflush(stdout);
+ sleep(1);
+ }
+
+ bpf_link__destroy(link);
+ UEI_REPORT(skel, uei);
+ scx_qmap__destroy(skel);
+ /*
+ * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart
+ * on CPU hotplug events.
+ */
+ return 0;
+}
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
new file mode 100644
index 000000000000..8bc626ede1c4
--- /dev/null
+++ b/tools/sched_ext/scx_show_state.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env drgn
+#
+# Copyright (C) 2024 Tejun Heo <tj@kernel.org>
+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates.
+
+desc = """
+This is a drgn script to show the current sched_ext state.
+For more info on drgn, visit https://github.com/osandov/drgn.
+"""
+
+import drgn
+import sys
+
+def err(s):
+ print(s, file=sys.stderr, flush=True)
+ sys.exit(1)
+
+def read_int(name):
+ return int(prog[name].value_())
+
+def read_atomic(name):
+ return prog[name].counter.value_()
+
+def read_static_key(name):
+ return prog[name].key.enabled.counter.value_()
+
+def ops_state_str(state):
+ return prog['scx_ops_enable_state_str'][state].string_().decode()
+
+ops = prog['scx_ops']
+enable_state = read_atomic("scx_ops_enable_state_var")
+
+print(f'ops : {ops.name.string_().decode()}')
+print(f'enabled : {read_static_key("__scx_ops_enabled")}')
+print(f'switching_all : {read_int("scx_switching_all")}')
+print(f'switched_all : {read_static_key("__scx_switched_all")}')
+print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
+print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
+print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
+print(f'enable_seq : {read_atomic("scx_enable_seq")}')
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
new file mode 100644
index 000000000000..ed7e8d535fc5
--- /dev/null
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -0,0 +1,156 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * A simple scheduler.
+ *
+ * By default, it operates as a simple global weighted vtime scheduler and can
+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
+ *
+ * - Statistics tracking how many tasks are queued to local and global dsq's.
+ * - Termination notification for userspace.
+ *
+ * While very simple, this scheduler should work reasonably well on CPUs with a
+ * uniform L3 cache topology. While preemption is not implemented, the fact that
+ * the scheduling queue is shared across all CPUs means that whatever is at the
+ * front of the queue is likely to be executed fairly quickly given enough
+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
+ * but comes with the usual problems with FIFO scheduling where saturating
+ * threads can easily drown out interactive ones.
+ *
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <scx/common.bpf.h>
+
+char _license[] SEC("license") = "GPL";
+
+const volatile bool fifo_sched;
+
+static u64 vtime_now;
+UEI_DEFINE(uei);
+
+/*
+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume
+ * from. If scx_simple only supported global FIFO scheduling, then we could
+ * just use SCX_DSQ_GLOBAL.
+ */
+#define SHARED_DSQ 0
+
+struct {
+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+ __uint(key_size, sizeof(u32));
+ __uint(value_size, sizeof(u64));
+ __uint(max_entries, 2); /* [local, global] */
+} stats SEC(".maps");
+
+static void stat_inc(u32 idx)
+{
+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx);
+ if (cnt_p)
+ (*cnt_p)++;
+}
+
+static inline bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
+{
+ bool is_idle = false;
+ s32 cpu;
+
+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
+ if (is_idle) {
+ stat_inc(0); /* count local queueing */
+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+ }
+
+ return cpu;
+}
+
+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
+{
+ stat_inc(1); /* count global queueing */
+
+ if (fifo_sched) {
+ scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+ } else {
+ u64 vtime = p->scx.dsq_vtime;
+
+ /*
+ * Limit the amount of budget that an idling task can accumulate
+ * to one slice.
+ */
+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+ vtime = vtime_now - SCX_SLICE_DFL;
+
+ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
+ enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
+{
+ scx_bpf_consume(SHARED_DSQ);
+}
+
+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
+{
+ if (fifo_sched)
+ return;
+
+ /*
+ * Global vtime always progresses forward as tasks start executing. The
+ * test and update can be performed concurrently from multiple CPUs and
+ * thus racy. Any error should be contained and temporary. Let's just
+ * live with it.
+ */
+ if (vtime_before(vtime_now, p->scx.dsq_vtime))
+ vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
+{
+ if (fifo_sched)
+ return;
+
+ /*
+ * Scale the execution time by the inverse of the weight and charge.
+ *
+ * Note that the default yield implementation yields by setting
+ * @p->scx.slice to zero and the following would treat the yielding task
+ * as if it has consumed all its slice. If this penalizes yielding tasks
+ * too much, determine the execution time by taking explicit timestamps
+ * instead of depending on @p->scx.slice.
+ */
+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
+{
+ p->scx.dsq_vtime = vtime_now;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
+{
+ return scx_bpf_create_dsq(SHARED_DSQ, -1);
+}
+
+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
+{
+ UEI_RECORD(uei, ei);
+}
+
+SCX_OPS_DEFINE(simple_ops,
+ .select_cpu = (void *)simple_select_cpu,
+ .enqueue = (void *)simple_enqueue,
+ .dispatch = (void *)simple_dispatch,
+ .running = (void *)simple_running,
+ .stopping = (void *)simple_stopping,
+ .enable = (void *)simple_enable,
+ .init = (void *)simple_init,
+ .exit = (void *)simple_exit,
+ .name = "simple");
diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c
new file mode 100644
index 000000000000..76d83199545c
--- /dev/null
+++ b/tools/sched_ext/scx_simple.c
@@ -0,0 +1,107 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <signal.h>
+#include <libgen.h>
+#include <bpf/bpf.h>
+#include <scx/common.h>
+#include "scx_simple.bpf.skel.h"
+
+const char help_fmt[] =
+"A simple sched_ext scheduler.\n"
+"\n"
+"See the top-level comment in .bpf.c for more details.\n"
+"\n"
+"Usage: %s [-f] [-v]\n"
+"\n"
+" -f Use FIFO scheduling instead of weighted vtime scheduling\n"
+" -v Print libbpf debug messages\n"
+" -h Display this help and exit\n";
+
+static bool verbose;
+static volatile int exit_req;
+
+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+ if (level == LIBBPF_DEBUG && !verbose)
+ return 0;
+ return vfprintf(stderr, format, args);
+}
+
+static void sigint_handler(int simple)
+{
+ exit_req = 1;
+}
+
+static void read_stats(struct scx_simple *skel, __u64 *stats)
+{
+ int nr_cpus = libbpf_num_possible_cpus();
+ __u64 cnts[2][nr_cpus];
+ __u32 idx;
+
+ memset(stats, 0, sizeof(stats[0]) * 2);
+
+ for (idx = 0; idx < 2; idx++) {
+ int ret, cpu;
+
+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats),
+ &idx, cnts[idx]);
+ if (ret < 0)
+ continue;
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ stats[idx] += cnts[idx][cpu];
+ }
+}
+
+int main(int argc, char **argv)
+{
+ struct scx_simple *skel;
+ struct bpf_link *link;
+ __u32 opt;
+ __u64 ecode;
+
+ libbpf_set_print(libbpf_print_fn);
+ signal(SIGINT, sigint_handler);
+ signal(SIGTERM, sigint_handler);
+restart:
+ skel = SCX_OPS_OPEN(simple_ops, scx_simple);
+
+ while ((opt = getopt(argc, argv, "fvh")) != -1) {
+ switch (opt) {
+ case 'f':
+ skel->rodata->fifo_sched = true;
+ break;
+ case 'v':
+ verbose = true;
+ break;
+ default:
+ fprintf(stderr, help_fmt, basename(argv[0]));
+ return opt != 'h';
+ }
+ }
+
+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei);
+ link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple);
+
+ while (!exit_req && !UEI_EXITED(skel, uei)) {
+ __u64 stats[2];
+
+ read_stats(skel, stats);
+ printf("local=%llu global=%llu\n", stats[0], stats[1]);
+ fflush(stdout);
+ sleep(1);
+ }
+
+ bpf_link__destroy(link);
+ ecode = UEI_REPORT(skel, uei);
+ scx_simple__destroy(skel);
+
+ if (UEI_ECODE_RESTART(ecode))
+ goto restart;
+ return 0;
+}