diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-01 17:22:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-01 17:22:14 -0700 |
commit | 3dbdb38e286903ec220aaf1fb29a8d94297da246 (patch) | |
tree | f80cd38836298d6bb465462e30f4dd644b018735 | |
parent | e267992f9ef0bf717d70a9ee18049782f77e4b3a (diff) | |
parent | 3958e2d0c34e18c41b60dc01832bd670a59ef70f (diff) |
Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- cgroup.kill is added which implements atomic killing of the whole
subtree.
Down the line, this should be able to replace the multiple userland
implementations of "keep killing till empty".
- PSI can now be turned off at boot time to avoid overhead for
configurations which don't care about PSI.
* 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
cgroup: make per-cgroup pressure stall tracking configurable
cgroup: Fix kernel-doc
cgroup: inline cgroup_task_freeze()
tests/cgroup: test cgroup.kill
tests/cgroup: move cg_wait_for(), cg_prepare_for_wait()
tests/cgroup: use cgroup.kill in cg_killall()
docs/cgroup: add entry for cgroup.kill
cgroup: introduce cgroup.kill
-rw-r--r-- | Documentation/admin-guide/cgroup-v2.rst | 15 | ||||
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 9 | ||||
-rw-r--r-- | include/linux/cgroup-defs.h | 4 | ||||
-rw-r--r-- | include/linux/cgroup.h | 25 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 180 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 2 | ||||
-rw-r--r-- | kernel/sched/psi.c | 30 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/.gitignore | 3 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/Makefile | 2 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.c | 51 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/cgroup_util.h | 2 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/test_freezer.c | 57 | ||||
-rw-r--r-- | tools/testing/selftests/cgroup/test_kill.c | 297 |
13 files changed, 569 insertions, 108 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4e59925e6583..5c7377b5bd3e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -953,6 +953,21 @@ All cgroup core files are prefixed with "cgroup." it's possible to delete a frozen (and empty) cgroup, as well as create new sub-cgroups. + cgroup.kill + A write-only single value file which exists in non-root cgroups. + The only allowed value is "1". + + Writing "1" to the file causes the cgroup and all descendant cgroups to + be killed. This means that all processes located in the affected cgroup + tree will be killed via SIGKILL. + + Killing a cgroup tree will deal with concurrent forks appropriately and + is protected against migrations. + + In a threaded cgroup, writing this file fails with EOPNOTSUPP as + killing cgroups is a process directed operation, i.e. it affects + the whole thread-group. + Controllers =========== diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 2991f6e692bd..0315407f5f57 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -497,16 +497,21 @@ ccw_timeout_log [S390] See Documentation/s390/common_io.rst for details. - cgroup_disable= [KNL] Disable a particular controller - Format: {name of the controller(s) to disable} + cgroup_disable= [KNL] Disable a particular controller or optional feature + Format: {name of the controller(s) or feature(s) to disable} The effects of cgroup_disable=foo are: - foo isn't auto-mounted if you mount all cgroups in a single hierarchy - foo isn't visible as an individually mountable subsystem + - if foo is an optional feature then the feature is + disabled and corresponding cgroup files are not + created {Currently only "memory" controller deal with this and cut the overhead, others just disable the usage. So only cgroup_disable=memory is actually worthy} + Specifying "pressure" disables per-cgroup pressure + stall information accounting feature cgroup_no_v1= [KNL] Disable cgroup controllers and named hierarchies in v1 Format: { { controller | "all" | "named" } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index fb8f6d2cd104..e1c705fdfa7c 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -71,6 +71,9 @@ enum { /* Cgroup is frozen. */ CGRP_FROZEN, + + /* Control group has to be killed. */ + CGRP_KILL, }; /* cgroup_root->flags */ @@ -110,6 +113,7 @@ enum { CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ + CFTYPE_PRESSURE = (1 << 6), /* only if pressure feature is enabled */ /* internal flags, do not use outside cgroup core proper */ __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 6bc9c76680b2..2cc237e3e8b3 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -676,6 +676,8 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) return &cgrp->psi; } +bool cgroup_psi_enabled(void); + static inline void cgroup_init_kthreadd(void) { /* @@ -735,6 +737,11 @@ static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) return NULL; } +static inline bool cgroup_psi_enabled(void) +{ + return false; +} + static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { @@ -906,20 +913,6 @@ void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - bool ret; - - if (task->flags & PF_KTHREAD) - return false; - - rcu_read_lock(); - ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); - rcu_read_unlock(); - - return ret; -} - static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; @@ -929,10 +922,6 @@ static inline bool cgroup_task_frozen(struct task_struct *task) static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } -static inline bool cgroup_task_freeze(struct task_struct *task) -{ - return false; -} static inline bool cgroup_task_frozen(struct task_struct *task) { return false; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9cc8c3a686b1..ea5b0f01d2b3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = { static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_base_files[]; +/* cgroup optional features */ +enum cgroup_opt_features { +#ifdef CONFIG_PSI + OPT_FEATURE_PRESSURE, +#endif + OPT_FEATURE_COUNT +}; + +static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = { +#ifdef CONFIG_PSI + "pressure", +#endif +}; + +static u16 cgroup_feature_disable_mask __read_mostly; + static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_skip(struct css_task_iter *it, @@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, } /** - * cgroup_taskset_migrate - migrate a taskset + * cgroup_migrate_execute - migrate a taskset * @mgctx: migration context * * Migrate tasks in @mgctx as setup by migration preparation functions. @@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of) { psi_trigger_replace(&of->priv, NULL); } + +bool cgroup_psi_enabled(void) +{ + return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0; +} + +#else /* CONFIG_PSI */ +bool cgroup_psi_enabled(void) +{ + return false; +} + #endif /* CONFIG_PSI */ static int cgroup_freeze_show(struct seq_file *seq, void *v) @@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, return nbytes; } +static void __cgroup_kill(struct cgroup *cgrp) +{ + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&cgroup_mutex); + + spin_lock_irq(&css_set_lock); + set_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); + + css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it); + while ((task = css_task_iter_next(&it))) { + /* Ignore kernel threads here. */ + if (task->flags & PF_KTHREAD) + continue; + + /* Skip tasks that are already dying. */ + if (__fatal_signal_pending(task)) + continue; + + send_sig(SIGKILL, task, 0); + } + css_task_iter_end(&it); + + spin_lock_irq(&css_set_lock); + clear_bit(CGRP_KILL, &cgrp->flags); + spin_unlock_irq(&css_set_lock); +} + +static void cgroup_kill(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css; + struct cgroup *dsct; + + lockdep_assert_held(&cgroup_mutex); + + cgroup_for_each_live_descendant_pre(dsct, css, cgrp) + __cgroup_kill(dsct); +} + +static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + ssize_t ret = 0; + int kill; + struct cgroup *cgrp; + + ret = kstrtoint(strstrip(buf), 0, &kill); + if (ret) + return ret; + + if (kill != 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + /* + * Killing is a process directed operation, i.e. the whole thread-group + * is taken down so act like we do for cgroup.procs and only make this + * writable in non-threaded cgroups. + */ + if (cgroup_is_threaded(cgrp)) + ret = -EOPNOTSUPP; + else + cgroup_kill(cgrp); + + cgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); @@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, restart: for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) continue; if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) @@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) WARN_ON(cft->ss || cft->kf_ops); + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; + if (cft->seq_start) kf_ops = &cgroup_kf_ops; else @@ -4861,12 +4968,18 @@ static struct cftype cgroup_base_files[] = { .write = cgroup_freeze_write, }, { + .name = "cgroup.kill", + .flags = CFTYPE_NOT_ON_ROOT, + .write = cgroup_kill_write, + }, + { .name = "cpu.stat", .seq_show = cpu_stat_show, }, #ifdef CONFIG_PSI { .name = "io.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_io_pressure_show, .write = cgroup_io_pressure_write, .poll = cgroup_pressure_poll, @@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "memory.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_memory_pressure_show, .write = cgroup_memory_pressure_write, .poll = cgroup_pressure_poll, @@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cpu.pressure", + .flags = CFTYPE_PRESSURE, .seq_show = cgroup_cpu_pressure_show, .write = cgroup_cpu_pressure_write, .poll = cgroup_pressure_poll, @@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child, struct kernel_clone_args *kargs) __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex) { + unsigned long cgrp_flags = 0; + bool kill = false; struct cgroup_subsys *ss; struct css_set *cset; int i; @@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child, /* init tasks are special, only link regular threads */ if (likely(child->pid)) { + if (kargs->cgrp) + cgrp_flags = kargs->cgrp->flags; + else + cgrp_flags = cset->dfl_cgrp->flags; + WARN_ON_ONCE(!list_empty(&child->cg_list)); cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); @@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child, cset = NULL; } - /* - * If the cgroup has to be frozen, the new task has too. Let's set - * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the - * frozen state. - */ - if (unlikely(cgroup_task_freeze(child))) { - spin_lock(&child->sighand->siglock); - WARN_ON_ONCE(child->frozen); - child->jobctl |= JOBCTL_TRAP_FREEZE; - spin_unlock(&child->sighand->siglock); + if (!(child->flags & PF_KTHREAD)) { + if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) { + /* + * If the cgroup has to be frozen, the new task has + * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to + * get the task into the frozen state. + */ + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later from + * do_freezer_trap(). So we avoid cgroup's transient + * switch from the frozen state and back. + */ + } /* - * Calling cgroup_update_frozen() isn't required here, - * because it will be called anyway a bit later from - * do_freezer_trap(). So we avoid cgroup's transient switch - * from the frozen state and back. + * If the cgroup is to be killed notice it now and take the + * child down right after we finished preparing it for + * userspace. */ + kill = test_bit(CGRP_KILL, &cgrp_flags); } spin_unlock_irq(&css_set_lock); @@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child, put_css_set(rcset); } + /* Cgroup has to be killed so take down child immediately. */ + if (unlikely(kill)) + do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID); + cgroup_css_set_put_fork(kargs); } @@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk) cset->nr_tasks--; WARN_ON_ONCE(cgroup_task_frozen(tsk)); - if (unlikely(cgroup_task_freeze(tsk))) + if (unlikely(!(tsk->flags & PF_KTHREAD) && + test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); spin_unlock_irq(&css_set_lock); @@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str) pr_info("Disabling %s control group subsystem\n", ss->name); } + + for (i = 0; i < OPT_FEATURE_COUNT; i++) { + if (strcmp(token, cgroup_opt_feature_names[i])) + continue; + cgroup_feature_disable_mask |= 1 << i; + pr_info("Disabling %s control group feature\n", + cgroup_opt_feature_names[i]); + break; + } } return 1; } @@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf, if (!(cft->flags & CFTYPE_NS_DELEGATABLE)) continue; + if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled()) + continue; + if (prefix) ret += snprintf(buf + ret, size - ret, "%s.", prefix); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index cee265cb535c..7f0e58917432 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) } /** - * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold + * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup * * Flush stats in @cgrp's subtree and prevent further flushes. Must be diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 58b36d17a09a..1652f2bb54b7 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -148,6 +148,7 @@ static int psi_bug __read_mostly; DEFINE_STATIC_KEY_FALSE(psi_disabled); +DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled); #ifdef CONFIG_PSI_DEFAULT_DISABLED static bool psi_enable; @@ -215,6 +216,9 @@ void __init psi_init(void) return; } + if (!cgroup_psi_enabled()) + static_branch_disable(&psi_cgroups_enabled); + psi_period = jiffies_to_nsecs(PSI_FREQ); group_init(&psi_system); } @@ -748,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu, static struct psi_group *iterate_groups(struct task_struct *task, void **iter) { + if (*iter == &psi_system) + return NULL; + #ifdef CONFIG_CGROUPS - struct cgroup *cgroup = NULL; + if (static_branch_likely(&psi_cgroups_enabled)) { + struct cgroup *cgroup = NULL; - if (!*iter) - cgroup = task->cgroups->dfl_cgrp; - else if (*iter == &psi_system) - return NULL; - else - cgroup = cgroup_parent(*iter); + if (!*iter) + cgroup = task->cgroups->dfl_cgrp; + else + cgroup = cgroup_parent(*iter); - if (cgroup && cgroup_parent(cgroup)) { - *iter = cgroup; - return cgroup_psi(cgroup); + if (cgroup && cgroup_parent(cgroup)) { + *iter = cgroup; + return cgroup_psi(cgroup); + } } -#else - if (*iter) - return NULL; #endif *iter = &psi_system; return &psi_system; diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore index 84cfcabea838..be9643ef6285 100644 --- a/tools/testing/selftests/cgroup/.gitignore +++ b/tools/testing/selftests/cgroup/.gitignore @@ -2,4 +2,5 @@ test_memcontrol test_core test_freezer -test_kmem
\ No newline at end of file +test_kmem +test_kill diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile index f027d933595b..59e222460581 100644 --- a/tools/testing/selftests/cgroup/Makefile +++ b/tools/testing/selftests/cgroup/Makefile @@ -9,6 +9,7 @@ TEST_GEN_PROGS = test_memcontrol TEST_GEN_PROGS += test_kmem TEST_GEN_PROGS += test_core TEST_GEN_PROGS += test_freezer +TEST_GEN_PROGS += test_kill include ../lib.mk @@ -16,3 +17,4 @@ $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_kmem: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h +$(OUTPUT)/test_kill: cgroup_util.c ../clone3/clone3_selftests.h ../pidfd/pidfd.h diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 027014662fb2..623cec04ad42 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -5,10 +5,12 @@ #include <errno.h> #include <fcntl.h> #include <linux/limits.h> +#include <poll.h> #include <signal.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <sys/inotify.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/wait.h> @@ -252,6 +254,10 @@ int cg_killall(const char *cgroup) char buf[PAGE_SIZE]; char *ptr = buf; + /* If cgroup.kill exists use it. */ + if (!cg_write(cgroup, "cgroup.kill", "1")) + return 0; + if (cg_read(cgroup, "cgroup.procs", buf, sizeof(buf))) return -1; @@ -576,3 +582,48 @@ int clone_into_cgroup_run_wait(const char *cgroup) (void)clone_reap(pid, WEXITED); return 0; } + +int cg_prepare_for_wait(const char *cgroup) +{ + int fd, ret = -1; + + fd = inotify_init1(0); + if (fd == -1) + return fd; + + ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"), + IN_MODIFY); + if (ret == -1) { + close(fd); + fd = -1; + } + + return fd; +} + +int cg_wait_for(int fd) +{ + int ret = -1; + struct pollfd fds = { + .fd = fd, + .events = POLLIN, + }; + + while (true) { + ret = poll(&fds, 1, 10000); + + if (ret == -1) { + if (errno == EINTR) + continue; + + break; + } + + if (ret > 0 && fds.revents & POLLIN) { + ret = 0; + break; + } + } + + return ret; +} diff --git a/tools/testing/selftests/cgroup/cgroup_util.h b/tools/testing/selftests/cgroup/cgroup_util.h index 5a1305dd1f0b..82e59cdf16e7 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.h +++ b/tools/testing/selftests/cgroup/cgroup_util.h @@ -54,3 +54,5 @@ extern pid_t clone_into_cgroup(int cgroup_fd); extern int clone_reap(pid_t pid, int options); extern int clone_into_cgroup_run_wait(const char *cgroup); extern int dirfd_open_opath(const char *dir); +extern int cg_prepare_for_wait(const char *cgroup); +extern int cg_wait_for(int fd); diff --git a/tools/testing/selftests/cgroup/test_freezer.c b/tools/testing/selftests/cgroup/test_freezer.c index 23d8fa4a3e4e..ff519029f6f4 100644 --- a/tools/testing/selftests/cgroup/test_freezer.c +++ b/tools/testing/selftests/cgroup/test_freezer.c @@ -7,9 +7,7 @@ #include <unistd.h> #include <stdio.h> #include <errno.h> -#include <poll.h> #include <stdlib.h> -#include <sys/inotify.h> #include <string.h> #include <sys/wait.h> @@ -55,61 +53,6 @@ static int cg_freeze_nowait(const char *cgroup, bool freeze) } /* - * Prepare for waiting on cgroup.events file. - */ -static int cg_prepare_for_wait(const char *cgroup) -{ - int fd, ret = -1; - - fd = inotify_init1(0); - if (fd == -1) { - debug("Error: inotify_init1() failed\n"); - return fd; - } - - ret = inotify_add_watch(fd, cg_control(cgroup, "cgroup.events"), - IN_MODIFY); - if (ret == -1) { - debug("Error: inotify_add_watch() failed\n"); - close(fd); - fd = -1; - } - - return fd; -} - -/* - * Wait for an event. If there are no events for 10 seconds, - * treat this an error. - */ -static int cg_wait_for(int fd) -{ - int ret = -1; - struct pollfd fds = { - .fd = fd, - .events = POLLIN, - }; - - while (true) { - ret = poll(&fds, 1, 10000); - - if (ret == -1) { - if (errno == EINTR) - continue; - debug("Error: poll() failed\n"); - break; - } - - if (ret > 0 && fds.revents & POLLIN) { - ret = 0; - break; - } - } - - return ret; -} - -/* * Attach a task to the given cgroup and wait for a cgroup frozen event. * All transient events (e.g. populated) are ignored. */ diff --git a/tools/testing/selftests/cgroup/test_kill.c b/tools/testing/selftests/cgroup/test_kill.c new file mode 100644 index 000000000000..6153690319c9 --- /dev/null +++ b/tools/testing/selftests/cgroup/test_kill.c @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <errno.h> +#include <linux/limits.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <unistd.h> + +#include "../kselftest.h" +#include "../pidfd/pidfd.h" +#include "cgroup_util.h" + +/* + * Kill the given cgroup and wait for the inotify signal. + * If there are no events in 10 seconds, treat this as an error. + * Then check that the cgroup is in the desired state. + */ +static int cg_kill_wait(const char *cgroup) +{ + int fd, ret = -1; + + fd = cg_prepare_for_wait(cgroup); + if (fd < 0) + return fd; + + ret = cg_write(cgroup, "cgroup.kill", "1"); + if (ret) + goto out; + + ret = cg_wait_for(fd); + if (ret) + goto out; + +out: + close(fd); + return ret; +} + +/* + * A simple process running in a sleep loop until being + * re-parented. + */ +static int child_fn(const char *cgroup, void *arg) +{ + int ppid = getppid(); + + while (getppid() == ppid) + usleep(1000); + + return getppid() == ppid; +} + +static int test_cgkill_simple(const char *root) +{ + pid_t pids[100]; + int ret = KSFT_FAIL; + char *cgroup = NULL; + int i; + + cgroup = cg_name(root, "cg_test_simple"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + for (i = 0; i < 100; i++) + pids[i] = cg_run_nowait(cgroup, child_fn, NULL); + + if (cg_wait_for_proc_count(cgroup, 100)) + goto cleanup; + + if (cg_read_strcmp(cgroup, "cgroup.events", "populated 1\n")) + goto cleanup; + + if (cg_kill_wait(cgroup)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (i = 0; i < 100; i++) + wait_for_pid(pids[i]); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +/* + * The test creates the following hierarchy: + * A + * / / \ \ + * B E I K + * /\ | + * C D F + * | + * G + * | + * H + * + * with a process in C, H and 3 processes in K. + * Then it tries to kill the whole tree. + */ +static int test_cgkill_tree(const char *root) +{ + pid_t pids[5]; + char *cgroup[10] = {0}; + int ret = KSFT_FAIL; + int i; + + cgroup[0] = cg_name(root, "cg_test_tree_A"); + if (!cgroup[0]) + goto cleanup; + + cgroup[1] = cg_name(cgroup[0], "B"); + if (!cgroup[1]) + goto cleanup; + + cgroup[2] = cg_name(cgroup[1], "C"); + if (!cgroup[2]) + goto cleanup; + + cgroup[3] = cg_name(cgroup[1], "D"); + if (!cgroup[3]) + goto cleanup; + + cgroup[4] = cg_name(cgroup[0], "E"); + if (!cgroup[4]) + goto cleanup; + + cgroup[5] = cg_name(cgroup[4], "F"); + if (!cgroup[5]) + goto cleanup; + + cgroup[6] = cg_name(cgroup[5], "G"); + if (!cgroup[6]) + goto cleanup; + + cgroup[7] = cg_name(cgroup[6], "H"); + if (!cgroup[7]) + goto cleanup; + + cgroup[8] = cg_name(cgroup[0], "I"); + if (!cgroup[8]) + goto cleanup; + + cgroup[9] = cg_name(cgroup[0], "K"); + if (!cgroup[9]) + goto cleanup; + + for (i = 0; i < 10; i++) + if (cg_create(cgroup[i])) + goto cleanup; + + pids[0] = cg_run_nowait(cgroup[2], child_fn, NULL); + pids[1] = cg_run_nowait(cgroup[7], child_fn, NULL); + pids[2] = cg_run_nowait(cgroup[9], child_fn, NULL); + pids[3] = cg_run_nowait(cgroup[9], child_fn, NULL); + pids[4] = cg_run_nowait(cgroup[9], child_fn, NULL); + + /* + * Wait until all child processes will enter + * corresponding cgroups. + */ + + if (cg_wait_for_proc_count(cgroup[2], 1) || + cg_wait_for_proc_count(cgroup[7], 1) || + cg_wait_for_proc_count(cgroup[9], 3)) + goto cleanup; + + /* + * Kill A and check that we get an empty notification. + */ + if (cg_kill_wait(cgroup[0])) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + for (i = 0; i < 5; i++) + wait_for_pid(pids[i]); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup[0], "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + for (i = 9; i >= 0 && cgroup[i]; i--) { + cg_destroy(cgroup[i]); + free(cgroup[i]); + } + + return ret; +} + +static int forkbomb_fn(const char *cgroup, void *arg) +{ + int ppid; + + fork(); + fork(); + + ppid = getppid(); + + while (getppid() == ppid) + usleep(1000); + + return getppid() == ppid; +} + +/* + * The test runs a fork bomb in a cgroup and tries to kill it. + */ +static int test_cgkill_forkbomb(const char *root) +{ + int ret = KSFT_FAIL; + char *cgroup = NULL; + pid_t pid = -ESRCH; + + cgroup = cg_name(root, "cg_forkbomb_test"); + if (!cgroup) + goto cleanup; + + if (cg_create(cgroup)) + goto cleanup; + + pid = cg_run_nowait(cgroup, forkbomb_fn, NULL); + if (pid < 0) + goto cleanup; + + usleep(100000); + + if (cg_kill_wait(cgroup)) + goto cleanup; + + if (cg_wait_for_proc_count(cgroup, 0)) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + if (pid > 0) + wait_for_pid(pid); + + if (ret == KSFT_PASS && + cg_read_strcmp(cgroup, "cgroup.events", "populated 0\n")) + ret = KSFT_FAIL; + + if (cgroup) + cg_destroy(cgroup); + free(cgroup); + return ret; +} + +#define T(x) { x, #x } +struct cgkill_test { + int (*fn)(const char *root); + const char *name; +} tests[] = { + T(test_cgkill_simple), + T(test_cgkill_tree), + T(test_cgkill_forkbomb), +}; +#undef T + +int main(int argc, char *argv[]) +{ + char root[PATH_MAX]; + int i, ret = EXIT_SUCCESS; + + if (cg_find_unified_root(root, sizeof(root))) + ksft_exit_skip("cgroup v2 isn't mounted\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + switch (tests[i].fn(root)) { + case KSFT_PASS: + ksft_test_result_pass("%s\n", tests[i].name); + break; + case KSFT_SKIP: + ksft_test_result_skip("%s\n", tests[i].name); + break; + default: + ret = EXIT_FAILURE; + ksft_test_result_fail("%s\n", tests[i].name); + break; + } + } + + return ret; +} |