cgroup: fix handling of multi-destination migration from subtree_control enabling

Consider the following v2 hierarchy. P0 (+memory) --- P1 (-memory) --- A \- B P0 has memory enabled in its subtree_control while P1 doesn't. If both A and B contain processes, they would belong to the memory css of P1. Now if memory is enabled on P1's subtree_control, memory csses should be created on both A and B and A's processes should be moved to the former and B's processes the latter. IOW, enabling controllers can cause atomic migrations into different csses. The core cgroup migration logic has been updated accordingly but the controller migration methods haven't and still assume that all tasks migrate to a single target css; furthermore, the methods were fed the css in which subtree_control was updated which is the parent of the target csses. pids controller depends on the migration methods to move charges and this made the controller attribute charges to the wrong csses often triggering the following warning by driving a counter negative. WARNING: CPU: 1 PID: 1 at kernel/cgroup_pids.c:97 pids_cancel.constprop.6+0x31/0x40() Modules linked in: CPU: 1 PID: 1 Comm: systemd Not tainted 4.4.0-rc1+ #29 ... ffffffff81f65382 ffff88007c043b90 ffffffff81551ffc 0000000000000000 ffff88007c043bc8 ffffffff810de202 ffff88007a752000 ffff88007a29ab00 ffff88007c043c80 ffff88007a1d8400 0000000000000001 ffff88007c043bd8 Call Trace: [<ffffffff81551ffc>] dump_stack+0x4e/0x82 [<ffffffff810de202>] warn_slowpath_common+0x82/0xc0 [<ffffffff810de2fa>] warn_slowpath_null+0x1a/0x20 [<ffffffff8118e031>] pids_cancel.constprop.6+0x31/0x40 [<ffffffff8118e0fd>] pids_can_attach+0x6d/0xf0 [<ffffffff81188a4c>] cgroup_taskset_migrate+0x6c/0x330 [<ffffffff81188e05>] cgroup_migrate+0xf5/0x190 [<ffffffff81189016>] cgroup_attach_task+0x176/0x200 [<ffffffff8118949d>] __cgroup_procs_write+0x2ad/0x460 [<ffffffff81189684>] cgroup_procs_write+0x14/0x20 [<ffffffff811854e5>] cgroup_file_write+0x35/0x1c0 [<ffffffff812e26f1>] kernfs_fop_write+0x141/0x190 [<ffffffff81265f88>] __vfs_write+0x28/0xe0 [<ffffffff812666fc>] vfs_write+0xac/0x1a0 [<ffffffff81267019>] SyS_write+0x49/0xb0 [<ffffffff81bcef32>] entry_SYSCALL_64_fastpath+0x12/0x76 This patch fixes the bug by removing @css parameter from the three migration methods, ->can_attach, ->cancel_attach() and ->attach() and updating cgroup_taskset iteration helpers also return the destination css in addition to the task being migrated. All controllers are updated accordingly. * Controllers which don't care whether there are one or multiple target csses can be converted trivially. cpu, io, freezer, perf, netclassid and netprio fall in this category. * cpuset's current implementation assumes that there's single source and destination and thus doesn't support v2 hierarchy already. The only change made by this patchset is how that single destination css is obtained. * memory migration path already doesn't do anything on v2. How the single destination css is obtained is updated and the prep stage of mem_cgroup_can_attach() is reordered to accomodate the change. * pids is the only controller which was affected by this bug. It now correctly handles multi-destination migrations and no longer causes counter underflow from incorrect accounting. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-and-tested-by: Daniel Wagner <daniel.wagner@bmw-carit.de> Cc: Aleksa Sarai <cyphar@cyphar.com>
author: Tejun Heo <tj@kernel.org> 2015-12-03 10:18:21 -0500
committer: Tejun Heo <tj@kernel.org> 2015-12-03 10:18:21 -0500
commit: 1f7dd3e5a6e4f093017fff12232572ee1aa4639b (patch)
tree: 2820e6f3fefd3c92ef2f7e58f688a8e2f2211aff /mm
parent: 599c963a0f19b14132065788322207eaa58bc7f8 (diff)
1 files changed, 23 insertions, 22 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9acfb165eb52..c92a65b2b4ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4779,23 +4779,18 @@ static void mem_cgroup_clear_mc(void)
 	spin_unlock(&mc.lock);
 }
 
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
-				 struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 {
-	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+	struct cgroup_subsys_state *css;
+	struct mem_cgroup *memcg;
 	struct mem_cgroup *from;
 	struct task_struct *leader, *p;
 	struct mm_struct *mm;
 	unsigned long move_flags;
 	int ret = 0;
 
-	/*
-	 * We are now commited to this value whatever it is. Changes in this
-	 * tunable will only affect upcoming migrations, not the current one.
-	 * So we need to save it, and keep it going.
-	 */
-	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
-	if (!move_flags)
+	/* charge immigration isn't supported on the default hierarchy */
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		return 0;
 
 	/*
@@ -4805,13 +4800,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 	 * multiple.
 	 */
 	p = NULL;
-	cgroup_taskset_for_each_leader(leader, tset) {
+	cgroup_taskset_for_each_leader(leader, css, tset) {
 		WARN_ON_ONCE(p);
 		p = leader;
+		memcg = mem_cgroup_from_css(css);
 	}
 	if (!p)
 		return 0;
 
+	/*
+	 * We are now commited to this value whatever it is. Changes in this
+	 * tunable will only affect upcoming migrations, not the current one.
+	 * So we need to save it, and keep it going.
+	 */
+	move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
+	if (!move_flags)
+		return 0;
+
 	from = mem_cgroup_from_task(p);
 
 	VM_BUG_ON(from == memcg);
@@ -4842,8 +4847,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 	return ret;
 }
 
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
-				     struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
 {
 	if (mc.to)
 		mem_cgroup_clear_mc();
@@ -4985,10 +4989,10 @@ retry:
 	atomic_dec(&mc.from->moving_account);
 }
 
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
-				 struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
 {
-	struct task_struct *p = cgroup_taskset_first(tset);
+	struct cgroup_subsys_state *css;
+	struct task_struct *p = cgroup_taskset_first(tset, &css);
 	struct mm_struct *mm = get_task_mm(p);
 
 	if (mm) {
@@ -5000,17 +5004,14 @@ static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
 		mem_cgroup_clear_mc();
 }
 #else	/* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
-				 struct cgroup_taskset *tset)
+static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
 {
 	return 0;
 }
-static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css,
-				     struct cgroup_taskset *tset)
+static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
 {
 }
-static void mem_cgroup_move_task(struct cgroup_subsys_state *css,
-				 struct cgroup_taskset *tset)
+static void mem_cgroup_move_task(struct cgroup_taskset *tset)
 {
 }
 #endif
author	Tejun Heo <tj@kernel.org>	2015-12-03 10:18:21 -0500
committer	Tejun Heo <tj@kernel.org>	2015-12-03 10:18:21 -0500
commit	1f7dd3e5a6e4f093017fff12232572ee1aa4639b (patch)
tree	2820e6f3fefd3c92ef2f7e58f688a8e2f2211aff /mm
parent	599c963a0f19b14132065788322207eaa58bc7f8 (diff)