diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-15 17:06:08 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-05-15 17:06:08 -0700 |
commit | de6fef50eaf40789b11841474726fd918a3a84a1 (patch) | |
tree | df6e1395240b9275e1f5d064edd6957cf3d8d510 /kernel | |
parent | f4b0c4b508364fde023e4f7b9f23f7e38c663dfe (diff) | |
parent | 21c38a3bd4ee3fb7337d013a638302fb5e5f9dc2 (diff) |
Merge tag 'cgroup-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- The locking around cpuset hotplug processing has always been a bit of
mess which was worked around by making hotplug processing
asynchronous. The asynchronity isn't great and led to other issues.
We tried to make the behavior synchronous a while ago but that led to
lockdep splats. Waiman took another stab at cleaning up and making it
synchronous. The patch has been in -next for well over a month and
there haven't been any complaints, so fingers crossed.
- Tracepoints added to help understanding rstat lock contentions.
- A bunch of minor changes - doc updates, code cleanups and selftests.
* tag 'cgroup-for-6.10' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (24 commits)
cgroup/rstat: add cgroup_rstat_cpu_lock helpers and tracepoints
selftests/cgroup: Drop define _GNU_SOURCE
docs: cgroup-v1: Update page cache removal functions
selftests/cgroup: fix uninitialized variables in test_zswap.c
selftests/cgroup: cpu_hogger init: use {} instead of {NULL}
selftests/cgroup: fix clang warnings: uninitialized fd variable
selftests/cgroup: fix clang build failures for abs() calls
cgroup/cpuset: Remove outdated comment in sched_partition_write()
cgroup/cpuset: Fix incorrect top_cpuset flags
cgroup/cpuset: Avoid clearing CS_SCHED_LOAD_BALANCE twice
cgroup/cpuset: Statically initialize more members of top_cpuset
cgroup: Avoid unnecessary looping in cgroup_no_v1()
cgroup, legacy_freezer: update comment for freezer_css_offline()
docs, cgroup: add entries for pids to cgroup-v2.rst
cgroup: don't call cgroup1_pidlist_destroy_all() for v2
cgroup_freezer: update comment for freezer_css_online()
cgroup/rstat: desc member cgrp in cgroup_rstat_flush_release
cgroup/rstat: add cgroup_rstat_lock helpers and tracepoints
cgroup/pids: Remove superfluous zeroing
docs: cgroup-v1: Fix description for css_online
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cgroup/cgroup-v1.c | 1 | ||||
-rw-r--r-- | kernel/cgroup/cgroup.c | 3 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 156 | ||||
-rw-r--r-- | kernel/cgroup/legacy_freezer.c | 5 | ||||
-rw-r--r-- | kernel/cgroup/pids.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/rstat.c | 118 | ||||
-rw-r--r-- | kernel/cpu.c | 48 | ||||
-rw-r--r-- | kernel/power/process.c | 2 |
8 files changed, 162 insertions, 173 deletions
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 520a11cb12f4..b9dbf6bf2779 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1335,6 +1335,7 @@ static int __init cgroup_no_v1(char *str) continue; cgroup_no_v1_mask |= 1 << i; + break; } } return 1; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a66c088c851c..e32b6972c478 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5368,7 +5368,8 @@ static void css_free_rwork_fn(struct work_struct *work) } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); - cgroup1_pidlist_destroy_all(cgrp); + if (!cgroup_on_dfl(cgrp)) + cgroup1_pidlist_destroy_all(cgrp); cancel_work_sync(&cgrp->release_agent_work); bpf_cgrp_storage_free(cgrp); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4237c8748715..a10e4bd0c0c1 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -202,6 +202,14 @@ struct cpuset { }; /* + * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously + */ +struct cpuset_remove_tasks_struct { + struct work_struct work; + struct cpuset *cs; +}; + +/* * Exclusive CPUs distributed out to sub-partitions of top_cpuset */ static cpumask_var_t subpartitions_cpus; @@ -360,9 +368,10 @@ static inline void notify_partition_change(struct cpuset *cs, int old_prs) } static struct cpuset top_cpuset = { - .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | - (1 << CS_MEM_EXCLUSIVE)), + .flags = BIT(CS_ONLINE) | BIT(CS_CPU_EXCLUSIVE) | + BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE), .partition_root_state = PRS_ROOT, + .relax_domain_level = -1, .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; @@ -449,12 +458,6 @@ static DEFINE_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; -/* - * CPU / memory hotplug is handled asynchronously. - */ -static void cpuset_hotplug_workfn(struct work_struct *work); -static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); - static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static inline void check_insane_mems_config(nodemask_t *nodes) @@ -540,22 +543,10 @@ static void guarantee_online_cpus(struct task_struct *tsk, rcu_read_lock(); cs = task_cs(tsk); - while (!cpumask_intersects(cs->effective_cpus, pmask)) { + while (!cpumask_intersects(cs->effective_cpus, pmask)) cs = parent_cs(cs); - if (unlikely(!cs)) { - /* - * The top cpuset doesn't have any online cpu as a - * consequence of a race between cpuset_hotplug_work - * and cpu hotplug notifier. But we know the top - * cpuset's effective_cpus is on its way to be - * identical to cpu_online_mask. - */ - goto out_unlock; - } - } - cpumask_and(pmask, pmask, cs->effective_cpus); -out_unlock: + cpumask_and(pmask, pmask, cs->effective_cpus); rcu_read_unlock(); } @@ -1217,7 +1208,7 @@ static void rebuild_sched_domains_locked(void) /* * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * Anyways, cpuset_handle_hotplug() will rebuild sched domains. * * With no CPUs in any subpartitions, top_cpuset's effective CPUs * should be the same as the active CPUs, so checking only top_cpuset @@ -1260,12 +1251,17 @@ static void rebuild_sched_domains_locked(void) } #endif /* CONFIG_SMP */ -void rebuild_sched_domains(void) +static void rebuild_sched_domains_cpuslocked(void) { - cpus_read_lock(); mutex_lock(&cpuset_mutex); rebuild_sched_domains_locked(); mutex_unlock(&cpuset_mutex); +} + +void rebuild_sched_domains(void) +{ + cpus_read_lock(); + rebuild_sched_domains_cpuslocked(); cpus_read_unlock(); } @@ -2079,14 +2075,11 @@ write_error: /* * For partcmd_update without newmask, it is being called from - * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. - * Update the load balance flag and scheduling domain if - * cpus_read_trylock() is successful. + * cpuset_handle_hotplug(). Update the load balance flag and + * scheduling domain accordingly. */ - if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { + if ((cmd == partcmd_update) && !newmask) update_partition_sd_lb(cs, old_prs); - cpus_read_unlock(); - } notify_partition_change(cs, old_prs); return 0; @@ -3599,8 +3592,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, * proceeding, so that we don't end up keep removing tasks added * after execution capability is restored. * - * cpuset_hotplug_work calls back into cgroup core via - * cgroup_transfer_tasks() and waiting for it from a cgroupfs + * cpuset_handle_hotplug may call back into cgroup core asynchronously + * via cgroup_transfer_tasks() and waiting for it from a cgroupfs * operation like this one can lead to a deadlock through kernfs * active_ref protection. Let's break the protection. Losing the * protection is okay as we check whether @cs is online after @@ -3609,7 +3602,6 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of, */ css_get(&cs->css); kernfs_break_active_protection(of->kn); - flush_work(&cpuset_hotplug_work); cpus_read_lock(); mutex_lock(&cpuset_mutex); @@ -3782,9 +3774,6 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, buf = strstrip(buf); - /* - * Convert "root" to ENABLED, and convert "member" to DISABLED. - */ if (!strcmp(buf, "root")) val = PRS_ROOT; else if (!strcmp(buf, "member")) @@ -4060,11 +4049,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; - /* - * Clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (!is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -4318,8 +4302,6 @@ int __init cpuset_init(void) nodes_setall(top_cpuset.effective_mems); fmeter_init(&top_cpuset.fmeter); - set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); - top_cpuset.relax_domain_level = -1; INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4354,6 +4336,16 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) } } +static void cpuset_migrate_tasks_workfn(struct work_struct *work) +{ + struct cpuset_remove_tasks_struct *s; + + s = container_of(work, struct cpuset_remove_tasks_struct, work); + remove_tasks_in_empty_cpuset(s->cs); + css_put(&s->cs->css); + kfree(s); +} + static void hotplug_update_tasks_legacy(struct cpuset *cs, struct cpumask *new_cpus, nodemask_t *new_mems, @@ -4383,12 +4375,21 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Move tasks to the nearest ancestor with execution resources, * This is full cgroup operation which will also call back into - * cpuset. Should be done outside any lock. + * cpuset. Execute it asynchronously using workqueue. */ - if (is_empty) { - mutex_unlock(&cpuset_mutex); - remove_tasks_in_empty_cpuset(cs); - mutex_lock(&cpuset_mutex); + if (is_empty && cs->css.cgroup->nr_populated_csets && + css_tryget_online(&cs->css)) { + struct cpuset_remove_tasks_struct *s; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (WARN_ON_ONCE(!s)) { + css_put(&cs->css); + return; + } + + s->cs = cs; + INIT_WORK(&s->work, cpuset_migrate_tasks_workfn); + schedule_work(&s->work); } } @@ -4421,30 +4422,6 @@ void cpuset_force_rebuild(void) force_rebuild = true; } -/* - * Attempt to acquire a cpus_read_lock while a hotplug operation may be in - * progress. - * Return: true if successful, false otherwise - * - * To avoid circular lock dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ -static bool cpuset_hotplug_cpus_read_trylock(void) -{ - int retries = 0; - - while (!cpus_read_trylock()) { - /* - * CPU hotplug still in progress. Retry 5 times - * with a 10ms wait before bailing out. - */ - if (++retries > 5) - return false; - msleep(10); - } - return true; -} - /** * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug * @cs: cpuset in interest @@ -4493,13 +4470,11 @@ retry: compute_partition_effective_cpumask(cs, &new_cpus); if (remote && cpumask_empty(&new_cpus) && - partition_is_populated(cs, NULL) && - cpuset_hotplug_cpus_read_trylock()) { + partition_is_populated(cs, NULL)) { remote_partition_disable(cs, tmp); compute_effective_cpumask(&new_cpus, cs, parent); remote = false; cpuset_force_rebuild(); - cpus_read_unlock(); } /* @@ -4519,18 +4494,8 @@ retry: else if (is_partition_valid(parent) && is_partition_invalid(cs)) partcmd = partcmd_update; - /* - * cpus_read_lock needs to be held before calling - * update_parent_effective_cpumask(). To avoid circular lock - * dependency between cpuset_mutex and cpus_read_lock, - * cpus_read_trylock() is used here to acquire the lock. - */ if (partcmd >= 0) { - if (!cpuset_hotplug_cpus_read_trylock()) - goto update_tasks; - update_parent_effective_cpumask(cs, partcmd, NULL, tmp); - cpus_read_unlock(); if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) { compute_partition_effective_cpumask(cs, &new_cpus); cpuset_force_rebuild(); @@ -4558,8 +4523,7 @@ unlock: } /** - * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset - * @work: unused + * cpuset_handle_hotplug - handle CPU/memory hot{,un}plug for a cpuset * * This function is called after either CPU or memory configuration has * changed and updates cpuset accordingly. The top_cpuset is always @@ -4573,8 +4537,10 @@ unlock: * * Note that CPU offlining during suspend is ignored. We don't modify * cpusets across suspend/resume cycles at all. + * + * CPU / memory hotplug is handled synchronously. */ -static void cpuset_hotplug_workfn(struct work_struct *work) +static void cpuset_handle_hotplug(void) { static cpumask_t new_cpus; static nodemask_t new_mems; @@ -4585,6 +4551,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) if (on_dfl && !alloc_cpumasks(NULL, &tmp)) ptmp = &tmp; + lockdep_assert_cpus_held(); mutex_lock(&cpuset_mutex); /* fetch the available cpus/mems and find out which changed how */ @@ -4666,7 +4633,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* rebuild sched domains if cpus_allowed has changed */ if (cpus_updated || force_rebuild) { force_rebuild = false; - rebuild_sched_domains(); + rebuild_sched_domains_cpuslocked(); } free_cpumasks(NULL, ptmp); @@ -4679,12 +4646,7 @@ void cpuset_update_active_cpus(void) * inside cgroup synchronization. Bounce actual hotplug processing * to a work item to avoid reverse locking order. */ - schedule_work(&cpuset_hotplug_work); -} - -void cpuset_wait_for_hotplug(void) -{ - flush_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); } /* @@ -4695,7 +4657,7 @@ void cpuset_wait_for_hotplug(void) static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - schedule_work(&cpuset_hotplug_work); + cpuset_handle_hotplug(); return NOTIFY_OK; } diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c index 66d1708042a7..074653f964c1 100644 --- a/kernel/cgroup/legacy_freezer.c +++ b/kernel/cgroup/legacy_freezer.c @@ -106,8 +106,7 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css) * @css: css being created * * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. + * parent's freezing state while holding cpus read lock and freezer_mutex. */ static int freezer_css_online(struct cgroup_subsys_state *css) { @@ -133,7 +132,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) * freezer_css_offline - initiate destruction of a freezer css * @css: css being destroyed * - * @css is going away. Mark it dead and decrement system_freezing_count if + * @css is going away. Mark it dead and decrement freezer_active if * it was holding one. */ static void freezer_css_offline(struct cgroup_subsys_state *css) diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 7695e60bcb40..0e5ec7d59b4d 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -75,9 +75,7 @@ pids_css_alloc(struct cgroup_subsys_state *parent) if (!pids) return ERR_PTR(-ENOMEM); - atomic64_set(&pids->counter, 0); atomic64_set(&pids->limit, PIDS_MAX); - atomic64_set(&pids->events_limit, 0); return &pids->css; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 07e2284bb499..fb8b49437573 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -7,6 +7,8 @@ #include <linux/btf.h> #include <linux/btf_ids.h> +#include <trace/events/cgroup.h> + static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); @@ -17,6 +19,60 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +/* + * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @fast_path determine the + * tracepoints being added, allowing us to diagnose "flush" related + * operations without handling high-frequency fast-path "update" events. + */ +static __always_inline +unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, const bool fast_path) +{ + unsigned long flags; + bool contended; + + /* + * The _irqsave() is needed because cgroup_rstat_lock is + * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring + * this lock with the _irq() suffix only disables interrupts on + * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables + * interrupts on both configurations. The _irqsave() ensures + * that interrupts are always disabled and later restored. + */ + contended = !raw_spin_trylock_irqsave(cpu_lock, flags); + if (contended) { + if (fast_path) + trace_cgroup_rstat_cpu_lock_contended_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_lock_contended(cgrp, cpu, contended); + + raw_spin_lock_irqsave(cpu_lock, flags); + } + + if (fast_path) + trace_cgroup_rstat_cpu_locked_fastpath(cgrp, cpu, contended); + else + trace_cgroup_rstat_cpu_locked(cgrp, cpu, contended); + + return flags; +} + +static __always_inline +void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup *cgrp, unsigned long flags, + const bool fast_path) +{ + if (fast_path) + trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); + else + trace_cgroup_rstat_cpu_unlock(cgrp, cpu, false); + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup @@ -42,7 +98,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -70,7 +126,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); } /** @@ -151,15 +207,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - /* - * The _irqsave() is needed because cgroup_rstat_lock is - * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring - * this lock with the _irq() suffix only disables interrupts on - * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables - * interrupts on both configurations. The _irqsave() ensures - * that interrupts are always disabled and later restored. - */ - raw_spin_lock_irqsave(cpu_lock, flags); + flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -196,7 +244,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - raw_spin_unlock_irqrestore(cpu_lock, flags); + _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); return head; } @@ -222,6 +270,35 @@ __weak noinline void bpf_rstat_flush(struct cgroup *cgrp, __bpf_hook_end(); +/* + * Helper functions for locking cgroup_rstat_lock. + * + * This makes it easier to diagnose locking issues and contention in + * production environments. The parameter @cpu_in_loop indicate lock + * was released and re-taken when collection data from the CPUs. The + * value -1 is used when obtaining the main lock else this is the CPU + * number processed last. + */ +static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) + __acquires(&cgroup_rstat_lock) +{ + bool contended; + + contended = !spin_trylock_irq(&cgroup_rstat_lock); + if (contended) { + trace_cgroup_rstat_lock_contended(cgrp, cpu_in_loop, contended); + spin_lock_irq(&cgroup_rstat_lock); + } + trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); +} + +static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) + __releases(&cgroup_rstat_lock) +{ + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); + spin_unlock_irq(&cgroup_rstat_lock); +} + /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) @@ -248,10 +325,10 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) /* play nice and yield if necessary */ if (need_resched() || spin_needbreak(&cgroup_rstat_lock)) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, cpu); if (!cond_resched()) cpu_relax(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, cpu); } } } @@ -273,9 +350,9 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } /** @@ -291,17 +368,18 @@ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); - spin_lock_irq(&cgroup_rstat_lock); + __cgroup_rstat_lock(cgrp, -1); cgroup_rstat_flush_locked(cgrp); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + * @cgrp: cgroup used by tracepoint */ -void cgroup_rstat_flush_release(void) +void cgroup_rstat_flush_release(struct cgroup *cgrp) __releases(&cgroup_rstat_lock) { - spin_unlock_irq(&cgroup_rstat_lock); + __cgroup_rstat_unlock(cgrp, -1); } int cgroup_rstat_init(struct cgroup *cgrp) @@ -533,7 +611,7 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) #ifdef CONFIG_SCHED_CORE forceidle_time = cgrp->bstat.forceidle_sum; #endif - cgroup_rstat_flush_release(); + cgroup_rstat_flush_release(cgrp); } else { root_cgroup_cputime(&bstat); usage = bstat.cputime.sum_exec_runtime; diff --git a/kernel/cpu.c b/kernel/cpu.c index 63447eb85dab..563877d6c28b 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1208,52 +1208,6 @@ void __init cpuhp_threads_init(void) kthread_unpark(this_cpu_read(cpuhp_state.thread)); } -/* - * - * Serialize hotplug trainwrecks outside of the cpu_hotplug_lock - * protected region. - * - * The operation is still serialized against concurrent CPU hotplug via - * cpu_add_remove_lock, i.e. CPU map protection. But it is _not_ - * serialized against other hotplug related activity like adding or - * removing of state callbacks and state instances, which invoke either the - * startup or the teardown callback of the affected state. - * - * This is required for subsystems which are unfixable vs. CPU hotplug and - * evade lock inversion problems by scheduling work which has to be - * completed _before_ cpu_up()/_cpu_down() returns. - * - * Don't even think about adding anything to this for any new code or even - * drivers. It's only purpose is to keep existing lock order trainwrecks - * working. - * - * For cpu_down() there might be valid reasons to finish cleanups which are - * not required to be done under cpu_hotplug_lock, but that's a different - * story and would be not invoked via this. - */ -static void cpu_up_down_serialize_trainwrecks(bool tasks_frozen) -{ - /* - * cpusets delegate hotplug operations to a worker to "solve" the - * lock order problems. Wait for the worker, but only if tasks are - * _not_ frozen (suspend, hibernate) as that would wait forever. - * - * The wait is required because otherwise the hotplug operation - * returns with inconsistent state, which could even be observed in - * user space when a new CPU is brought up. The CPU plug uevent - * would be delivered and user space reacting on it would fail to - * move tasks to the newly plugged CPU up to the point where the - * work has finished because up to that point the newly plugged CPU - * is not assignable in cpusets/cgroups. On unplug that's not - * necessarily a visible issue, but it is still inconsistent state, - * which is the real problem which needs to be "fixed". This can't - * prevent the transient state between scheduling the work and - * returning from waiting for it. - */ - if (!tasks_frozen) - cpuset_wait_for_hotplug(); -} - #ifdef CONFIG_HOTPLUG_CPU #ifndef arch_clear_mm_cpumask_cpu #define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) @@ -1494,7 +1448,6 @@ out: */ lockup_detector_cleanup(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } @@ -1728,7 +1681,6 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) out: cpus_write_unlock(); arch_smt_update(); - cpu_up_down_serialize_trainwrecks(tasks_frozen); return ret; } diff --git a/kernel/power/process.c b/kernel/power/process.c index cae81a87cc91..66ac067d9ae6 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -194,8 +194,6 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); - cpuset_wait_for_hotplug(); - read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ |