summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/bpf_local_storage.c4
-rw-r--r--kernel/bpf/helpers.c99
-rw-r--r--kernel/cgroup/cgroup.c36
-rw-r--r--kernel/cgroup/cpuset.c197
-rw-r--r--kernel/cgroup/misc.c80
-rw-r--r--kernel/cgroup/pids.c129
-rw-r--r--kernel/cgroup/rstat.c37
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/irq/irq_sim.c60
-rw-r--r--kernel/kcsan/kcsan_test.c1
-rw-r--r--kernel/locking/locktorture.c1
-rw-r--r--kernel/pid_namespace.c17
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c49
-rw-r--r--kernel/rcu/refscale.c1
-rw-r--r--kernel/rcu/srcutiny.c3
-rw-r--r--kernel/rcu/srcutree.c13
-rw-r--r--kernel/rcu/sync.c12
-rw-r--r--kernel/rcu/tasks.h26
-rw-r--r--kernel/rcu/tree.c92
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_exp.h24
-rw-r--r--kernel/rcu/tree_nocb.h147
-rw-r--r--kernel/rcu/tree_plugin.h14
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/scftorture.c3
-rw-r--r--kernel/sched/core.c21
-rw-r--r--kernel/sched/deadline.c7
-rw-r--r--kernel/sched/fair.c12
-rw-r--r--kernel/sched/psi.c21
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sched/stats.h11
-rw-r--r--kernel/signal.c8
-rw-r--r--kernel/smp.c5
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/time/clocksource-wdtest.c1
-rw-r--r--kernel/time/test_udelay.c1
-rw-r--r--kernel/time/tick-broadcast.c23
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/time/time_test.c1
-rw-r--r--kernel/time/timekeeping.c131
-rw-r--r--kernel/torture.c1
-rw-r--r--kernel/workqueue.c352
45 files changed, 1115 insertions, 612 deletions
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index 976cb258a0ed..c938dea5ddbf 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
- nbuckets, GFP_USER | __GFP_NOWARN);
+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 2a69a9a36c0f..3243c83ef3e3 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
- struct rcu_head rcu;
+ union {
+ struct rcu_head rcu;
+ struct work_struct delete_work;
+ };
u64 flags;
};
@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
struct bpf_hrtimer {
struct bpf_async_cb cb;
struct hrtimer timer;
+ atomic_t cancelling;
};
struct bpf_work {
@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
kfree_rcu(w, cb.rcu);
}
+static void bpf_timer_delete_work(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
+
+ /* Cancel the timer and wait for callback to complete if it was running.
+ * If hrtimer_cancel() can be safely called it's safe to call
+ * kfree_rcu(t) right after for both preallocated and non-preallocated
+ * maps. The async->cb = NULL was already done and no code path can see
+ * address 't' anymore. Timer if armed for existing bpf_hrtimer before
+ * bpf_timer_cancel_and_free will have been cancelled.
+ */
+ hrtimer_cancel(&t->timer);
+ kfree_rcu(t, cb.rcu);
+}
+
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
clockid = flags & (MAX_CLOCKS - 1);
t = (struct bpf_hrtimer *)cb;
+ atomic_set(&t->cancelling, 0);
+ INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
cb->value = (void *)async - map->record->timer_off;
@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
- struct bpf_hrtimer *t;
+ struct bpf_hrtimer *t, *cur_t;
+ bool inc = false;
int ret = 0;
if (in_nmi())
@@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
ret = -EINVAL;
goto out;
}
- if (this_cpu_read(hrtimer_running) == t) {
+
+ cur_t = this_cpu_read(hrtimer_running);
+ if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
- * since it waits for callback_fn to finish
+ * since it waits for callback_fn to finish.
*/
ret = -EDEADLK;
goto out;
}
+
+ /* Only account in-flight cancellations when invoked from a timer
+ * callback, since we want to avoid waiting only if other _callbacks_
+ * are waiting on us, to avoid introducing lockups. Non-callback paths
+ * are ok, since nobody would synchronously wait for their completion.
+ */
+ if (!cur_t)
+ goto drop;
+ atomic_inc(&t->cancelling);
+ /* Need full barrier after relaxed atomic_inc */
+ smp_mb__after_atomic();
+ inc = true;
+ if (atomic_read(&cur_t->cancelling)) {
+ /* We're cancelling timer t, while some other timer callback is
+ * attempting to cancel us. In such a case, it might be possible
+ * that timer t belongs to the other callback, or some other
+ * callback waiting upon it (creating transitive dependencies
+ * upon us), and we will enter a deadlock if we continue
+ * cancelling and waiting for it synchronously, since it might
+ * do the same. Bail!
+ */
+ ret = -EDEADLK;
+ goto out;
+ }
+drop:
drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
@@ -1467,6 +1516,8 @@ out:
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ if (inc)
+ atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
if (!t)
return;
- /* Cancel the timer and wait for callback to complete if it was running.
- * If hrtimer_cancel() can be safely called it's safe to call kfree(t)
- * right after for both preallocated and non-preallocated maps.
- * The async->cb = NULL was already done and no code path can
- * see address 't' anymore.
- *
- * Check that bpf_map_delete/update_elem() wasn't called from timer
- * callback_fn. In such case don't call hrtimer_cancel() (since it will
- * deadlock) and don't call hrtimer_try_to_cancel() (since it will just
- * return -1). Though callback_fn is still running on this cpu it's
+ /* We check that bpf_map_delete/update_elem() was called from timer
+ * callback_fn. In such case we don't call hrtimer_cancel() (since it
+ * will deadlock) and don't call hrtimer_try_to_cancel() (since it will
+ * just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
+ *
+ * However, it is possible the timer callback_fn calling us armed the
+ * timer _before_ calling us, such that failing to cancel it here will
+ * cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
+ * Therefore, we _need_ to cancel any outstanding timers before we do
+ * kfree_rcu, even though no more timers can be armed.
+ *
+ * Moreover, we need to schedule work even if timer does not belong to
+ * the calling callback_fn, as on two different CPUs, we can end up in a
+ * situation where both sides run in parallel, try to cancel one
+ * another, and we end up waiting on both sides in hrtimer_cancel
+ * without making forward progress, since timer1 depends on time2
+ * callback to finish, and vice versa.
+ *
+ * CPU 1 (timer1_cb) CPU 2 (timer2_cb)
+ * bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
+ *
+ * To avoid these issues, punt to workqueue context when we are in a
+ * timer callback.
*/
- if (this_cpu_read(hrtimer_running) != t)
- hrtimer_cancel(&t->timer);
- kfree_rcu(t, cb.rcu);
+ if (this_cpu_read(hrtimer_running))
+ queue_work(system_unbound_wq, &t->cb.delete_work);
+ else
+ bpf_timer_delete_work(&t->cb.delete_work);
}
/* This function is called by map_delete/update_elem for individual element and
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index e32b6972c478..c8e4b62b436a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1744,8 +1744,11 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
if (cgroup_psi_enabled()) {
ret = cgroup_addrm_files(css, cgrp,
cgroup_psi_files, true);
- if (ret < 0)
+ if (ret < 0) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
return ret;
+ }
}
} else {
ret = cgroup_addrm_files(css, cgrp,
@@ -1839,9 +1842,9 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
rcu_assign_pointer(dcgrp->subsys[ssid], css);
ss->root = dst_root;
- css->cgroup = dcgrp;
spin_lock_irq(&css_set_lock);
+ css->cgroup = dcgrp;
WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
e_cset_node[ss->id]) {
@@ -1922,6 +1925,7 @@ enum cgroup2_param {
Opt_memory_localevents,
Opt_memory_recursiveprot,
Opt_memory_hugetlb_accounting,
+ Opt_pids_localevents,
nr__cgroup2_params
};
@@ -1931,6 +1935,7 @@ static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("memory_localevents", Opt_memory_localevents),
fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
fsparam_flag("memory_hugetlb_accounting", Opt_memory_hugetlb_accounting),
+ fsparam_flag("pids_localevents", Opt_pids_localevents),
{}
};
@@ -1960,6 +1965,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_memory_hugetlb_accounting:
ctx->flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
return 0;
+ case Opt_pids_localevents:
+ ctx->flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ return 0;
}
return -EINVAL;
}
@@ -1989,6 +1997,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING;
+
+ if (root_flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ cgrp_dfl_root.flags |= CGRP_ROOT_PIDS_LOCAL_EVENTS;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_PIDS_LOCAL_EVENTS;
}
}
@@ -2004,6 +2017,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
seq_puts(seq, ",memory_recursiveprot");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)
seq_puts(seq, ",memory_hugetlb_accounting");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ seq_puts(seq, ",pids_localevents");
return 0;
}
@@ -6686,8 +6701,10 @@ void cgroup_exit(struct task_struct *tsk)
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
css_set_move_task(tsk, cset, NULL, false);
- list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ /* matches the signal->live check in css_task_iter_advance() */
+ if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
+ list_add_tail(&tsk->cg_list, &cset->dying_tasks);
if (dl_task(tsk))
dec_dl_tasks_cs(tsk);
@@ -6714,10 +6731,12 @@ void cgroup_release(struct task_struct *task)
ss->release(task);
} while_each_subsys_mask();
- spin_lock_irq(&css_set_lock);
- css_set_skip_task_iters(task_css_set(task), task);
- list_del_init(&task->cg_list);
- spin_unlock_irq(&css_set_lock);
+ if (!list_empty(&task->cg_list)) {
+ spin_lock_irq(&css_set_lock);
+ css_set_skip_task_iters(task_css_set(task), task);
+ list_del_init(&task->cg_list);
+ spin_unlock_irq(&css_set_lock);
+ }
}
void cgroup_free(struct task_struct *task)
@@ -7062,7 +7081,8 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
"favordynmods\n"
"memory_localevents\n"
"memory_recursiveprot\n"
- "memory_hugetlb_accounting\n");
+ "memory_hugetlb_accounting\n"
+ "pids_localevents\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index c12b9fdb22a4..40ec4abaf440 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -21,6 +21,7 @@
* License. See the file COPYING in the main directory of the Linux
* distribution for more details.
*/
+#include "cgroup-internal.h"
#include <linux/cpu.h>
#include <linux/cpumask.h>
@@ -87,7 +88,7 @@ static const char * const perr_strings[] = {
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
- [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+ [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
};
@@ -127,19 +128,28 @@ struct cpuset {
/*
* Exclusive CPUs dedicated to current cgroup (default hierarchy only)
*
- * This exclusive CPUs must be a subset of cpus_allowed. A parent
- * cgroup can only grant exclusive CPUs to one of its children.
+ * The effective_cpus of a valid partition root comes solely from its
+ * effective_xcpus and some of the effective_xcpus may be distributed
+ * to sub-partitions below & hence excluded from its effective_cpus.
+ * For a valid partition root, its effective_cpus have no relationship
+ * with cpus_allowed unless its exclusive_cpus isn't set.
*
- * When the cgroup becomes a valid partition root, effective_xcpus
- * defaults to cpus_allowed if not set. The effective_cpus of a valid
- * partition root comes solely from its effective_xcpus and some of the
- * effective_xcpus may be distributed to sub-partitions below & hence
- * excluded from its effective_cpus.
+ * This value will only be set if either exclusive_cpus is set or
+ * when this cpuset becomes a local partition root.
*/
cpumask_var_t effective_xcpus;
/*
* Exclusive CPUs as requested by the user (default hierarchy only)
+ *
+ * Its value is independent of cpus_allowed and designates the set of
+ * CPUs that can be granted to the current cpuset or its children when
+ * it becomes a valid partition root. The effective set of exclusive
+ * CPUs granted (effective_xcpus) depends on whether those exclusive
+ * CPUs are passed down by its ancestors and not yet taken up by
+ * another sibling partition root along the way.
+ *
+ * If its value isn't set, it defaults to cpus_allowed.
*/
cpumask_var_t exclusive_cpus;
@@ -169,7 +179,7 @@ struct cpuset {
/* for custom sched domain */
int relax_domain_level;
- /* number of valid sub-partitions */
+ /* number of valid local child partitions */
int nr_subparts;
/* partition root state */
@@ -230,6 +240,17 @@ static struct list_head remote_children;
* 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
* -2 - invalid isolated partition root
+ *
+ * There are 2 types of partitions - local or remote. Local partitions are
+ * those whose parents are partition root themselves. Setting of
+ * cpuset.cpus.exclusive are optional in setting up local partitions.
+ * Remote partitions are those whose parents are not partition roots. Passing
+ * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor
+ * nodes are mandatory in creating a remote partition.
+ *
+ * For simplicity, a local partition can be created under a local or remote
+ * partition but a remote partition cannot have any partition root in its
+ * ancestor chain except the cgroup root.
*/
#define PRS_MEMBER 0
#define PRS_ROOT 1
@@ -434,7 +455,7 @@ static struct cpuset top_cpuset = {
* by other task, we use alloc_lock in the task_struct fields to protect
* them.
*
- * The cpuset_common_file_read() handlers only hold callback_lock across
+ * The cpuset_common_seq_show() handlers only hold callback_lock across
* small pieces of code, such as when reading out possibly multi-word
* cpumasks and nodemasks.
*
@@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
+/* Return user specified exclusive CPUs */
+static inline struct cpumask *user_xcpus(struct cpuset *cs)
+{
+ return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
+ : cs->exclusive_cpus;
+}
+
+static inline bool xcpus_empty(struct cpuset *cs)
+{
+ return cpumask_empty(cs->cpus_allowed) &&
+ cpumask_empty(cs->exclusive_cpus);
+}
+
static inline struct cpumask *fetch_xcpus(struct cpuset *cs)
{
return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus :
@@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* If either I or some sibling (!= me) is exclusive, we can't
- * overlap
+ * overlap. exclusive_cpus cannot overlap with each other if set.
*/
ret = -EINVAL;
cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur) {
+ bool txset, cxset; /* Are exclusive_cpus set? */
+
+ if (c == cur)
+ continue;
+
+ txset = !cpumask_empty(trial->exclusive_cpus);
+ cxset = !cpumask_empty(c->exclusive_cpus);
+ if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) ||
+ (txset && cxset)) {
if (!cpusets_are_exclusive(trial, c))
goto out;
+ } else if (txset || cxset) {
+ struct cpumask *xcpus, *acpus;
+
+ /*
+ * When just one of the exclusive_cpus's is set,
+ * cpus_allowed of the other cpuset, if set, cannot be
+ * a subset of it or none of those CPUs will be
+ * available if these exclusive CPUs are activated.
+ */
+ if (txset) {
+ xcpus = trial->exclusive_cpus;
+ acpus = c->cpus_allowed;
+ } else {
+ xcpus = c->exclusive_cpus;
+ acpus = trial->cpus_allowed;
+ }
+ if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus))
+ goto out;
}
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed))
goto out;
}
@@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
+ bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
doms = NULL;
dattr = NULL;
csa = NULL;
/* Special case for the 99% of systems with one, full, sched domain */
- if (root_load_balance && !top_cpuset.nr_subparts) {
+ if (root_load_balance && cpumask_empty(subpartitions_cpus)) {
+single_root_domain:
ndoms = 1;
doms = alloc_sched_domains(ndoms);
if (!doms)
@@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if (cp == &top_cpuset)
continue;
+
+ if (cgrpv2)
+ goto v2;
+
/*
+ * v1:
* Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
* latter: All child cpusets contain a subset of the
* parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
* the corresponding sched domain.
- *
- * If root is load-balancing, we can skip @cp if it
- * is a subset of the root's effective_cpus.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
@@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains,
housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
- if (root_load_balance &&
- cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus))
- continue;
-
if (is_sched_load_balance(cp) &&
!cpumask_empty(cp->effective_cpus))
csa[csn++] = cp;
- /* skip @cp's subtree if not a partition root */
- if (!is_partition_valid(cp))
+ /* skip @cp's subtree */
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+
+v2:
+ /*
+ * Only valid partition roots that are not isolated and with
+ * non-empty effective_cpus will be saved into csn[].
+ */
+ if ((cp->partition_root_state == PRS_ROOT) &&
+ !cpumask_empty(cp->effective_cpus))
+ csa[csn++] = cp;
+
+ /*
+ * Skip @cp's subtree if not a partition root and has no
+ * exclusive CPUs to be granted to child cpusets.
+ */
+ if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
+ /*
+ * If there are only isolated partitions underneath the cgroup root,
+ * we can optimize out unneeded sched domains scanning.
+ */
+ if (root_load_balance && (csn == 1))
+ goto single_root_domain;
+
for (i = 0; i < csn; i++)
csa[i]->pn = i;
ndoms = csn;
@@ -1064,6 +1145,20 @@ restart:
dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr),
GFP_KERNEL);
+ /*
+ * Cgroup v2 doesn't support domain attributes, just set all of them
+ * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a
+ * subset of HK_TYPE_DOMAIN housekeeping CPUs.
+ */
+ if (cgrpv2) {
+ for (i = 0; i < ndoms; i++) {
+ cpumask_copy(doms[i], csa[i]->effective_cpus);
+ if (dattr)
+ dattr[i] = SD_ATTR_INIT;
+ }
+ goto done;
+ }
+
for (nslot = 0, i = 0; i < csn; i++) {
struct cpuset *a = csa[i];
struct cpumask *dp;
@@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void)
* root should be only a subset of the active CPUs. Since a CPU in any
* partition root could be offlined, all must be checked.
*/
- if (top_cpuset.nr_subparts) {
+ if (!cpumask_empty(subpartitions_cpus)) {
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (!is_partition_valid(cs)) {
@@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
*/
static int update_partition_exclusive(struct cpuset *cs, int new_prs)
{
- bool exclusive = (new_prs > 0);
+ bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) {
if (update_flag(CS_CPU_EXCLUSIVE, cs, 1))
@@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated);
* Return: true if xcpus is not empty, false otherwise.
*
* Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set),
- * it must be a subset of cpus_allowed and parent's effective_xcpus.
+ * it must be a subset of parent's effective_xcpus.
*/
static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
struct cpumask *xcpus)
@@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs,
if (!xcpus)
xcpus = cs->effective_xcpus;
- if (!cpumask_empty(cs->exclusive_cpus))
- cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed);
- else
- cpumask_copy(xcpus, cs->cpus_allowed);
-
- return cpumask_and(xcpus, xcpus, parent->effective_xcpus);
+ return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus);
}
static inline bool is_remote_partition(struct cpuset *cs)
@@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
*/
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
- xcpus = !cpumask_empty(cs->exclusive_cpus)
- ? cs->effective_xcpus : cs->cpus_allowed;
+ xcpus = user_xcpus(cs);
if (cmd == partcmd_invalidate) {
if (is_prs_invalid(old_prs))
@@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
- if (!newmask && cpumask_empty(cs->cpus_allowed))
+ if (!newmask && xcpus_empty(cs))
return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
@@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
retval = cpulist_parse(buf, trialcs->exclusive_cpus);
if (retval < 0)
return retval;
- if (!is_cpu_exclusive(cs))
- set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags);
}
/* Nothing to do if the CPUs didn't change */
@@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs)
? partcmd_enable : partcmd_enablei;
/*
- * cpus_allowed cannot be empty.
+ * cpus_allowed and exclusive_cpus cannot be both empty.
*/
- if (cpumask_empty(cs->cpus_allowed)) {
+ if (xcpus_empty(cs)) {
err = PERR_CPUSEMPTY;
goto out;
}
@@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
}
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
- nodes_clear(cs->mems_allowed);
- nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
INIT_LIST_HEAD(&cs->remote_sibling);
@@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
+ /*
+ * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
+ */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ !is_sched_load_balance(parent))
+ clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->use_parent_ecpus = true;
parent->child_ecpus_count++;
}
-
- /*
- * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
- */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_sched_load_balance(parent))
- clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
-
spin_unlock_irq(&callback_lock);
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
@@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void)
* In the rare case that hotplug removes all the cpus in
* subpartitions_cpus, we assumed that cpus are updated.
*/
- if (!cpus_updated && top_cpuset.nr_subparts)
+ if (!cpus_updated && !cpumask_empty(subpartitions_cpus))
cpus_updated = true;
/* For v1, synchronize cpus_allowed to cpu_active_mask */
@@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
if (!buf)
goto out;
- css = task_get_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- css_put(css);
+ rcu_read_lock();
+ spin_lock_irq(&css_set_lock);
+ css = task_css(tsk, cpuset_cgrp_id);
+ retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
+ current->nsproxy->cgroup_ns);
+ spin_unlock_irq(&css_set_lock);
+ rcu_read_unlock();
+
if (retval == -E2BIG)
retval = -ENAMETOOLONG;
if (retval < 0)
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 79a3717a5803..0e26068995a6 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -121,6 +121,30 @@ static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
misc_res_name[type]);
}
+static void misc_cg_update_watermark(struct misc_res *res, u64 new_usage)
+{
+ u64 old;
+
+ while (true) {
+ old = atomic64_read(&res->watermark);
+ if (new_usage <= old)
+ break;
+ if (atomic64_cmpxchg(&res->watermark, old, new_usage) == old)
+ break;
+ }
+}
+
+static void misc_cg_event(enum misc_res_type type, struct misc_cg *cg)
+{
+ atomic64_inc(&cg->res[type].events_local);
+ cgroup_file_notify(&cg->events_local_file);
+
+ for (; parent_misc(cg); cg = parent_misc(cg)) {
+ atomic64_inc(&cg->res[type].events);
+ cgroup_file_notify(&cg->events_file);
+ }
+}
+
/**
* misc_cg_try_charge() - Try charging the misc cgroup.
* @type: Misc res type to charge.
@@ -159,14 +183,12 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount)
ret = -EBUSY;
goto err_charge;
}
+ misc_cg_update_watermark(res, new_usage);
}
return 0;
err_charge:
- for (j = i; j; j = parent_misc(j)) {
- atomic64_inc(&j->res[type].events);
- cgroup_file_notify(&j->events_file);
- }
+ misc_cg_event(type, i);
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
@@ -308,6 +330,29 @@ static int misc_cg_current_show(struct seq_file *sf, void *v)
}
/**
+ * misc_cg_peak_show() - Show the peak usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_peak_show(struct seq_file *sf, void *v)
+{
+ int i;
+ u64 watermark;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ watermark = atomic64_read(&cg->res[i].watermark);
+ if (READ_ONCE(misc_res_capacity[i]) || watermark)
+ seq_printf(sf, "%s %llu\n", misc_res_name[i], watermark);
+ }
+
+ return 0;
+}
+
+/**
* misc_cg_capacity_show() - Show the total capacity of misc res on the host.
* @sf: Interface file
* @v: Arguments passed
@@ -331,20 +376,33 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
-static int misc_events_show(struct seq_file *sf, void *v)
+static int __misc_events_show(struct seq_file *sf, bool local)
{
struct misc_cg *cg = css_misc(seq_css(sf));
u64 events;
int i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
- events = atomic64_read(&cg->res[i].events);
+ if (local)
+ events = atomic64_read(&cg->res[i].events_local);
+ else
+ events = atomic64_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events);
}
return 0;
}
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, false);
+}
+
+static int misc_events_local_show(struct seq_file *sf, void *v)
+{
+ return __misc_events_show(sf, true);
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -358,6 +416,10 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_current_show,
},
{
+ .name = "peak",
+ .seq_show = misc_cg_peak_show,
+ },
+ {
.name = "capacity",
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
@@ -368,6 +430,12 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_file),
.seq_show = misc_events_show,
},
+ {
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_local_file),
+ .seq_show = misc_events_local_show,
+ },
{}
};
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 0e5ec7d59b4d..f5cb0ec45b9d 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -38,6 +38,14 @@
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
+enum pidcg_event {
+ /* Fork failed in subtree because this pids_cgroup limit was hit. */
+ PIDCG_MAX,
+ /* Fork failed in this pids_cgroup because ancestor limit was hit. */
+ PIDCG_FORKFAIL,
+ NR_PIDCG_EVENTS,
+};
+
struct pids_cgroup {
struct cgroup_subsys_state css;
@@ -49,11 +57,12 @@ struct pids_cgroup {
atomic64_t limit;
int64_t watermark;
- /* Handle for "pids.events" */
+ /* Handles for pids.events[.local] */
struct cgroup_file events_file;
+ struct cgroup_file events_local_file;
- /* Number of times fork failed because limit was hit. */
- atomic64_t events_limit;
+ atomic64_t events[NR_PIDCG_EVENTS];
+ atomic64_t events_local[NR_PIDCG_EVENTS];
};
static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
@@ -148,12 +157,13 @@ static void pids_charge(struct pids_cgroup *pids, int num)
* pids_try_charge - hierarchically try to charge the pid count
* @pids: the pid cgroup state
* @num: the number of pids to charge
+ * @fail: storage of pid cgroup causing the fail
*
* This function follows the set limit. It will fail if the charge would cause
* the new value to exceed the hierarchical limit. Returns 0 if the charge
* succeeded, otherwise -EAGAIN.
*/
-static int pids_try_charge(struct pids_cgroup *pids, int num)
+static int pids_try_charge(struct pids_cgroup *pids, int num, struct pids_cgroup **fail)
{
struct pids_cgroup *p, *q;
@@ -166,9 +176,10 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
* p->limit is %PIDS_MAX then we know that this test will never
* fail.
*/
- if (new > limit)
+ if (new > limit) {
+ *fail = p;
goto revert;
-
+ }
/*
* Not technically accurate if we go over limit somewhere up
* the hierarchy, but that's tolerable for the watermark.
@@ -229,6 +240,36 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
}
}
+static void pids_event(struct pids_cgroup *pids_forking,
+ struct pids_cgroup *pids_over_limit)
+{
+ struct pids_cgroup *p = pids_forking;
+ bool limit = false;
+
+ /* Only log the first time limit is hit. */
+ if (atomic64_inc_return(&p->events_local[PIDCG_FORKFAIL]) == 1) {
+ pr_info("cgroup: fork rejected by pids controller in ");
+ pr_cont_cgroup_path(p->css.cgroup);
+ pr_cont("\n");
+ }
+ cgroup_file_notify(&p->events_local_file);
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS)
+ return;
+
+ for (; parent_pids(p); p = parent_pids(p)) {
+ if (p == pids_over_limit) {
+ limit = true;
+ atomic64_inc(&p->events_local[PIDCG_MAX]);
+ cgroup_file_notify(&p->events_local_file);
+ }
+ if (limit)
+ atomic64_inc(&p->events[PIDCG_MAX]);
+
+ cgroup_file_notify(&p->events_file);
+ }
+}
+
/*
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
@@ -236,7 +277,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
- struct pids_cgroup *pids;
+ struct pids_cgroup *pids, *pids_over_limit;
int err;
if (cset)
@@ -244,16 +285,10 @@ static int pids_can_fork(struct task_struct *task, struct css_set *cset)
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
- err = pids_try_charge(pids, 1);
- if (err) {
- /* Only log the first time events_limit is incremented. */
- if (atomic64_inc_return(&pids->events_limit) == 1) {
- pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(css->cgroup);
- pr_cont("\n");
- }
- cgroup_file_notify(&pids->events_file);
- }
+ err = pids_try_charge(pids, 1, &pids_over_limit);
+ if (err)
+ pids_event(pids, pids_over_limit);
+
return err;
}
@@ -337,11 +372,32 @@ static s64 pids_peak_read(struct cgroup_subsys_state *css,
return READ_ONCE(pids->watermark);
}
-static int pids_events_show(struct seq_file *sf, void *v)
+static int __pids_events_show(struct seq_file *sf, bool local)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
+ enum pidcg_event pe = PIDCG_MAX;
+ atomic64_t *events;
+
+ if (!cgroup_subsys_on_dfl(pids_cgrp_subsys) ||
+ cgrp_dfl_root.flags & CGRP_ROOT_PIDS_LOCAL_EVENTS) {
+ pe = PIDCG_FORKFAIL;
+ local = true;
+ }
+ events = local ? pids->events_local : pids->events;
+
+ seq_printf(sf, "max %lld\n", (s64)atomic64_read(&events[pe]));
+ return 0;
+}
- seq_printf(sf, "max %lld\n", (s64)atomic64_read(&pids->events_limit));
+static int pids_events_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, false);
+ return 0;
+}
+
+static int pids_events_local_show(struct seq_file *sf, void *v)
+{
+ __pids_events_show(sf, true);
return 0;
}
@@ -368,9 +424,42 @@ static struct cftype pids_files[] = {
.file_offset = offsetof(struct pids_cgroup, events_file),
.flags = CFTYPE_NOT_ON_ROOT,
},
+ {
+ .name = "events.local",
+ .seq_show = pids_events_local_show,
+ .file_offset = offsetof(struct pids_cgroup, events_local_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ { } /* terminate */
+};
+
+static struct cftype pids_files_legacy[] = {
+ {
+ .name = "max",
+ .write = pids_max_write,
+ .seq_show = pids_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .read_s64 = pids_current_read,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
+ .name = "events",
+ .seq_show = pids_events_show,
+ .file_offset = offsetof(struct pids_cgroup, events_file),
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
{ } /* terminate */
};
+
struct cgroup_subsys pids_cgrp_subsys = {
.css_alloc = pids_css_alloc,
.css_free = pids_css_free,
@@ -379,7 +468,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
.release = pids_release,
- .legacy_cftypes = pids_files,
+ .legacy_cftypes = pids_files_legacy,
.dfl_cftypes = pids_files,
.threaded = true,
};
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index fb8b49437573..a06b45272411 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -594,49 +594,46 @@ static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
}
}
+
+static void cgroup_force_idle_show(struct seq_file *seq, struct cgroup_base_stat *bstat)
+{
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time = bstat->forceidle_sum;
+
+ do_div(forceidle_time, NSEC_PER_USEC);
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct cgroup_base_stat bstat;
-#ifdef CONFIG_SCHED_CORE
- u64 forceidle_time;
-#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = cgrp->bstat.forceidle_sum;
-#endif
cgroup_rstat_flush_release(cgrp);
} else {
- root_cgroup_cputime(&bstat);
- usage = bstat.cputime.sum_exec_runtime;
- utime = bstat.cputime.utime;
- stime = bstat.cputime.stime;
-#ifdef CONFIG_SCHED_CORE
- forceidle_time = bstat.forceidle_sum;
-#endif
+ /* cgrp->bstat of root is not actually used, reuse it */
+ root_cgroup_cputime(&cgrp->bstat);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ utime = cgrp->bstat.cputime.utime;
+ stime = cgrp->bstat.cputime.stime;
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
-#ifdef CONFIG_SCHED_CORE
- do_div(forceidle_time, NSEC_PER_USEC);
-#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
-#ifdef CONFIG_SCHED_CORE
- seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
-#endif
+ cgroup_force_idle_show(seq, &cgrp->bstat);
}
/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3d2bf1d50a0c..1209ddaec026 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -483,6 +483,8 @@ static int cpu_hotplug_disabled;
DEFINE_STATIC_PERCPU_RWSEM(cpu_hotplug_lock);
+static bool cpu_hotplug_offline_disabled __ro_after_init;
+
void cpus_read_lock(void)
{
percpu_down_read(&cpu_hotplug_lock);
@@ -542,6 +544,14 @@ static void lockdep_release_cpus_lock(void)
rwsem_release(&cpu_hotplug_lock.dep_map, _THIS_IP_);
}
+/* Declare CPU offlining not supported */
+void cpu_hotplug_disable_offlining(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_offline_disabled = true;
+ cpu_maps_update_done();
+}
+
/*
* Wait for currently running CPU hotplug operations to complete (if any) and
* disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@ -1471,7 +1481,7 @@ static int cpu_down_maps_locked(unsigned int cpu, enum cpuhp_state target)
* If the platform does not support hotplug, report it explicitly to
* differentiate it from a transient offlining failure.
*/
- if (cc_platform_has(CC_ATTR_HOTPLUG_DISABLED))
+ if (cpu_hotplug_offline_disabled)
return -EOPNOTSUPP;
if (cpu_hotplug_disabled)
return -EBUSY;
@@ -1894,8 +1904,8 @@ int freeze_secondary_cpus(int primary)
cpumask_clear(frozen_cpus);
pr_info("Disabling non-boot CPUs ...\n");
- for_each_online_cpu(cpu) {
- if (cpu == primary)
+ for (cpu = nr_cpu_ids - 1; cpu >= 0; cpu--) {
+ if (!cpu_online(cpu) || cpu == primary)
continue;
if (pm_wakeup_pending()) {
@@ -3072,6 +3082,9 @@ EXPORT_SYMBOL(__cpu_possible_mask);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
+struct cpumask __cpu_enabled_mask __read_mostly;
+EXPORT_SYMBOL(__cpu_enabled_mask);
+
struct cpumask __cpu_present_mask __read_mostly;
EXPORT_SYMBOL(__cpu_present_mask);
diff --git a/kernel/exit.c b/kernel/exit.c
index f95a2c1338a8..81fcee45d630 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -484,6 +484,8 @@ retry:
* Search through everything else, we should not get here often.
*/
for_each_process(g) {
+ if (atomic_read(&mm->mm_users) <= 1)
+ break;
if (g->flags & PF_KTHREAD)
continue;
for_each_thread(g, c) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 99076dbe27d8..63a76166497a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -616,12 +616,6 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm)
exe_file = get_mm_exe_file(oldmm);
RCU_INIT_POINTER(mm->exe_file, exe_file);
- /*
- * We depend on the oldmm having properly denied write access to the
- * exe_file already.
- */
- if (exe_file && deny_write_access(exe_file))
- pr_warn_once("deny_write_access() failed in %s\n", __func__);
}
#ifdef CONFIG_MMU
@@ -1412,20 +1406,11 @@ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
*/
old_exe_file = rcu_dereference_raw(mm->exe_file);
- if (new_exe_file) {
- /*
- * We expect the caller (i.e., sys_execve) to already denied
- * write access, so this is unlikely to fail.
- */
- if (unlikely(deny_write_access(new_exe_file)))
- return -EACCES;
+ if (new_exe_file)
get_file(new_exe_file);
- }
rcu_assign_pointer(mm->exe_file, new_exe_file);
- if (old_exe_file) {
- allow_write_access(old_exe_file);
+ if (old_exe_file)
fput(old_exe_file);
- }
return 0;
}
@@ -1464,9 +1449,6 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
return ret;
}
- ret = deny_write_access(new_exe_file);
- if (ret)
- return -EACCES;
get_file(new_exe_file);
/* set the new file */
@@ -1475,10 +1457,8 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
rcu_assign_pointer(mm->exe_file, new_exe_file);
mmap_write_unlock(mm);
- if (old_exe_file) {
- allow_write_access(old_exe_file);
+ if (old_exe_file)
fput(old_exe_file);
- }
return 0;
}
@@ -2941,8 +2921,6 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
}
#endif
-#ifdef __ARCH_WANT_SYS_CLONE3
-
noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
struct clone_args __user *uargs,
size_t usize)
@@ -3086,6 +3064,11 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
+#ifdef __ARCH_BROKEN_SYS_CLONE3
+#warning clone3() entry point is missing, please fix
+ return -ENOSYS;
+#endif
+
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
@@ -3097,7 +3080,6 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
return kernel_clone(&kargs);
}
-#endif
void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data)
{
diff --git a/kernel/irq/irq_sim.c b/kernel/irq/irq_sim.c
index 38d6ae651ac7..3d4036db15ac 100644
--- a/kernel/irq/irq_sim.c
+++ b/kernel/irq/irq_sim.c
@@ -17,6 +17,8 @@ struct irq_sim_work_ctx {
unsigned int irq_count;
unsigned long *pending;
struct irq_domain *domain;
+ struct irq_sim_ops ops;
+ void *user_data;
};
struct irq_sim_irq_ctx {
@@ -88,6 +90,31 @@ static int irq_sim_set_irqchip_state(struct irq_data *data,
return 0;
}
+static int irq_sim_request_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_requested)
+ return work_ctx->ops.irq_sim_irq_requested(work_ctx->domain,
+ hwirq,
+ work_ctx->user_data);
+
+ return 0;
+}
+
+static void irq_sim_release_resources(struct irq_data *data)
+{
+ struct irq_sim_irq_ctx *irq_ctx = irq_data_get_irq_chip_data(data);
+ struct irq_sim_work_ctx *work_ctx = irq_ctx->work_ctx;
+ irq_hw_number_t hwirq = irqd_to_hwirq(data);
+
+ if (work_ctx->ops.irq_sim_irq_released)
+ work_ctx->ops.irq_sim_irq_released(work_ctx->domain, hwirq,
+ work_ctx->user_data);
+}
+
static struct irq_chip irq_sim_irqchip = {
.name = "irq_sim",
.irq_mask = irq_sim_irqmask,
@@ -95,6 +122,8 @@ static struct irq_chip irq_sim_irqchip = {
.irq_set_type = irq_sim_set_type,
.irq_get_irqchip_state = irq_sim_get_irqchip_state,
.irq_set_irqchip_state = irq_sim_set_irqchip_state,
+ .irq_request_resources = irq_sim_request_resources,
+ .irq_release_resources = irq_sim_release_resources,
};
static void irq_sim_handle_irq(struct irq_work *work)
@@ -164,6 +193,15 @@ static const struct irq_domain_ops irq_sim_domain_ops = {
struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
+ return irq_domain_create_sim_full(fwnode, num_irqs, NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+
+struct irq_domain *irq_domain_create_sim_full(struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
struct irq_sim_work_ctx *work_ctx __free(kfree) =
kmalloc(sizeof(*work_ctx), GFP_KERNEL);
@@ -183,10 +221,14 @@ struct irq_domain *irq_domain_create_sim(struct fwnode_handle *fwnode,
work_ctx->irq_count = num_irqs;
work_ctx->work = IRQ_WORK_INIT_HARD(irq_sim_handle_irq);
work_ctx->pending = no_free_ptr(pending);
+ work_ctx->user_data = data;
+
+ if (ops)
+ memcpy(&work_ctx->ops, ops, sizeof(*ops));
return no_free_ptr(work_ctx)->domain;
}
-EXPORT_SYMBOL_GPL(irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(irq_domain_create_sim_full);
/**
* irq_domain_remove_sim - Deinitialize the interrupt simulator domain: free
@@ -228,10 +270,22 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
struct fwnode_handle *fwnode,
unsigned int num_irqs)
{
+ return devm_irq_domain_create_sim_full(dev, fwnode, num_irqs,
+ NULL, NULL);
+}
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+
+struct irq_domain *
+devm_irq_domain_create_sim_full(struct device *dev,
+ struct fwnode_handle *fwnode,
+ unsigned int num_irqs,
+ const struct irq_sim_ops *ops,
+ void *data)
+{
struct irq_domain *domain;
int ret;
- domain = irq_domain_create_sim(fwnode, num_irqs);
+ domain = irq_domain_create_sim_full(fwnode, num_irqs, ops, data);
if (IS_ERR(domain))
return domain;
@@ -241,4 +295,4 @@ struct irq_domain *devm_irq_domain_create_sim(struct device *dev,
return domain;
}
-EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim);
+EXPORT_SYMBOL_GPL(devm_irq_domain_create_sim_full);
diff --git a/kernel/kcsan/kcsan_test.c b/kernel/kcsan/kcsan_test.c
index 0c17b4c83e1c..117d9d4d3c3b 100644
--- a/kernel/kcsan/kcsan_test.c
+++ b/kernel/kcsan/kcsan_test.c
@@ -1620,5 +1620,6 @@ static struct kunit_suite kcsan_test_suite = {
kunit_test_suites(&kcsan_test_suite);
+MODULE_DESCRIPTION("KCSAN test suite");
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Marco Elver <elver@google.com>");
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 415d81e6ce70..de95ec07e477 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -30,6 +30,7 @@
#include <linux/torture.h>
#include <linux/reboot.h>
+MODULE_DESCRIPTION("torture test facility for locking");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 25f3cf679b35..bdf0087d6442 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -249,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
- /*
- * Release tasks_rcu_exit_srcu to avoid following deadlock:
- *
- * 1) TASK A unshare(CLONE_NEWPID)
- * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
- * and TASK C
- * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
- * 4) TASK A calls synchronize_rcu_tasks()
- * -> synchronize_srcu(tasks_rcu_exit_srcu)
- * 5) *DEADLOCK*
- *
- * It is considered safe to release tasks_rcu_exit_srcu here
- * because we assume the current task can not be concurrently
- * reaped at this point.
- */
- exit_tasks_rcu_stop();
schedule();
- exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 8db4fedaaa1e..b53a9e8f5904 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -42,6 +42,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 807fbf6123a7..08bf7c669dd3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -390,6 +391,7 @@ struct rcu_torture_ops {
int extendables;
int slow_gps;
int no_pi_lock;
+ int debug_objects;
const char *name;
};
@@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = {
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
+ .debug_objects = 1,
.name = "rcu"
};
@@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcu"
};
@@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcud"
};
@@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
rfcpp = rfp->rcu_fwd_cb_tail;
rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
- WRITE_ONCE(*rfcpp, rfcp);
+ smp_store_release(rfcpp, rfcp);
WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(rfp->n_launders_hist))
@@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void)
cur_ops->gp_slow_unregister(NULL);
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
static void rcu_torture_leak_cb(struct rcu_head *rhp)
{
}
@@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
/*
* Verify that double-free causes debug-objects to complain, but only
@@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
static void rcu_test_debug_objects(void)
{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ int idx;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) {
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n",
+ KBUILD_MODNAME, cur_ops->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(cur_ops->debug_objects &&
+ (!cur_ops->call || !cur_ops->cb_barrier)))
+ return;
+
struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
+ pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
- preempt_disable(); /* Prevent preemption from interrupting test. */
- rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
- local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu_hurry(&rh2, rcu_torture_leak_cb);
- call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */
+ cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ cur_ops->call(&rh2, rcu_torture_leak_cb);
+ cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu_hurry(rhp, rcu_torture_leak_cb);
- call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ cur_ops->call(rhp, rcu_torture_leak_cb);
+ cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
- local_irq_enable();
- rcu_read_unlock();
- preempt_enable();
+ cur_ops->readunlock(idx);
/* Wait for them all to get done so we can safely return. */
- rcu_barrier();
- pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
+ cur_ops->cb_barrier();
+ pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
kfree(rhp);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
static void rcutorture_sync(void)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 2c2648a3ad30..f4ea5b1ec068 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -63,6 +63,7 @@ do { \
#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+MODULE_DESCRIPTION("Scalability test for object reference mechanisms");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5afd5cf494db..549c03336ee9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+ return cookie == SRCU_GET_STATE_COMPLETED ||
+ ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index bc4b58b0204e..b24db425f16d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
__func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
- return; /* Caller forgot to stop doing call_srcu()? */
+ return; // Caller forgot to stop doing call_srcu()?
+ // Or caller invoked start_poll_synchronize_srcu()
+ // and then cleanup_srcu_struct() before that grace
+ // period ended?
}
kfree(sup->node);
sup->node = NULL;
@@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp)
bool cbs;
bool last_lvl;
int cpu;
- unsigned long flags;
unsigned long gpseq;
int idx;
unsigned long mask;
@@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
@@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ if (cookie != SRCU_GET_STATE_COMPLETED &&
+ !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 6c2bd9001adc..da60a9947c00 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* we are called at early boot time but this shouldn't happen.
*/
}
- WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1);
+ rsp->gp_count++;
spin_unlock_irq(&rsp->rss_lock);
if (gp_state == GP_IDLE) {
@@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
- int gpc;
-
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
- gpc = rsp->gp_count - 1;
- WRITE_ONCE(rsp->gp_count, gpc);
- if (!gpc) {
+ WARN_ON_ONCE(rsp->gp_count == 0);
+ if (!--rsp->gp_count) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
@@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
{
int gp_state;
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count);
if (rsp->gp_state == GP_REPLAY)
WRITE_ONCE(rsp->gp_state, GP_EXIT);
gp_state = rsp->gp_state;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index e1bf33018e6d..ba3440a45b6d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// not know to synchronize with this RCU Tasks grace period) have
// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
// will take care of any tasks stuck in the non-preemptible region
-// of do_exit() following its call to exit_tasks_rcu_stop().
+// of do_exit() following its call to exit_tasks_rcu_finish().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void)
* Remove the task from the "yet another list" because do_exit() is now
* non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void)
+void exit_tasks_rcu_finish(void)
{
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
@@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
- */
-void exit_tasks_rcu_finish(void)
-{
- exit_tasks_rcu_stop();
- exit_tasks_rcu_finish_trace(current);
+ exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
// allow safe access to the hop list.
for_each_online_cpu(cpu) {
rcu_read_lock();
+ // Note that cpu_curr_snapshot() picks up the target
+ // CPU's current task while its runqueue is locked with
+ // an smp_mb__after_spinlock(). This ensures that either
+ // the grace-period kthread will see that task's read-side
+ // critical section or the task will see the updater's pre-GP
+ // accesses. The trailing smp_mb() in cpu_curr_snapshot()
+ // does not currently play a role other than simplify
+ // that function's ordering semantics. If these simplified
+ // ordering semantics continue to be redundant, that smp_mb()
+ // might be removed.
t = cpu_curr_snapshot(cpu);
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 28c7031711a3..e641cc681901 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -175,6 +176,9 @@ static int gp_init_delay;
module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+static int nohz_full_patience_delay;
+module_param(nohz_full_patience_delay, int, 0444);
+static int nohz_full_patience_delay_jiffies;
// Add delay to rcu_read_unlock() for strict grace periods.
static int rcu_unlock_delay;
@@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void)
}
/*
- * Snapshot the ->dynticks counter with full ordering so as to allow
- * stable comparison of this counter with past and future snapshots.
- */
-static int rcu_dynticks_snap(int cpu)
-{
- smp_mb(); // Fundamental RCU ordering guarantee.
- return ct_dynticks_cpu_acquire(cpu);
-}
-
-/*
* Return true if the snapshot returned from rcu_dynticks_snap()
* indicates that RCU is in an extended quiescent state.
*/
@@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp->cpu);
+ /*
+ * The first failing snapshot is already ordered against the accesses
+ * performed by the remote CPU after it exits idle.
+ *
+ * The second snapshot therefore only needs to order against accesses
+ * performed by the remote CPU prior to entering idle and therefore can
+ * rely solely on acquire semantics.
+ */
+ return snap != ct_dynticks_cpu_acquire(rdp->cpu);
}
/*
@@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses and updater's
+ * accesses prior to current GP (and also the started GP sequence number)
+ * is enforced by rcu_seq_start() implicit barrier and even further by
+ * smp_mb__after_unlock_lock() barriers chained all the way throughout the
+ * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
+ * locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and post grace period
+ * updater's accesses is enforced by the below acquire semantic.
+ */
+ rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
rcu_sr_put_wait_head(rcu);
}
+
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
}
/*
@@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
*/
static void rcu_sr_normal_gp_cleanup(void)
{
- struct llist_node *wait_tail, *next, *rcu;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
int done = 0;
wait_tail = rcu_state.srs_wait_tail;
@@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void)
break;
}
- // concurrent sr_normal_gp_cleanup work might observe this update.
- smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ /*
+ * Fast path, no more users to process except putting the second last
+ * wait head if no inflight-workers. If there are in-flight workers,
+ * they will remove the last wait head.
+ *
+ * Note that the ACQUIRE orders atomic access with list manipulation.
+ */
+ if (wait_tail->next && wait_tail->next->next == NULL &&
+ rcu_sr_is_wait_head(wait_tail->next) &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
+ wait_tail->next = NULL;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
/*
* We schedule a work in order to perform a final processing
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+ if (wait_tail->next) {
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
}
/*
@@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- local_irq_save(flags);
+ local_irq_disable();
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Nothing to do on this leaf rcu_node structure. */
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
continue;
}
@@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void)
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -4313,11 +4347,15 @@ static int rcu_pending(int user)
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
- if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
+ gp_in_progress = rcu_gp_in_progress();
+ if ((user || rcu_is_cpu_rrupt_from_idle() ||
+ (gp_in_progress &&
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) +
+ nohz_full_patience_delay_jiffies))) &&
+ rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- gp_in_progress = rcu_gp_in_progress();
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
@@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(ct->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
@@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_rdp_is_offloaded(rdp) ||
- rcu_segcblist_empty(&rdp->cblist))
- return; /* No callbacks to migrate. */
+ if (rcu_rdp_is_offloaded(rdp))
+ return;
raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (rcu_segcblist_empty(&rdp->cblist)) {
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ return; /* No callbacks to migrate. */
+ }
+
WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bae7925c497f..fcf2b4aa3441 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -223,7 +223,6 @@ struct rcu_data {
struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
- atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
@@ -420,6 +419,7 @@ struct rcu_state {
struct llist_node *srs_done_tail; /* ready for GP users. */
struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8a1d9c8bd9f7..4acd29d16fdb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- smp_mb(); /* Ensure test happens before caller kfree(). */
+ /*
+ * Order GP completion with preceding accesses. Order also GP
+ * completion with post GP update side accesses. Pairs with
+ * rcu_seq_end().
+ */
+ smp_mb();
return true;
}
return false;
@@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses
+ * and updater's accesses prior to current GP (and also
+ * the started GP sequence number) is enforced by
+ * rcu_seq_start() implicit barrier, relayed by kworkers
+ * locking and even further by smp_mb__after_unlock_lock()
+ * barriers chained all the way throughout the rnp locking
+ * tree since sync_exp_reset_tree() and up to the current
+ * leaf rnp locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and
+ * post grace period updater's accesses is enforced by the
+ * below acquire semantic.
+ */
+ snap = ct_dynticks_cpu_acquire(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void)
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3f85577bddd4..3ce30841119a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0);
/*
* Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
+ * lock isn't immediately available, perform minimal sanity check.
*/
static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
__acquires(&rdp->nocb_bypass_lock)
@@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
lockdep_assert_irqs_disabled();
if (raw_spin_trylock(&rdp->nocb_bypass_lock))
return;
- atomic_inc(&rdp->nocb_lock_contended);
+ /*
+ * Contention expected only when local enqueue collide with
+ * remote flush from kthreads.
+ */
WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
}
/*
@@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
// We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
rcu_nocb_bypass_lock(rdp);
ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
@@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
}
}
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
- bool *wake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
@@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will handle this rdp until it ever gets de-offloaded.
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 1;
} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
@@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will ignore this rdp until it ever gets re-offloaded.
*/
rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 0;
} else {
WARN_ON_ONCE(1);
@@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (rdp_toggling) {
- bool wake_state = false;
int ret;
- ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ ret = nocb_gp_toggle_rdp(rdp_toggling);
if (ret == 1)
list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
else if (ret == 0)
list_del(&rdp_toggling->nocb_entry_rdp);
- if (wake_state)
- swake_up_one(&rdp_toggling->nocb_state_wq);
+
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
my_rdp->nocb_gp_seq = -1;
@@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+ return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park();
}
/*
@@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp)
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
- bool needwake_state = false;
bool needwake_gp = false;
- bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- if (READ_ONCE(rdp->nocb_cb_sleep)) {
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+ if (kthread_should_park()) {
+ kthread_parkme();
+ } else if (READ_ONCE(rdp->nocb_cb_sleep)) {
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
local_irq_save(flags);
rcu_momentary_dyntick_idle();
@@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
+ if (!rcu_segcblist_ready_cbs(cblist)) {
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
} else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
}
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
/*
@@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
bool wake_gp = false;
rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
@@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg)
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
- /*
- * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
- * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
- */
- if (!rdp->nocb_cb_kthread) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
-
swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_GP));
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
@@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* but we stick to paranoia in this rare path.
*/
rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_nocb_unlock_irqrestore(rdp, flags);
list_del(&rdp->nocb_entry_rdp);
@@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg)
wake_gp = rdp_offload_toggle(rdp, true, flags);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ kthread_unpark(rdp->nocb_cb_kthread);
+
swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
/*
@@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
}
rcu_organize_nocb_kthreads();
@@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
+ t = kthread_create(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
goto end;
+ if (rcu_rdp_is_offloaded(rdp))
+ wake_up_process(t);
+ else
+ kthread_park(t);
+
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
"lL"[raw_spin_is_locked(&rdp->nocb_lock)],
"sS"[!!rdp->nocb_cb_sleep],
".W"[swait_active(&rdp->nocb_cb_wq)],
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 340bbefe5f65..c569da65b421 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
rcu_lockdep_is_held_nocb(rdp) ||
- (rdp == this_cpu_ptr(&rcu_data) &&
- !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (nohz_full_patience_delay < 0) {
+ pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
+ nohz_full_patience_delay = 0;
+ } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
+ pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
+ nohz_full_patience_delay = 5 * MSEC_PER_SEC;
+ } else if (nohz_full_patience_delay) {
+ pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
+ }
+ nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 460efecd077b..4b0e9d7c4c68 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
// Print signed value, as negative values indicate a probable bug.
@@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_cpu(cpu) & 0xffff,
ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 59032aaccd18..44e83a646264 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -43,6 +43,7 @@
#define SCFTORTOUT_ERRSTRING(s, x...) pr_alert(SCFTORT_FLAG "!!! " s "\n", ## x)
+MODULE_DESCRIPTION("Torture tests on the smp_call_function() family of primitives");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
@@ -67,7 +68,7 @@ torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operation
torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
-char *torture_type = "";
+static char *torture_type = "";
#ifdef MODULE
# define SCFTORT_SHUTDOWN 0
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index bcf2c4cc0522..35a35e36024b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -723,7 +723,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
- psi_account_irqtime(rq->curr, irq_delta);
delayacct_irq(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
@@ -4467,12 +4466,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
* @cpu: The CPU on which to snapshot the task.
*
* Returns the task_struct pointer of the task "currently" running on
- * the specified CPU. If the same task is running on that CPU throughout,
- * the return value will be a pointer to that task's task_struct structure.
- * If the CPU did any context switches even vaguely concurrently with the
- * execution of this function, the return value will be a pointer to the
- * task_struct structure of a randomly chosen task that was running on
- * that CPU somewhere around the time that this function was executing.
+ * the specified CPU.
*
* If the specified CPU was offline, the return value is whatever it
* is, perhaps a pointer to the task_struct structure of that CPU's idle
@@ -4486,11 +4480,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
*/
struct task_struct *cpu_curr_snapshot(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
+ struct rq_flags rf;
- smp_mb(); /* Pairing determined by caller's synchronization design. */
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
smp_mb(); /* Pairing determined by caller's synchronization design. */
+
return t;
}
@@ -5665,7 +5664,7 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr = rq->curr;
+ struct task_struct *curr;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5677,6 +5676,9 @@ void sched_tick(void)
rq_lock(rq, &rf);
+ curr = rq->curr;
+ psi_account_irqtime(rq, curr, NULL);
+
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
@@ -6737,6 +6739,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
++*switch_count;
migrate_disable_switch(rq, prev);
+ psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c75d1307d86d..9bedd148f007 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1804,8 +1804,13 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* The replenish timer needs to be canceled. No
* problem if it fires concurrently: boosted threads
* are ignored in dl_task_timer().
+ *
+ * If the timer callback was running (hrtimer_try_to_cancel == -1),
+ * it will eventually call put_task_struct().
*/
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ if (hrtimer_try_to_cancel(&p->dl.dl_timer) == 1 &&
+ !dl_server(&p->dl))
+ put_task_struct(p);
p->dl.dl_throttled = 0;
}
} else if (!dl_prio(p->normal_prio)) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a5b1ae0aa55..24dda708b699 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9149,12 +9149,8 @@ static int detach_tasks(struct lb_env *env)
break;
env->loop++;
- /*
- * We've more or less seen every task there is, call it quits
- * unless we haven't found any movable task yet.
- */
- if (env->loop > env->loop_max &&
- !(env->flags & LBF_ALL_PINNED))
+ /* We've more or less seen every task there is, call it quits */
+ if (env->loop > env->loop_max)
break;
/* take a breather every nr_migrate tasks */
@@ -11393,9 +11389,7 @@ more_balance:
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
- /* Stop if we tried all running tasks */
- if (env.loop < busiest->nr_running)
- goto more_balance;
+ goto more_balance;
}
/*
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 7b4aa5809c0f..507d7b8d79af 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -773,6 +773,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
enum psi_states s;
u32 state_mask;
+ lockdep_assert_rq_held(cpu_rq(cpu));
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
@@ -991,22 +992,32 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-void psi_account_irqtime(struct task_struct *task, u32 delta)
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev)
{
- int cpu = task_cpu(task);
+ int cpu = task_cpu(curr);
struct psi_group *group;
struct psi_group_cpu *groupc;
- u64 now;
+ u64 now, irq;
+ s64 delta;
if (static_branch_likely(&psi_disabled))
return;
- if (!task->pid)
+ if (!curr->pid)
+ return;
+
+ lockdep_assert_rq_held(rq);
+ group = task_psi_group(curr);
+ if (prev && task_psi_group(prev) == group)
return;
now = cpu_clock(cpu);
+ irq = irq_time_read(cpu);
+ delta = (s64)(irq - rq->psi_irq_time);
+ if (delta < 0)
+ return;
+ rq->psi_irq_time = irq;
- group = task_psi_group(task);
do {
if (!group->enabled)
continue;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a831af102070..ef20c61004eb 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1126,6 +1126,7 @@ struct rq {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
+ u64 psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRT
u64 prev_steal_time;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 38f3698f5e5b..b02dfc322951 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -110,8 +110,12 @@ __schedstats_from_se(struct sched_entity *se)
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
-void psi_account_irqtime(struct task_struct *task, u32 delta);
-
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+void psi_account_irqtime(struct rq *rq, struct task_struct *curr, struct task_struct *prev);
+#else
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
+#endif /*CONFIG_IRQ_TIME_ACCOUNTING */
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@@ -192,7 +196,8 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
-static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
+static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
+ struct task_struct *prev) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
diff --git a/kernel/signal.c b/kernel/signal.c
index 1f9dd41c04be..60c737e423a1 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2600,6 +2600,14 @@ static void do_freezer_trap(void)
spin_unlock_irq(&current->sighand->siglock);
cgroup_enter_frozen();
schedule();
+
+ /*
+ * We could've been woken by task_work, run it to clear
+ * TIF_NOTIFY_SIGNAL. The caller will retry if necessary.
+ */
+ clear_notify_signal();
+ if (unlikely(task_work_pending(current)))
+ task_work_run();
}
static int ptrace_signal(int signr, kernel_siginfo_t *info, enum pid_type type)
diff --git a/kernel/smp.c b/kernel/smp.c
index f085ebcdf9e7..aaffecdad319 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -25,6 +25,7 @@
#include <linux/nmi.h>
#include <linux/sched/debug.h>
#include <linux/jump_label.h>
+#include <linux/string_choices.h>
#include <trace/events/ipi.h>
#define CREATE_TRACE_POINTS
@@ -982,8 +983,7 @@ void __init smp_init(void)
num_nodes = num_online_nodes();
num_cpus = num_online_cpus();
pr_info("Brought up %d node%s, %d CPU%s\n",
- num_nodes, (num_nodes > 1 ? "s" : ""),
- num_cpus, (num_cpus > 1 ? "s" : ""));
+ num_nodes, str_plural(num_nodes), num_cpus, str_plural(num_cpus));
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
@@ -1119,6 +1119,7 @@ int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys)
queue_work_on(cpu, system_wq, &sscs.work);
wait_for_completion(&sscs.done);
+ destroy_work_on_stack(&sscs.work);
return sscs.ret;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b696b85ac63e..2ef820a2d067 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -76,8 +76,6 @@ COND_SYSCALL(timerfd_gettime32);
COND_SYSCALL(acct);
COND_SYSCALL(capget);
COND_SYSCALL(capset);
-/* __ARCH_WANT_SYS_CLONE3 */
-COND_SYSCALL(clone3);
COND_SYSCALL(futex);
COND_SYSCALL(futex_time32);
COND_SYSCALL(set_robust_list);
diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c
index d06185e054ea..62e73444ffe4 100644
--- a/kernel/time/clocksource-wdtest.c
+++ b/kernel/time/clocksource-wdtest.c
@@ -22,6 +22,7 @@
#include "tick-internal.h"
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clocksource watchdog unit test");
MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0;
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
index 20d5df631570..783f2297111b 100644
--- a/kernel/time/test_udelay.c
+++ b/kernel/time/test_udelay.c
@@ -155,5 +155,6 @@ static void __exit udelay_test_exit(void)
module_exit(udelay_test_exit);
+MODULE_DESCRIPTION("udelay test module");
MODULE_AUTHOR("David Riley <davidriley@chromium.org>");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 771d1e040303..b4843099a8da 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -1141,6 +1141,7 @@ void tick_broadcast_switch_to_oneshot(void)
#ifdef CONFIG_HOTPLUG_CPU
void hotplug_cpu__broadcast_tick_pull(int deadcpu)
{
+ struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
struct clock_event_device *bc;
unsigned long flags;
@@ -1148,6 +1149,28 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu)
bc = tick_broadcast_device.evtdev;
if (bc && broadcast_needs_cpu(bc, deadcpu)) {
+ /*
+ * If the broadcast force bit of the current CPU is set,
+ * then the current CPU has not yet reprogrammed the local
+ * timer device to avoid a ping-pong race. See
+ * ___tick_broadcast_oneshot_control().
+ *
+ * If the broadcast device is hrtimer based then
+ * programming the broadcast event below does not have any
+ * effect because the local clockevent device is not
+ * running and not programmed because the broadcast event
+ * is not earlier than the pending event of the local clock
+ * event device. As a consequence all CPUs waiting for a
+ * broadcast event are stuck forever.
+ *
+ * Detect this condition and reprogram the cpu local timer
+ * device to avoid the starvation.
+ */
+ if (tick_check_broadcast_expired()) {
+ cpumask_clear_cpu(smp_processor_id(), tick_broadcast_force_mask);
+ tick_program_event(td->evtdev->next_event, 1);
+ }
+
/* This moves the broadcast assignment to this CPU: */
clockevents_program_event(bc, bc->next_event, 1);
}
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 71a792cd8936..753a184c7090 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -1026,10 +1026,10 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
if (expires == KTIME_MAX || ts->next_tick == hrtimer_get_expires(&ts->sched_timer))
return;
- WARN_ON_ONCE(1);
- printk_once("basemono: %llu ts->next_tick: %llu dev->next_event: %llu timer->active: %d timer->expires: %llu\n",
- basemono, ts->next_tick, dev->next_event,
- hrtimer_active(&ts->sched_timer), hrtimer_get_expires(&ts->sched_timer));
+ WARN_ONCE(1, "basemono: %llu ts->next_tick: %llu dev->next_event: %llu "
+ "timer->active: %d timer->expires: %llu\n", basemono, ts->next_tick,
+ dev->next_event, hrtimer_active(&ts->sched_timer),
+ hrtimer_get_expires(&ts->sched_timer));
}
/*
@@ -1385,20 +1385,6 @@ unsigned long tick_nohz_get_idle_calls_cpu(int cpu)
return ts->idle_calls;
}
-/**
- * tick_nohz_get_idle_calls - return the current idle calls counter value
- *
- * Called from the schedutil frequency scaling governor in scheduler context.
- *
- * Return: the current idle calls counter value for the current CPU
- */
-unsigned long tick_nohz_get_idle_calls(void)
-{
- struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
-
- return ts->idle_calls;
-}
-
static void tick_nohz_account_idle_time(struct tick_sched *ts,
ktime_t now)
{
diff --git a/kernel/time/time_test.c b/kernel/time/time_test.c
index 3e5d422dd15c..2889763165e5 100644
--- a/kernel/time/time_test.c
+++ b/kernel/time/time_test.c
@@ -96,4 +96,5 @@ static struct kunit_suite time_test_suite = {
};
kunit_test_suite(time_test_suite);
+MODULE_DESCRIPTION("time unit test suite");
MODULE_LICENSE("GPL");
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4e18db1819f8..2fa87dcfeda9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1195,6 +1195,108 @@ static bool timestamp_in_interval(u64 start, u64 end, u64 ts)
return false;
}
+static bool convert_clock(u64 *val, u32 numerator, u32 denominator)
+{
+ u64 rem, res;
+
+ if (!numerator || !denominator)
+ return false;
+
+ res = div64_u64_rem(*val, denominator, &rem) * numerator;
+ *val = res + div_u64(rem * numerator, denominator);
+ return true;
+}
+
+static bool convert_base_to_cs(struct system_counterval_t *scv)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+ u32 num, den;
+
+ /* The timestamp was taken from the time keeper clock source */
+ if (cs->id == scv->cs_id)
+ return true;
+
+ /*
+ * Check whether cs_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != scv->cs_id)
+ return false;
+
+ num = scv->use_nsecs ? cs->freq_khz : base->numerator;
+ den = scv->use_nsecs ? USEC_PER_SEC : base->denominator;
+
+ if (!convert_clock(&scv->cycles, num, den))
+ return false;
+
+ scv->cycles += base->offset;
+ return true;
+}
+
+static bool convert_cs_to_base(u64 *cycles, enum clocksource_ids base_id)
+{
+ struct clocksource *cs = tk_core.timekeeper.tkr_mono.clock;
+ struct clocksource_base *base;
+
+ /*
+ * Check whether base_id matches the base clock. Prevent the compiler from
+ * re-evaluating @base as the clocksource might change concurrently.
+ */
+ base = READ_ONCE(cs->base);
+ if (!base || base->id != base_id)
+ return false;
+
+ *cycles -= base->offset;
+ if (!convert_clock(cycles, base->denominator, base->numerator))
+ return false;
+ return true;
+}
+
+static bool convert_ns_to_cs(u64 *delta)
+{
+ struct tk_read_base *tkr = &tk_core.timekeeper.tkr_mono;
+
+ if (BITS_TO_BYTES(fls64(*delta) + tkr->shift) >= sizeof(*delta))
+ return false;
+
+ *delta = div_u64((*delta << tkr->shift) - tkr->xtime_nsec, tkr->mult);
+ return true;
+}
+
+/**
+ * ktime_real_to_base_clock() - Convert CLOCK_REALTIME timestamp to a base clock timestamp
+ * @treal: CLOCK_REALTIME timestamp to convert
+ * @base_id: base clocksource id
+ * @cycles: pointer to store the converted base clock timestamp
+ *
+ * Converts a supplied, future realtime clock value to the corresponding base clock value.
+ *
+ * Return: true if the conversion is successful, false otherwise.
+ */
+bool ktime_real_to_base_clock(ktime_t treal, enum clocksource_ids base_id, u64 *cycles)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+ unsigned int seq;
+ u64 delta;
+
+ do {
+ seq = read_seqcount_begin(&tk_core.seq);
+ if ((u64)treal < tk->tkr_mono.base_real)
+ return false;
+ delta = (u64)treal - tk->tkr_mono.base_real;
+ if (!convert_ns_to_cs(&delta))
+ return false;
+ *cycles = tk->tkr_mono.cycle_last + delta;
+ if (!convert_cs_to_base(cycles, base_id))
+ return false;
+ } while (read_seqcount_retry(&tk_core.seq, seq));
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(ktime_real_to_base_clock);
+
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
* @get_time_fn: Callback to get simultaneous device time and
@@ -1241,7 +1343,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* installed timekeeper clocksource
*/
if (system_counterval.cs_id == CSID_GENERIC ||
- tk->tkr_mono.clock->id != system_counterval.cs_id)
+ !convert_base_to_cs(&system_counterval))
return -ENODEV;
cycles = system_counterval.cycles;
@@ -1307,6 +1409,30 @@ int get_device_system_crosststamp(int (*get_time_fn)
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
+ * timekeeping_clocksource_has_base - Check whether the current clocksource
+ * is based on given a base clock
+ * @id: base clocksource ID
+ *
+ * Note: The return value is a snapshot which can become invalid right
+ * after the function returns.
+ *
+ * Return: true if the timekeeper clocksource has a base clock with @id,
+ * false otherwise
+ */
+bool timekeeping_clocksource_has_base(enum clocksource_ids id)
+{
+ /*
+ * This is a snapshot, so no point in using the sequence
+ * count. Just prevent the compiler from re-evaluating @base as the
+ * clocksource might change concurrently.
+ */
+ struct clocksource_base *base = READ_ONCE(tk_core.timekeeper.tkr_mono.clock->base);
+
+ return base ? base->id == id : false;
+}
+EXPORT_SYMBOL_GPL(timekeeping_clocksource_has_base);
+
+/**
* do_settimeofday64 - Sets the time of day.
* @ts: pointer to the timespec64 variable containing the new time
*
@@ -2421,6 +2547,7 @@ EXPORT_SYMBOL_GPL(random_get_entropy_fallback);
/**
* do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ * @txc: Pointer to kernel_timex structure containing NTP parameters
*/
int do_adjtimex(struct __kernel_timex *txc)
{
@@ -2489,6 +2616,8 @@ int do_adjtimex(struct __kernel_timex *txc)
#ifdef CONFIG_NTP_PPS
/**
* hardpps() - Accessor function to NTP __hardpps function
+ * @phase_ts: Pointer to timespec64 structure representing phase timestamp
+ * @raw_ts: Pointer to timespec64 structure representing raw timestamp
*/
void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts)
{
diff --git a/kernel/torture.c b/kernel/torture.c
index c72ab2d251f4..dede150aef01 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -40,6 +40,7 @@
#include <linux/sched/rt.h>
#include "rcu/rcu.h"
+MODULE_DESCRIPTION("Common functions for in-kernel torture tests");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3fbaecfc88c2..1745ca788ede 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -216,8 +216,6 @@ struct worker_pool {
struct worker *manager; /* L: purely informational */
struct list_head workers; /* A: attached workers */
- struct list_head dying_workers; /* A: workers about to die */
- struct completion *detach_completion; /* all workers detached */
struct ida worker_ida; /* worker IDs for task name */
@@ -436,7 +434,7 @@ static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_pod_attrs_buf;
+static struct workqueue_attrs *unbound_wq_update_pwq_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -447,6 +445,9 @@ static struct rcuwait manager_wait = __RCUWAIT_INITIALIZER(manager_wait);
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
+/* PL: mirror the cpu_online_mask excluding the CPU in the midst of hotplugging */
+static cpumask_var_t wq_online_cpumask;
+
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
@@ -1684,33 +1685,6 @@ static void __pwq_activate_work(struct pool_workqueue *pwq,
__clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb);
}
-/**
- * pwq_activate_work - Activate a work item if inactive
- * @pwq: pool_workqueue @work belongs to
- * @work: work item to activate
- *
- * Returns %true if activated. %false if already active.
- */
-static bool pwq_activate_work(struct pool_workqueue *pwq,
- struct work_struct *work)
-{
- struct worker_pool *pool = pwq->pool;
- struct wq_node_nr_active *nna;
-
- lockdep_assert_held(&pool->lock);
-
- if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE))
- return false;
-
- nna = wq_node_nr_active(pwq->wq, pool->node);
- if (nna)
- atomic_inc(&nna->nr);
-
- pwq->nr_active++;
- __pwq_activate_work(pwq, work);
- return true;
-}
-
static bool tryinc_node_nr_active(struct wq_node_nr_active *nna)
{
int max = READ_ONCE(nna->max);
@@ -2117,7 +2091,7 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
*/
pwq = get_work_pwq(work);
if (pwq && pwq->pool == pool) {
- unsigned long work_data;
+ unsigned long work_data = *work_data_bits(work);
debug_work_deactivate(work);
@@ -2126,13 +2100,17 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* pwq->inactive_works since a queued barrier can't be
* canceled (see the comments in insert_wq_barrier()).
*
- * An inactive work item cannot be grabbed directly because
+ * An inactive work item cannot be deleted directly because
* it might have linked barrier work items which, if left
* on the inactive_works list, will confuse pwq->nr_active
- * management later on and cause stall. Make sure the work
- * item is activated before grabbing.
+ * management later on and cause stall. Move the linked
+ * barrier work items to the worklist when deleting the grabbed
+ * item. Also keep WORK_STRUCT_INACTIVE in work_data, so that
+ * it doesn't participate in nr_active management in later
+ * pwq_dec_nr_in_flight().
*/
- pwq_activate_work(pwq, work);
+ if (work_data & WORK_STRUCT_INACTIVE)
+ move_linked_works(work, &pwq->pool->worklist, NULL);
list_del_init(&work->entry);
@@ -2140,7 +2118,6 @@ static int try_to_grab_pending(struct work_struct *work, u32 cflags,
* work->data points to pwq iff queued. Let's point to pool. As
* this destroys work->data needed by the next step, stash it.
*/
- work_data = *work_data_bits(work);
set_work_pool_and_keep_pending(work, pool->id,
pool_offq_flags(pool));
@@ -2298,9 +2275,13 @@ retry:
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
+ *
+ * For ordered workqueue, work items must be queued on the newest pwq
+ * for accurate order management. Guaranteed order also guarantees
+ * non-reentrancy. See the comments above unplug_oldest_pwq().
*/
last_pool = get_work_pool(work);
- if (last_pool && last_pool != pool) {
+ if (last_pool && last_pool != pool && !(wq->flags & __WQ_ORDERED)) {
struct worker *worker;
raw_spin_lock(&last_pool->lock);
@@ -2710,6 +2691,27 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_unlock(&wq_pool_attach_mutex);
}
+static void unbind_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ kthread_set_per_cpu(worker->task, -1);
+ if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
+ else
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+}
+
+
+static void detach_worker(struct worker *worker)
+{
+ lockdep_assert_held(&wq_pool_attach_mutex);
+
+ unbind_worker(worker);
+ list_del(&worker->node);
+ worker->pool = NULL;
+}
+
/**
* worker_detach_from_pool() - detach a worker from its pool
* @worker: worker which is attached to its pool
@@ -2721,26 +2723,16 @@ static void worker_attach_to_pool(struct worker *worker,
static void worker_detach_from_pool(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- struct completion *detach_completion = NULL;
/* there is one permanent BH worker per CPU which should never detach */
WARN_ON_ONCE(pool->flags & POOL_BH);
mutex_lock(&wq_pool_attach_mutex);
-
- kthread_set_per_cpu(worker->task, -1);
- list_del(&worker->node);
- worker->pool = NULL;
-
- if (list_empty(&pool->workers) && list_empty(&pool->dying_workers))
- detach_completion = pool->detach_completion;
+ detach_worker(worker);
mutex_unlock(&wq_pool_attach_mutex);
/* clear leftover flags without pool->lock after it is detached */
worker->flags &= ~(WORKER_UNBOUND | WORKER_REBOUND);
-
- if (detach_completion)
- complete(detach_completion);
}
static int format_worker_id(char *buf, size_t size, struct worker *worker,
@@ -2844,35 +2836,22 @@ fail:
return NULL;
}
-static void unbind_worker(struct worker *worker)
+static void detach_dying_workers(struct list_head *cull_list)
{
- lockdep_assert_held(&wq_pool_attach_mutex);
+ struct worker *worker;
- kthread_set_per_cpu(worker->task, -1);
- if (cpumask_intersects(wq_unbound_cpumask, cpu_active_mask))
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, wq_unbound_cpumask) < 0);
- else
- WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
+ list_for_each_entry(worker, cull_list, entry)
+ detach_worker(worker);
}
-static void wake_dying_workers(struct list_head *cull_list)
+static void reap_dying_workers(struct list_head *cull_list)
{
struct worker *worker, *tmp;
list_for_each_entry_safe(worker, tmp, cull_list, entry) {
list_del_init(&worker->entry);
- unbind_worker(worker);
- /*
- * If the worker was somehow already running, then it had to be
- * in pool->idle_list when set_worker_dying() happened or we
- * wouldn't have gotten here.
- *
- * Thus, the worker must either have observed the WORKER_DIE
- * flag, or have set its state to TASK_IDLE. Either way, the
- * below will be observed by the worker and is safe to do
- * outside of pool->lock.
- */
- wake_up_process(worker->task);
+ kthread_stop_put(worker->task);
+ kfree(worker);
}
}
@@ -2906,7 +2885,9 @@ static void set_worker_dying(struct worker *worker, struct list_head *list)
worker->flags |= WORKER_DIE;
list_move(&worker->entry, list);
- list_move(&worker->node, &pool->dying_workers);
+
+ /* get an extra task struct reference for later kthread_stop_put() */
+ get_task_struct(worker->task);
}
/**
@@ -2965,9 +2946,9 @@ static void idle_cull_fn(struct work_struct *work)
/*
* Grabbing wq_pool_attach_mutex here ensures an already-running worker
- * cannot proceed beyong worker_detach_from_pool() in its self-destruct
- * path. This is required as a previously-preempted worker could run after
- * set_worker_dying() has happened but before wake_dying_workers() did.
+ * cannot proceed beyong set_pf_worker() in its self-destruct path.
+ * This is required as a previously-preempted worker could run after
+ * set_worker_dying() has happened but before detach_dying_workers() did.
*/
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);
@@ -2988,8 +2969,10 @@ static void idle_cull_fn(struct work_struct *work)
}
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
mutex_unlock(&wq_pool_attach_mutex);
+
+ reap_dying_workers(&cull_list);
}
static void send_mayday(struct work_struct *work)
@@ -3368,9 +3351,7 @@ woke_up:
set_pf_worker(false);
ida_free(&pool->worker_ida, worker->id);
- worker_detach_from_pool(worker);
WARN_ON_ONCE(!list_empty(&worker->entry));
- kfree(worker);
return 0;
}
@@ -4761,7 +4742,6 @@ static int init_worker_pool(struct worker_pool *pool)
timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);
INIT_LIST_HEAD(&pool->workers);
- INIT_LIST_HEAD(&pool->dying_workers);
ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
@@ -4903,7 +4883,6 @@ static void rcu_free_pool(struct rcu_head *rcu)
*/
static void put_unbound_pool(struct worker_pool *pool)
{
- DECLARE_COMPLETION_ONSTACK(detach_completion);
struct worker *worker;
LIST_HEAD(cull_list);
@@ -4955,14 +4934,11 @@ static void put_unbound_pool(struct worker_pool *pool)
WARN_ON(pool->nr_workers || pool->nr_idle);
raw_spin_unlock_irq(&pool->lock);
- wake_dying_workers(&cull_list);
+ detach_dying_workers(&cull_list);
- if (!list_empty(&pool->workers) || !list_empty(&pool->dying_workers))
- pool->detach_completion = &detach_completion;
mutex_unlock(&wq_pool_attach_mutex);
- if (pool->detach_completion)
- wait_for_completion(pool->detach_completion);
+ reap_dying_workers(&cull_list);
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
@@ -5038,12 +5014,6 @@ fail:
return NULL;
}
-static void rcu_free_pwq(struct rcu_head *rcu)
-{
- kmem_cache_free(pwq_cache,
- container_of(rcu, struct pool_workqueue, rcu));
-}
-
/*
* Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero
* refcnt and needs to be destroyed.
@@ -5089,7 +5059,7 @@ static void pwq_release_workfn(struct kthread_work *work)
raw_spin_unlock_irq(&nna->lock);
}
- call_rcu(&pwq->rcu, rcu_free_pwq);
+ kfree_rcu(pwq, rcu);
/*
* If we're the last pwq going away, @wq is already dead and no one
@@ -5161,14 +5131,22 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
return pwq;
}
+static void apply_wqattrs_lock(void)
+{
+ mutex_lock(&wq_pool_mutex);
+}
+
+static void apply_wqattrs_unlock(void)
+{
+ mutex_unlock(&wq_pool_mutex);
+}
+
/**
* wq_calc_pod_cpumask - calculate a wq_attrs' cpumask for a pod
* @attrs: the wq_attrs of the default pwq of the target workqueue
* @cpu: the target CPU
- * @cpu_going_down: if >= 0, the CPU to consider as offline
*
- * Calculate the cpumask a workqueue with @attrs should use on @pod. If
- * @cpu_going_down is >= 0, that cpu is considered offline during calculation.
+ * Calculate the cpumask a workqueue with @attrs should use on @pod.
* The result is stored in @attrs->__pod_cpumask.
*
* If pod affinity is not enabled, @attrs->cpumask is always used. If enabled
@@ -5177,29 +5155,18 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
*
* The caller is responsible for ensuring that the cpumask of @pod stays stable.
*/
-static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu,
- int cpu_going_down)
+static void wq_calc_pod_cpumask(struct workqueue_attrs *attrs, int cpu)
{
const struct wq_pod_type *pt = wqattrs_pod_type(attrs);
int pod = pt->cpu_pod[cpu];
- /* does @pod have any online CPUs @attrs wants? */
+ /* calculate possible CPUs in @pod that @attrs wants */
cpumask_and(attrs->__pod_cpumask, pt->pod_cpus[pod], attrs->cpumask);
- cpumask_and(attrs->__pod_cpumask, attrs->__pod_cpumask, cpu_online_mask);
- if (cpu_going_down >= 0)
- cpumask_clear_cpu(cpu_going_down, attrs->__pod_cpumask);
-
- if (cpumask_empty(attrs->__pod_cpumask)) {
+ /* does @pod have any online CPUs @attrs wants? */
+ if (!cpumask_intersects(attrs->__pod_cpumask, wq_online_cpumask)) {
cpumask_copy(attrs->__pod_cpumask, attrs->cpumask);
return;
}
-
- /* yeap, return possible CPUs in @pod that @attrs wants */
- cpumask_and(attrs->__pod_cpumask, attrs->cpumask, pt->pod_cpus[pod]);
-
- if (cpumask_empty(attrs->__pod_cpumask))
- pr_warn_once("WARNING: workqueue cpumask: online intersect > "
- "possible intersect\n");
}
/* install @pwq into @wq and return the old pwq, @cpu < 0 for dfl_pwq */
@@ -5284,7 +5251,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
ctx->dfl_pwq->refcnt++;
ctx->pwq_tbl[cpu] = ctx->dfl_pwq;
} else {
- wq_calc_pod_cpumask(new_attrs, cpu, -1);
+ wq_calc_pod_cpumask(new_attrs, cpu);
ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->pwq_tbl[cpu])
goto out_free;
@@ -5375,8 +5342,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
*
* Performs GFP_KERNEL allocations.
*
- * Assumes caller has CPU hotplug read exclusion, i.e. cpus_read_lock().
- *
* Return: 0 on success and -errno on failure.
*/
int apply_workqueue_attrs(struct workqueue_struct *wq,
@@ -5384,8 +5349,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
{
int ret;
- lockdep_assert_cpus_held();
-
mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
mutex_unlock(&wq_pool_mutex);
@@ -5394,15 +5357,12 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
}
/**
- * wq_update_pod - update pod affinity of a wq for CPU hot[un]plug
+ * unbound_wq_update_pwq - update a pwq slot for CPU hot[un]plug
* @wq: the target workqueue
- * @cpu: the CPU to update pool association for
- * @hotplug_cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
+ * @cpu: the CPU to update the pwq slot for
*
* This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
- * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update pod affinity of
- * @wq accordingly.
+ * %CPU_DOWN_FAILED. @cpu is in the same pod of the CPU being hot[un]plugged.
*
*
* If pod affinity can't be adjusted due to memory allocation failure, it falls
@@ -5415,10 +5375,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
* CPU_DOWN. If a workqueue user wants strict affinity, it's the user's
* responsibility to flush the work item from CPU_DOWN_PREPARE.
*/
-static void wq_update_pod(struct workqueue_struct *wq, int cpu,
- int hotplug_cpu, bool online)
+static void unbound_wq_update_pwq(struct workqueue_struct *wq, int cpu)
{
- int off_cpu = online ? -1 : hotplug_cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
@@ -5432,13 +5390,13 @@ static void wq_update_pod(struct workqueue_struct *wq, int cpu,
* Let's use a preallocated one. The following buf is protected by
* CPU hotplug exclusion.
*/
- target_attrs = wq_update_pod_attrs_buf;
+ target_attrs = unbound_wq_update_pwq_attrs_buf;
copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
wqattrs_actualize_cpumask(target_attrs, wq_unbound_cpumask);
/* nothing to do if the target cpumask matches the current pwq */
- wq_calc_pod_cpumask(target_attrs, cpu, off_cpu);
+ wq_calc_pod_cpumask(target_attrs, cpu);
if (wqattrs_equal(target_attrs, unbound_pwq(wq, cpu)->pool->attrs))
return;
@@ -5472,21 +5430,24 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
bool highpri = wq->flags & WQ_HIGHPRI;
int cpu, ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
wq->cpu_pwq = alloc_percpu(struct pool_workqueue *);
if (!wq->cpu_pwq)
goto enomem;
if (!(wq->flags & WQ_UNBOUND)) {
+ struct worker_pool __percpu *pools;
+
+ if (wq->flags & WQ_BH)
+ pools = bh_worker_pools;
+ else
+ pools = cpu_worker_pools;
+
for_each_possible_cpu(cpu) {
struct pool_workqueue **pwq_p;
- struct worker_pool __percpu *pools;
struct worker_pool *pool;
- if (wq->flags & WQ_BH)
- pools = bh_worker_pools;
- else
- pools = cpu_worker_pools;
-
pool = &(per_cpu_ptr(pools, cpu)[highpri]);
pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu);
@@ -5504,26 +5465,18 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
return 0;
}
- cpus_read_lock();
if (wq->flags & __WQ_ORDERED) {
struct pool_workqueue *dfl_pwq;
- ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
dfl_pwq = rcu_access_pointer(wq->dfl_pwq);
WARN(!ret && (wq->pwqs.next != &dfl_pwq->pwqs_node ||
wq->pwqs.prev != &dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
- ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
+ ret = apply_workqueue_attrs_locked(wq, unbound_std_wq_attrs[highpri]);
}
- cpus_read_unlock();
-
- /* for unbound pwq, flush the pwq_release_worker ensures that the
- * pwq_release_workfn() completes before calling kfree(wq).
- */
- if (ret)
- kthread_flush_worker(pwq_release_worker);
return ret;
@@ -5561,6 +5514,8 @@ static int init_rescuer(struct workqueue_struct *wq)
char id_buf[WORKER_ID_LEN];
int ret;
+ lockdep_assert_held(&wq_pool_mutex);
+
if (!(wq->flags & WQ_MEM_RECLAIM))
return 0;
@@ -5585,7 +5540,7 @@ static int init_rescuer(struct workqueue_struct *wq)
wq->rescuer = rescuer;
if (wq->flags & WQ_UNBOUND)
- kthread_bind_mask(rescuer->task, wq_unbound_cpumask);
+ kthread_bind_mask(rescuer->task, unbound_effective_cpumask(wq));
else
kthread_bind_mask(rescuer->task, cpu_possible_mask);
wake_up_process(rescuer->task);
@@ -5733,21 +5688,14 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
goto err_unreg_lockdep;
}
- if (alloc_and_link_pwqs(wq) < 0)
- goto err_free_node_nr_active;
-
- if (wq_online && init_rescuer(wq) < 0)
- goto err_destroy;
-
- if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
- goto err_destroy;
-
/*
- * wq_pool_mutex protects global freeze state and workqueues list.
- * Grab it, adjust max_active and add the new @wq to workqueues
- * list.
+ * wq_pool_mutex protects the workqueues list, allocations of PWQs,
+ * and the global freeze state.
*/
- mutex_lock(&wq_pool_mutex);
+ apply_wqattrs_lock();
+
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_unlock_free_node_nr_active;
mutex_lock(&wq->mutex);
wq_adjust_max_active(wq);
@@ -5755,13 +5703,27 @@ struct workqueue_struct *alloc_workqueue(const char *fmt,
list_add_tail_rcu(&wq->list, &workqueues);
- mutex_unlock(&wq_pool_mutex);
+ if (wq_online && init_rescuer(wq) < 0)
+ goto err_unlock_destroy;
+
+ apply_wqattrs_unlock();
+
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
return wq;
-err_free_node_nr_active:
- if (wq->flags & WQ_UNBOUND)
+err_unlock_free_node_nr_active:
+ apply_wqattrs_unlock();
+ /*
+ * Failed alloc_and_link_pwqs() may leave pending pwq->release_work,
+ * flushing the pwq_release_worker ensures that the pwq_release_workfn()
+ * completes before calling kfree(wq).
+ */
+ if (wq->flags & WQ_UNBOUND) {
+ kthread_flush_worker(pwq_release_worker);
free_node_nr_active(wq->node_nr_active);
+ }
err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
@@ -5769,6 +5731,8 @@ err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
return NULL;
+err_unlock_destroy:
+ apply_wqattrs_unlock();
err_destroy:
destroy_workqueue(wq);
return NULL;
@@ -6607,6 +6571,8 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(&wq_pool_mutex);
+ cpumask_set_cpu(cpu, wq_online_cpumask);
+
for_each_pool(pool, pi) {
/* BH pools aren't affected by hotplug */
if (pool->flags & POOL_BH)
@@ -6629,7 +6595,7 @@ int workqueue_online_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, true);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);
@@ -6653,6 +6619,9 @@ int workqueue_offline_cpu(unsigned int cpu)
/* update pod affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
+
+ cpumask_clear_cpu(cpu, wq_online_cpumask);
+
list_for_each_entry(wq, &workqueues, list) {
struct workqueue_attrs *attrs = wq->unbound_attrs;
@@ -6661,7 +6630,7 @@ int workqueue_offline_cpu(unsigned int cpu)
int tcpu;
for_each_cpu(tcpu, pt->pod_cpus[pt->cpu_pod[cpu]])
- wq_update_pod(wq, tcpu, cpu, false);
+ unbound_wq_update_pwq(wq, tcpu);
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, cpu);
@@ -6887,8 +6856,7 @@ static int workqueue_apply_unbound_cpumask(const cpumask_var_t unbound_cpumask)
* @exclude_cpumask: the cpumask to be excluded from wq_unbound_cpumask
*
* This function can be called from cpuset code to provide a set of isolated
- * CPUs that should be excluded from wq_unbound_cpumask. The caller must hold
- * either cpus_read_lock or cpus_write_lock.
+ * CPUs that should be excluded from wq_unbound_cpumask.
*/
int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
{
@@ -6898,12 +6866,8 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
return -ENOMEM;
- lockdep_assert_cpus_held();
mutex_lock(&wq_pool_mutex);
- /* Save the current isolated cpumask & export it via sysfs */
- cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
-
/*
* If the operation fails, it will fall back to
* wq_requested_unbound_cpumask which is initially set to
@@ -6915,6 +6879,10 @@ int workqueue_unbound_exclude_cpumask(cpumask_var_t exclude_cpumask)
if (!cpumask_equal(cpumask, wq_unbound_cpumask))
ret = workqueue_apply_unbound_cpumask(cpumask);
+ /* Save the current isolated cpumask & export it via sysfs */
+ if (!ret)
+ cpumask_copy(wq_isolated_cpumask, exclude_cpumask);
+
mutex_unlock(&wq_pool_mutex);
free_cpumask_var(cpumask);
return ret;
@@ -6948,9 +6916,8 @@ static int wq_affn_dfl_set(const char *val, const struct kernel_param *kp)
wq_affn_dfl = affn;
list_for_each_entry(wq, &workqueues, list) {
- for_each_online_cpu(cpu) {
- wq_update_pod(wq, cpu, cpu, true);
- }
+ for_each_online_cpu(cpu)
+ unbound_wq_update_pwq(wq, cpu);
}
mutex_unlock(&wq_pool_mutex);
@@ -7038,19 +7005,6 @@ static struct attribute *wq_sysfs_attrs[] = {
};
ATTRIBUTE_GROUPS(wq_sysfs);
-static void apply_wqattrs_lock(void)
-{
- /* CPUs should stay stable across pwq creations and installations */
- cpus_read_lock();
- mutex_lock(&wq_pool_mutex);
-}
-
-static void apply_wqattrs_unlock(void)
-{
- mutex_unlock(&wq_pool_mutex);
- cpus_read_unlock();
-}
-
static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -7249,16 +7203,12 @@ static int workqueue_set_unbound_cpumask(cpumask_var_t cpumask)
*/
cpumask_and(cpumask, cpumask, cpu_possible_mask);
if (!cpumask_empty(cpumask)) {
+ ret = 0;
apply_wqattrs_lock();
- cpumask_copy(wq_requested_unbound_cpumask, cpumask);
- if (cpumask_equal(cpumask, wq_unbound_cpumask)) {
- ret = 0;
- goto out_unlock;
- }
-
- ret = workqueue_apply_unbound_cpumask(cpumask);
-
-out_unlock:
+ if (!cpumask_equal(cpumask, wq_unbound_cpumask))
+ ret = workqueue_apply_unbound_cpumask(cpumask);
+ if (!ret)
+ cpumask_copy(wq_requested_unbound_cpumask, cpumask);
apply_wqattrs_unlock();
}
@@ -7577,10 +7527,18 @@ static void wq_watchdog_timer_fn(struct timer_list *unused)
notrace void wq_watchdog_touch(int cpu)
{
+ unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ;
+ unsigned long touch_ts = READ_ONCE(wq_watchdog_touched);
+ unsigned long now = jiffies;
+
if (cpu >= 0)
- per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies;
+ per_cpu(wq_watchdog_touched_cpu, cpu) = now;
+ else
+ WARN_ONCE(1, "%s should be called with valid CPU", __func__);
- wq_watchdog_touched = jiffies;
+ /* Don't unnecessarily store to global cacheline */
+ if (time_after(now, touch_ts + thresh / 4))
+ WRITE_ONCE(wq_watchdog_touched, jiffies);
}
static void wq_watchdog_set_thresh(unsigned long thresh)
@@ -7690,10 +7648,12 @@ void __init workqueue_init_early(void)
BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+ BUG_ON(!alloc_cpumask_var(&wq_online_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&wq_requested_unbound_cpumask, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&wq_isolated_cpumask, GFP_KERNEL));
+ cpumask_copy(wq_online_cpumask, cpu_online_mask);
cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
@@ -7704,8 +7664,8 @@ void __init workqueue_init_early(void)
pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
- wq_update_pod_attrs_buf = alloc_workqueue_attrs();
- BUG_ON(!wq_update_pod_attrs_buf);
+ unbound_wq_update_pwq_attrs_buf = alloc_workqueue_attrs();
+ BUG_ON(!unbound_wq_update_pwq_attrs_buf);
/*
* If nohz_full is enabled, set power efficient workqueue as unbound.
@@ -7970,12 +7930,12 @@ void __init workqueue_init_topology(void)
/*
* Workqueues allocated earlier would have all CPUs sharing the default
- * worker pool. Explicitly call wq_update_pod() on all workqueue and CPU
- * combinations to apply per-pod sharing.
+ * worker pool. Explicitly call unbound_wq_update_pwq() on all workqueue
+ * and CPU combinations to apply per-pod sharing.
*/
list_for_each_entry(wq, &workqueues, list) {
for_each_online_cpu(cpu)
- wq_update_pod(wq, cpu, cpu, true);
+ unbound_wq_update_pwq(wq, cpu);
if (wq->flags & WQ_UNBOUND) {
mutex_lock(&wq->mutex);
wq_update_node_max_active(wq, -1);