summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 15:25:27 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-07-15 15:25:27 -0700
commit9855e873285f1395b9a04613c56101f04a5ec9fb (patch)
treef4208bc03ebcc3011feec1bf7534b5a85d7183fd /kernel
parent253e1e98180a124475327f2ce7b6f15f2e4d0d45 (diff)
parent02219caa92b5b0ed97f8d8b9cf580f6f34a9be31 (diff)
Merge tag 'rcu.2024.07.12a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu
Pull RCU updates from Paul McKenney: - Update Tasks RCU and Tasks Rude RCU description in Requirements.rst and clarify rcu_assign_pointer() and rcu_dereference() ordering properties - Add lockdep assertions for RCU readers, limit inline wakeups for callback-bypass synchronize_rcu(), add an rcutree.nohz_full_patience_delay to reduce nohz_full OS jitter, add Uladzislau Rezki as RCU maintainer, and fix a subtle callback-migration memory-ordering issue - Remove a number of redundant memory barriers - Remove unnecessary bypass-list lock-contention mitigation, use parking API instead of open-coded ad-hoc equivalent, and upgrade obsolete comments - Revert avoidance of a deadlock that can no longer occur and properly synchronize Tasks Trace RCU checking of runqueues - Add tests for handling of double-call_rcu() bug, add missing MODULE_DESCRIPTION, and add a script that histograms the number of calls to RCU updaters - Fill out SRCU polled-grace-period API * tag 'rcu.2024.07.12a' of git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu: (29 commits) rcu: Fix rcu_barrier() VS post CPUHP_TEARDOWN_CPU invocation rcu: Eliminate lockless accesses to rcu_sync->gp_count MAINTAINERS: Add Uladzislau Rezki as RCU maintainer rcu: Add rcutree.nohz_full_patience_delay to reduce nohz_full OS jitter rcu/exp: Remove redundant full memory barrier at the end of GP rcu: Remove full memory barrier on RCU stall printout rcu: Remove full memory barrier on boot time eqs sanity check rcu/exp: Remove superfluous full memory barrier upon first EQS snapshot rcu: Remove superfluous full memory barrier upon first EQS snapshot rcu: Remove full ordering on second EQS snapshot srcu: Fill out polled grace-period APIs srcu: Update cleanup_srcu_struct() comment srcu: Add NUM_ACTIVE_SRCU_POLL_OLDSTATE srcu: Disable interrupts directly in srcu_gp_end() rcu: Disable interrupts directly in rcu_gp_init() rcu/tree: Reduce wake up for synchronize_rcu() common case rcu/tasks: Fix stale task snaphot for Tasks Trace tools/rcu: Add rcu-updaters.sh script rcutorture: Add missing MODULE_DESCRIPTION() macros rcutorture: Fix rcu_torture_fwd_cb_cr() data race ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/pid_namespace.c17
-rw-r--r--kernel/rcu/rcuscale.c1
-rw-r--r--kernel/rcu/rcutorture.c49
-rw-r--r--kernel/rcu/refscale.c1
-rw-r--r--kernel/rcu/srcutiny.c3
-rw-r--r--kernel/rcu/srcutree.c13
-rw-r--r--kernel/rcu/sync.c12
-rw-r--r--kernel/rcu/tasks.h26
-rw-r--r--kernel/rcu/tree.c92
-rw-r--r--kernel/rcu/tree.h2
-rw-r--r--kernel/rcu/tree_exp.h24
-rw-r--r--kernel/rcu/tree_nocb.h147
-rw-r--r--kernel/rcu/tree_plugin.h14
-rw-r--r--kernel/rcu/tree_stall.h4
-rw-r--r--kernel/sched/core.c14
15 files changed, 207 insertions, 212 deletions
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 25f3cf679b35..bdf0087d6442 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -249,24 +249,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->pid_allocated == init_pids)
break;
- /*
- * Release tasks_rcu_exit_srcu to avoid following deadlock:
- *
- * 1) TASK A unshare(CLONE_NEWPID)
- * 2) TASK A fork() twice -> TASK B (child reaper for new ns)
- * and TASK C
- * 3) TASK B exits, kills TASK C, waits for TASK A to reap it
- * 4) TASK A calls synchronize_rcu_tasks()
- * -> synchronize_srcu(tasks_rcu_exit_srcu)
- * 5) *DEADLOCK*
- *
- * It is considered safe to release tasks_rcu_exit_srcu here
- * because we assume the current task can not be concurrently
- * reaped at this point.
- */
- exit_tasks_rcu_stop();
schedule();
- exit_tasks_rcu_start();
}
__set_current_state(TASK_RUNNING);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index 8db4fedaaa1e..b53a9e8f5904 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -42,6 +42,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based scalability-test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 807fbf6123a7..08bf7c669dd3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -51,6 +51,7 @@
#include "rcu.h"
+MODULE_DESCRIPTION("Read-Copy Update module-based torture test facility");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
@@ -390,6 +391,7 @@ struct rcu_torture_ops {
int extendables;
int slow_gps;
int no_pi_lock;
+ int debug_objects;
const char *name;
};
@@ -577,6 +579,7 @@ static struct rcu_torture_ops rcu_ops = {
.irq_capable = 1,
.can_boost = IS_ENABLED(CONFIG_RCU_BOOST),
.extendables = RCUTORTURE_MAX_EXTEND,
+ .debug_objects = 1,
.name = "rcu"
};
@@ -747,6 +750,7 @@ static struct rcu_torture_ops srcu_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcu"
};
@@ -786,6 +790,7 @@ static struct rcu_torture_ops srcud_ops = {
.cbflood_max = 50000,
.irq_capable = 1,
.no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU),
+ .debug_objects = 1,
.name = "srcud"
};
@@ -2626,7 +2631,7 @@ static void rcu_torture_fwd_cb_cr(struct rcu_head *rhp)
spin_lock_irqsave(&rfp->rcu_fwd_lock, flags);
rfcpp = rfp->rcu_fwd_cb_tail;
rfp->rcu_fwd_cb_tail = &rfcp->rfc_next;
- WRITE_ONCE(*rfcpp, rfcp);
+ smp_store_release(rfcpp, rfcp);
WRITE_ONCE(rfp->n_launders_cb, rfp->n_launders_cb + 1);
i = ((jiffies - rfp->rcu_fwd_startat) / (HZ / FWD_CBS_HIST_DIV));
if (i >= ARRAY_SIZE(rfp->n_launders_hist))
@@ -3455,7 +3460,6 @@ rcu_torture_cleanup(void)
cur_ops->gp_slow_unregister(NULL);
}
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
static void rcu_torture_leak_cb(struct rcu_head *rhp)
{
}
@@ -3473,7 +3477,6 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
pr_alert("%s: duplicated callback was invoked.\n", KBUILD_MODNAME);
}
-#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
/*
* Verify that double-free causes debug-objects to complain, but only
@@ -3482,39 +3485,43 @@ static void rcu_torture_err_cb(struct rcu_head *rhp)
*/
static void rcu_test_debug_objects(void)
{
-#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
struct rcu_head rh1;
struct rcu_head rh2;
+ int idx;
+
+ if (!IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) {
+ pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_%s()\n",
+ KBUILD_MODNAME, cur_ops->name);
+ return;
+ }
+
+ if (WARN_ON_ONCE(cur_ops->debug_objects &&
+ (!cur_ops->call || !cur_ops->cb_barrier)))
+ return;
+
struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
init_rcu_head_on_stack(&rh1);
init_rcu_head_on_stack(&rh2);
- pr_alert("%s: WARN: Duplicate call_rcu() test starting.\n", KBUILD_MODNAME);
+ pr_alert("%s: WARN: Duplicate call_%s() test starting.\n", KBUILD_MODNAME, cur_ops->name);
/* Try to queue the rh2 pair of callbacks for the same grace period. */
- preempt_disable(); /* Prevent preemption from interrupting test. */
- rcu_read_lock(); /* Make it impossible to finish a grace period. */
- call_rcu_hurry(&rh1, rcu_torture_leak_cb); /* Start grace period. */
- local_irq_disable(); /* Make it harder to start a new grace period. */
- call_rcu_hurry(&rh2, rcu_torture_leak_cb);
- call_rcu_hurry(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
+ idx = cur_ops->readlock(); /* Make it impossible to finish a grace period. */
+ cur_ops->call(&rh1, rcu_torture_leak_cb); /* Start grace period. */
+ cur_ops->call(&rh2, rcu_torture_leak_cb);
+ cur_ops->call(&rh2, rcu_torture_err_cb); /* Duplicate callback. */
if (rhp) {
- call_rcu_hurry(rhp, rcu_torture_leak_cb);
- call_rcu_hurry(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
+ cur_ops->call(rhp, rcu_torture_leak_cb);
+ cur_ops->call(rhp, rcu_torture_err_cb); /* Another duplicate callback. */
}
- local_irq_enable();
- rcu_read_unlock();
- preempt_enable();
+ cur_ops->readunlock(idx);
/* Wait for them all to get done so we can safely return. */
- rcu_barrier();
- pr_alert("%s: WARN: Duplicate call_rcu() test complete.\n", KBUILD_MODNAME);
+ cur_ops->cb_barrier();
+ pr_alert("%s: WARN: Duplicate call_%s() test complete.\n", KBUILD_MODNAME, cur_ops->name);
destroy_rcu_head_on_stack(&rh1);
destroy_rcu_head_on_stack(&rh2);
kfree(rhp);
-#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
- pr_alert("%s: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n", KBUILD_MODNAME);
-#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
}
static void rcutorture_sync(void)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index 2c2648a3ad30..f4ea5b1ec068 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -63,6 +63,7 @@ do { \
#define SCALEOUT_ERRSTRING(s, x...) pr_alert("%s" SCALE_FLAG "!!! " s "\n", scale_type, ## x)
+MODULE_DESCRIPTION("Scalability test for object reference mechanisms");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Joel Fernandes (Google) <joel@joelfernandes.org>");
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 5afd5cf494db..549c03336ee9 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -277,7 +277,8 @@ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
unsigned long cur_s = READ_ONCE(ssp->srcu_idx);
barrier();
- return ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
+ return cookie == SRCU_GET_STATE_COMPLETED ||
+ ULONG_CMP_GE(cur_s, cookie) || ULONG_CMP_LT(cur_s, cookie - 3);
}
EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu);
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index bc4b58b0204e..b24db425f16d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -667,7 +667,10 @@ void cleanup_srcu_struct(struct srcu_struct *ssp)
pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n",
__func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)),
rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed);
- return; /* Caller forgot to stop doing call_srcu()? */
+ return; // Caller forgot to stop doing call_srcu()?
+ // Or caller invoked start_poll_synchronize_srcu()
+ // and then cleanup_srcu_struct() before that grace
+ // period ended?
}
kfree(sup->node);
sup->node = NULL;
@@ -845,7 +848,6 @@ static void srcu_gp_end(struct srcu_struct *ssp)
bool cbs;
bool last_lvl;
int cpu;
- unsigned long flags;
unsigned long gpseq;
int idx;
unsigned long mask;
@@ -907,12 +909,12 @@ static void srcu_gp_end(struct srcu_struct *ssp)
if (!(gpseq & counter_wrap_check))
for_each_possible_cpu(cpu) {
sdp = per_cpu_ptr(ssp->sda, cpu);
- spin_lock_irqsave_rcu_node(sdp, flags);
+ spin_lock_irq_rcu_node(sdp);
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100))
sdp->srcu_gp_seq_needed = gpseq;
if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100))
sdp->srcu_gp_seq_needed_exp = gpseq;
- spin_unlock_irqrestore_rcu_node(sdp, flags);
+ spin_unlock_irq_rcu_node(sdp);
}
/* Callback initiation done, allow grace periods after next. */
@@ -1540,7 +1542,8 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu);
*/
bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie)
{
- if (!rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
+ if (cookie != SRCU_GET_STATE_COMPLETED &&
+ !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie))
return false;
// Ensure that the end of the SRCU grace period happens before
// any subsequent code that the caller might execute.
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index 6c2bd9001adc..da60a9947c00 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp)
* we are called at early boot time but this shouldn't happen.
*/
}
- WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1);
+ rsp->gp_count++;
spin_unlock_irq(&rsp->rss_lock);
if (gp_state == GP_IDLE) {
@@ -151,15 +151,11 @@ void rcu_sync_enter(struct rcu_sync *rsp)
*/
void rcu_sync_exit(struct rcu_sync *rsp)
{
- int gpc;
-
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
spin_lock_irq(&rsp->rss_lock);
- gpc = rsp->gp_count - 1;
- WRITE_ONCE(rsp->gp_count, gpc);
- if (!gpc) {
+ WARN_ON_ONCE(rsp->gp_count == 0);
+ if (!--rsp->gp_count) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
@@ -178,10 +174,10 @@ void rcu_sync_dtor(struct rcu_sync *rsp)
{
int gp_state;
- WARN_ON_ONCE(READ_ONCE(rsp->gp_count));
WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_PASSED);
spin_lock_irq(&rsp->rss_lock);
+ WARN_ON_ONCE(rsp->gp_count);
if (rsp->gp_state == GP_REPLAY)
WRITE_ONCE(rsp->gp_state, GP_EXIT);
gp_state = rsp->gp_state;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index e1bf33018e6d..ba3440a45b6d 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -858,7 +858,7 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp)
// not know to synchronize with this RCU Tasks grace period) have
// completed exiting. The synchronize_rcu() in rcu_tasks_postgp()
// will take care of any tasks stuck in the non-preemptible region
-// of do_exit() following its call to exit_tasks_rcu_stop().
+// of do_exit() following its call to exit_tasks_rcu_finish().
// check_all_holdout_tasks(), repeatedly until holdout list is empty:
// Scans the holdout list, attempting to identify a quiescent state
// for each task on the list. If there is a quiescent state, the
@@ -1220,7 +1220,7 @@ void exit_tasks_rcu_start(void)
* Remove the task from the "yet another list" because do_exit() is now
* non-preemptible, allowing synchronize_rcu() to wait beyond this point.
*/
-void exit_tasks_rcu_stop(void)
+void exit_tasks_rcu_finish(void)
{
unsigned long flags;
struct rcu_tasks_percpu *rtpcp;
@@ -1231,22 +1231,12 @@ void exit_tasks_rcu_stop(void)
raw_spin_lock_irqsave_rcu_node(rtpcp, flags);
list_del_init(&t->rcu_tasks_exit_list);
raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags);
-}
-/*
- * Contribute to protect against tasklist scan blind spot while the
- * task is exiting and may be removed from the tasklist. See
- * corresponding synchronize_srcu() for further details.
- */
-void exit_tasks_rcu_finish(void)
-{
- exit_tasks_rcu_stop();
- exit_tasks_rcu_finish_trace(current);
+ exit_tasks_rcu_finish_trace(t);
}
#else /* #ifdef CONFIG_TASKS_RCU */
void exit_tasks_rcu_start(void) { }
-void exit_tasks_rcu_stop(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
@@ -1757,6 +1747,16 @@ static void rcu_tasks_trace_pregp_step(struct list_head *hop)
// allow safe access to the hop list.
for_each_online_cpu(cpu) {
rcu_read_lock();
+ // Note that cpu_curr_snapshot() picks up the target
+ // CPU's current task while its runqueue is locked with
+ // an smp_mb__after_spinlock(). This ensures that either
+ // the grace-period kthread will see that task's read-side
+ // critical section or the task will see the updater's pre-GP
+ // accesses. The trailing smp_mb() in cpu_curr_snapshot()
+ // does not currently play a role other than simplify
+ // that function's ordering semantics. If these simplified
+ // ordering semantics continue to be redundant, that smp_mb()
+ // might be removed.
t = cpu_curr_snapshot(cpu);
if (rcu_tasks_trace_pertask_prep(t, true))
trc_add_holdout(t, hop);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 28c7031711a3..e641cc681901 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -96,6 +96,7 @@ static struct rcu_state rcu_state = {
.ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED,
.srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work,
rcu_sr_normal_gp_cleanup_work),
+ .srs_cleanups_pending = ATOMIC_INIT(0),
};
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -175,6 +176,9 @@ static int gp_init_delay;
module_param(gp_init_delay, int, 0444);
static int gp_cleanup_delay;
module_param(gp_cleanup_delay, int, 0444);
+static int nohz_full_patience_delay;
+module_param(nohz_full_patience_delay, int, 0444);
+static int nohz_full_patience_delay_jiffies;
// Add delay to rcu_read_unlock() for strict grace periods.
static int rcu_unlock_delay;
@@ -296,16 +300,6 @@ static void rcu_dynticks_eqs_online(void)
}
/*
- * Snapshot the ->dynticks counter with full ordering so as to allow
- * stable comparison of this counter with past and future snapshots.
- */
-static int rcu_dynticks_snap(int cpu)
-{
- smp_mb(); // Fundamental RCU ordering guarantee.
- return ct_dynticks_cpu_acquire(cpu);
-}
-
-/*
* Return true if the snapshot returned from rcu_dynticks_snap()
* indicates that RCU is in an extended quiescent state.
*/
@@ -321,7 +315,15 @@ static bool rcu_dynticks_in_eqs(int snap)
*/
static bool rcu_dynticks_in_eqs_since(struct rcu_data *rdp, int snap)
{
- return snap != rcu_dynticks_snap(rdp->cpu);
+ /*
+ * The first failing snapshot is already ordered against the accesses
+ * performed by the remote CPU after it exits idle.
+ *
+ * The second snapshot therefore only needs to order against accesses
+ * performed by the remote CPU prior to entering idle and therefore can
+ * rely solely on acquire semantics.
+ */
+ return snap != ct_dynticks_cpu_acquire(rdp->cpu);
}
/*
@@ -769,7 +771,18 @@ static void rcu_gpnum_ovf(struct rcu_node *rnp, struct rcu_data *rdp)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- rdp->dynticks_snap = rcu_dynticks_snap(rdp->cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses and updater's
+ * accesses prior to current GP (and also the started GP sequence number)
+ * is enforced by rcu_seq_start() implicit barrier and even further by
+ * smp_mb__after_unlock_lock() barriers chained all the way throughout the
+ * rnp locking tree since rcu_gp_init() and up to the current leaf rnp
+ * locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and post grace period
+ * updater's accesses is enforced by the below acquire semantic.
+ */
+ rdp->dynticks_snap = ct_dynticks_cpu_acquire(rdp->cpu);
if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) {
trace_rcu_fqs(rcu_state.name, rdp->gp_seq, rdp->cpu, TPS("dti"));
rcu_gpnum_ovf(rdp->mynode, rdp);
@@ -1660,6 +1673,9 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
rcu_sr_put_wait_head(rcu);
}
+
+ /* Order list manipulations with atomic access. */
+ atomic_dec_return_release(&rcu_state.srs_cleanups_pending);
}
/*
@@ -1667,7 +1683,7 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work)
*/
static void rcu_sr_normal_gp_cleanup(void)
{
- struct llist_node *wait_tail, *next, *rcu;
+ struct llist_node *wait_tail, *next = NULL, *rcu = NULL;
int done = 0;
wait_tail = rcu_state.srs_wait_tail;
@@ -1693,16 +1709,34 @@ static void rcu_sr_normal_gp_cleanup(void)
break;
}
- // concurrent sr_normal_gp_cleanup work might observe this update.
- smp_store_release(&rcu_state.srs_done_tail, wait_tail);
+ /*
+ * Fast path, no more users to process except putting the second last
+ * wait head if no inflight-workers. If there are in-flight workers,
+ * they will remove the last wait head.
+ *
+ * Note that the ACQUIRE orders atomic access with list manipulation.
+ */
+ if (wait_tail->next && wait_tail->next->next == NULL &&
+ rcu_sr_is_wait_head(wait_tail->next) &&
+ !atomic_read_acquire(&rcu_state.srs_cleanups_pending)) {
+ rcu_sr_put_wait_head(wait_tail->next);
+ wait_tail->next = NULL;
+ }
+
+ /* Concurrent sr_normal_gp_cleanup work might observe this update. */
ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail);
+ smp_store_release(&rcu_state.srs_done_tail, wait_tail);
/*
* We schedule a work in order to perform a final processing
* of outstanding users(if still left) and releasing wait-heads
* added by rcu_sr_normal_gp_init() call.
*/
- queue_work(sync_wq, &rcu_state.srs_cleanup_work);
+ if (wait_tail->next) {
+ atomic_inc(&rcu_state.srs_cleanups_pending);
+ if (!queue_work(sync_wq, &rcu_state.srs_cleanup_work))
+ atomic_dec(&rcu_state.srs_cleanups_pending);
+ }
}
/*
@@ -1810,7 +1844,7 @@ static noinline_for_stack bool rcu_gp_init(void)
WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF);
/* Exclude CPU hotplug operations. */
rcu_for_each_leaf_node(rnp) {
- local_irq_save(flags);
+ local_irq_disable();
arch_spin_lock(&rcu_state.ofl_lock);
raw_spin_lock_rcu_node(rnp);
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
@@ -1818,7 +1852,7 @@ static noinline_for_stack bool rcu_gp_init(void)
/* Nothing to do on this leaf rcu_node structure. */
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
continue;
}
@@ -1855,7 +1889,7 @@ static noinline_for_stack bool rcu_gp_init(void)
raw_spin_unlock_rcu_node(rnp);
arch_spin_unlock(&rcu_state.ofl_lock);
- local_irq_restore(flags);
+ local_irq_enable();
}
rcu_gp_slow(gp_preinit_delay); /* Races with CPU hotplug. */
@@ -4313,11 +4347,15 @@ static int rcu_pending(int user)
return 1;
/* Is this a nohz_full CPU in userspace or idle? (Ignore RCU if so.) */
- if ((user || rcu_is_cpu_rrupt_from_idle()) && rcu_nohz_full_cpu())
+ gp_in_progress = rcu_gp_in_progress();
+ if ((user || rcu_is_cpu_rrupt_from_idle() ||
+ (gp_in_progress &&
+ time_before(jiffies, READ_ONCE(rcu_state.gp_start) +
+ nohz_full_patience_delay_jiffies))) &&
+ rcu_nohz_full_cpu())
return 0;
/* Is the RCU core waiting for a quiescent state from this CPU? */
- gp_in_progress = rcu_gp_in_progress();
if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm && gp_in_progress)
return 1;
@@ -4767,7 +4805,7 @@ rcu_boot_init_percpu_data(int cpu)
rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
INIT_WORK(&rdp->strict_work, strict_work_handler);
WARN_ON_ONCE(ct->dynticks_nesting != 1);
- WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)));
+ WARN_ON_ONCE(rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu)));
rdp->barrier_seq_snap = rcu_state.barrier_sequence;
rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
rdp->rcu_ofl_gp_state = RCU_GP_CLEANED;
@@ -5110,11 +5148,15 @@ void rcutree_migrate_callbacks(int cpu)
struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
bool needwake;
- if (rcu_rdp_is_offloaded(rdp) ||
- rcu_segcblist_empty(&rdp->cblist))
- return; /* No callbacks to migrate. */
+ if (rcu_rdp_is_offloaded(rdp))
+ return;
raw_spin_lock_irqsave(&rcu_state.barrier_lock, flags);
+ if (rcu_segcblist_empty(&rdp->cblist)) {
+ raw_spin_unlock_irqrestore(&rcu_state.barrier_lock, flags);
+ return; /* No callbacks to migrate. */
+ }
+
WARN_ON_ONCE(rcu_rdp_cpu_online(rdp));
rcu_barrier_entrain(rdp);
my_rdp = this_cpu_ptr(&rcu_data);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index bae7925c497f..fcf2b4aa3441 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -223,7 +223,6 @@ struct rcu_data {
struct swait_queue_head nocb_state_wq; /* For offloading state changes */
struct task_struct *nocb_gp_kthread;
raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
- atomic_t nocb_lock_contended; /* Contention experienced. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
struct timer_list nocb_timer; /* Enforce finite deferral. */
unsigned long nocb_gp_adv_time; /* Last call_rcu() CB adv (jiffies). */
@@ -420,6 +419,7 @@ struct rcu_state {
struct llist_node *srs_done_tail; /* ready for GP users. */
struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX];
struct work_struct srs_cleanup_work;
+ atomic_t srs_cleanups_pending; /* srs inflight worker cleanups. */
};
/* Values for rcu_state structure's gp_flags field. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 8a1d9c8bd9f7..4acd29d16fdb 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -265,7 +265,12 @@ static bool sync_exp_work_done(unsigned long s)
{
if (rcu_exp_gp_seq_done(s)) {
trace_rcu_exp_grace_period(rcu_state.name, s, TPS("done"));
- smp_mb(); /* Ensure test happens before caller kfree(). */
+ /*
+ * Order GP completion with preceding accesses. Order also GP
+ * completion with post GP update side accesses. Pairs with
+ * rcu_seq_end().
+ */
+ smp_mb();
return true;
}
return false;
@@ -357,7 +362,21 @@ static void __sync_rcu_exp_select_node_cpus(struct rcu_exp_work *rewp)
!(rnp->qsmaskinitnext & mask)) {
mask_ofl_test |= mask;
} else {
- snap = rcu_dynticks_snap(cpu);
+ /*
+ * Full ordering between remote CPU's post idle accesses
+ * and updater's accesses prior to current GP (and also
+ * the started GP sequence number) is enforced by
+ * rcu_seq_start() implicit barrier, relayed by kworkers
+ * locking and even further by smp_mb__after_unlock_lock()
+ * barriers chained all the way throughout the rnp locking
+ * tree since sync_exp_reset_tree() and up to the current
+ * leaf rnp locking.
+ *
+ * Ordering between remote CPU's pre idle accesses and
+ * post grace period updater's accesses is enforced by the
+ * below acquire semantic.
+ */
+ snap = ct_dynticks_cpu_acquire(cpu);
if (rcu_dynticks_in_eqs(snap))
mask_ofl_test |= mask;
else
@@ -953,7 +972,6 @@ void synchronize_rcu_expedited(void)
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
sync_exp_work_done(s));
- smp_mb(); /* Work actions happen before return. */
/* Let the next expedited grace period start. */
mutex_unlock(&rcu_state.exp_mutex);
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
index 3f85577bddd4..3ce30841119a 100644
--- a/kernel/rcu/tree_nocb.h
+++ b/kernel/rcu/tree_nocb.h
@@ -91,8 +91,7 @@ module_param(nocb_nobypass_lim_per_jiffy, int, 0);
/*
* Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
+ * lock isn't immediately available, perform minimal sanity check.
*/
static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
__acquires(&rdp->nocb_bypass_lock)
@@ -100,29 +99,12 @@ static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
lockdep_assert_irqs_disabled();
if (raw_spin_trylock(&rdp->nocb_bypass_lock))
return;
- atomic_inc(&rdp->nocb_lock_contended);
+ /*
+ * Contention expected only when local enqueue collide with
+ * remote flush from kthreads.
+ */
WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
}
/*
@@ -510,7 +492,6 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
}
// We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
rcu_nocb_bypass_lock(rdp);
ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
@@ -635,8 +616,7 @@ static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
}
}
-static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
- bool *wake_state)
+static int nocb_gp_toggle_rdp(struct rcu_data *rdp)
{
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long flags;
@@ -650,8 +630,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will handle this rdp until it ever gets de-offloaded.
*/
rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 1;
} else if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
@@ -660,8 +638,6 @@ static int nocb_gp_toggle_rdp(struct rcu_data *rdp,
* We will ignore this rdp until it ever gets re-offloaded.
*/
rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *wake_state = true;
ret = 0;
} else {
WARN_ON_ONCE(1);
@@ -877,16 +853,15 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
}
if (rdp_toggling) {
- bool wake_state = false;
int ret;
- ret = nocb_gp_toggle_rdp(rdp_toggling, &wake_state);
+ ret = nocb_gp_toggle_rdp(rdp_toggling);
if (ret == 1)
list_add_tail(&rdp_toggling->nocb_entry_rdp, &my_rdp->nocb_head_rdp);
else if (ret == 0)
list_del(&rdp_toggling->nocb_entry_rdp);
- if (wake_state)
- swake_up_one(&rdp_toggling->nocb_state_wq);
+
+ swake_up_one(&rdp_toggling->nocb_state_wq);
}
my_rdp->nocb_gp_seq = -1;
@@ -913,16 +888,9 @@ static int rcu_nocb_gp_kthread(void *arg)
return 0;
}
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+ return !READ_ONCE(rdp->nocb_cb_sleep) || kthread_should_park();
}
/*
@@ -934,21 +902,19 @@ static void nocb_cb_wait(struct rcu_data *rdp)
struct rcu_segcblist *cblist = &rdp->cblist;
unsigned long cur_gp_seq;
unsigned long flags;
- bool needwake_state = false;
bool needwake_gp = false;
- bool can_sleep = true;
struct rcu_node *rnp = rdp->mynode;
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- if (READ_ONCE(rdp->nocb_cb_sleep)) {
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+ if (kthread_should_park()) {
+ kthread_parkme();
+ } else if (READ_ONCE(rdp->nocb_cb_sleep)) {
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
local_irq_save(flags);
rcu_momentary_dyntick_idle();
@@ -971,37 +937,16 @@ static void nocb_cb_wait(struct rcu_data *rdp)
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
+ if (!rcu_segcblist_ready_cbs(cblist)) {
+ WRITE_ONCE(rdp->nocb_cb_sleep, true);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
} else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
}
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
rcu_nocb_unlock_irqrestore(rdp, flags);
if (needwake_gp)
rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
}
/*
@@ -1094,17 +1039,8 @@ static int rdp_offload_toggle(struct rcu_data *rdp,
bool wake_gp = false;
rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
rcu_nocb_unlock_irqrestore(rdp, flags);
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
// Queue this rdp for add/del to/from the list to iterate on rcuog
WRITE_ONCE(rdp_gp->nocb_toggling_rdp, rdp);
@@ -1161,19 +1097,11 @@ static long rcu_nocb_rdp_deoffload(void *arg)
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
- /*
- * If rcuo[p] kthread spawn failed, directly remove SEGCBLIST_KTHREAD_CB.
- * Just wait SEGCBLIST_KTHREAD_GP to be cleared by rcuog.
- */
- if (!rdp->nocb_cb_kthread) {
- rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
-
swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP));
+ !rcu_segcblist_test_flags(cblist,
+ SEGCBLIST_KTHREAD_GP));
+ if (rdp->nocb_cb_kthread)
+ kthread_park(rdp->nocb_cb_kthread);
} else {
/*
* No kthread to clear the flags for us or remove the rdp from the nocb list
@@ -1181,8 +1109,7 @@ static long rcu_nocb_rdp_deoffload(void *arg)
* but we stick to paranoia in this rare path.
*/
rcu_nocb_lock_irqsave(rdp, flags);
- rcu_segcblist_clear_flags(&rdp->cblist,
- SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_nocb_unlock_irqrestore(rdp, flags);
list_del(&rdp->nocb_entry_rdp);
@@ -1282,8 +1209,10 @@ static long rcu_nocb_rdp_offload(void *arg)
wake_gp = rdp_offload_toggle(rdp, true, flags);
if (wake_gp)
wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ kthread_unpark(rdp->nocb_cb_kthread);
+
swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
/*
@@ -1468,7 +1397,7 @@ void __init rcu_init_nohz(void)
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE);
}
rcu_organize_nocb_kthreads();
@@ -1526,11 +1455,16 @@ static void rcu_spawn_cpu_nocb_kthread(int cpu)
mutex_unlock(&rdp_gp->nocb_gp_kthread_mutex);
/* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
+ t = kthread_create(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
goto end;
+ if (rcu_rdp_is_offloaded(rdp))
+ wake_up_process(t);
+ else
+ kthread_park(t);
+
if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_CB_BOOST) && kthread_prio)
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
@@ -1678,12 +1612,11 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ pr_info(" CB %d^%d->%d %c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
rdp->cpu, rdp->nocb_gp_rdp->cpu,
nocb_next_rdp ? nocb_next_rdp->cpu : -1,
"kK"[!!rdp->nocb_cb_kthread],
"bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
"lL"[raw_spin_is_locked(&rdp->nocb_lock)],
"sS"[!!rdp->nocb_cb_sleep],
".W"[swait_active(&rdp->nocb_cb_wq)],
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 340bbefe5f65..c569da65b421 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,8 +28,8 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
!(lockdep_is_held(&rcu_state.barrier_mutex) ||
(IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
rcu_lockdep_is_held_nocb(rdp) ||
- (rdp == this_cpu_ptr(&rcu_data) &&
- !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
+ (!(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible()) &&
+ rdp == this_cpu_ptr(&rcu_data)) ||
rcu_current_is_nocb_kthread(rdp)),
"Unsafe read of RCU_NOCB offloaded state"
);
@@ -93,6 +93,16 @@ static void __init rcu_bootup_announce_oddness(void)
pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
if (gp_cleanup_delay)
pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
+ if (nohz_full_patience_delay < 0) {
+ pr_info("\tRCU NOCB CPU patience negative (%d), resetting to zero.\n", nohz_full_patience_delay);
+ nohz_full_patience_delay = 0;
+ } else if (nohz_full_patience_delay > 5 * MSEC_PER_SEC) {
+ pr_info("\tRCU NOCB CPU patience too large (%d), resetting to %ld.\n", nohz_full_patience_delay, 5 * MSEC_PER_SEC);
+ nohz_full_patience_delay = 5 * MSEC_PER_SEC;
+ } else if (nohz_full_patience_delay) {
+ pr_info("\tRCU NOCB CPU patience set to %d milliseconds.\n", nohz_full_patience_delay);
+ }
+ nohz_full_patience_delay_jiffies = msecs_to_jiffies(nohz_full_patience_delay);
if (!use_softirq)
pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 460efecd077b..4b0e9d7c4c68 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -501,7 +501,7 @@ static void print_cpu_stall_info(int cpu)
}
delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq);
falsepositive = rcu_is_gp_kthread_starving(NULL) &&
- rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu));
+ rcu_dynticks_in_eqs(ct_dynticks_cpu(cpu));
rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j);
if (rcuc_starved)
// Print signed value, as negative values indicate a probable bug.
@@ -515,7 +515,7 @@ static void print_cpu_stall_info(int cpu)
rdp->rcu_iw_pending ? (int)min(delta, 9UL) + '0' :
"!."[!delta],
ticks_value, ticks_title,
- rcu_dynticks_snap(cpu) & 0xffff,
+ ct_dynticks_cpu(cpu) & 0xffff,
ct_dynticks_nesting_cpu(cpu), ct_dynticks_nmi_nesting_cpu(cpu),
rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 59ce0841eb1f..35a35e36024b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4466,12 +4466,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
* @cpu: The CPU on which to snapshot the task.
*
* Returns the task_struct pointer of the task "currently" running on
- * the specified CPU. If the same task is running on that CPU throughout,
- * the return value will be a pointer to that task's task_struct structure.
- * If the CPU did any context switches even vaguely concurrently with the
- * execution of this function, the return value will be a pointer to the
- * task_struct structure of a randomly chosen task that was running on
- * that CPU somewhere around the time that this function was executing.
+ * the specified CPU.
*
* If the specified CPU was offline, the return value is whatever it
* is, perhaps a pointer to the task_struct structure of that CPU's idle
@@ -4485,11 +4480,16 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
*/
struct task_struct *cpu_curr_snapshot(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
struct task_struct *t;
+ struct rq_flags rf;
- smp_mb(); /* Pairing determined by caller's synchronization design. */
+ rq_lock_irqsave(rq, &rf);
+ smp_mb__after_spinlock(); /* Pairing determined by caller's synchronization design. */
t = rcu_dereference(cpu_curr(cpu));
+ rq_unlock_irqrestore(rq, &rf);
smp_mb(); /* Pairing determined by caller's synchronization design. */
+
return t;
}