diff options
Diffstat (limited to 'kernel/rcu')
-rw-r--r-- | kernel/rcu/Kconfig | 20 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.c | 10 | ||||
-rw-r--r-- | kernel/rcu/rcu_segcblist.h | 12 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 2 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 476 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 131 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 31 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 14 | ||||
-rw-r--r-- | kernel/rcu/tree_nocb.h | 160 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 250 | ||||
-rw-r--r-- | kernel/rcu/tree_stall.h | 27 |
11 files changed, 658 insertions, 475 deletions
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 3128b7cf8e1f..bf8e341e75b4 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -112,7 +112,7 @@ config RCU_STALL_COMMON making these warnings mandatory for the tree variants. config RCU_NEED_SEGCBLIST - def_bool ( TREE_RCU || TREE_SRCU ) + def_bool ( TREE_RCU || TREE_SRCU || TASKS_RCU_GENERIC ) config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" @@ -169,24 +169,6 @@ config RCU_FANOUT_LEAF Take the default if unsure. -config RCU_FAST_NO_HZ - bool "Accelerate last non-dyntick-idle CPU's grace periods" - depends on NO_HZ_COMMON && SMP && RCU_EXPERT - default n - help - This option permits CPUs to enter dynticks-idle state even if - they have RCU callbacks queued, and prevents RCU from waking - these CPUs up more than roughly once every four jiffies (by - default, you can adjust this using the rcutree.rcu_idle_gp_delay - parameter), thus improving energy efficiency. On the other - hand, this option increases the duration of RCU grace periods, - for example, slowing down synchronize_rcu(). - - Say Y if energy efficiency is critically important, and you - don't care about increased grace-period durations. - - Say N if you are unsure. - config RCU_BOOST bool "Enable RCU priority boosting" depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index aaa111237b60..81145c3ece25 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -261,16 +261,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) } /* - * Mark the specified rcu_segcblist structure as offloaded. + * Mark the specified rcu_segcblist structure as offloaded (or not) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - if (offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); - } else { + if (offload) + rcu_segcblist_set_flags(rsclp, SEGCBLIST_LOCKING | SEGCBLIST_OFFLOADED); + else rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); - } } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 9a19328ff251..e373fbe44da5 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -80,11 +80,14 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ +/* + * Is the specified rcu_segcblist NOCB offloaded (or in the middle of the + * [de]offloading process)? + */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + rcu_segcblist_test_flags(rsclp, SEGCBLIST_LOCKING)) return true; return false; @@ -92,9 +95,8 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) { - int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; - - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_RCU_CORE)) return true; return false; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index a0ba2ed49bc6..92c002d65482 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -99,7 +99,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); - if (!newval && READ_ONCE(ssp->srcu_gp_waiting)) + if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 7da3c81c3f59..84f1d91604cc 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -6,6 +6,7 @@ */ #ifdef CONFIG_TASKS_RCU_GENERIC +#include "rcu_segcblist.h" //////////////////////////////////////////////////////////////////////// // @@ -20,11 +21,33 @@ typedef void (*holdouts_func_t)(struct list_head *hop, bool ndrpt, bool *frptp); typedef void (*postgp_func_t)(struct rcu_tasks *rtp); /** + * struct rcu_tasks_percpu - Per-CPU component of definition for a Tasks-RCU-like mechanism. + * @cblist: Callback list. + * @lock: Lock protecting per-CPU callback list. + * @rtp_jiffies: Jiffies counter value for statistics. + * @rtp_n_lock_retries: Rough lock-contention statistic. + * @rtp_work: Work queue for invoking callbacks. + * @rtp_irq_work: IRQ work queue for deferred wakeups. + * @barrier_q_head: RCU callback for barrier operation. + * @cpu: CPU number corresponding to this entry. + * @rtpp: Pointer to the rcu_tasks structure. + */ +struct rcu_tasks_percpu { + struct rcu_segcblist cblist; + raw_spinlock_t __private lock; + unsigned long rtp_jiffies; + unsigned long rtp_n_lock_retries; + struct work_struct rtp_work; + struct irq_work rtp_irq_work; + struct rcu_head barrier_q_head; + int cpu; + struct rcu_tasks *rtpp; +}; + +/** * struct rcu_tasks - Definition for a Tasks-RCU-like mechanism. - * @cbs_head: Head of callback list. - * @cbs_tail: Tail pointer for callback list. * @cbs_wq: Wait queue allowing new callback to get kthread's attention. - * @cbs_lock: Lock protecting callback list. + * @cbs_gbl_lock: Lock protecting callback list. * @kthread_ptr: This flavor's grace-period/callback-invocation kthread. * @gp_func: This flavor's grace-period-wait function. * @gp_state: Grace period's most recent state transition (debugging). @@ -32,7 +55,7 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @init_fract: Initial backoff sleep interval. * @gp_jiffies: Time of last @gp_state transition. * @gp_start: Most recent grace-period start in jiffies. - * @n_gps: Number of grace periods completed since boot. + * @tasks_gp_seq: Number of grace periods completed since boot. * @n_ipis: Number of IPIs sent to encourage grace periods to end. * @n_ipis_fails: Number of IPI-send failures. * @pregp_func: This flavor's pre-grace-period function (optional). @@ -41,20 +64,27 @@ typedef void (*postgp_func_t)(struct rcu_tasks *rtp); * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @rtpcpu: This flavor's rcu_tasks_percpu structure. + * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. + * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. + * @percpu_dequeue_lim: Number of per-CPU callback queues in use for dequeuing. + * @percpu_dequeue_gpseq: RCU grace-period number to propagate enqueue limit to dequeuers. + * @barrier_q_mutex: Serialize barrier operations. + * @barrier_q_count: Number of queues being waited on. + * @barrier_q_completion: Barrier wait/wakeup mechanism. + * @barrier_q_seq: Sequence number for barrier operations. * @name: This flavor's textual name. * @kname: This flavor's kthread name. */ struct rcu_tasks { - struct rcu_head *cbs_head; - struct rcu_head **cbs_tail; struct wait_queue_head cbs_wq; - raw_spinlock_t cbs_lock; + raw_spinlock_t cbs_gbl_lock; int gp_state; int gp_sleep; int init_fract; unsigned long gp_jiffies; unsigned long gp_start; - unsigned long n_gps; + unsigned long tasks_gp_seq; unsigned long n_ipis; unsigned long n_ipis_fails; struct task_struct *kthread_ptr; @@ -65,20 +95,40 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + struct rcu_tasks_percpu __percpu *rtpcpu; + int percpu_enqueue_shift; + int percpu_enqueue_lim; + int percpu_dequeue_lim; + unsigned long percpu_dequeue_gpseq; + struct mutex barrier_q_mutex; + atomic_t barrier_q_count; + struct completion barrier_q_completion; + unsigned long barrier_q_seq; char *name; char *kname; }; -#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ -static struct rcu_tasks rt_name = \ -{ \ - .cbs_tail = &rt_name.cbs_head, \ - .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ - .cbs_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_lock), \ - .gp_func = gp, \ - .call_func = call, \ - .name = n, \ - .kname = #rt_name, \ +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp); + +#define DEFINE_RCU_TASKS(rt_name, gp, call, n) \ +static DEFINE_PER_CPU(struct rcu_tasks_percpu, rt_name ## __percpu) = { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name ## __percpu.cbs_pcpu_lock), \ + .rtp_irq_work = IRQ_WORK_INIT(call_rcu_tasks_iw_wakeup), \ +}; \ +static struct rcu_tasks rt_name = \ +{ \ + .cbs_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rt_name.cbs_wq), \ + .cbs_gbl_lock = __RAW_SPIN_LOCK_UNLOCKED(rt_name.cbs_gbl_lock), \ + .gp_func = gp, \ + .call_func = call, \ + .rtpcpu = &rt_name ## __percpu, \ + .name = n, \ + .percpu_enqueue_shift = ilog2(CONFIG_NR_CPUS), \ + .percpu_enqueue_lim = 1, \ + .percpu_dequeue_lim = 1, \ + .barrier_q_mutex = __MUTEX_INITIALIZER(rt_name.barrier_q_mutex), \ + .barrier_q_seq = (0UL - 50UL) << RCU_SEQ_CTR_SHIFT, \ + .kname = #rt_name, \ } /* Track exiting tasks in order to allow them to be waited for. */ @@ -94,6 +144,15 @@ module_param(rcu_task_ipi_delay, int, 0644); static int rcu_task_stall_timeout __read_mostly = RCU_TASK_STALL_TIMEOUT; module_param(rcu_task_stall_timeout, int, 0644); +static int rcu_task_enqueue_lim __read_mostly = -1; +module_param(rcu_task_enqueue_lim, int, 0444); + +static bool rcu_task_cb_adjust; +static int rcu_task_contend_lim __read_mostly = 100; +module_param(rcu_task_contend_lim, int, 0444); +static int rcu_task_collapse_lim __read_mostly = 10; +module_param(rcu_task_collapse_lim, int, 0444); + /* RCU tasks grace-period state for debugging. */ #define RTGS_INIT 0 #define RTGS_WAIT_WAIT_CBS 1 @@ -128,6 +187,8 @@ static const char * const rcu_tasks_gp_state_names[] = { // // Generic code. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp); + /* Record grace-period phase and time. */ static void set_tasks_gp_state(struct rcu_tasks *rtp, int newstate) { @@ -148,23 +209,106 @@ static const char *tasks_gp_state_getname(struct rcu_tasks *rtp) } #endif /* #ifndef CONFIG_TINY_RCU */ +// Initialize per-CPU callback lists for the specified flavor of +// Tasks RCU. +static void cblist_init_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + int lim; + + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rcu_task_enqueue_lim < 0) { + rcu_task_enqueue_lim = 1; + rcu_task_cb_adjust = true; + pr_info("%s: Setting adjustable number of callback queues.\n", __func__); + } else if (rcu_task_enqueue_lim == 0) { + rcu_task_enqueue_lim = 1; + } + lim = rcu_task_enqueue_lim; + + if (lim > nr_cpu_ids) + lim = nr_cpu_ids; + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids / lim)); + WRITE_ONCE(rtp->percpu_dequeue_lim, lim); + smp_store_release(&rtp->percpu_enqueue_lim, lim); + for_each_possible_cpu(cpu) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + WARN_ON_ONCE(!rtpcp); + if (cpu) + raw_spin_lock_init(&ACCESS_PRIVATE(rtpcp, lock)); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + if (rcu_segcblist_empty(&rtpcp->cblist)) + rcu_segcblist_init(&rtpcp->cblist); + INIT_WORK(&rtpcp->rtp_work, rcu_tasks_invoke_cbs_wq); + rtpcp->cpu = cpu; + rtpcp->rtpp = rtp; + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + pr_info("%s: Setting shift to %d and lim to %d.\n", __func__, data_race(rtp->percpu_enqueue_shift), data_race(rtp->percpu_enqueue_lim)); +} + +// IRQ-work handler that does deferred wakeup for call_rcu_tasks_generic(). +static void call_rcu_tasks_iw_wakeup(struct irq_work *iwp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(iwp, struct rcu_tasks_percpu, rtp_irq_work); + + rtp = rtpcp->rtpp; + wake_up(&rtp->cbs_wq); +} + // Enqueue a callback for the specified flavor of Tasks RCU. static void call_rcu_tasks_generic(struct rcu_head *rhp, rcu_callback_t func, struct rcu_tasks *rtp) { unsigned long flags; + unsigned long j; + bool needadjust = false; bool needwake; + struct rcu_tasks_percpu *rtpcp; rhp->next = NULL; rhp->func = func; - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - needwake = !rtp->cbs_head; - WRITE_ONCE(*rtp->cbs_tail, rhp); - rtp->cbs_tail = &rhp->next; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); + local_irq_save(flags); + rcu_read_lock(); + rtpcp = per_cpu_ptr(rtp->rtpcpu, + smp_processor_id() >> READ_ONCE(rtp->percpu_enqueue_shift)); + if (!raw_spin_trylock_rcu_node(rtpcp)) { // irqs already disabled. + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + j = jiffies; + if (rtpcp->rtp_jiffies != j) { + rtpcp->rtp_jiffies = j; + rtpcp->rtp_n_lock_retries = 0; + } + if (rcu_task_cb_adjust && ++rtpcp->rtp_n_lock_retries > rcu_task_contend_lim && + READ_ONCE(rtp->percpu_enqueue_lim) != nr_cpu_ids) + needadjust = true; // Defer adjustment to avoid deadlock. + } + if (!rcu_segcblist_is_enabled(&rtpcp->cblist)) { + raw_spin_unlock_rcu_node(rtpcp); // irqs remain disabled. + cblist_init_generic(rtp); + raw_spin_lock_rcu_node(rtpcp); // irqs already disabled. + } + needwake = rcu_segcblist_empty(&rtpcp->cblist); + rcu_segcblist_enqueue(&rtpcp->cblist, rhp); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + if (unlikely(needadjust)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim != nr_cpu_ids) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + WRITE_ONCE(rtp->percpu_dequeue_lim, nr_cpu_ids); + smp_store_release(&rtp->percpu_enqueue_lim, nr_cpu_ids); + pr_info("Switching %s to per-CPU callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + rcu_read_unlock(); /* We can't create the thread unless interrupts are enabled. */ if (needwake && READ_ONCE(rtp->kthread_ptr)) - wake_up(&rtp->cbs_wq); + irq_work_queue(&rtpcp->rtp_irq_work); } // Wait for a grace period for the specified flavor of Tasks RCU. @@ -178,12 +322,173 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) wait_rcu_gp(rtp->call_func); } +// RCU callback function for rcu_barrier_tasks_generic(). +static void rcu_barrier_tasks_generic_cb(struct rcu_head *rhp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp; + + rtpcp = container_of(rhp, struct rcu_tasks_percpu, barrier_q_head); + rtp = rtpcp->rtpp; + if (atomic_dec_and_test(&rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); +} + +// Wait for all in-flight callbacks for the specified RCU Tasks flavor. +// Operates in a manner similar to rcu_barrier(). +static void rcu_barrier_tasks_generic(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + struct rcu_tasks_percpu *rtpcp; + unsigned long s = rcu_seq_snap(&rtp->barrier_q_seq); + + mutex_lock(&rtp->barrier_q_mutex); + if (rcu_seq_done(&rtp->barrier_q_seq, s)) { + smp_mb(); + mutex_unlock(&rtp->barrier_q_mutex); + return; + } + rcu_seq_start(&rtp->barrier_q_seq); + init_completion(&rtp->barrier_q_completion); + atomic_set(&rtp->barrier_q_count, 2); + for_each_possible_cpu(cpu) { + if (cpu >= smp_load_acquire(&rtp->percpu_dequeue_lim)) + break; + rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + rtpcp->barrier_q_head.func = rcu_barrier_tasks_generic_cb; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + if (rcu_segcblist_entrain(&rtpcp->cblist, &rtpcp->barrier_q_head)) + atomic_inc(&rtp->barrier_q_count); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + if (atomic_sub_and_test(2, &rtp->barrier_q_count)) + complete(&rtp->barrier_q_completion); + wait_for_completion(&rtp->barrier_q_completion); + rcu_seq_end(&rtp->barrier_q_seq); + mutex_unlock(&rtp->barrier_q_mutex); +} + +// Advance callbacks and indicate whether either a grace period or +// callback invocation is needed. +static int rcu_tasks_need_gpcb(struct rcu_tasks *rtp) +{ + int cpu; + unsigned long flags; + long n; + long ncbs = 0; + long ncbsnz = 0; + int needgpcb = 0; + + for (cpu = 0; cpu < smp_load_acquire(&rtp->percpu_dequeue_lim); cpu++) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, cpu); + + /* Advance and accelerate any new callbacks. */ + if (!rcu_segcblist_n_cbs(&rtpcp->cblist)) + continue; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + // Should we shrink down to a single callback queue? + n = rcu_segcblist_n_cbs(&rtpcp->cblist); + if (n) { + ncbs += n; + if (cpu > 0) + ncbsnz += n; + } + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + if (rcu_segcblist_pend_cbs(&rtpcp->cblist)) + needgpcb |= 0x3; + if (!rcu_segcblist_empty(&rtpcp->cblist)) + needgpcb |= 0x1; + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + } + + // Shrink down to a single callback queue if appropriate. + // This is done in two stages: (1) If there are no more than + // rcu_task_collapse_lim callbacks on CPU 0 and none on any other + // CPU, limit enqueueing to CPU 0. (2) After an RCU grace period, + // if there has not been an increase in callbacks, limit dequeuing + // to CPU 0. Note the matching RCU read-side critical section in + // call_rcu_tasks_generic(). + if (rcu_task_cb_adjust && ncbs <= rcu_task_collapse_lim) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim > 1) { + WRITE_ONCE(rtp->percpu_enqueue_shift, ilog2(nr_cpu_ids)); + smp_store_release(&rtp->percpu_enqueue_lim, 1); + rtp->percpu_dequeue_gpseq = get_state_synchronize_rcu(); + pr_info("Starting switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + if (rcu_task_cb_adjust && !ncbsnz && + poll_state_synchronize_rcu(rtp->percpu_dequeue_gpseq)) { + raw_spin_lock_irqsave(&rtp->cbs_gbl_lock, flags); + if (rtp->percpu_enqueue_lim < rtp->percpu_dequeue_lim) { + WRITE_ONCE(rtp->percpu_dequeue_lim, 1); + pr_info("Completing switch %s to CPU-0 callback queuing.\n", rtp->name); + } + raw_spin_unlock_irqrestore(&rtp->cbs_gbl_lock, flags); + } + + return needgpcb; +} + +// Advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs(struct rcu_tasks *rtp, struct rcu_tasks_percpu *rtpcp) +{ + int cpu; + int cpunext; + unsigned long flags; + int len; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + struct rcu_tasks_percpu *rtpcp_next; + + cpu = rtpcp->cpu; + cpunext = cpu * 2 + 1; + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + cpunext++; + if (cpunext < smp_load_acquire(&rtp->percpu_dequeue_lim)) { + rtpcp_next = per_cpu_ptr(rtp->rtpcpu, cpunext); + queue_work_on(cpunext, system_wq, &rtpcp_next->rtp_work); + } + } + + if (rcu_segcblist_empty(&rtpcp->cblist)) + return; + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_advance(&rtpcp->cblist, rcu_seq_current(&rtp->tasks_gp_seq)); + rcu_segcblist_extract_done_cbs(&rtpcp->cblist, &rcl); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); + len = rcl.len; + for (rhp = rcu_cblist_dequeue(&rcl); rhp; rhp = rcu_cblist_dequeue(&rcl)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + cond_resched(); + } + raw_spin_lock_irqsave_rcu_node(rtpcp, flags); + rcu_segcblist_add_len(&rtpcp->cblist, -len); + (void)rcu_segcblist_accelerate(&rtpcp->cblist, rcu_seq_snap(&rtp->tasks_gp_seq)); + raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); +} + +// Workqueue flood to advance callbacks and invoke any that are ready. +static void rcu_tasks_invoke_cbs_wq(struct work_struct *wp) +{ + struct rcu_tasks *rtp; + struct rcu_tasks_percpu *rtpcp = container_of(wp, struct rcu_tasks_percpu, rtp_work); + + rtp = rtpcp->rtpp; + rcu_tasks_invoke_cbs(rtp, rtpcp); +} + /* RCU-tasks kthread that detects grace periods and invokes callbacks. */ static int __noreturn rcu_tasks_kthread(void *arg) { - unsigned long flags; - struct rcu_head *list; - struct rcu_head *next; + int needgpcb; struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ @@ -199,42 +504,22 @@ static int __noreturn rcu_tasks_kthread(void *arg) for (;;) { set_tasks_gp_state(rtp, RTGS_WAIT_CBS); - /* Pick up any new callbacks. */ - raw_spin_lock_irqsave(&rtp->cbs_lock, flags); - smp_mb__after_spinlock(); // Order updates vs. GP. - list = rtp->cbs_head; - rtp->cbs_head = NULL; - rtp->cbs_tail = &rtp->cbs_head; - raw_spin_unlock_irqrestore(&rtp->cbs_lock, flags); - /* If there were none, wait a bit and start over. */ - if (!list) { - wait_event_interruptible(rtp->cbs_wq, - READ_ONCE(rtp->cbs_head)); - if (!rtp->cbs_head) { - WARN_ON(signal_pending(current)); - set_tasks_gp_state(rtp, RTGS_WAIT_WAIT_CBS); - schedule_timeout_idle(HZ/10); - } - continue; + wait_event_idle(rtp->cbs_wq, (needgpcb = rcu_tasks_need_gpcb(rtp))); + + if (needgpcb & 0x2) { + // Wait for one grace period. + set_tasks_gp_state(rtp, RTGS_WAIT_GP); + rtp->gp_start = jiffies; + rcu_seq_start(&rtp->tasks_gp_seq); + rtp->gp_func(rtp); + rcu_seq_end(&rtp->tasks_gp_seq); } - // Wait for one grace period. - set_tasks_gp_state(rtp, RTGS_WAIT_GP); - rtp->gp_start = jiffies; - rtp->gp_func(rtp); - rtp->n_gps++; - - /* Invoke the callbacks. */ + /* Invoke callbacks. */ set_tasks_gp_state(rtp, RTGS_INVOKE_CBS); - while (list) { - next = list->next; - local_bh_disable(); - list->func(list); - local_bh_enable(); - list = next; - cond_resched(); - } + rcu_tasks_invoke_cbs(rtp, per_cpu_ptr(rtp->rtpcpu, 0)); + /* Paranoid sleep to keep this from entering a tight loop */ schedule_timeout_idle(rtp->gp_sleep); } @@ -279,14 +564,15 @@ static void __init rcu_tasks_bootup_oddness(void) /* Dump out rcutorture-relevant state common to all RCU-tasks flavors. */ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) { + struct rcu_tasks_percpu *rtpcp = per_cpu_ptr(rtp->rtpcpu, 0); // for_each... pr_info("%s: %s(%d) since %lu g:%lu i:%lu/%lu %c%c %s\n", rtp->kname, tasks_gp_state_getname(rtp), data_race(rtp->gp_state), jiffies - data_race(rtp->gp_jiffies), - data_race(rtp->n_gps), + data_race(rcu_seq_current(&rtp->tasks_gp_seq)), data_race(rtp->n_ipis_fails), data_race(rtp->n_ipis), ".k"[!!data_race(rtp->kthread_ptr)], - ".C"[!!data_race(rtp->cbs_head)], + ".C"[!data_race(rcu_segcblist_empty(&rtpcp->cblist))], s); } #endif // #ifndef CONFIG_TINY_RCU @@ -411,10 +697,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // exit_tasks_rcu_finish() functions begin and end, respectively, the SRCU // read-side critical sections waited for by rcu_tasks_postscan(). // -// Pre-grace-period update-side code is ordered before the grace via the -// ->cbs_lock and the smp_mb__after_spinlock(). Pre-grace-period read-side -// code is ordered before the grace period via synchronize_rcu() call -// in rcu_tasks_pregp_step() and by the scheduler's locks and interrupt +// Pre-grace-period update-side code is ordered before the grace +// via the raw_spin_lock.*rcu_node(). Pre-grace-period read-side code +// is ordered before the grace period via synchronize_rcu() call in +// rcu_tasks_pregp_step() and by the scheduler's locks and interrupt // disabling. /* Pre-grace-period preparation. */ @@ -586,13 +872,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks); */ void rcu_barrier_tasks(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks(); + rcu_barrier_tasks_generic(&rcu_tasks); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { + cblist_init_generic(&rcu_tasks); rcu_tasks.gp_sleep = HZ / 10; rcu_tasks.init_fract = HZ / 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; @@ -724,13 +1010,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_rude); */ void rcu_barrier_tasks_rude(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_rude(); + rcu_barrier_tasks_generic(&rcu_tasks_rude); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_rude); static int __init rcu_spawn_tasks_rude_kthread(void) { + cblist_init_generic(&rcu_tasks_rude); rcu_tasks_rude.gp_sleep = HZ / 10; rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; @@ -1073,25 +1359,50 @@ static void rcu_tasks_trace_postscan(struct list_head *hop) // Any tasks that exit after this point will set ->trc_reader_checked. } +/* Communicate task state back to the RCU tasks trace stall warning request. */ +struct trc_stall_chk_rdr { + int nesting; + int ipi_to_cpu; + u8 needqs; +}; + +static int trc_check_slow_task(struct task_struct *t, void *arg) +{ + struct trc_stall_chk_rdr *trc_rdrp = arg; + + if (task_curr(t)) + return false; // It is running, so decline to inspect it. + trc_rdrp->nesting = READ_ONCE(t->trc_reader_nesting); + trc_rdrp->ipi_to_cpu = READ_ONCE(t->trc_ipi_to_cpu); + trc_rdrp->needqs = READ_ONCE(t->trc_reader_special.b.need_qs); + return true; +} + /* Show the state of a task stalling the current RCU tasks trace GP. */ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport) { int cpu; + struct trc_stall_chk_rdr trc_rdr; + bool is_idle_tsk = is_idle_task(t); if (*firstreport) { pr_err("INFO: rcu_tasks_trace detected stalls on tasks:\n"); *firstreport = false; } - // FIXME: This should attempt to use try_invoke_on_nonrunning_task(). cpu = task_cpu(t); - pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", - t->pid, - ".I"[READ_ONCE(t->trc_ipi_to_cpu) >= 0], - ".i"[is_idle_task(t)], - ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], - READ_ONCE(t->trc_reader_nesting), - " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)], - cpu); + if (!task_call_func(t, trc_check_slow_task, &trc_rdr)) + pr_alert("P%d: %c\n", + t->pid, + ".i"[is_idle_tsk]); + else + pr_alert("P%d: %c%c%c nesting: %d%c cpu: %d\n", + t->pid, + ".I"[trc_rdr.ipi_to_cpu >= 0], + ".i"[is_idle_tsk], + ".N"[cpu >= 0 && tick_nohz_full_cpu(cpu)], + trc_rdr.nesting, + " N"[!!trc_rdr.needqs], + cpu); sched_show_task(t); } @@ -1121,7 +1432,8 @@ static void check_all_holdout_tasks_trace(struct list_head *hop, trc_wait_for_one_reader(t, hop); // If check succeeded, remove this task from the list. - if (READ_ONCE(t->trc_reader_checked)) + if (smp_load_acquire(&t->trc_ipi_to_cpu) == -1 && + READ_ONCE(t->trc_reader_checked)) trc_del_holdout(t); else if (needreport) show_stalled_task_trace(t, firstreport); @@ -1156,7 +1468,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp) // Yes, this assumes that CPUs process IPIs in order. If that ever // changes, there will need to be a recheck and/or timed wait. for_each_online_cpu(cpu) - if (smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu))) + if (WARN_ON_ONCE(smp_load_acquire(per_cpu_ptr(&trc_ipi_to_cpu, cpu)))) smp_call_function_single(cpu, rcu_tasks_trace_empty_fn, NULL, 1); // Remove the safety count. @@ -1256,13 +1568,13 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_tasks_trace); */ void rcu_barrier_tasks_trace(void) { - /* There is only one callback queue, so this is easy. ;-) */ - synchronize_rcu_tasks_trace(); + rcu_barrier_tasks_generic(&rcu_tasks_trace); } EXPORT_SYMBOL_GPL(rcu_barrier_tasks_trace); static int __init rcu_spawn_tasks_trace_kthread(void) { + cblist_init_generic(&rcu_tasks_trace); if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; rcu_tasks_trace.init_fract = HZ / 10; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ef8d36f580fc..a4c25a6283b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -79,7 +79,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(1), #ifdef CONFIG_RCU_NOCB_CPU - .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, + .cblist.flags = SEGCBLIST_RCU_CORE, #endif }; static struct rcu_state rcu_state = { @@ -624,7 +624,6 @@ static noinstr void rcu_eqs_enter(bool user) instrumentation_begin(); trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); - rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); // instrumentation for the noinstr rcu_dynticks_eqs_enter() @@ -768,9 +767,6 @@ noinstr void rcu_nmi_exit(void) trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, atomic_read(&rdp->dynticks)); WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ - if (!in_nmi()) - rcu_prepare_for_idle(); - // instrumentation for the noinstr rcu_dynticks_eqs_enter() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); instrumentation_end(); @@ -872,7 +868,6 @@ static void noinstr rcu_eqs_exit(bool user) // instrumentation for the noinstr rcu_dynticks_eqs_exit() instrument_atomic_write(&rdp->dynticks, sizeof(rdp->dynticks)); - rcu_cleanup_after_idle(); trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); WRITE_ONCE(rdp->dynticks_nesting, 1); @@ -1014,12 +1009,6 @@ noinstr void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); // ... but is watching here. - if (!in_nmi()) { - instrumentation_begin(); - rcu_cleanup_after_idle(); - instrumentation_end(); - } - instrumentation_begin(); // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() instrument_atomic_read(&rdp->dynticks, sizeof(rdp->dynticks)); @@ -1087,6 +1076,24 @@ void rcu_irq_enter_irqson(void) } /* + * Check to see if any future non-offloaded RCU-related work will need + * to be done by the current CPU, even if none need be done immediately, + * returning 1 if so. This function is part of the RCU implementation; + * it is -not- an exported member of the RCU API. This is used by + * the idle-entry code to figure out whether it is safe to disable the + * scheduler-clock interrupt. + * + * Just check whether or not this CPU has non-offloaded RCU callbacks + * queued. + */ +int rcu_needs_cpu(u64 basemono, u64 *nextevt) +{ + *nextevt = KTIME_MAX; + return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && + !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); +} + +/* * If any sort of urgency was applied to the current CPU (for example, * the scheduler-clock interrupt was enabled on a nohz_full CPU) in order * to get to a quiescent state, disable it. @@ -1467,7 +1474,7 @@ static void rcu_gp_kthread_wake(void) { struct task_struct *t = READ_ONCE(rcu_state.gp_kthread); - if ((current == t && !in_irq() && !in_serving_softirq()) || + if ((current == t && !in_hardirq() && !in_serving_softirq()) || !READ_ONCE(rcu_state.gp_flags) || !t) return; WRITE_ONCE(rcu_state.gp_wake_time, jiffies); @@ -1590,10 +1597,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp, struct rcu_data *rdp) { rcu_lockdep_assert_cblist_protected(rdp); - if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || - !raw_spin_trylock_rcu_node(rnp)) + if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp)) return; - WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); + // The grace period cannot end while we hold the rcu_node lock. + if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq))) + WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp)); raw_spin_unlock_rcu_node(rnp); } @@ -2277,7 +2285,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = rcu_rdp_is_offloaded(rdp); + bool needacc = false; struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2304,15 +2312,30 @@ rcu_report_qs_rdp(struct rcu_data *rdp) /* * This GP can't end until cpu checks in, so all of our * callbacks can be processed during the next GP. + * + * NOCB kthreads have their own way to deal with that... */ - if (!offloaded) + if (!rcu_rdp_is_offloaded(rdp)) { needwake = rcu_accelerate_cbs(rnp, rdp); + } else if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) { + /* + * ...but NOCB kthreads may miss or delay callbacks acceleration + * if in the middle of a (de-)offloading process. + */ + needacc = true; + } rcu_disable_urgency_upon_qs(rdp); rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(); + + if (needacc) { + rcu_nocb_lock_irqsave(rdp, flags); + rcu_accelerate_cbs_unlocked(rnp, rdp); + rcu_nocb_unlock_irqrestore(rdp, flags); + } } } @@ -2444,7 +2467,6 @@ static void rcu_do_batch(struct rcu_data *rdp) int div; bool __maybe_unused empty; unsigned long flags; - const bool offloaded = rcu_rdp_is_offloaded(rdp); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count = 0; @@ -2462,18 +2484,17 @@ static void rcu_do_batch(struct rcu_data *rdp) } /* - * Extract the list of ready callbacks, disabling to prevent + * Extract the list of ready callbacks, disabling IRQs to prevent * races with call_rcu() from interrupt handlers. Leave the * callback counts, as rcu_barrier() needs to be conservative. */ - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); pending = rcu_segcblist_n_cbs(&rdp->cblist); div = READ_ONCE(rcu_divisor); div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; bl = max(rdp->blimit, pending >> div); - if (unlikely(bl > 100)) { + if (in_serving_softirq() && unlikely(bl > 100)) { long rrn = READ_ONCE(rcu_resched_ns); rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn; @@ -2482,7 +2503,7 @@ static void rcu_do_batch(struct rcu_data *rdp) trace_rcu_batch_start(rcu_state.name, rcu_segcblist_n_cbs(&rdp->cblist), bl); rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); - if (offloaded) + if (rcu_rdp_is_offloaded(rdp)) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); @@ -2510,18 +2531,21 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. */ - if (count >= bl && !offloaded && - (need_resched() || - (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) - break; - if (unlikely(tlimit)) { - /* only call local_clock() every 32 callbacks */ - if (likely((count & 31) || local_clock() < tlimit)) - continue; - /* Exceeded the time limit, so leave. */ - break; - } - if (!in_serving_softirq()) { + if (in_serving_softirq()) { + if (count >= bl && (need_resched() || !is_idle_task(current))) + break; + /* + * Make sure we don't spend too much time here and deprive other + * softirq vectors of CPU cycles. + */ + if (unlikely(tlimit)) { + /* only call local_clock() every 32 callbacks */ + if (likely((count & 31) || local_clock() < tlimit)) + continue; + /* Exceeded the time limit, so leave. */ + break; + } + } else { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); @@ -2530,8 +2554,7 @@ static void rcu_do_batch(struct rcu_data *rdp) } } - local_irq_save(flags); - rcu_nocb_lock(rdp); + rcu_nocb_lock_irqsave(rdp, flags); rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2565,9 +2588,6 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_nocb_unlock_irqrestore(rdp, flags); - /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist)) - invoke_rcu_core(); tick_dep_clear_task(current, TICK_DEP_BIT_RCU); } @@ -2706,6 +2726,23 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + /* + * On RT rcu_core() can be preempted when IRQs aren't disabled. + * Therefore this function can race with concurrent NOCB (de-)offloading + * on this CPU and the below condition must be considered volatile. + * However if we race with: + * + * _ Offloading: In the worst case we accelerate or process callbacks + * concurrently with NOCB kthreads. We are guaranteed to + * call rcu_nocb_lock() if that happens. + * + * _ Deoffloading: In the worst case we miss callbacks acceleration or + * processing. This is fine because the early stage + * of deoffloading invokes rcu_core() after setting + * SEGCBLIST_RCU_CORE. So we guarantee that we'll process + * what could have been dismissed without the need to wait + * for the next rcu_pending() check in the next jiffy. + */ const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) @@ -2714,7 +2751,7 @@ static __latent_entropy void rcu_core(void) WARN_ON_ONCE(!rdp->beenonline); /* Report any deferred quiescent states if preemption enabled. */ - if (!(preempt_count() & PREEMPT_MASK)) { + if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { set_tsk_need_resched(current); @@ -2737,8 +2774,12 @@ static __latent_entropy void rcu_core(void) /* If there are callbacks ready, invoke them. */ if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && - likely(READ_ONCE(rcu_scheduler_fully_active))) + likely(READ_ONCE(rcu_scheduler_fully_active))) { rcu_do_batch(rdp); + /* Re-invoke RCU core processing if there are callbacks remaining. */ + if (rcu_segcblist_ready_cbs(&rdp->cblist)) + invoke_rcu_core(); + } /* Do any needed deferred wakeups of rcuo kthreads. */ do_nocb_deferred_wakeup(rdp); @@ -2982,7 +3023,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) head->func = func; head->next = NULL; local_irq_save(flags); - kasan_record_aux_stack(head); + kasan_record_aux_stack_noalloc(head); rdp = this_cpu_ptr(&rcu_data); /* Add the callback to our list. */ @@ -3547,7 +3588,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) return; } - kasan_record_aux_stack(ptr); + kasan_record_aux_stack_noalloc(ptr); success = add_ptr_to_bulk_krc_lock(&krcp, &flags, ptr, !head); if (!success) { run_page_cache_worker(krcp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 305cf6aeb408..486fc901bd08 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -157,7 +157,6 @@ struct rcu_data { bool core_needs_qs; /* Core waits for quiescent state. */ bool beenonline; /* CPU online at least once. */ bool gpwrap; /* Possible ->gp_seq wrap. */ - bool exp_deferred_qs; /* This CPU awaiting a deferred QS? */ bool cpu_started; /* RCU watching this onlining CPU. */ struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ unsigned long grpmask; /* Mask to apply to leaf qsmask. */ @@ -189,11 +188,6 @@ struct rcu_data { bool rcu_urgent_qs; /* GP old need light quiescent state. */ bool rcu_forced_tick; /* Forced tick to provide QS. */ bool rcu_forced_tick_exp; /* ... provide QS to expedited GP. */ -#ifdef CONFIG_RCU_FAST_NO_HZ - unsigned long last_accelerate; /* Last jiffy CBs were accelerated. */ - unsigned long last_advance_all; /* Last jiffy CBs were all advanced. */ - int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ -#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ /* 4) rcu_barrier(), OOM callbacks, and expediting. */ struct rcu_head barrier_head; @@ -227,8 +221,11 @@ struct rcu_data { struct swait_queue_head nocb_gp_wq; /* For nocb kthreads to sleep on. */ bool nocb_cb_sleep; /* Is the nocb CB thread asleep? */ struct task_struct *nocb_cb_kthread; - struct rcu_data *nocb_next_cb_rdp; - /* Next rcu_data in wakeup chain. */ + struct list_head nocb_head_rdp; /* + * Head of rcu_data list in wakeup chain, + * if rdp_gp. + */ + struct list_head nocb_entry_rdp; /* rcu_data node in wakeup chain. */ /* The following fields are used by CB kthread, hence new cacheline. */ struct rcu_data *nocb_gp_rdp ____cacheline_internodealigned_in_smp; @@ -419,8 +416,6 @@ static bool rcu_is_callbacks_kthread(void); static void rcu_cpu_kthread_setup(unsigned int cpu); static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp); static void __init rcu_spawn_boost_kthreads(void); -static void rcu_cleanup_after_idle(void); -static void rcu_prepare_for_idle(void); static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static bool rcu_preempt_need_deferred_qs(struct task_struct *t); static void rcu_preempt_deferred_qs(struct task_struct *t); @@ -447,12 +442,16 @@ static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp, static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp); #ifdef CONFIG_RCU_NOCB_CPU static void __init rcu_organize_nocb_kthreads(void); -#define rcu_nocb_lock_irqsave(rdp, flags) \ -do { \ - if (!rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ - local_irq_save(flags); \ - else \ - raw_spin_lock_irqsave(&(rdp)->nocb_lock, (flags)); \ + +/* + * Disable IRQs before checking offloaded state so that local + * locking is safe against concurrent de-offloading. + */ +#define rcu_nocb_lock_irqsave(rdp, flags) \ +do { \ + local_irq_save(flags); \ + if (rcu_segcblist_is_offloaded(&(rdp)->cblist)) \ + raw_spin_lock(&(rdp)->nocb_lock); \ } while (0) #else /* #ifdef CONFIG_RCU_NOCB_CPU */ #define rcu_nocb_lock_irqsave(rdp, flags) local_irq_save(flags) diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index f3947c49eee7..237a79989aba 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -255,7 +255,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_node *rnp, */ static void rcu_report_exp_rdp(struct rcu_data *rdp) { - WRITE_ONCE(rdp->exp_deferred_qs, false); + WRITE_ONCE(rdp->cpu_no_qs.b.exp, false); rcu_report_exp_cpu_mult(rdp->mynode, rdp->grpmask, true); } @@ -387,6 +387,7 @@ retry_ipi: continue; } if (get_cpu() == cpu) { + mask_ofl_test |= mask; put_cpu(); continue; } @@ -506,7 +507,10 @@ static void synchronize_rcu_expedited_wait(void) if (rdp->rcu_forced_tick_exp) continue; rdp->rcu_forced_tick_exp = true; - tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_disable(); + if (cpu_online(cpu)) + tick_dep_set_cpu(cpu, TICK_DEP_BIT_RCU_EXP); + preempt_enable(); } } j = READ_ONCE(jiffies_till_first_fqs); @@ -655,7 +659,7 @@ static void rcu_exp_handler(void *unused) rcu_dynticks_curr_cpu_in_eqs()) { rcu_report_exp_rdp(rdp); } else { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); set_tsk_need_resched(t); set_preempt_need_resched(); } @@ -677,7 +681,7 @@ static void rcu_exp_handler(void *unused) if (depth > 0) { raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmask & rdp->grpmask) { - rdp->exp_deferred_qs = true; + WRITE_ONCE(rdp->cpu_no_qs.b.exp, true); t->rcu_read_unlock_special.b.exp_hint = true; } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); @@ -759,7 +763,7 @@ static void sync_sched_exp_online_cleanup(int cpu) my_cpu = get_cpu(); /* Quiescent state either not needed or already requested, leave. */ if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) || - rdp->cpu_no_qs.b.exp) { + READ_ONCE(rdp->cpu_no_qs.b.exp)) { put_cpu(); return; } diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h index 368ef7b9af4f..eeafb546a7a0 100644 --- a/kernel/rcu/tree_nocb.h +++ b/kernel/rcu/tree_nocb.h @@ -60,16 +60,22 @@ static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp) * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. * If the list is invalid, a warning is emitted and all CPUs are offloaded. */ + +static bool rcu_nocb_is_setup; + static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - if (cpulist_parse(str, rcu_nocb_mask)) { - pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); - cpumask_setall(rcu_nocb_mask); + if (*str == '=') { + if (cpulist_parse(++str, rcu_nocb_mask)) { + pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n"); + cpumask_setall(rcu_nocb_mask); + } } + rcu_nocb_is_setup = true; return 1; } -__setup("rcu_nocbs=", rcu_nocb_setup); +__setup("rcu_nocbs", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { @@ -625,7 +631,21 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) * and the global grace-period kthread are awakened if needed. */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); - for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + /* + * An rcu_data structure is removed from the list after its + * CPU is de-offloaded and added to the list before that CPU is + * (re-)offloaded. If the following loop happens to be referencing + * that rcu_data structure during the time that the corresponding + * CPU is de-offloaded and then immediately re-offloaded, this + * loop's rdp pointer will be carried to the end of the list by + * the resulting pair of list operations. This can cause the loop + * to skip over some of the rcu_data structures that were supposed + * to have been scanned. Fortunately a new iteration through the + * entire loop is forced after a given CPU's rcu_data structure + * is added to the list, so the skipped-over rcu_data structures + * won't be ignored for long. + */ + list_for_each_entry_rcu(rdp, &my_rdp->nocb_head_rdp, nocb_entry_rdp, 1) { bool needwake_state = false; if (!nocb_gp_enabled_cb(rdp)) @@ -789,6 +809,18 @@ static void nocb_cb_wait(struct rcu_data *rdp) bool can_sleep = true; struct rcu_node *rnp = rdp->mynode; + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); + + local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); @@ -841,17 +873,6 @@ static void nocb_cb_wait(struct rcu_data *rdp) if (needwake_state) swake_up_one(&rdp->nocb_state_wq); - - do { - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - nocb_cb_wait_cond(rdp)); - - // VVV Ensure CB invocation follows _sleep test. - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); - } - } while (!nocb_cb_can_run(rdp)); } /* @@ -990,22 +1011,33 @@ static long rcu_nocb_rdp_deoffload(void *arg) * will refuse to put anything into the bypass. */ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies)); + /* + * Start with invoking rcu_core() early. This way if the current thread + * happens to preempt an ongoing call to rcu_core() in the middle, + * leaving some work dismissed because rcu_core() still thinks the rdp is + * completely offloaded, we are guaranteed a nearby future instance of + * rcu_core() to catch up. + */ + rcu_segcblist_set_flags(cblist, SEGCBLIST_RCU_CORE); + invoke_rcu_core(); ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Stop nocb_gp_wait() from iterating over this structure. */ + list_del_rcu(&rdp->nocb_entry_rdp); /* * Lock one last time to acquire latest callback updates from kthreads * so we can later handle callbacks locally without locking. */ rcu_nocb_lock_irqsave(rdp, flags); /* - * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb + * Theoretically we could clear SEGCBLIST_LOCKING after the nocb * lock is released but how about being paranoid for once? */ - rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_LOCKING); /* - * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * Without SEGCBLIST_LOCKING, we can't use * rcu_nocb_unlock_irqrestore() anymore. */ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); @@ -1057,15 +1089,26 @@ static long rcu_nocb_rdp_offload(void *arg) return -EINVAL; pr_info("Offloading %d\n", rdp->cpu); + + /* + * Cause future nocb_gp_wait() invocations to iterate over + * structure, resetting ->nocb_gp_sleep and waking up the related + * "rcuog". Since nocb_gp_wait() in turn locks ->nocb_gp_lock + * before setting ->nocb_gp_sleep again, we are guaranteed to + * iterate this newly added structure before "rcuog" goes to + * sleep again. + */ + list_add_tail_rcu(&rdp->nocb_entry_rdp, &rdp->nocb_gp_rdp->nocb_head_rdp); + /* - * Can't use rcu_nocb_lock_irqsave() while we are in - * SEGCBLIST_SOFTIRQ_ONLY mode. + * Can't use rcu_nocb_lock_irqsave() before SEGCBLIST_LOCKING + * is set. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); /* * We didn't take the nocb lock while working on the - * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * rdp->cblist with SEGCBLIST_LOCKING cleared (pure softirq/rcuc mode). * Every modifications that have been done previously on * rdp->cblist must be visible remotely by the nocb kthreads * upon wake up after reading the cblist flags. @@ -1084,6 +1127,14 @@ static long rcu_nocb_rdp_offload(void *arg) rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + /* + * All kthreads are ready to work, we can finally relieve rcu_core() and + * enable nocb bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_RCU_CORE); + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; } @@ -1122,13 +1173,17 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { - if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { - pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); - return; + if (need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask)) { + if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { + pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); + return; + } } + rcu_nocb_is_setup = true; } - if (!cpumask_available(rcu_nocb_mask)) + + if (!rcu_nocb_is_setup) return; #if defined(CONFIG_NO_HZ_FULL) @@ -1154,8 +1209,8 @@ void __init rcu_init_nohz(void) if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); - rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP); + rcu_segcblist_clear_flags(&rdp->cblist, SEGCBLIST_RCU_CORE); } rcu_organize_nocb_kthreads(); } @@ -1178,17 +1233,17 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread * for this CPU's group has not yet been created, spawn it as well. */ -static void rcu_spawn_one_nocb_kthread(int cpu) +static void rcu_spawn_cpu_nocb_kthread(int cpu) { struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_data *rdp_gp; struct task_struct *t; - /* - * If this isn't a no-CBs CPU or if it already has an rcuo kthread, - * then nothing to do. - */ - if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread) + if (!rcu_scheduler_fully_active || !rcu_nocb_is_setup) + return; + + /* If there already is an rcuo kthread, then nothing to do. */ + if (rdp->nocb_cb_kthread) return; /* If we didn't spawn the GP kthread first, reorganize! */ @@ -1211,16 +1266,6 @@ static void rcu_spawn_one_nocb_kthread(int cpu) } /* - * If the specified CPU is a no-CBs CPU that does not already have its - * rcuo kthread, spawn it. - */ -static void rcu_spawn_cpu_nocb_kthread(int cpu) -{ - if (rcu_scheduler_fully_active) - rcu_spawn_one_nocb_kthread(cpu); -} - -/* * Once the scheduler is running, spawn rcuo kthreads for all online * no-CBs CPUs. This assumes that the early_initcall()s happen before * non-boot CPUs come online -- if this changes, we will need to add @@ -1230,8 +1275,10 @@ static void __init rcu_spawn_nocb_kthreads(void) { int cpu; - for_each_online_cpu(cpu) - rcu_spawn_cpu_nocb_kthread(cpu); + if (rcu_nocb_is_setup) { + for_each_online_cpu(cpu) + rcu_spawn_cpu_nocb_kthread(cpu); + } } /* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */ @@ -1251,7 +1298,6 @@ static void __init rcu_organize_nocb_kthreads(void) int nl = 0; /* Next GP kthread. */ struct rcu_data *rdp; struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */ - struct rcu_data *rdp_prev = NULL; if (!cpumask_available(rcu_nocb_mask)) return; @@ -1265,14 +1311,14 @@ static void __init rcu_organize_nocb_kthreads(void) * Should the corresponding CPU come online in the future, then * we will spawn the needed set of rcu_nocb_kthread() kthreads. */ - for_each_cpu(cpu, rcu_nocb_mask) { + for_each_possible_cpu(cpu) { rdp = per_cpu_ptr(&rcu_data, cpu); if (rdp->cpu >= nl) { /* New GP kthread, set up for CBs & next GP. */ gotnocbs = true; nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls; - rdp->nocb_gp_rdp = rdp; rdp_gp = rdp; + INIT_LIST_HEAD(&rdp->nocb_head_rdp); if (dump_tree) { if (!firsttime) pr_cont("%s\n", gotnocbscbs @@ -1285,12 +1331,12 @@ static void __init rcu_organize_nocb_kthreads(void) } else { /* Another CB kthread, link to previous GP kthread. */ gotnocbscbs = true; - rdp->nocb_gp_rdp = rdp_gp; - rdp_prev->nocb_next_cb_rdp = rdp; if (dump_tree) pr_cont(" %d", cpu); } - rdp_prev = rdp; + rdp->nocb_gp_rdp = rdp_gp; + if (cpumask_test_cpu(cpu, rcu_nocb_mask)) + list_add_tail(&rdp->nocb_entry_rdp, &rdp_gp->nocb_head_rdp); } if (gotnocbs && dump_tree) pr_cont("%s\n", gotnocbscbs ? "" : " (self only)"); @@ -1352,6 +1398,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) { char bufw[20]; char bufr[20]; + struct rcu_data *nocb_next_rdp; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wassleep; @@ -1359,11 +1406,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); + nocb_next_rdp = list_next_or_null_rcu(&rdp->nocb_gp_rdp->nocb_head_rdp, + &rdp->nocb_entry_rdp, + typeof(*rdp), + nocb_entry_rdp); + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, - rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, + nocb_next_rdp ? nocb_next_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5199559fbbf0..c5b45c2f68a1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -16,7 +16,7 @@ static bool rcu_rdp_is_offloaded(struct rcu_data *rdp) { /* - * In order to read the offloaded state of an rdp is a safe + * In order to read the offloaded state of an rdp in a safe * and stable way and prevent from its value to be changed * under us, we must either hold the barrier mutex, the cpu * hotplug lock (read or write) or the nocb lock. Local @@ -51,12 +51,10 @@ static void __init rcu_bootup_announce_oddness(void) RCU_FANOUT); if (rcu_fanout_exact) pr_info("\tHierarchical RCU autobalancing is disabled.\n"); - if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); if (IS_ENABLED(CONFIG_PROVE_RCU)) pr_info("\tRCU lockdep checking is enabled.\n"); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) - pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n"); + pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n"); if (RCU_NUM_LVLS >= 4) pr_info("\tFour(or more)-level hierarchy is enabled.\n"); if (RCU_FANOUT_LEAF != 16) @@ -88,13 +86,13 @@ static void __init rcu_bootup_announce_oddness(void) if (rcu_kick_kthreads) pr_info("\tKick kthreads if too-long grace period.\n"); if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD)) - pr_info("\tRCU callback double-/use-after-free debug enabled.\n"); + pr_info("\tRCU callback double-/use-after-free debug is enabled.\n"); if (gp_preinit_delay) pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay); if (gp_init_delay) pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay); if (gp_cleanup_delay) - pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_cleanup_delay); + pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay); if (!use_softirq) pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n"); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG)) @@ -260,10 +258,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * no need to check for a subsequent expedited GP. (Though we are * still in a quiescent state in any case.) */ - if (blkd_state & RCU_EXP_BLKD && rdp->exp_deferred_qs) + if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); else - WARN_ON_ONCE(rdp->exp_deferred_qs); + WARN_ON_ONCE(rdp->cpu_no_qs.b.exp); } /* @@ -277,12 +275,16 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) * current task, there might be any number of other tasks blocked while * in an RCU read-side critical section. * + * Unlike non-preemptible-RCU, quiescent state reports for expedited + * grace periods are handled separately via deferred quiescent states + * and context switch events. + * * Callers to this function must disable preemption. */ static void rcu_qs(void) { RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n"); - if (__this_cpu_read(rcu_data.cpu_no_qs.s)) { + if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) { trace_rcu_grace_period(TPS("rcu_preempt"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); @@ -350,7 +352,7 @@ void rcu_note_context_switch(bool preempt) * means that we continue to block the current grace period. */ rcu_qs(); - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); rcu_tasks_qs(current, preempt); trace_rcu_utilization(TPS("End context switch")); @@ -477,7 +479,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ special = t->rcu_read_unlock_special; rdp = this_cpu_ptr(&rcu_data); - if (!special.s && !rdp->exp_deferred_qs) { + if (!special.s && !rdp->cpu_no_qs.b.exp) { local_irq_restore(flags); return; } @@ -497,7 +499,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) * tasks are handled when removing the task from the * blocked-tasks list below. */ - if (rdp->exp_deferred_qs) + if (rdp->cpu_no_qs.b.exp) rcu_report_exp_rdp(rdp); /* Clean up if blocked during RCU read-side critical section. */ @@ -580,7 +582,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags) */ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { - return (__this_cpu_read(rcu_data.exp_deferred_qs) || + return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) || READ_ONCE(t->rcu_read_unlock_special.s)) && rcu_preempt_depth() == 0; } @@ -642,7 +644,7 @@ static void rcu_read_unlock_special(struct task_struct *t) (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled && t->rcu_blocked_node); // Need to defer quiescent state until everything is enabled. - if (use_softirq && (in_irq() || (expboost && !irqs_were_disabled))) { + if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) { // Using softirq, safe to awaken, and either the // wakeup is free or there is either an expedited // GP in flight or a potential need to deboost. @@ -845,10 +847,8 @@ static void rcu_qs(void) trace_rcu_grace_period(TPS("rcu_sched"), __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs")); __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) - return; - __this_cpu_write(rcu_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); + if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp)) + rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); } /* @@ -925,7 +925,18 @@ static bool rcu_preempt_need_deferred_qs(struct task_struct *t) { return false; } -static void rcu_preempt_deferred_qs(struct task_struct *t) { } + +// Except that we do need to respond to a request by an expedited grace +// period for a quiescent state from this CPU. Note that requests from +// tasks are handled when removing the task from the blocked-tasks list +// below. +static void rcu_preempt_deferred_qs(struct task_struct *t) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (rdp->cpu_no_qs.b.exp) + rcu_report_exp_rdp(rdp); +} /* * Because there is no preemptible RCU, there can be no readers blocked, @@ -1153,7 +1164,6 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) /* * Create an RCU-boost kthread for the specified node if one does not * already exist. We only create this kthread for preemptible RCU. - * Returns zero if all is well, a negated errno otherwise. */ static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp) { @@ -1204,8 +1214,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); + cpumask_and(cm, cm, housekeeping_cpumask(HK_FLAG_RCU)); if (cpumask_weight(cm) == 0) - cpumask_setall(cm); + cpumask_copy(cm, housekeeping_cpumask(HK_FLAG_RCU)); set_cpus_allowed_ptr(t, cm); free_cpumask_var(cm); } @@ -1253,201 +1264,6 @@ static void __init rcu_spawn_boost_kthreads(void) #endif /* #else #ifdef CONFIG_RCU_BOOST */ -#if !defined(CONFIG_RCU_FAST_NO_HZ) - -/* - * Check to see if any future non-offloaded RCU-related work will need - * to be done by the current CPU, even if none need be done immediately, - * returning 1 if so. This function is part of the RCU implementation; - * it is -not- an exported member of the RCU API. - * - * Because we not have RCU_FAST_NO_HZ, just check whether or not this - * CPU has RCU callbacks queued. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - *nextevt = KTIME_MAX; - return !rcu_segcblist_empty(&this_cpu_ptr(&rcu_data)->cblist) && - !rcu_rdp_is_offloaded(this_cpu_ptr(&rcu_data)); -} - -/* - * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up - * after it. - */ -static void rcu_cleanup_after_idle(void) -{ -} - -/* - * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, - * is nothing. - */ -static void rcu_prepare_for_idle(void) -{ -} - -#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - -/* - * This code is invoked when a CPU goes idle, at which point we want - * to have the CPU do everything required for RCU so that it can enter - * the energy-efficient dyntick-idle mode. - * - * The following preprocessor symbol controls this: - * - * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted - * to sleep in dyntick-idle mode with RCU callbacks pending. This - * is sized to be roughly one RCU grace period. Those energy-efficiency - * benchmarkers who might otherwise be tempted to set this to a large - * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your - * system. And if you are -that- concerned about energy efficiency, - * just power the system down and be done with it! - * - * The value below works well in practice. If future workloads require - * adjustment, they can be converted into kernel config parameters, though - * making the state machine smarter might be a better option. - */ -#define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ - -static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; -module_param(rcu_idle_gp_delay, int, 0644); - -/* - * Try to advance callbacks on the current CPU, but only if it has been - * awhile since the last time we did so. Afterwards, if there are any - * callbacks ready for immediate invocation, return true. - */ -static bool __maybe_unused rcu_try_advance_all_cbs(void) -{ - bool cbs_ready = false; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - - /* Exit early if we advanced recently. */ - if (jiffies == rdp->last_advance_all) - return false; - rdp->last_advance_all = jiffies; - - rnp = rdp->mynode; - - /* - * Don't bother checking unless a grace period has - * completed since we last checked and there are - * callbacks not yet ready to invoke. - */ - if ((rcu_seq_completed_gp(rdp->gp_seq, - rcu_seq_current(&rnp->gp_seq)) || - unlikely(READ_ONCE(rdp->gpwrap))) && - rcu_segcblist_pend_cbs(&rdp->cblist)) - note_gp_changes(rdp); - - if (rcu_segcblist_ready_cbs(&rdp->cblist)) - cbs_ready = true; - return cbs_ready; -} - -/* - * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready - * to invoke. If the CPU has callbacks, try to advance them. Tell the - * caller about what to set the timeout. - * - * The caller must have disabled interrupts. - */ -int rcu_needs_cpu(u64 basemono, u64 *nextevt) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - unsigned long dj; - - lockdep_assert_irqs_disabled(); - - /* If no non-offloaded callbacks, RCU doesn't need the CPU. */ - if (rcu_segcblist_empty(&rdp->cblist) || - rcu_rdp_is_offloaded(rdp)) { - *nextevt = KTIME_MAX; - return 0; - } - - /* Attempt to advance callbacks. */ - if (rcu_try_advance_all_cbs()) { - /* Some ready to invoke, so initiate later invocation. */ - invoke_rcu_core(); - return 1; - } - rdp->last_accelerate = jiffies; - - /* Request timer and round. */ - dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; - - *nextevt = basemono + dj * TICK_NSEC; - return 0; -} - -/* - * Prepare a CPU for idle from an RCU perspective. The first major task is to - * sense whether nohz mode has been enabled or disabled via sysfs. The second - * major task is to accelerate (that is, assign grace-period numbers to) any - * recently arrived callbacks. - * - * The caller must have disabled interrupts. - */ -static void rcu_prepare_for_idle(void) -{ - bool needwake; - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - struct rcu_node *rnp; - int tne; - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - - /* Handle nohz enablement switches conservatively. */ - tne = READ_ONCE(tick_nohz_active); - if (tne != rdp->tick_nohz_enabled_snap) { - if (!rcu_segcblist_empty(&rdp->cblist)) - invoke_rcu_core(); /* force nohz to see update. */ - rdp->tick_nohz_enabled_snap = tne; - return; - } - if (!tne) - return; - - /* - * If we have not yet accelerated this jiffy, accelerate all - * callbacks on this CPU. - */ - if (rdp->last_accelerate == jiffies) - return; - rdp->last_accelerate = jiffies; - if (rcu_segcblist_pend_cbs(&rdp->cblist)) { - rnp = rdp->mynode; - raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ - needwake = rcu_accelerate_cbs(rnp, rdp); - raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ - if (needwake) - rcu_gp_kthread_wake(); - } -} - -/* - * Clean up for exit from idle. Attempt to advance callbacks based on - * any grace periods that elapsed while the CPU was idle, and if any - * callbacks are now ready to invoke, initiate invocation. - */ -static void rcu_cleanup_after_idle(void) -{ - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - - lockdep_assert_irqs_disabled(); - if (rcu_rdp_is_offloaded(rdp)) - return; - if (rcu_try_advance_all_cbs()) - invoke_rcu_core(); -} - -#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ - /* * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the * grace-period kthread will do force_quiescent_state() processing? @@ -1455,7 +1271,7 @@ static void rcu_cleanup_after_idle(void) * CPU unless the grace period has extended for too long. * * This code relies on the fact that all NO_HZ_FULL CPUs are also - * CONFIG_RCU_NOCB_CPU CPUs. + * RCU_NOCB_CPU CPUs. */ static bool rcu_nohz_full_cpu(void) { diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5e2fa6fd97f1..21bebf7c9030 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -347,26 +347,6 @@ static void rcu_dump_cpu_stacks(void) } } -#ifdef CONFIG_RCU_FAST_NO_HZ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); - - sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d", - rdp->last_accelerate & 0xffff, jiffies & 0xffff, - !!rdp->tick_nohz_enabled_snap); -} - -#else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ - -static void print_cpu_stall_fast_no_hz(char *cp, int cpu) -{ - *cp = '\0'; -} - -#endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ - static const char * const gp_state_names[] = { [RCU_GP_IDLE] = "RCU_GP_IDLE", [RCU_GP_WAIT_GPS] = "RCU_GP_WAIT_GPS", @@ -408,13 +388,12 @@ static bool rcu_is_gp_kthread_starving(unsigned long *jp) * of RCU grace periods that this CPU is ignorant of, for example, "1" * if the CPU was aware of the previous grace period. * - * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. + * Also print out idle info. */ static void print_cpu_stall_info(int cpu) { unsigned long delta; bool falsepositive; - char fast_no_hz[72]; struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); char *ticks_title; unsigned long ticks_value; @@ -432,11 +411,10 @@ static void print_cpu_stall_info(int cpu) ticks_title = "ticks this GP"; ticks_value = rdp->ticks_this_gp; } - print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rcu_seq_ctr(rdp->mynode->gp_seq - rdp->rcu_iw_gp_seq); falsepositive = rcu_is_gp_kthread_starving(NULL) && rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s%s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%#lx softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -449,7 +427,6 @@ static void print_cpu_stall_info(int cpu) rdp->dynticks_nesting, rdp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), data_race(rcu_state.n_force_qs) - rcu_state.n_force_qs_gpstart, - fast_no_hz, falsepositive ? " (false positive?)" : ""); } |