From 2da4b2a7fd8de52c44fa83c59f00d38a0d90caae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 25 Apr 2017 11:34:40 -0700 Subject: srcu: Expedite first synchronize_srcu() when idle Classic SRCU in effect expedites the first synchronize_srcu() when SRCU is idle, and Mike Galbraith demonstrated that some use cases do in fact rely on this behavior. In particular, Mike showed that Steven Rostedt's hotplug stress script takes 55 seconds with Classic SRCU and more than 16 -minutes- when running Tree SRCU. Assuming that each Tree SRCU's call to synchronize_srcu() takes four milliseconds, this implies that Steven's test invokes synchronize_srcu() in isolation, but more than once per 200 microseconds. Mike used ftrace to demonstrate that the time between successive calls to synchronize_srcu() ranged from 118 to 342 microseconds, with one outlier at 80 milliseconds. This data clearly indicates that Tree SRCU needs to expedite the first invocation of synchronize_srcu() during an SRCU idle period. This commit therefor introduces a srcu_might_be_idle() function that probabilistically checks whether or not SRCU is idle. This function is used by synchronize_rcu() as an additional criterion in deciding whether or not to expedite. (Hat trick to Peter Zijlstra for his earlier suggestion that this might in fact be a problem. Which for all I know might have motivated Mike to look into it.) Reported-by: Mike Galbraith Signed-off-by: Paul E. McKenney Tested-by: Mike Galbraith --- kernel/rcu/srcutree.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) (limited to 'kernel/rcu/srcutree.c') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 4b98e6f45166..2286e06fd159 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -402,6 +402,7 @@ static void srcu_gp_start(struct srcu_struct *sp) rcu_seq_current(&sp->srcu_gp_seq)); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); + smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ rcu_seq_start(&sp->srcu_gp_seq); state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); WARN_ON_ONCE(state != SRCU_STATE_SCAN1); @@ -676,6 +677,57 @@ static void srcu_flip(struct srcu_struct *sp) smp_mb(); /* D */ /* Pairs with C. */ } +/* + * If SRCU is likely idle, return true, otherwise return false. + * + * Note that it is OK for several current from-idle requests for a new + * grace period from idle to specify expediting because they will all end + * up requesting the same grace period anyhow. So no loss. + * + * Note also that if any CPU (including the current one) is still invoking + * callbacks, this function will nevertheless say "idle". This is not + * ideal, but the overhead of checking all CPUs' callback lists is even + * less ideal, especially on large systems. Furthermore, the wakeup + * can happen before the callback is fully removed, so we have no choice + * but to accept this type of error. + * + * This function is also subject to counter-wrap errors, but let's face + * it, if this function was preempted for enough time for the counters + * to wrap, it really doesn't matter whether or not we expedite the grace + * period. The extra overhead of a needlessly expedited grace period is + * negligible when amoritized over that time period, and the extra latency + * of a needlessly non-expedited grace period is similarly negligible. + */ +static bool srcu_might_be_idle(struct srcu_struct *sp) +{ + unsigned long flags; + struct srcu_data *sdp; + unsigned long curseq; + + /* If the local srcu_data structure has callbacks, not idle. */ + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { + local_irq_restore(flags); + return false; /* Callbacks already present, so not idle. */ + } + local_irq_restore(flags); + + /* + * No local callbacks, so probabalistically probe global state. + * Exact information would require acquiring locks, which would + * kill scalability, hence the probabalistic nature of the probe. + */ + curseq = rcu_seq_current(&sp->srcu_gp_seq); + smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ + if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + return false; /* Grace period in progress, so not idle. */ + smp_mb(); /* Order ->srcu_gp_seq with prior access. */ + if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + return false; /* GP # changed, so not idle. */ + return true; /* With reasonable probability, idle! */ +} + /* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating @@ -823,10 +875,15 @@ EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); * Of course, these memory-ordering guarantees apply only when * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are * passed the same srcu_struct structure. + * + * If SRCU is likely idle, expedite the first request. This semantic + * was provided by Classic SRCU, and is relied upon by its users, so TREE + * SRCU must also provide it. Note that detecting idleness is heuristic + * and subject to both false positives and negatives. */ void synchronize_srcu(struct srcu_struct *sp) { - if (rcu_gp_is_expedited()) + if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) synchronize_srcu_expedited(sp); else __synchronize_srcu(sp, true); -- cgit v1.2.3-58-ga151