diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-29 11:41:43 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-29 11:41:43 -0700 |
commit | 635de956a7f5a6ffcb04f29d70630c64c717b56b (patch) | |
tree | e438ed6f1a39f2873f6a7c7d70399eed08d40d3f /kernel | |
parent | d0cc7ecacba8a5b6bbdd5aa6ba3d1bc2fe59b580 (diff) | |
parent | a500fc918f7b8dc3dff2e6c74f3e73e856c18248 (diff) |
Merge tag 'x86-mm-2021-04-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 tlb updates from Ingo Molnar:
"The x86 MM changes in this cycle were:
- Implement concurrent TLB flushes, which overlaps the local TLB
flush with the remote TLB flush.
In testing this improved sysbench performance measurably by a
couple of percentage points, especially if TLB-heavy security
mitigations are active.
- Further micro-optimizations to improve the performance of TLB
flushes"
* tag 'x86-mm-2021-04-29' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
smp: Micro-optimize smp_call_function_many_cond()
smp: Inline on_each_cpu_cond() and on_each_cpu()
x86/mm/tlb: Remove unnecessary uses of the inline keyword
cpumask: Mark functions as pure
x86/mm/tlb: Do not make is_lazy dirty for no reason
x86/mm/tlb: Privatize cpu_tlbstate
x86/mm/tlb: Flush remote and local TLBs concurrently
x86/mm/tlb: Open-code on_each_cpu_cond_mask() for tlb_is_not_lazy()
x86/mm/tlb: Unify flush_tlb_func_local() and flush_tlb_func_remote()
smp: Run functions concurrently in smp_call_function_many_cond()
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/smp.c | 212 | ||||
-rw-r--r-- | kernel/up.c | 38 |
2 files changed, 89 insertions, 161 deletions
diff --git a/kernel/smp.c b/kernel/smp.c index f472ef623956..e21074900006 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -850,12 +850,28 @@ call: } EXPORT_SYMBOL_GPL(smp_call_function_any); +/* + * Flags to be used as scf_flags argument of smp_call_function_many_cond(). + * + * %SCF_WAIT: Wait until function execution is completed + * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask + */ +#define SCF_WAIT (1U << 0) +#define SCF_RUN_LOCAL (1U << 1) + static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, - bool wait, smp_cond_func_t cond_func) + unsigned int scf_flags, + smp_cond_func_t cond_func) { + int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; - int cpu, next_cpu, this_cpu = smp_processor_id(); + bool wait = scf_flags & SCF_WAIT; + bool run_remote = false; + bool run_local = false; + int nr_cpus = 0; + + lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. @@ -863,8 +879,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask, * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ - WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() - && !oops_in_progress && !early_boot_irqs_disabled); + if (cpu_online(this_cpu) && !oops_in_progress && + !early_boot_irqs_disabled) + lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and @@ -874,70 +891,75 @@ static void smp_call_function_many_cond(const struct cpumask *mask, */ WARN_ON_ONCE(!in_task()); - /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ + /* Check if we need local execution. */ + if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) + run_local = true; + + /* Check if we need remote execution, i.e., any CPU excluding this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (cpu < nr_cpu_ids) + run_remote = true; - /* No online cpus? We're done. */ - if (cpu >= nr_cpu_ids) - return; + if (run_remote) { + cfd = this_cpu_ptr(&cfd_data); + cpumask_and(cfd->cpumask, mask, cpu_online_mask); + __cpumask_clear_cpu(this_cpu, cfd->cpumask); - /* Do we have another CPU which isn't us? */ - next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); - if (next_cpu == this_cpu) - next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); - - /* Fastpath: do that cpu by itself. */ - if (next_cpu >= nr_cpu_ids) { - if (!cond_func || cond_func(cpu, info)) - smp_call_function_single(cpu, func, info, wait); - return; - } + cpumask_clear(cfd->cpumask_ipi); + for_each_cpu(cpu, cfd->cpumask) { + struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); + call_single_data_t *csd = &pcpu->csd; - cfd = this_cpu_ptr(&cfd_data); + if (cond_func && !cond_func(cpu, info)) + continue; - cpumask_and(cfd->cpumask, mask, cpu_online_mask); - __cpumask_clear_cpu(this_cpu, cfd->cpumask); + csd_lock(csd); + if (wait) + csd->node.u_flags |= CSD_TYPE_SYNC; + csd->func = func; + csd->info = info; +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG + csd->node.src = smp_processor_id(); + csd->node.dst = cpu; +#endif + cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { + __cpumask_set_cpu(cpu, cfd->cpumask_ipi); + nr_cpus++; + last_cpu = cpu; - /* Some callers race with other cpus changing the passed mask */ - if (unlikely(!cpumask_weight(cfd->cpumask))) - return; + cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); + } else { + cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); + } + } - cpumask_clear(cfd->cpumask_ipi); - for_each_cpu(cpu, cfd->cpumask) { - struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); - call_single_data_t *csd = &pcpu->csd; + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING); - if (cond_func && !cond_func(cpu, info)) - continue; + /* + * Choose the most efficient way to send an IPI. Note that the + * number of CPUs might be zero due to concurrent changes to the + * provided mask. + */ + if (nr_cpus == 1) + send_call_function_single_ipi(last_cpu); + else if (likely(nr_cpus > 1)) + arch_send_call_function_ipi_mask(cfd->cpumask_ipi); - csd_lock(csd); - if (wait) - csd->node.u_flags |= CSD_TYPE_SYNC; - csd->func = func; - csd->info = info; -#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->node.src = smp_processor_id(); - csd->node.dst = cpu; -#endif - cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); - if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { - __cpumask_set_cpu(cpu, cfd->cpumask_ipi); - cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); - } else { - cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); - } + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED); } - /* Send a message to all CPUs in the map */ - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, - CFD_SEQ_NOCPU, CFD_SEQ_PING); - arch_send_call_function_ipi_mask(cfd->cpumask_ipi); - cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, - CFD_SEQ_NOCPU, CFD_SEQ_PINGED); + if (run_local && (!cond_func || cond_func(this_cpu, info))) { + unsigned long flags; - if (wait) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } + + if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; @@ -948,12 +970,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask, } /** - * smp_call_function_many(): Run a function on a set of other CPUs. + * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. + * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait + * (atomically) until function has completed on other CPUs. If + * %SCF_RUN_LOCAL is set, the function will also be run locally + * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * @@ -964,7 +988,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask, void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { - smp_call_function_many_cond(mask, func, info, wait, NULL); + smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); @@ -1076,56 +1100,6 @@ void __init smp_init(void) } /* - * Call a function on all processors. May be used during early boot while - * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead - * of local_irq_disable/enable(). - */ -void on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - preempt_disable(); - smp_call_function(func, info, wait); - local_irq_save(flags); - func(info); - local_irq_restore(flags); - preempt_enable(); -} -EXPORT_SYMBOL(on_each_cpu); - -/** - * on_each_cpu_mask(): Run a function on processors specified by - * cpumask, which may include the local processor. - * @mask: The set of cpus to run on (only runs on online subset). - * @func: The function to run. This must be fast and non-blocking. - * @info: An arbitrary pointer to pass to the function. - * @wait: If true, wait (atomically) until function has completed - * on other CPUs. - * - * If @wait is true, then returns once @func has returned. - * - * You must not call this function with disabled interrupts or from a - * hardware interrupt handler or from a bottom half handler. The - * exception is that it may be used during early boot while - * early_boot_irqs_disabled is set. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait) -{ - int cpu = get_cpu(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(cpu, mask)) { - unsigned long flags; - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } - put_cpu(); -} -EXPORT_SYMBOL(on_each_cpu_mask); - -/* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local @@ -1150,27 +1124,17 @@ EXPORT_SYMBOL(on_each_cpu_mask); void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { - int cpu = get_cpu(); + unsigned int scf_flags = SCF_RUN_LOCAL; - smp_call_function_many_cond(mask, func, info, wait, cond_func); - if (cpumask_test_cpu(cpu, mask) && cond_func(cpu, info)) { - unsigned long flags; + if (wait) + scf_flags |= SCF_WAIT; - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } - put_cpu(); + preempt_disable(); + smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); + preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait) -{ - on_each_cpu_cond_mask(cond_func, func, info, wait, cpu_online_mask); -} -EXPORT_SYMBOL(on_each_cpu_cond); - static void do_nothing(void *unused) { } diff --git a/kernel/up.c b/kernel/up.c index c6f323dcd45b..bf20b4a9af60 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -36,35 +36,6 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) } EXPORT_SYMBOL(smp_call_function_single_async); -void on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - local_irq_save(flags); - func(info); - local_irq_restore(flags); -} -EXPORT_SYMBOL(on_each_cpu); - -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -void on_each_cpu_mask(const struct cpumask *mask, - smp_call_func_t func, void *info, bool wait) -{ - unsigned long flags; - - if (cpumask_test_cpu(0, mask)) { - local_irq_save(flags); - func(info); - local_irq_restore(flags); - } -} -EXPORT_SYMBOL(on_each_cpu_mask); - /* * Preemption is disabled here to make sure the cond_func is called under the * same condtions in UP and SMP. @@ -75,7 +46,7 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, unsigned long flags; preempt_disable(); - if (cond_func(0, info)) { + if ((!cond_func || cond_func(0, info)) && cpumask_test_cpu(0, mask)) { local_irq_save(flags); func(info); local_irq_restore(flags); @@ -84,13 +55,6 @@ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, } EXPORT_SYMBOL(on_each_cpu_cond_mask); -void on_each_cpu_cond(smp_cond_func_t cond_func, smp_call_func_t func, - void *info, bool wait) -{ - on_each_cpu_cond_mask(cond_func, func, info, wait, NULL); -} -EXPORT_SYMBOL(on_each_cpu_cond); - int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { int ret; |