diff options
Diffstat (limited to 'arch/s390/kernel/hiperdispatch.c')
-rw-r--r-- | arch/s390/kernel/hiperdispatch.c | 430 |
1 files changed, 430 insertions, 0 deletions
diff --git a/arch/s390/kernel/hiperdispatch.c b/arch/s390/kernel/hiperdispatch.c new file mode 100644 index 000000000000..2a99a216ab62 --- /dev/null +++ b/arch/s390/kernel/hiperdispatch.c @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2024 + */ + +#define KMSG_COMPONENT "hd" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +/* + * Hiperdispatch: + * Dynamically calculates the optimum number of high capacity COREs + * by considering the state the system is in. When hiperdispatch decides + * that a capacity update is necessary, it schedules a topology update. + * During topology updates the CPU capacities are always re-adjusted. + * + * There is two places where CPU capacities are being accessed within + * hiperdispatch. + * -> hiperdispatch's reoccuring work function reads CPU capacities to + * determine high capacity CPU count. + * -> during a topology update hiperdispatch's adjustment function + * updates CPU capacities. + * These two can run on different CPUs in parallel which can cause + * hiperdispatch to make wrong decisions. This can potentially cause + * some overhead by leading to extra rebuild_sched_domains() calls + * for correction. Access to capacities within hiperdispatch has to be + * serialized to prevent the overhead. + * + * Hiperdispatch decision making revolves around steal time. + * HD_STEAL_THRESHOLD value is taken as reference. Whenever steal time + * crosses the threshold value hiperdispatch falls back to giving high + * capacities to entitled CPUs. When steal time drops below the + * threshold boundary, hiperdispatch utilizes all CPUs by giving all + * of them high capacity. + * + * The theory behind HD_STEAL_THRESHOLD is related to the SMP thread + * performance. Comparing the throughput of; + * - single CORE, with N threads, running N tasks + * - N separate COREs running N tasks, + * using individual COREs for individual tasks yield better + * performance. This performance difference is roughly ~30% (can change + * between machine generations) + * + * Hiperdispatch tries to hint scheduler to use individual COREs for + * each task, as long as steal time on those COREs are less than 30%, + * therefore delaying the throughput loss caused by using SMP threads. + */ + +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/device.h> +#include <linux/kernel_stat.h> +#include <linux/kstrtox.h> +#include <linux/ktime.h> +#include <linux/sysctl.h> +#include <linux/types.h> +#include <linux/workqueue.h> +#include <asm/hiperdispatch.h> +#include <asm/setup.h> +#include <asm/smp.h> +#include <asm/topology.h> + +#define CREATE_TRACE_POINTS +#include <asm/trace/hiperdispatch.h> + +#define HD_DELAY_FACTOR (4) +#define HD_DELAY_INTERVAL (HZ / 4) +#define HD_STEAL_THRESHOLD 30 +#define HD_STEAL_AVG_WEIGHT 16 + +static cpumask_t hd_vl_coremask; /* Mask containing all vertical low COREs */ +static cpumask_t hd_vmvl_cpumask; /* Mask containing vertical medium and low CPUs */ +static int hd_high_capacity_cores; /* Current CORE count with high capacity */ +static int hd_entitled_cores; /* Total vertical high and medium CORE count */ +static int hd_online_cores; /* Current online CORE count */ + +static unsigned long hd_previous_steal; /* Previous iteration's CPU steal timer total */ +static unsigned long hd_high_time; /* Total time spent while all cpus have high capacity */ +static unsigned long hd_low_time; /* Total time spent while vl cpus have low capacity */ +static atomic64_t hd_adjustments; /* Total occurrence count of hiperdispatch adjustments */ + +static unsigned int hd_steal_threshold = HD_STEAL_THRESHOLD; +static unsigned int hd_delay_factor = HD_DELAY_FACTOR; +static int hd_enabled; + +static void hd_capacity_work_fn(struct work_struct *work); +static DECLARE_DELAYED_WORK(hd_capacity_work, hd_capacity_work_fn); + +static int hd_set_hiperdispatch_mode(int enable) +{ + if (!MACHINE_HAS_TOPOLOGY) + enable = 0; + if (hd_enabled == enable) + return 0; + hd_enabled = enable; + return 1; +} + +void hd_reset_state(void) +{ + cpumask_clear(&hd_vl_coremask); + cpumask_clear(&hd_vmvl_cpumask); + hd_entitled_cores = 0; + hd_online_cores = 0; +} + +void hd_add_core(int cpu) +{ + const struct cpumask *siblings; + int polarization; + + hd_online_cores++; + polarization = smp_cpu_get_polarization(cpu); + siblings = topology_sibling_cpumask(cpu); + switch (polarization) { + case POLARIZATION_VH: + hd_entitled_cores++; + break; + case POLARIZATION_VM: + hd_entitled_cores++; + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + case POLARIZATION_VL: + cpumask_set_cpu(cpu, &hd_vl_coremask); + cpumask_or(&hd_vmvl_cpumask, &hd_vmvl_cpumask, siblings); + break; + } +} + +/* Serialize update and read operations of debug counters. */ +static DEFINE_MUTEX(hd_counter_mutex); + +static void hd_update_times(void) +{ + static ktime_t prev; + ktime_t now; + + /* + * Check if hiperdispatch is active, if not set the prev to 0. + * This way it is possible to differentiate the first update iteration after + * enabling hiperdispatch. + */ + if (hd_entitled_cores == 0 || hd_enabled == 0) { + prev = ktime_set(0, 0); + return; + } + now = ktime_get(); + if (ktime_after(prev, 0)) { + if (hd_high_capacity_cores == hd_online_cores) + hd_high_time += ktime_ms_delta(now, prev); + else + hd_low_time += ktime_ms_delta(now, prev); + } + prev = now; +} + +static void hd_update_capacities(void) +{ + int cpu, upscaling_cores; + unsigned long capacity; + + upscaling_cores = hd_high_capacity_cores - hd_entitled_cores; + capacity = upscaling_cores > 0 ? CPU_CAPACITY_HIGH : CPU_CAPACITY_LOW; + hd_high_capacity_cores = hd_entitled_cores; + for_each_cpu(cpu, &hd_vl_coremask) { + smp_set_core_capacity(cpu, capacity); + if (capacity != CPU_CAPACITY_HIGH) + continue; + hd_high_capacity_cores++; + upscaling_cores--; + if (upscaling_cores == 0) + capacity = CPU_CAPACITY_LOW; + } +} + +void hd_disable_hiperdispatch(void) +{ + cancel_delayed_work_sync(&hd_capacity_work); + hd_high_capacity_cores = hd_online_cores; + hd_previous_steal = 0; +} + +int hd_enable_hiperdispatch(void) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + mutex_unlock(&hd_counter_mutex); + if (hd_enabled == 0) + return 0; + if (hd_entitled_cores == 0) + return 0; + if (hd_online_cores <= hd_entitled_cores) + return 0; + mod_delayed_work(system_wq, &hd_capacity_work, HD_DELAY_INTERVAL * hd_delay_factor); + hd_update_capacities(); + return 1; +} + +static unsigned long hd_steal_avg(unsigned long new) +{ + static unsigned long steal; + + steal = (steal * (HD_STEAL_AVG_WEIGHT - 1) + new) / HD_STEAL_AVG_WEIGHT; + return steal; +} + +static unsigned long hd_calculate_steal_percentage(void) +{ + unsigned long time_delta, steal_delta, steal, percentage; + static ktime_t prev; + int cpus, cpu; + ktime_t now; + + cpus = 0; + steal = 0; + percentage = 0; + for_each_cpu(cpu, &hd_vmvl_cpumask) { + steal += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL]; + cpus++; + } + /* + * If there is no vertical medium and low CPUs steal time + * is 0 as vertical high CPUs shouldn't experience steal time. + */ + if (cpus == 0) + return percentage; + now = ktime_get(); + time_delta = ktime_to_ns(ktime_sub(now, prev)); + if (steal > hd_previous_steal && hd_previous_steal != 0) { + steal_delta = (steal - hd_previous_steal) * 100 / time_delta; + percentage = steal_delta / cpus; + } + hd_previous_steal = steal; + prev = now; + return percentage; +} + +static void hd_capacity_work_fn(struct work_struct *work) +{ + unsigned long steal_percentage, new_cores; + + mutex_lock(&smp_cpu_state_mutex); + /* + * If online cores are less or equal to entitled cores hiperdispatch + * does not need to make any adjustments, call a topology update to + * disable hiperdispatch. + * Normally this check is handled on topology update, but during cpu + * unhotplug, topology and cpu mask updates are done in reverse + * order, causing hd_enable_hiperdispatch() to get stale data. + */ + if (hd_online_cores <= hd_entitled_cores) { + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return; + } + steal_percentage = hd_steal_avg(hd_calculate_steal_percentage()); + if (steal_percentage < hd_steal_threshold) + new_cores = hd_online_cores; + else + new_cores = hd_entitled_cores; + if (hd_high_capacity_cores != new_cores) { + trace_s390_hd_rebuild_domains(hd_high_capacity_cores, new_cores); + hd_high_capacity_cores = new_cores; + atomic64_inc(&hd_adjustments); + topology_schedule_update(); + } + trace_s390_hd_work_fn(steal_percentage, hd_entitled_cores, hd_high_capacity_cores); + mutex_unlock(&smp_cpu_state_mutex); + schedule_delayed_work(&hd_capacity_work, HD_DELAY_INTERVAL); +} + +static int hiperdispatch_ctl_handler(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int hiperdispatch; + int rc; + struct ctl_table ctl_entry = { + .procname = ctl->procname, + .data = &hiperdispatch, + .maxlen = sizeof(int), + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }; + + hiperdispatch = hd_enabled; + rc = proc_douintvec_minmax(&ctl_entry, write, buffer, lenp, ppos); + if (rc < 0 || !write) + return rc; + mutex_lock(&smp_cpu_state_mutex); + if (hd_set_hiperdispatch_mode(hiperdispatch)) + topology_schedule_update(); + mutex_unlock(&smp_cpu_state_mutex); + return 0; +} + +static struct ctl_table hiperdispatch_ctl_table[] = { + { + .procname = "hiperdispatch", + .mode = 0644, + .proc_handler = hiperdispatch_ctl_handler, + }, +}; + +static ssize_t hd_steal_threshold_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_steal_threshold); +} + +static ssize_t hd_steal_threshold_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (val > 100) + return -ERANGE; + hd_steal_threshold = val; + return count; +} + +static DEVICE_ATTR_RW(hd_steal_threshold); + +static ssize_t hd_delay_factor_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return sysfs_emit(buf, "%u\n", hd_delay_factor); +} + +static ssize_t hd_delay_factor_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + unsigned int val; + int rc; + + rc = kstrtouint(buf, 0, &val); + if (rc) + return rc; + if (!val) + return -ERANGE; + hd_delay_factor = val; + return count; +} + +static DEVICE_ATTR_RW(hd_delay_factor); + +static struct attribute *hd_attrs[] = { + &dev_attr_hd_steal_threshold.attr, + &dev_attr_hd_delay_factor.attr, + NULL, +}; + +static const struct attribute_group hd_attr_group = { + .name = "hiperdispatch", + .attrs = hd_attrs, +}; + +static int hd_greedy_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_high_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_greedy_time_fops, hd_greedy_time_get, NULL, "%llu\n"); + +static int hd_conservative_time_get(void *unused, u64 *val) +{ + mutex_lock(&hd_counter_mutex); + hd_update_times(); + *val = hd_low_time; + mutex_unlock(&hd_counter_mutex); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_conservative_time_fops, hd_conservative_time_get, NULL, "%llu\n"); + +static int hd_adjustment_count_get(void *unused, u64 *val) +{ + *val = atomic64_read(&hd_adjustments); + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(hd_adjustments_fops, hd_adjustment_count_get, NULL, "%llu\n"); + +static void __init hd_create_debugfs_counters(void) +{ + struct dentry *dir; + + dir = debugfs_create_dir("hiperdispatch", arch_debugfs_dir); + debugfs_create_file("conservative_time_ms", 0400, dir, NULL, &hd_conservative_time_fops); + debugfs_create_file("greedy_time_ms", 0400, dir, NULL, &hd_greedy_time_fops); + debugfs_create_file("adjustment_count", 0400, dir, NULL, &hd_adjustments_fops); +} + +static void __init hd_create_attributes(void) +{ + struct device *dev; + + dev = bus_get_dev_root(&cpu_subsys); + if (!dev) + return; + if (sysfs_create_group(&dev->kobj, &hd_attr_group)) + pr_warn("Unable to create hiperdispatch attribute group\n"); + put_device(dev); +} + +static int __init hd_init(void) +{ + if (IS_ENABLED(CONFIG_HIPERDISPATCH_ON)) { + hd_set_hiperdispatch_mode(1); + topology_schedule_update(); + } + if (!register_sysctl("s390", hiperdispatch_ctl_table)) + pr_warn("Failed to register s390.hiperdispatch sysctl attribute\n"); + hd_create_debugfs_counters(); + hd_create_attributes(); + return 0; +} +late_initcall(hd_init); |