diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r-- | kernel/cgroup/cpuset.c | 197 |
1 files changed, 142 insertions, 55 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c12b9fdb22a4..40ec4abaf440 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -21,6 +21,7 @@ * License. See the file COPYING in the main directory of the Linux * distribution for more details. */ +#include "cgroup-internal.h" #include <linux/cpu.h> #include <linux/cpumask.h> @@ -87,7 +88,7 @@ static const char * const perr_strings[] = { [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", - [PERR_CPUSEMPTY] = "cpuset.cpus is empty", + [PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty", [PERR_HKEEPING] = "partition config conflicts with housekeeping setup", }; @@ -127,19 +128,28 @@ struct cpuset { /* * Exclusive CPUs dedicated to current cgroup (default hierarchy only) * - * This exclusive CPUs must be a subset of cpus_allowed. A parent - * cgroup can only grant exclusive CPUs to one of its children. + * The effective_cpus of a valid partition root comes solely from its + * effective_xcpus and some of the effective_xcpus may be distributed + * to sub-partitions below & hence excluded from its effective_cpus. + * For a valid partition root, its effective_cpus have no relationship + * with cpus_allowed unless its exclusive_cpus isn't set. * - * When the cgroup becomes a valid partition root, effective_xcpus - * defaults to cpus_allowed if not set. The effective_cpus of a valid - * partition root comes solely from its effective_xcpus and some of the - * effective_xcpus may be distributed to sub-partitions below & hence - * excluded from its effective_cpus. + * This value will only be set if either exclusive_cpus is set or + * when this cpuset becomes a local partition root. */ cpumask_var_t effective_xcpus; /* * Exclusive CPUs as requested by the user (default hierarchy only) + * + * Its value is independent of cpus_allowed and designates the set of + * CPUs that can be granted to the current cpuset or its children when + * it becomes a valid partition root. The effective set of exclusive + * CPUs granted (effective_xcpus) depends on whether those exclusive + * CPUs are passed down by its ancestors and not yet taken up by + * another sibling partition root along the way. + * + * If its value isn't set, it defaults to cpus_allowed. */ cpumask_var_t exclusive_cpus; @@ -169,7 +179,7 @@ struct cpuset { /* for custom sched domain */ int relax_domain_level; - /* number of valid sub-partitions */ + /* number of valid local child partitions */ int nr_subparts; /* partition root state */ @@ -230,6 +240,17 @@ static struct list_head remote_children; * 2 - partition root without load balancing (isolated) * -1 - invalid partition root * -2 - invalid isolated partition root + * + * There are 2 types of partitions - local or remote. Local partitions are + * those whose parents are partition root themselves. Setting of + * cpuset.cpus.exclusive are optional in setting up local partitions. + * Remote partitions are those whose parents are not partition roots. Passing + * down exclusive CPUs by setting cpuset.cpus.exclusive along its ancestor + * nodes are mandatory in creating a remote partition. + * + * For simplicity, a local partition can be created under a local or remote + * partition but a remote partition cannot have any partition root in its + * ancestor chain except the cgroup root. */ #define PRS_MEMBER 0 #define PRS_ROOT 1 @@ -434,7 +455,7 @@ static struct cpuset top_cpuset = { * by other task, we use alloc_lock in the task_struct fields to protect * them. * - * The cpuset_common_file_read() handlers only hold callback_lock across + * The cpuset_common_seq_show() handlers only hold callback_lock across * small pieces of code, such as when reading out possibly multi-word * cpumasks and nodemasks. * @@ -709,6 +730,19 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* Return user specified exclusive CPUs */ +static inline struct cpumask *user_xcpus(struct cpuset *cs) +{ + return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed + : cs->exclusive_cpus; +} + +static inline bool xcpus_empty(struct cpuset *cs) +{ + return cpumask_empty(cs->cpus_allowed) && + cpumask_empty(cs->exclusive_cpus); +} + static inline struct cpumask *fetch_xcpus(struct cpuset *cs) { return !cpumask_empty(cs->exclusive_cpus) ? cs->exclusive_cpus : @@ -825,17 +859,41 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * If either I or some sibling (!= me) is exclusive, we can't - * overlap + * overlap. exclusive_cpus cannot overlap with each other if set. */ ret = -EINVAL; cpuset_for_each_child(c, css, par) { - if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && - c != cur) { + bool txset, cxset; /* Are exclusive_cpus set? */ + + if (c == cur) + continue; + + txset = !cpumask_empty(trial->exclusive_cpus); + cxset = !cpumask_empty(c->exclusive_cpus); + if (is_cpu_exclusive(trial) || is_cpu_exclusive(c) || + (txset && cxset)) { if (!cpusets_are_exclusive(trial, c)) goto out; + } else if (txset || cxset) { + struct cpumask *xcpus, *acpus; + + /* + * When just one of the exclusive_cpus's is set, + * cpus_allowed of the other cpuset, if set, cannot be + * a subset of it or none of those CPUs will be + * available if these exclusive CPUs are activated. + */ + if (txset) { + xcpus = trial->exclusive_cpus; + acpus = c->cpus_allowed; + } else { + xcpus = c->exclusive_cpus; + acpus = trial->cpus_allowed; + } + if (!cpumask_empty(acpus) && cpumask_subset(acpus, xcpus)) + goto out; } if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && - c != cur && nodes_intersects(trial->mems_allowed, c->mems_allowed)) goto out; } @@ -957,13 +1015,15 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); + bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); doms = NULL; dattr = NULL; csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ - if (root_load_balance && !top_cpuset.nr_subparts) { + if (root_load_balance && cpumask_empty(subpartitions_cpus)) { +single_root_domain: ndoms = 1; doms = alloc_sched_domains(ndoms); if (!doms) @@ -991,16 +1051,18 @@ static int generate_sched_domains(cpumask_var_t **domains, cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { if (cp == &top_cpuset) continue; + + if (cgrpv2) + goto v2; + /* + * v1: * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The * latter: All child cpusets contain a subset of the * parent's cpus, so just skip them, and then we call * update_domain_attr_tree() to calc relax_domain_level of * the corresponding sched domain. - * - * If root is load-balancing, we can skip @cp if it - * is a subset of the root's effective_cpus. */ if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && @@ -1008,20 +1070,39 @@ static int generate_sched_domains(cpumask_var_t **domains, housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; - if (root_load_balance && - cpumask_subset(cp->cpus_allowed, top_cpuset.effective_cpus)) - continue; - if (is_sched_load_balance(cp) && !cpumask_empty(cp->effective_cpus)) csa[csn++] = cp; - /* skip @cp's subtree if not a partition root */ - if (!is_partition_valid(cp)) + /* skip @cp's subtree */ + pos_css = css_rightmost_descendant(pos_css); + continue; + +v2: + /* + * Only valid partition roots that are not isolated and with + * non-empty effective_cpus will be saved into csn[]. + */ + if ((cp->partition_root_state == PRS_ROOT) && + !cpumask_empty(cp->effective_cpus)) + csa[csn++] = cp; + + /* + * Skip @cp's subtree if not a partition root and has no + * exclusive CPUs to be granted to child cpusets. + */ + if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); + /* + * If there are only isolated partitions underneath the cgroup root, + * we can optimize out unneeded sched domains scanning. + */ + if (root_load_balance && (csn == 1)) + goto single_root_domain; + for (i = 0; i < csn; i++) csa[i]->pn = i; ndoms = csn; @@ -1064,6 +1145,20 @@ restart: dattr = kmalloc_array(ndoms, sizeof(struct sched_domain_attr), GFP_KERNEL); + /* + * Cgroup v2 doesn't support domain attributes, just set all of them + * to SD_ATTR_INIT. Also non-isolating partition root CPUs are a + * subset of HK_TYPE_DOMAIN housekeeping CPUs. + */ + if (cgrpv2) { + for (i = 0; i < ndoms; i++) { + cpumask_copy(doms[i], csa[i]->effective_cpus); + if (dattr) + dattr[i] = SD_ATTR_INIT; + } + goto done; + } + for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; struct cpumask *dp; @@ -1223,7 +1318,7 @@ static void rebuild_sched_domains_locked(void) * root should be only a subset of the active CPUs. Since a CPU in any * partition root could be offlined, all must be checked. */ - if (top_cpuset.nr_subparts) { + if (!cpumask_empty(subpartitions_cpus)) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!is_partition_valid(cs)) { @@ -1338,7 +1433,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, */ static int update_partition_exclusive(struct cpuset *cs, int new_prs) { - bool exclusive = (new_prs > 0); + bool exclusive = (new_prs > PRS_MEMBER); if (exclusive && !is_cpu_exclusive(cs)) { if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) @@ -1532,7 +1627,7 @@ EXPORT_SYMBOL_GPL(cpuset_cpu_is_isolated); * Return: true if xcpus is not empty, false otherwise. * * Starting with exclusive_cpus (cpus_allowed if exclusive_cpus is not set), - * it must be a subset of cpus_allowed and parent's effective_xcpus. + * it must be a subset of parent's effective_xcpus. */ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, struct cpumask *xcpus) @@ -1542,12 +1637,7 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, if (!xcpus) xcpus = cs->effective_xcpus; - if (!cpumask_empty(cs->exclusive_cpus)) - cpumask_and(xcpus, cs->exclusive_cpus, cs->cpus_allowed); - else - cpumask_copy(xcpus, cs->cpus_allowed); - - return cpumask_and(xcpus, xcpus, parent->effective_xcpus); + return cpumask_and(xcpus, user_xcpus(cs), parent->effective_xcpus); } static inline bool is_remote_partition(struct cpuset *cs) @@ -1826,8 +1916,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, */ adding = deleting = false; old_prs = new_prs = cs->partition_root_state; - xcpus = !cpumask_empty(cs->exclusive_cpus) - ? cs->effective_xcpus : cs->cpus_allowed; + xcpus = user_xcpus(cs); if (cmd == partcmd_invalidate) { if (is_prs_invalid(old_prs)) @@ -1855,7 +1944,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, return is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART; } - if (!newmask && cpumask_empty(cs->cpus_allowed)) + if (!newmask && xcpus_empty(cs)) return PERR_CPUSEMPTY; nocpu = tasks_nocpu_error(parent, cs, xcpus); @@ -2583,8 +2672,6 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) return retval; - if (!is_cpu_exclusive(cs)) - set_bit(CS_CPU_EXCLUSIVE, &trialcs->flags); } /* Nothing to do if the CPUs didn't change */ @@ -3071,9 +3158,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) ? partcmd_enable : partcmd_enablei; /* - * cpus_allowed cannot be empty. + * cpus_allowed and exclusive_cpus cannot be both empty. */ - if (cpumask_empty(cs->cpus_allowed)) { + if (xcpus_empty(cs)) { err = PERR_CPUSEMPTY; goto out; } @@ -4009,8 +4096,6 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) } __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - nodes_clear(cs->mems_allowed); - nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; INIT_LIST_HEAD(&cs->remote_sibling); @@ -4040,6 +4125,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) set_bit(CS_SPREAD_PAGE, &cs->flags); if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); + /* + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && + !is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -4050,14 +4141,6 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->use_parent_ecpus = true; parent->child_ecpus_count++; } - - /* - * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated - */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) - clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); - spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) @@ -4571,7 +4654,7 @@ static void cpuset_handle_hotplug(void) * In the rare case that hotplug removes all the cpus in * subpartitions_cpus, we assumed that cpus are updated. */ - if (!cpus_updated && top_cpuset.nr_subparts) + if (!cpus_updated && !cpumask_empty(subpartitions_cpus)) cpus_updated = true; /* For v1, synchronize cpus_allowed to cpu_active_mask */ @@ -5051,10 +5134,14 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, if (!buf) goto out; - css = task_get_css(tsk, cpuset_cgrp_id); - retval = cgroup_path_ns(css->cgroup, buf, PATH_MAX, - current->nsproxy->cgroup_ns); - css_put(css); + rcu_read_lock(); + spin_lock_irq(&css_set_lock); + css = task_css(tsk, cpuset_cgrp_id); + retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + spin_unlock_irq(&css_set_lock); + rcu_read_unlock(); + if (retval == -E2BIG) retval = -ENAMETOOLONG; if (retval < 0) |