diff options
-rw-r--r-- | kernel/cgroup/cpuset.c | 335 |
1 files changed, 306 insertions, 29 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0419654f3004..7ac320e079b8 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -192,6 +192,9 @@ struct cpuset { /* Handle for cpuset.cpus.partition */ struct cgroup_file partition_file; + + /* Remote partition silbling list anchored at remote_children */ + struct list_head remote_sibling; }; /* @@ -199,6 +202,9 @@ struct cpuset { */ static cpumask_var_t subpartitions_cpus; +/* List of remote partition root children */ +static struct list_head remote_children; + /* * Partition root states: * @@ -348,6 +354,7 @@ static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .partition_root_state = PRS_ROOT, + .remote_sibling = LIST_HEAD_INIT(top_cpuset.remote_sibling), }; /** @@ -1434,6 +1441,211 @@ static bool compute_effective_exclusive_cpumask(struct cpuset *cs, return cpumask_and(xcpus, xcpus, parent->effective_xcpus); } +static inline bool is_remote_partition(struct cpuset *cs) +{ + return !list_empty(&cs->remote_sibling); +} + +static inline bool is_local_partition(struct cpuset *cs) +{ + return is_partition_valid(cs) && !is_remote_partition(cs); +} + +/* + * remote_partition_enable - Enable current cpuset as a remote partition root + * @cs: the cpuset to update + * @tmp: temparary masks + * Return: 1 if successful, 0 if error + * + * Enable the current cpuset to become a remote partition root taking CPUs + * directly from the top cpuset. cpuset_mutex must be held by the caller. + */ +static int remote_partition_enable(struct cpuset *cs, struct tmpmasks *tmp) +{ + /* + * The user must have sysadmin privilege. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + /* + * The requested exclusive_cpus must not be allocated to other + * partitions and it can't use up all the root's effective_cpus. + * + * Note that if there is any local partition root above it or + * remote partition root underneath it, its exclusive_cpus must + * have overlapped with subpartitions_cpus. + */ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + if (cpumask_empty(tmp->new_cpus) || + cpumask_intersects(tmp->new_cpus, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus)) + return 0; + + spin_lock_irq(&callback_lock); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + + if (cs->use_parent_ecpus) { + struct cpuset *parent = parent_cs(cs); + + cs->use_parent_ecpus = false; + parent->child_ecpus_count--; + } + list_add(&cs->remote_sibling, &remote_children); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + + return 1; +} + +/* + * remote_partition_disable - Remove current cpuset from remote partition list + * @cs: the cpuset to update + * @tmp: temparary masks + * + * The effective_cpus is also updated. + * + * cpuset_mutex must be held by the caller. + */ +static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) +{ + compute_effective_exclusive_cpumask(cs, tmp->new_cpus); + WARN_ON_ONCE(!is_remote_partition(cs)); + WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, subpartitions_cpus)); + + spin_lock_irq(&callback_lock); + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->new_cpus); + cpumask_and(tmp->new_cpus, + tmp->new_cpus, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->new_cpus); + list_del_init(&cs->remote_sibling); + cs->partition_root_state = -cs->partition_root_state; + if (!cs->prs_err) + cs->prs_err = PERR_INVCPUS; + reset_partition_data(cs); + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); +} + +/* + * remote_cpus_update - cpus_exclusive change of remote partition + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @tmp: temparary masks + * + * top_cpuset and subpartitions_cpus will be updated or partition can be + * invalidated. + */ +static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, + struct tmpmasks *tmp) +{ + bool adding, deleting; + + if (WARN_ON_ONCE(!is_remote_partition(cs))) + return; + + WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus)); + + if (cpumask_empty(newmask)) + goto invalidate; + + adding = cpumask_andnot(tmp->addmask, newmask, cs->effective_xcpus); + deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, newmask); + + /* + * Additions of remote CPUs is only allowed if those CPUs are + * not allocated to other partitions and there are effective_cpus + * left in the top cpuset. + */ + if (adding && (!capable(CAP_SYS_ADMIN) || + cpumask_intersects(tmp->addmask, subpartitions_cpus) || + cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))) + goto invalidate; + + spin_lock_irq(&callback_lock); + if (adding) { + cpumask_or(subpartitions_cpus, + subpartitions_cpus, tmp->addmask); + cpumask_andnot(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->addmask); + } + if (deleting) { + cpumask_andnot(subpartitions_cpus, + subpartitions_cpus, tmp->delmask); + cpumask_and(tmp->delmask, + tmp->delmask, cpu_active_mask); + cpumask_or(top_cpuset.effective_cpus, + top_cpuset.effective_cpus, tmp->delmask); + } + spin_unlock_irq(&callback_lock); + + /* + * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + */ + update_tasks_cpumask(&top_cpuset, tmp->new_cpus); + update_sibling_cpumasks(&top_cpuset, NULL, tmp); + return; + +invalidate: + remote_partition_disable(cs, tmp); +} + +/* + * remote_partition_check - check if a child remote partition needs update + * @cs: the cpuset to be updated + * @newmask: the new effective_xcpus mask + * @delmask: temporary mask for deletion (not in tmp) + * @tmp: temparary masks + * + * This should be called before the given cs has updated its cpus_allowed + * and/or effective_xcpus. + */ +static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, + struct cpumask *delmask, struct tmpmasks *tmp) +{ + struct cpuset *child, *next; + int disable_cnt = 0; + + /* + * Compute the effective exclusive CPUs that will be deleted. + */ + if (!cpumask_andnot(delmask, cs->effective_xcpus, newmask) || + !cpumask_intersects(delmask, subpartitions_cpus)) + return; /* No deletion of exclusive CPUs in partitions */ + + /* + * Searching the remote children list to look for those that will + * be impacted by the deletion of exclusive CPUs. + * + * Since a cpuset must be removed from the remote children list + * before it can go offline and holding cpuset_mutex will prevent + * any change in cpuset status. RCU read lock isn't needed. + */ + lockdep_assert_held(&cpuset_mutex); + list_for_each_entry_safe(child, next, &remote_children, remote_sibling) + if (cpumask_intersects(child->effective_cpus, delmask)) { + remote_partition_disable(child, tmp); + disable_cnt++; + } + if (disable_cnt) + rebuild_sched_domains_locked(); +} + /** * update_parent_effective_cpumask - update effective_cpus mask of parent cpuset * @cs: The cpuset that requests change in partition root state @@ -1548,7 +1760,7 @@ static int update_parent_effective_cpumask(struct cpuset *cs, int cmd, subparts_delta++; } else if (cmd == partcmd_disable) { /* - n* May need to add cpus to parent's effective_cpus for + * May need to add cpus to parent's effective_cpus for * valid partition root. */ adding = !is_prs_invalid(old_prs) && @@ -1749,7 +1961,7 @@ write_error: * @new_ecpus: previously computed effective_cpus to be updated * * Compute the effective_cpus of a partition root by scanning effective_xcpus - * of child partition roots and exclusing their effective_xcpus. + * of child partition roots and excluding their effective_xcpus. * * This has the side effect of invalidating valid child partition roots, * if necessary. Since it is called from either cpuset_hotplug_update_tasks() @@ -1840,9 +2052,17 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, cs) { struct cpuset *parent = parent_cs(cp); + bool remote = is_remote_partition(cp); bool update_parent = false; - compute_effective_cpumask(tmp->new_cpus, cp, parent); + /* + * Skip descendent remote partition that acquires CPUs + * directly from top cpuset unless it is cs. + */ + if (remote && (cp != cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } /* * Update effective_xcpus if exclusive_cpus set. @@ -1854,8 +2074,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, spin_unlock_irq(&callback_lock); } - if (is_partition_valid(parent) && is_partition_valid(cp)) + old_prs = new_prs = cp->partition_root_state; + if (remote || (is_partition_valid(parent) && + is_partition_valid(cp))) compute_partition_effective_cpumask(cp, tmp->new_cpus); + else + compute_effective_cpumask(tmp->new_cpus, cp, parent); /* * A partition with no effective_cpus is allowed as long as @@ -1873,7 +2097,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * it is a partition root that has explicitly distributed * out all its CPUs. */ - if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { + if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus)) { cpumask_copy(tmp->new_cpus, parent->effective_cpus); if (!cp->use_parent_ecpus) { cp->use_parent_ecpus = true; @@ -1885,6 +2109,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, parent->child_ecpus_count--; } + if (remote) + goto get_css; + /* * Skip the whole subtree if * 1) the cpumask remains the same, @@ -1907,7 +2134,6 @@ update_parent_effective: * update_tasks_cpumask() again for tasks in the parent * cpuset if the parent's effective_cpus changes. */ - old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { case PRS_ROOT: @@ -1929,7 +2155,7 @@ update_parent_effective: break; } } - +get_css: if (!css_tryget_online(&cp->css)) continue; rcu_read_unlock(); @@ -1953,13 +2179,8 @@ update_parent_effective: if ((new_prs > 0) && cpumask_empty(cp->exclusive_cpus)) cpumask_and(cp->effective_xcpus, cp->cpus_allowed, parent->effective_xcpus); - if (new_prs < 0) { - /* Reset partition data */ - cp->nr_subparts = 0; - cpumask_clear(cp->effective_xcpus); - if (is_cpu_exclusive(cp)) - clear_bit(CS_CPU_EXCLUSIVE, &cp->flags); - } + else if (new_prs < 0) + reset_partition_data(cp); spin_unlock_irq(&callback_lock); notify_partition_change(cp, old_prs); @@ -2157,12 +2378,23 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, goto out_free; if (is_partition_valid(cs)) { - if (invalidate) + /* + * Call remote_cpus_update() to handle valid remote partition + */ + if (is_remote_partition(cs)) + remote_cpus_update(cs, trialcs->effective_xcpus, &tmp); + else if (invalidate) update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); else update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); + } else if (!cpumask_empty(cs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } spin_lock_irq(&callback_lock); @@ -2203,6 +2435,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (!*buf) { cpumask_clear(trialcs->exclusive_cpus); + cpumask_clear(trialcs->effective_xcpus); } else { retval = cpulist_parse(buf, trialcs->exclusive_cpus); if (retval < 0) @@ -2218,7 +2451,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, if (alloc_cpumasks(NULL, &tmp)) return -ENOMEM; - compute_effective_exclusive_cpumask(trialcs, NULL); + if (*buf) + compute_effective_exclusive_cpumask(trialcs, NULL); /* * Check all the descendants in update_cpumasks_hier() if @@ -2240,14 +2474,26 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, cs->prs_err = PERR_NOCPUS; } - if (invalidate) + if (is_remote_partition(cs)) { + if (invalidate) + remote_partition_disable(cs, &tmp); + else + remote_cpus_update(cs, trialcs->effective_xcpus, + &tmp); + } else if (invalidate) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, &tmp); - else + } else { update_parent_effective_cpumask(cs, partcmd_update, trialcs->effective_xcpus, &tmp); + } + } else if (!cpumask_empty(trialcs->exclusive_cpus)) { + /* + * Use trialcs->effective_cpus as a temp cpumask + */ + remote_partition_check(cs, trialcs->effective_xcpus, + trialcs->effective_cpus, &tmp); } - spin_lock_irq(&callback_lock); cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus); cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus); @@ -2643,18 +2889,25 @@ out: static int update_prstate(struct cpuset *cs, int new_prs) { int err = PERR_NONE, old_prs = cs->partition_root_state; + struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; if (old_prs == new_prs) return 0; /* - * For a previously invalid partition root, leave it at being - * invalid if new_prs is not "member". + * For a previously invalid partition root with valid partition root + * parent, treat it as if it is a "member". Otherwise, reject it as + * remote partition cannot currently self-recover from an invalid + * state. */ if (new_prs && is_prs_invalid(old_prs)) { - cs->partition_root_state = -new_prs; - return 0; + if (is_partition_valid(parent)) { + old_prs = PRS_MEMBER; + } else { + cs->partition_root_state = -new_prs; + return 0; + } } if (alloc_cpumasks(NULL, &tmpmask)) @@ -2665,8 +2918,6 @@ static int update_prstate(struct cpuset *cs, int new_prs) * later if partition becomes invalid. */ if ((new_prs > 0) && cpumask_empty(cs->exclusive_cpus)) { - struct cpuset *parent = parent_cs(cs); - spin_lock_irq(&callback_lock); cpumask_and(cs->effective_xcpus, cs->cpus_allowed, parent->effective_xcpus); @@ -2688,6 +2939,12 @@ static int update_prstate(struct cpuset *cs, int new_prs) err = update_parent_effective_cpumask(cs, partcmd_enable, NULL, &tmpmask); + /* + * If an attempt to become local partition root fails, + * try to become a remote partition root instead. + */ + if (err && remote_partition_enable(cs, &tmpmask)) + err = 0; } else if (old_prs && new_prs) { /* * A change in load balance state only, no change in cpumasks. @@ -2698,8 +2955,11 @@ static int update_prstate(struct cpuset *cs, int new_prs) * Switching back to member is always allowed even if it * disables child partitions. */ - update_parent_effective_cpumask(cs, partcmd_disable, NULL, - &tmpmask); + if (is_remote_partition(cs)) + remote_partition_disable(cs, &tmpmask); + else + update_parent_effective_cpumask(cs, partcmd_disable, + NULL, &tmpmask); /* * Invalidation of child partitions will be done in @@ -3602,6 +3862,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) nodes_clear(cs->effective_mems); fmeter_init(&cs->fmeter); cs->relax_domain_level = -1; + INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) @@ -3637,6 +3898,11 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; + /* + * Clear CS_SCHED_LOAD_BALANCE if parent is isolated + */ + if (!is_sched_load_balance(parent)) + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } /* @@ -3891,6 +4157,7 @@ int __init cpuset_init(void) fmeter_init(&top_cpuset.fmeter); set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); top_cpuset.relax_domain_level = -1; + INIT_LIST_HEAD(&remote_children); BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); @@ -4006,6 +4273,7 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) static nodemask_t new_mems; bool cpus_updated; bool mems_updated; + bool remote; struct cpuset *parent; retry: wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); @@ -4032,9 +4300,18 @@ retry: * Compute effective_cpus for valid partition root, may invalidate * child partition roots if necessary. */ - if (is_partition_valid(cs) && is_partition_valid(parent)) + remote = is_remote_partition(cs); + if (remote || (is_partition_valid(cs) && is_partition_valid(parent))) compute_partition_effective_cpumask(cs, &new_cpus); + if (remote && cpumask_empty(&new_cpus) && + partition_is_populated(cs, NULL)) { + remote_partition_disable(cs, tmp); + compute_effective_cpumask(&new_cpus, cs, parent); + remote = false; + cpuset_force_rebuild(); + } + /* * Force the partition to become invalid if either one of * the following conditions hold: @@ -4042,7 +4319,7 @@ retry: * 2) parent is invalid or doesn't grant any cpus to child * partitions. */ - if (is_partition_valid(cs) && (!is_partition_valid(parent) || + if (is_local_partition(cs) && (!is_partition_valid(parent) || tasks_nocpu_error(parent, cs, &new_cpus))) { update_parent_effective_cpumask(cs, partcmd_invalidate, NULL, tmp); compute_effective_cpumask(&new_cpus, cs, parent); |