diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-16 14:47:16 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-16 14:47:16 -0700 |
commit | 825a3b2605c3aa193e0075d0f9c72e33c17ab16a (patch) | |
tree | e8665c4cc20076ae53165475839d36b4bc641cd3 /include | |
parent | cf6ed9a6682d3f171cf9550d4bbe0ef31b768a7e (diff) | |
parent | ef0491ea17f8019821c7e9c8e801184ecf17f85a (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- massive CPU hotplug rework (Thomas Gleixner)
- improve migration fairness (Peter Zijlstra)
- CPU load calculation updates/cleanups (Yuyang Du)
- cpufreq updates (Steve Muckle)
- nohz optimizations (Frederic Weisbecker)
- switch_mm() micro-optimization on x86 (Andy Lutomirski)
- ... lots of other enhancements, fixes and cleanups.
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (66 commits)
ARM: Hide finish_arch_post_lock_switch() from modules
sched/core: Provide a tsk_nr_cpus_allowed() helper
sched/core: Use tsk_cpus_allowed() instead of accessing ->cpus_allowed
sched/loadavg: Fix loadavg artifacts on fully idle and on fully loaded systems
sched/fair: Correct unit of load_above_capacity
sched/fair: Clean up scale confusion
sched/nohz: Fix affine unpinned timers mess
sched/fair: Fix fairness issue on migration
sched/core: Kill sched_class::task_waking to clean up the migration logic
sched/fair: Prepare to fix fairness problems on migration
sched/fair: Move record_wakee()
sched/core: Fix comment typo in wake_q_add()
sched/core: Remove unused variable
sched: Make hrtick_notifier an explicit call
sched/fair: Make ilb_notifier an explicit call
sched/hotplug: Make activate() the last hotplug step
sched/hotplug: Move migration CPU_DYING to sched_cpu_dying()
sched/migration: Move CPU_ONLINE into scheduler state
sched/migration: Move calc_load_migrate() into CPU_DYING
sched/migration: Move prepare transition to SCHED_STARTING state
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cpu.h | 18 | ||||
-rw-r--r-- | include/linux/cpuhotplug.h | 2 | ||||
-rw-r--r-- | include/linux/cpumask.h | 6 | ||||
-rw-r--r-- | include/linux/lockdep.h | 23 | ||||
-rw-r--r-- | include/linux/mmu_context.h | 7 | ||||
-rw-r--r-- | include/linux/sched.h | 124 |
6 files changed, 136 insertions, 44 deletions
diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f9b1fab4388a..21597dcac0e2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,25 +59,7 @@ struct notifier_block; * CPU notifier priorities. */ enum { - /* - * SCHED_ACTIVE marks a cpu which is coming up active during - * CPU_ONLINE and CPU_DOWN_FAILED and must be the first - * notifier. CPUSET_ACTIVE adjusts cpuset according to - * cpu_active mask right after SCHED_ACTIVE. During - * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are - * ordered in the similar way. - * - * This ordering guarantees consistent cpu_active mask and - * migration behavior to all cpu notifiers. - */ - CPU_PRI_SCHED_ACTIVE = INT_MAX, - CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, - CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, - CPU_PRI_CPUSET_INACTIVE = INT_MIN, - - /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, - CPU_PRI_MIGRATION = 10, /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5d68e15e46b7..386374d19987 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -8,6 +8,7 @@ enum cpuhp_state { CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, + CPUHP_AP_SCHED_STARTING, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, @@ -16,6 +17,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 40cee6b77a93..e828cf65d7df 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present) static inline void set_cpu_online(unsigned int cpu, bool online) { - if (online) { + if (online) cpumask_set_cpu(cpu, &__cpu_online_mask); - cpumask_set_cpu(cpu, &__cpu_active_mask); - } else { + else cpumask_clear_cpu(cpu, &__cpu_online_mask); - } } static inline void diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f75222ea7f16..eabe0138eb06 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); -extern void lock_pin_lock(struct lockdep_map *lock); -extern void lock_unpin_lock(struct lockdep_map *lock); +struct pin_cookie { unsigned int val; }; + +#define NIL_COOKIE (struct pin_cookie){ .val = 0U, } + +extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); +extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); +extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); # define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, @@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) -#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map) +#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) +#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) +#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ @@ -427,8 +433,13 @@ struct lock_class_key { }; #define lockdep_recursing(tsk) (0) -#define lockdep_pin_lock(l) do { (void)(l); } while (0) -#define lockdep_unpin_lock(l) do { (void)(l); } while (0) +struct pin_cookie { }; + +#define NIL_COOKIE (struct pin_cookie){ } + +#define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) +#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 70fffeba7495..a4441784503b 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -1,9 +1,16 @@ #ifndef _LINUX_MMU_CONTEXT_H #define _LINUX_MMU_CONTEXT_H +#include <asm/mmu_context.h> + struct mm_struct; void use_mm(struct mm_struct *mm); void unuse_mm(struct mm_struct *mm); +/* Architectures that care about IRQ state in switch_mm can override this. */ +#ifndef switch_mm_irqs_off +# define switch_mm_irqs_off switch_mm +#endif + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e8dfa6f0d843..6cc0df970f1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -177,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(int active); +extern void cpu_load_update_nohz_start(void); +extern void cpu_load_update_nohz_stop(void); #else -static inline void update_cpu_load_nohz(int active) { } +static inline void cpu_load_update_nohz_start(void) { } +static inline void cpu_load_update_nohz_stop(void) { } #endif extern void dump_cpu_task(int cpu); @@ -371,6 +373,15 @@ extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +extern int sched_cpu_starting(unsigned int cpu); +extern int sched_cpu_activate(unsigned int cpu); +extern int sched_cpu_deactivate(unsigned int cpu); + +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_dying(unsigned int cpu); +#else +# define sched_cpu_dying NULL +#endif extern void sched_show_task(struct task_struct *p); @@ -934,9 +945,19 @@ enum cpu_idle_type { }; /* + * Integer metrics need fixed point arithmetic, e.g., sched/fair + * has a few: load, load_avg, util_avg, freq, and capacity. + * + * We define a basic fixed point arithmetic range, and then formalize + * all these metrics based on that basic range. + */ +# define SCHED_FIXEDPOINT_SHIFT 10 +# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) + +/* * Increase resolution of cpu_capacity calculations */ -#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) /* @@ -1198,18 +1219,56 @@ struct load_weight { }; /* - * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors frequency scaling into the amount of time that a - * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the - * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency and cpu scaling into the amount of time - * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. - * For cfs_rq, it is the aggregated such times of all runnable and + * The load_avg/util_avg accumulates an infinite geometric series + * (see __update_load_avg() in kernel/sched/fair.c). + * + * [load_avg definition] + * + * load_avg = runnable% * scale_load_down(load) + * + * where runnable% is the time ratio that a sched_entity is runnable. + * For cfs_rq, it is the aggregated load_avg of all runnable and * blocked sched_entities. - * The 64 bit load_sum can: - * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with - * the highest weight (=88761) always runnable, we should not overflow - * 2) for entity, support any load.weight always runnable + * + * load_avg may also take frequency scaling into account: + * + * load_avg = runnable% * scale_load_down(load) * freq% + * + * where freq% is the CPU frequency normalized to the highest frequency. + * + * [util_avg definition] + * + * util_avg = running% * SCHED_CAPACITY_SCALE + * + * where running% is the time ratio that a sched_entity is running on + * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable + * and blocked sched_entities. + * + * util_avg may also factor frequency scaling and CPU capacity scaling: + * + * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% + * + * where freq% is the same as above, and capacity% is the CPU capacity + * normalized to the greatest capacity (due to uarch differences, etc). + * + * N.B., the above ratios (runnable%, running%, freq%, and capacity%) + * themselves are in the range of [0, 1]. To do fixed point arithmetics, + * we therefore scale them to as large a range as necessary. This is for + * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * + * [Overflow issue] + * + * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities + * with the highest load (=88761), always runnable on a single cfs_rq, + * and should not overflow as the number already hits PID_MAX_LIMIT. + * + * For all other cases (including 32-bit kernels), struct load_weight's + * weight will overflow first before we do, because: + * + * Max(load_avg) <= Max(load.weight) + * + * Then it is the load_weight's responsibility to consider overflow + * issues. */ struct sched_avg { u64 last_update_time, load_sum; @@ -1871,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly; /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +static inline int tsk_nr_cpus_allowed(struct task_struct *p) +{ + return p->nr_cpus_allowed; +} + #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 @@ -2303,8 +2367,6 @@ extern unsigned long long notrace sched_clock(void); /* * See the comment in kernel/sched/clock.c */ -extern u64 cpu_clock(int cpu); -extern u64 local_clock(void); extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); @@ -2323,6 +2385,16 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static inline u64 local_clock(void) +{ + return sched_clock(); +} #else /* * Architectures can set this to 1 if they have specified @@ -2337,6 +2409,26 @@ extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +static inline u64 local_clock(void) +{ + return sched_clock_cpu(raw_smp_processor_id()); +} #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING |