From 3fc498f165304dc913f1d13b5ac9ab4c758ee7ab Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: smp: introduce a generic on_each_cpu_mask() function We have lots of infrastructure in place to partition multi-core systems such that we have a group of CPUs that are dedicated to specific task: cgroups, scheduler and interrupt affinity, and cpuisol= boot parameter. Still, kernel code will at times interrupt all CPUs in the system via IPIs for various needs. These IPIs are useful and cannot be avoided altogether, but in certain cases it is possible to interrupt only specific CPUs that have useful work to do and not the entire system. This patch set, inspired by discussions with Peter Zijlstra and Frederic Weisbecker when testing the nohz task patch set, is a first stab at trying to explore doing this by locating the places where such global IPI calls are being made and turning the global IPI into an IPI for a specific group of CPUs. The purpose of the patch set is to get feedback if this is the right way to go for dealing with this issue and indeed, if the issue is even worth dealing with at all. Based on the feedback from this patch set I plan to offer further patches that address similar issue in other code paths. This patch creates an on_each_cpu_mask() and on_each_cpu_cond() infrastructure API (the former derived from existing arch specific versions in Tile and Arm) and uses them to turn several global IPI invocation to per CPU group invocations. Core kernel: on_each_cpu_mask() calls a function on processors specified by cpumask, which may or may not include the local processor. You must not call this function with disabled interrupts or from a hardware interrupt handler or from a bottom half handler. arch/arm: Note that the generic version is a little different then the Arm one: 1. It has the mask as first parameter 2. It calls the function on the calling CPU with interrupts disabled, but this should be OK since the function is called on the other CPUs with interrupts disabled anyway. arch/tile: The API is the same as the tile private one, but the generic version also calls the function on the with interrupts disabled in UP case This is OK since the function is called on the other CPUs with interrupts disabled. Signed-off-by: Gilad Ben-Yossef Reviewed-by: Christoph Lameter Acked-by: Chris Metcalf Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Rik van Riel Cc: Andi Kleen Cc: Sasha Levin Cc: Mel Gorman Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Cc: Russell King Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index db197d60489b..a081e6ce0e0a 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -701,3 +701,32 @@ int on_each_cpu(void (*func) (void *info), void *info, int wait) return ret; } EXPORT_SYMBOL(on_each_cpu); + +/** + * on_each_cpu_mask(): Run a function on processors specified by + * cpumask, which may include the local processor. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait) +{ + int cpu = get_cpu(); + + smp_call_function_many(mask, func, info, wait); + if (cpumask_test_cpu(cpu, mask)) { + local_irq_disable(); + func(info); + local_irq_enable(); + } + put_cpu(); +} +EXPORT_SYMBOL(on_each_cpu_mask); -- cgit v1.2.3-58-ga151 From b3a7e98e024ffa9f7e4554dd720c508015c4a831 Mon Sep 17 00:00:00 2001 From: Gilad Ben-Yossef Date: Wed, 28 Mar 2012 14:42:43 -0700 Subject: smp: add func to IPI cpus based on parameter func Add the on_each_cpu_cond() function that wraps on_each_cpu_mask() and calculates the cpumask of cpus to IPI by calling a function supplied as a parameter in order to determine whether to IPI each specific cpu. The function works around allocation failure of cpumask variable in CONFIG_CPUMASK_OFFSTACK=y by itereating over cpus sending an IPI a time via smp_call_function_single(). The function is useful since it allows to seperate the specific code that decided in each case whether to IPI a specific cpu for a specific request from the common boilerplate code of handling creating the mask, handling failures etc. [akpm@linux-foundation.org: s/gfpflags/gfp_flags/] [akpm@linux-foundation.org: avoid double-evaluation of `info' (per Michal), parenthesise evaluation of `cond_func'] [akpm@linux-foundation.org: s/CPU/CPUs, use all 80 cols in comment] Signed-off-by: Gilad Ben-Yossef Cc: Chris Metcalf Cc: Christoph Lameter Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Russell King Cc: Pekka Enberg Cc: Matt Mackall Cc: Sasha Levin Cc: Rik van Riel Cc: Andi Kleen Cc: Alexander Viro Cc: Avi Kivity Acked-by: Michal Nazarewicz Cc: Kosaki Motohiro Cc: Milton Miller Reviewed-by: "Srivatsa S. Bhat" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 24 +++++++++++++++++++++ kernel/smp.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index d0adb7898d54..10530d92c04b 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -108,6 +108,15 @@ int on_each_cpu(smp_call_func_t func, void *info, int wait); void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -153,6 +162,21 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) local_irq_enable(); \ } \ } while (0) +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ + do { \ + void *__info = (info); \ + preempt_disable(); \ + if ((cond_func)(0, __info)) { \ + local_irq_disable(); \ + (func)(__info); \ + local_irq_enable(); \ + } \ + preempt_enable(); \ + } while (0) static inline void smp_send_reschedule(int cpu) { } #define num_booting_cpus() 1 diff --git a/kernel/smp.c b/kernel/smp.c index a081e6ce0e0a..2f8b10ecf759 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -730,3 +730,64 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, put_cpu(); } EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * on_each_cpu_cond(): Call a function on each processor for which + * the supplied function cond_func returns true, optionally waiting + * for all the required CPUs to finish. This may include the local + * processor. + * @cond_func: A callback function that is passed a cpu id and + * the the info parameter. The function is called + * with preemption disabled. The function should + * return a blooean value indicating whether to IPI + * the specified CPU. + * @func: The function to run on all applicable CPUs. + * This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to both functions. + * @wait: If true, wait (atomically) until function has + * completed on other CPUs. + * @gfp_flags: GFP flags to use when allocating the cpumask + * used internally by the function. + * + * The function might sleep if the GFP flags indicates a non + * atomic allocation is allowed. + * + * Preemption is disabled to protect against CPUs going offline but not online. + * CPUs going online during the call will not be seen or sent an IPI. + * + * You must not call this function with disabled interrupts or + * from a hardware interrupt handler or from a bottom half handler. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + cpumask_var_t cpus; + int cpu, ret; + + might_sleep_if(gfp_flags & __GFP_WAIT); + + if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) + cpumask_set_cpu(cpu, cpus); + on_each_cpu_mask(cpus, func, info, wait); + preempt_enable(); + free_cpumask_var(cpus); + } else { + /* + * No free cpumask, bother. No matter, we'll + * just have to IPI them one by one. + */ + preempt_disable(); + for_each_online_cpu(cpu) + if (cond_func(cpu, info)) { + ret = smp_call_function_single(cpu, func, + info, wait); + WARN_ON_ONCE(!ret); + } + preempt_enable(); + } +} +EXPORT_SYMBOL(on_each_cpu_cond); -- cgit v1.2.3-58-ga151 From d034cfab4f7b9e768c5c1caaa56c5bd4805d2b92 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kexec: crash: don't save swapper_pg_dir for !CONFIG_MMU configurations nommu platforms don't have very interesting swapper_pg_dir pointers and usually just #define them to NULL, meaning that we can't include them in the vmcoreinfo on the kexec crash path. This patch only saves the swapper_pg_dir if we have an MMU. Signed-off-by: Will Deacon Reviewed-by: Simon Horman Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index a6a675cb9818..769e347c5196 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1462,7 +1462,9 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(init_uts_ns); VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif VMCOREINFO_SYMBOL(_stext); VMCOREINFO_SYMBOL(vmlist); -- cgit v1.2.3-58-ga151 From eaa3be6add6f327ab0a633e4fee8e6f2cc8c8a4c Mon Sep 17 00:00:00 2001 From: Zhenzhong Duan Date: Wed, 28 Mar 2012 14:42:47 -0700 Subject: kexec: add further check to crashkernel When using crashkernel=2M-256M, the kernel doesn't give any warning. This is misleading sometimes. Signed-off-by: Zhenzhong Duan Acked-by: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 769e347c5196..3288c9b29bae 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1359,6 +1359,10 @@ static int __init parse_crashkernel_simple(char *cmdline, if (*cur == '@') *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warning("crashkernel: unrecognized char\n"); + return -EINVAL; + } return 0; } -- cgit v1.2.3-58-ga151 From 5a04cca6c39cdd0b8c75b0628da634248f381b62 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 28 Mar 2012 14:42:50 -0700 Subject: sysctl: use bitmap library functions Use bitmap_set() instead of using set_bit() for each bit. This conversion is valid because the bitmap is private in the function call and atomic bitops were unnecessary. This also includes minor change. - Use bitmap_copy() for shorter typing Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d48ff4fd44c3..dbd70bdc1765 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -2393,9 +2394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } } - while (val_a <= val_b) - set_bit(val_a++, tmp_bitmap); - + bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; proc_skip_char(&kbuf, &left, '\n'); } @@ -2438,8 +2437,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else - memcpy(bitmap, tmp_bitmap, - BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } kfree(tmp_bitmap); *lenp -= left; -- cgit v1.2.3-58-ga151 From cf3f89214ef6a33fad60856bc5ffd7bb2fc4709b Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Wed, 28 Mar 2012 14:42:51 -0700 Subject: pidns: add reboot_pid_ns() to handle the reboot syscall In the case of a child pid namespace, rebooting the system does not really makes sense. When the pid namespace is used in conjunction with the other namespaces in order to create a linux container, the reboot syscall leads to some problems. A container can reboot the host. That can be fixed by dropping the sys_reboot capability but we are unable to correctly to poweroff/ halt/reboot a container and the container stays stuck at the shutdown time with the container's init process waiting indefinitively. After several attempts, no solution from userspace was found to reliabily handle the shutdown from a container. This patch propose to make the init process of the child pid namespace to exit with a signal status set to : SIGINT if the child pid namespace called "halt/poweroff" and SIGHUP if the child pid namespace called "reboot". When the reboot syscall is called and we are not in the initial pid namespace, we kill the pid namespace for "HALT", "POWEROFF", "RESTART", and "RESTART2". Otherwise we return EINVAL. Returning EINVAL is also an easy way to check if this feature is supported by the kernel when invoking another 'reboot' option like CAD. By this way the parent process of the child pid namespace knows if it rebooted or not and can take the right decision. Test case: ========== #include #include #include #include #include #include #include #include #include static int do_reboot(void *arg) { int *cmd = arg; if (reboot(*cmd)) printf("failed to reboot(%d): %m\n", *cmd); } int test_reboot(int cmd, int sig) { long stack_size = 4096; void *stack = alloca(stack_size) + stack_size; int status; pid_t ret; ret = clone(do_reboot, stack, CLONE_NEWPID | SIGCHLD, &cmd); if (ret < 0) { printf("failed to clone: %m\n"); return -1; } if (wait(&status) < 0) { printf("unexpected wait error: %m\n"); return -1; } if (!WIFSIGNALED(status)) { printf("child process exited but was not signaled\n"); return -1; } if (WTERMSIG(status) != sig) { printf("signal termination is not the one expected\n"); return -1; } return 0; } int main(int argc, char *argv[]) { int status; status = test_reboot(LINUX_REBOOT_CMD_RESTART, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_RESTART2, SIGHUP); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_RESTART2) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_HALT, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_HALT) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_POWER_OFF, SIGINT); if (status < 0) return 1; printf("reboot(LINUX_REBOOT_CMD_POWERR_OFF) succeed\n"); status = test_reboot(LINUX_REBOOT_CMD_CAD_ON, -1); if (status >= 0) { printf("reboot(LINUX_REBOOT_CMD_CAD_ON) should have failed\n"); return 1; } printf("reboot(LINUX_REBOOT_CMD_CAD_ON) has failed as expected\n"); return 0; } [akpm@linux-foundation.org: tweak and add comments] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Daniel Lezcano Acked-by: Serge Hallyn Tested-by: Serge Hallyn Reviewed-by: Oleg Nesterov Cc: Michael Kerrisk Cc: "Eric W. Biederman" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pid_namespace.h | 8 +++++++- kernel/pid_namespace.c | 33 +++++++++++++++++++++++++++++++++ kernel/sys.c | 9 +++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index f5bd679be46b..b067bd8c49d0 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -33,6 +33,7 @@ struct pid_namespace { #endif gid_t pid_gid; int hide_pid; + int reboot; /* group exit code if this pidns was rebooted */ }; extern struct pid_namespace init_pid_ns; @@ -48,6 +49,7 @@ static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) extern struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *ns); extern void free_pid_ns(struct kref *kref); extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); +extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); static inline void put_pid_ns(struct pid_namespace *ns) { @@ -75,11 +77,15 @@ static inline void put_pid_ns(struct pid_namespace *ns) { } - static inline void zap_pid_ns_processes(struct pid_namespace *ns) { BUG(); } + +static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + return 0; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 17b232869a04..57bc1fd35b3c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -15,6 +15,7 @@ #include #include #include +#include #define BITS_PER_PAGE (PAGE_SIZE*8) @@ -183,6 +184,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rc = sys_wait4(-1, NULL, __WALL, NULL); } while (rc != -ECHILD); + if (pid_ns->reboot) + current->signal->group_exit_code = pid_ns->reboot; + acct_exit_ns(pid_ns); return; } @@ -217,6 +221,35 @@ static struct ctl_table pid_ns_ctl_table[] = { static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } }; +int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) +{ + if (pid_ns == &init_pid_ns) + return 0; + + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART2: + case LINUX_REBOOT_CMD_RESTART: + pid_ns->reboot = SIGHUP; + break; + + case LINUX_REBOOT_CMD_POWER_OFF: + case LINUX_REBOOT_CMD_HALT: + pid_ns->reboot = SIGINT; + break; + default: + return -EINVAL; + } + + read_lock(&tasklist_lock); + force_sig(SIGKILL, pid_ns->child_reaper); + read_unlock(&tasklist_lock); + + do_exit(0); + + /* Not reached */ + return 0; +} + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/sys.c b/kernel/sys.c index 9eb7fcab8df6..e7006eb6c1e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -444,6 +444,15 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, magic2 != LINUX_REBOOT_MAGIC2C)) return -EINVAL; + /* + * If pid namespaces are enabled and the current task is in a child + * pid_namespace, the command is handled by reboot_pid_ns() which will + * call do_exit(). + */ + ret = reboot_pid_ns(task_active_pid_ns(current), cmd); + if (ret) + return ret; + /* Instead of trying to make the power_off code look like * halt when pm_power_off is not set do it the easy way. */ -- cgit v1.2.3-58-ga151