diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/acct.c | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 129 | ||||
-rw-r--r-- | kernel/exit.c | 26 | ||||
-rw-r--r-- | kernel/fork.c | 104 | ||||
-rw-r--r-- | kernel/futex.c | 137 | ||||
-rw-r--r-- | kernel/intermodule.c | 3 | ||||
-rw-r--r-- | kernel/irq/handle.c | 2 | ||||
-rw-r--r-- | kernel/irq/manage.c | 4 | ||||
-rw-r--r-- | kernel/irq/proc.c | 14 | ||||
-rw-r--r-- | kernel/kprobes.c | 94 | ||||
-rw-r--r-- | kernel/module.c | 33 | ||||
-rw-r--r-- | kernel/params.c | 4 | ||||
-rw-r--r-- | kernel/posix-timers.c | 28 | ||||
-rw-r--r-- | kernel/power/Kconfig | 15 | ||||
-rw-r--r-- | kernel/power/disk.c | 55 | ||||
-rw-r--r-- | kernel/power/main.c | 5 | ||||
-rw-r--r-- | kernel/power/pm.c | 3 | ||||
-rw-r--r-- | kernel/power/process.c | 29 | ||||
-rw-r--r-- | kernel/power/swsusp.c | 202 | ||||
-rw-r--r-- | kernel/printk.c | 13 | ||||
-rw-r--r-- | kernel/ptrace.c | 41 | ||||
-rw-r--r-- | kernel/rcupdate.c | 14 | ||||
-rw-r--r-- | kernel/resource.c | 3 | ||||
-rw-r--r-- | kernel/sched.c | 341 | ||||
-rw-r--r-- | kernel/signal.c | 83 | ||||
-rw-r--r-- | kernel/softlockup.c | 151 | ||||
-rw-r--r-- | kernel/sys.c | 6 | ||||
-rw-r--r-- | kernel/timer.c | 18 | ||||
-rw-r--r-- | kernel/workqueue.c | 5 |
30 files changed, 1231 insertions, 334 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index cb05cd05d237..8d57a2f1226b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -27,6 +27,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_SYSFS) += ksysfs.o +obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_SECCOMP) += seccomp.o diff --git a/kernel/acct.c b/kernel/acct.c index 4168f631868e..f70e6027cca9 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -220,7 +220,7 @@ asmlinkage long sys_acct(const char __user *name) return (PTR_ERR(tmp)); } /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND, 0); + file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); putname(tmp); if (IS_ERR(file)) { return (PTR_ERR(file)); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ab1b4e518b8..712d02029971 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -628,13 +628,6 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ -/* - * Hack to avoid 2.6.13 partial node dynamic sched domain bug. - * Disable letting 'cpu_exclusive' cpusets define dynamic sched - * domains, until the sched domain can handle partial nodes. - * Remove this #if hackery when sched domains fixed. - */ -#if 0 static void update_cpu_domains(struct cpuset *cur) { struct cpuset *c, *par = cur->parent; @@ -675,11 +668,6 @@ static void update_cpu_domains(struct cpuset *cur) partition_sched_domains(&pspan, &cspan); unlock_cpu_hotplug(); } -#else -static void update_cpu_domains(struct cpuset *cur) -{ -} -#endif static int update_cpumask(struct cpuset *cs, char *buf) { @@ -984,6 +972,10 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, *s++ = '\n'; *s = '\0'; + /* Do nothing if *ppos is at the eof or beyond the eof. */ + if (s - page <= *ppos) + return 0; + start = page + *ppos; n = s - start; retval = n - copy_to_user(buf, start, min(n, nbytes)); @@ -1611,17 +1603,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 0; } +/* + * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive + * ancestor to the specified cpuset. Call while holding cpuset_sem. + * If no ancestor is mem_exclusive (an unusual configuration), then + * returns the root cpuset. + */ +static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +{ + while (!is_mem_exclusive(cs) && cs->parent) + cs = cs->parent; + return cs; +} + /** - * cpuset_zone_allowed - is zone z allowed in current->mems_allowed - * @z: zone in question + * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) * - * Is zone z allowed in current->mems_allowed, or is - * the CPU in interrupt context? (zone is always allowed in this case) - */ -int cpuset_zone_allowed(struct zone *z) + * If we're in interrupt, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. If it's not a + * __GFP_HARDWALL request and this zone's nodes is in the nearest + * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * Otherwise, no. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest mem_exclusive ancestor cpuset. + * + * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * routine only calls here with __GFP_HARDWALL bit _not_ set if + * it's a GFP_KERNEL allocation, and all nodes in the current tasks + * mems_allowed came up empty on the first pass over the zonelist. + * So only GFP_KERNEL allocations, if all nodes in the cpuset are + * short of memory, might require taking the cpuset_sem semaphore. + * + * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() + * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing + * hardwall cpusets - no allocation on a node outside the cpuset is + * allowed (unless in interrupt, of course). + * + * The second loop doesn't even call here for GFP_ATOMIC requests + * (if the __alloc_pages() local variable 'wait' is set). That check + * and the checks below have the combined affect in the second loop of + * the __alloc_pages() routine that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + **/ + +int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) +{ + int node; /* node that zone z is on */ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed = 1; /* is allocation in zone z allowed? */ + + if (in_interrupt()) + return 1; + node = z->zone_pgdat->node_id; + if (node_isset(node, current->mems_allowed)) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + down(&cpuset_sem); + cs = current->cpuset; + if (!cs) + goto done; /* current task exiting */ + cs = nearest_exclusive_ancestor(cs); + allowed = node_isset(node, cs->mems_allowed); +done: + up(&cpuset_sem); + return allowed; +} + +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires cpuset_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) { - return in_interrupt() || - node_isset(z->zone_pgdat->node_id, current->mems_allowed); + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + down(&cpuset_sem); + cs1 = current->cpuset; + if (!cs1) + goto done; /* current task exiting */ + cs2 = p->cpuset; + if (!cs2) + goto done; /* task p is exiting */ + cs1 = nearest_exclusive_ancestor(cs1); + cs2 = nearest_exclusive_ancestor(cs2); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + up(&cpuset_sem); + + return overlap; } /* diff --git a/kernel/exit.c b/kernel/exit.c index 5b0fb9f09f21..6d2089a1bce7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -368,17 +368,19 @@ EXPORT_SYMBOL(daemonize); static inline void close_files(struct files_struct * files) { int i, j; + struct fdtable *fdt; j = 0; + fdt = files_fdtable(files); for (;;) { unsigned long set; i = j * __NFDBITS; - if (i >= files->max_fdset || i >= files->max_fds) + if (i >= fdt->max_fdset || i >= fdt->max_fds) break; - set = files->open_fds->fds_bits[j++]; + set = fdt->open_fds->fds_bits[j++]; while (set) { if (set & 1) { - struct file * file = xchg(&files->fd[i], NULL); + struct file * file = xchg(&fdt->fd[i], NULL); if (file) filp_close(file, files); } @@ -403,18 +405,22 @@ struct files_struct *get_files_struct(struct task_struct *task) void fastcall put_files_struct(struct files_struct *files) { + struct fdtable *fdt; + if (atomic_dec_and_test(&files->count)) { close_files(files); /* * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. */ - if (files->fd != &files->fd_array[0]) - free_fd_array(files->fd, files->max_fds); - if (files->max_fdset > __FD_SETSIZE) { - free_fdset(files->open_fds, files->max_fdset); - free_fdset(files->close_on_exec, files->max_fdset); - } - kmem_cache_free(files_cachep, files); + fdt = files_fdtable(files); + if (fdt == &files->fdtab) + fdt->free_files = files; + else + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); } } diff --git a/kernel/fork.c b/kernel/fork.c index b65187f0c74e..8149f3602881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -35,6 +35,7 @@ #include <linux/syscalls.h> #include <linux/jiffies.h> #include <linux/futex.h> +#include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> @@ -176,6 +177,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) /* One for us, one for whoever does the "release_task()" (usually parent) */ atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); return tsk; } @@ -564,24 +566,53 @@ static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) return 0; } -static int count_open_files(struct files_struct *files, int size) +static int count_open_files(struct fdtable *fdt) { + int size = fdt->max_fdset; int i; /* Find the last open fd */ for (i = size/(8*sizeof(long)); i > 0; ) { - if (files->open_fds->fds_bits[--i]) + if (fdt->open_fds->fds_bits[--i]) break; } i = (i+1) * 8 * sizeof(long); return i; } +static struct files_struct *alloc_files(void) +{ + struct files_struct *newf; + struct fdtable *fdt; + + newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); + if (!newf) + goto out; + + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + fdt = &newf->fdtab; + fdt->next_fd = 0; + fdt->max_fds = NR_OPEN_DEFAULT; + fdt->max_fdset = __FD_SETSIZE; + fdt->close_on_exec = &newf->close_on_exec_init; + fdt->open_fds = &newf->open_fds_init; + fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&fdt->rcu); + fdt->free_files = NULL; + fdt->next = NULL; + rcu_assign_pointer(newf->fdt, fdt); +out: + return newf; +} + static int copy_files(unsigned long clone_flags, struct task_struct * tsk) { struct files_struct *oldf, *newf; struct file **old_fds, **new_fds; int open_files, size, i, error = 0, expand; + struct fdtable *old_fdt, *new_fdt; /* * A background process may not have any files ... @@ -602,35 +633,27 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) */ tsk->files = NULL; error = -ENOMEM; - newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); - if (!newf) + newf = alloc_files(); + if (!newf) goto out; - atomic_set(&newf->count, 1); - - spin_lock_init(&newf->file_lock); - newf->next_fd = 0; - newf->max_fds = NR_OPEN_DEFAULT; - newf->max_fdset = __FD_SETSIZE; - newf->close_on_exec = &newf->close_on_exec_init; - newf->open_fds = &newf->open_fds_init; - newf->fd = &newf->fd_array[0]; - spin_lock(&oldf->file_lock); - - open_files = count_open_files(oldf, oldf->max_fdset); + old_fdt = files_fdtable(oldf); + new_fdt = files_fdtable(newf); + size = old_fdt->max_fdset; + open_files = count_open_files(old_fdt); expand = 0; /* * Check whether we need to allocate a larger fd array or fd set. * Note: we're not a clone task, so the open count won't change. */ - if (open_files > newf->max_fdset) { - newf->max_fdset = 0; + if (open_files > new_fdt->max_fdset) { + new_fdt->max_fdset = 0; expand = 1; } - if (open_files > newf->max_fds) { - newf->max_fds = 0; + if (open_files > new_fdt->max_fds) { + new_fdt->max_fds = 0; expand = 1; } @@ -642,14 +665,21 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) spin_unlock(&newf->file_lock); if (error < 0) goto out_release; + new_fdt = files_fdtable(newf); + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. + */ spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); } - old_fds = oldf->fd; - new_fds = newf->fd; + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; - memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); - memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); + memcpy(new_fdt->open_fds->fds_bits, old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, old_fdt->close_on_exec->fds_bits, open_files/8); for (i = open_files; i != 0; i--) { struct file *f = *old_fds++; @@ -662,24 +692,24 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk) * is partway through open(). So make sure that this * fd is available to the new process. */ - FD_CLR(open_files - i, newf->open_fds); + FD_CLR(open_files - i, new_fdt->open_fds); } - *new_fds++ = f; + rcu_assign_pointer(*new_fds++, f); } spin_unlock(&oldf->file_lock); /* compute the remainder to be cleared */ - size = (newf->max_fds - open_files) * sizeof(struct file *); + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); /* This is long word aligned thus could use a optimized version */ memset(new_fds, 0, size); - if (newf->max_fdset > open_files) { - int left = (newf->max_fdset-open_files)/8; + if (new_fdt->max_fdset > open_files) { + int left = (new_fdt->max_fdset-open_files)/8; int start = open_files / (8 * sizeof(unsigned long)); - memset(&newf->open_fds->fds_bits[start], 0, left); - memset(&newf->close_on_exec->fds_bits[start], 0, left); + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); } tsk->files = newf; @@ -688,9 +718,9 @@ out: return error; out_release: - free_fdset (newf->close_on_exec, newf->max_fdset); - free_fdset (newf->open_fds, newf->max_fdset); - free_fd_array(newf->fd, newf->max_fds); + free_fdset (new_fdt->close_on_exec, new_fdt->max_fdset); + free_fdset (new_fdt->open_fds, new_fdt->max_fdset); + free_fd_array(new_fdt->fd, new_fdt->max_fds); kmem_cache_free(files_cachep, newf); goto out; } @@ -994,6 +1024,9 @@ static task_t *copy_process(unsigned long clone_flags, * of CLONE_PTRACE. */ clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif /* Our parent execution domain becomes current domain These must match for thread signalling to apply */ @@ -1112,6 +1145,9 @@ static task_t *copy_process(unsigned long clone_flags, __get_cpu_var(process_counts)++; } + if (!current->signal->tty && p->signal->tty) + p->signal->tty = NULL; + nr_threads++; total_forks++; write_unlock_irq(&tasklist_lock); diff --git a/kernel/futex.c b/kernel/futex.c index c7130f86106c..ca05fe6a70b2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -40,6 +40,7 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/signal.h> +#include <asm/futex.h> #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -327,6 +328,118 @@ out: } /* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) +{ + union futex_key key1, key2; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head; + struct futex_q *this, *next; + int ret, op_ret, attempt = 0; + +retryfull: + down_read(¤t->mm->mmap_sem); + + ret = get_futex_key(uaddr1, &key1); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, &key2); + if (unlikely(ret != 0)) + goto out; + + bh1 = hash_futex(&key1); + bh2 = hash_futex(&key2); + +retry: + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) + spin_lock(&bh1->lock); + + op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); + if (unlikely(op_ret < 0)) { + int dummy; + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); + + /* futex_atomic_op_inuser needs to both read and write + * *(int __user *)uaddr2, but we can't modify it + * non-atomically. Therefore, if get_user below is not + * enough, we need to handle the fault ourselves, while + * still holding the mmap_sem. */ + if (attempt++) { + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + + ret = -EFAULT; + if (attempt >= 2 || + !(vma = find_vma(mm, uaddr2)) || + vma->vm_start > uaddr2 || + !(vma->vm_flags & VM_WRITE)) + goto out; + + switch (handle_mm_fault(mm, vma, uaddr2, 1)) { + case VM_FAULT_MINOR: + current->min_flt++; + break; + case VM_FAULT_MAJOR: + current->maj_flt++; + break; + default: + goto out; + } + goto retry; + } + + /* If we would have faulted, release mmap_sem, + * fault it in and start all over again. */ + up_read(¤t->mm->mmap_sem); + + ret = get_user(dummy, (int __user *)uaddr2); + if (ret) + return ret; + + goto retryfull; + } + + head = &bh1->chain; + + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &bh2->chain; + + op_ret = 0; + list_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + spin_unlock(&bh1->lock); + if (bh1 != bh2) + spin_unlock(&bh2->lock); +out: + up_read(¤t->mm->mmap_sem); + return ret; +} + +/* * Requeue all waiters hashed on one physical page to another * physical page. */ @@ -673,23 +786,17 @@ static int futex_fd(unsigned long uaddr, int signal) filp->f_mapping = filp->f_dentry->d_inode->i_mapping; if (signal) { - int err; err = f_setown(filp, current->pid, 1); if (err < 0) { - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; + goto error; } filp->f_owner.signum = signal; } q = kmalloc(sizeof(*q), GFP_KERNEL); if (!q) { - put_unused_fd(ret); - put_filp(filp); - ret = -ENOMEM; - goto out; + err = -ENOMEM; + goto error; } down_read(¤t->mm->mmap_sem); @@ -697,10 +804,8 @@ static int futex_fd(unsigned long uaddr, int signal) if (unlikely(err != 0)) { up_read(¤t->mm->mmap_sem); - put_unused_fd(ret); - put_filp(filp); kfree(q); - return err; + goto error; } /* @@ -716,6 +821,11 @@ static int futex_fd(unsigned long uaddr, int signal) fd_install(ret, filp); out: return ret; +error: + put_unused_fd(ret); + put_filp(filp); + ret = err; + goto out; } long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, @@ -740,6 +850,9 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, case FUTEX_CMP_REQUEUE: ret = futex_requeue(uaddr, uaddr2, val, val2, &val3); break; + case FUTEX_WAKE_OP: + ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); + break; default: ret = -ENOSYS; } diff --git a/kernel/intermodule.c b/kernel/intermodule.c index 388977f3e9b7..0cbe633420fb 100644 --- a/kernel/intermodule.c +++ b/kernel/intermodule.c @@ -39,7 +39,7 @@ void inter_module_register(const char *im_name, struct module *owner, const void struct list_head *tmp; struct inter_module_entry *ime, *ime_new; - if (!(ime_new = kmalloc(sizeof(*ime), GFP_KERNEL))) { + if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) { /* Overloaded kernel, not fatal */ printk(KERN_ERR "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n", @@ -47,7 +47,6 @@ void inter_module_register(const char *im_name, struct module *owner, const void kmalloc_failed = 1; return; } - memset(ime_new, 0, sizeof(*ime_new)); ime_new->im_name = im_name; ime_new->owner = owner; ime_new->userdata = userdata; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c29f83c16497..3ff7b925c387 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -111,7 +111,7 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) unsigned int status; kstat_this_cpu.irqs[irq]++; - if (desc->status & IRQ_PER_CPU) { + if (CHECK_IRQ_PER_CPU(desc->status)) { irqreturn_t action_ret; /* diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ac6700985705..1cfdb08ddf20 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -18,6 +18,10 @@ cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; +#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE) +cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS]; +#endif + /** * synchronize_irq - wait for pending IRQ handlers (on other CPUs) * diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 85d08daa6600..f26e534c6585 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -19,12 +19,22 @@ static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; */ static struct proc_dir_entry *smp_affinity_entry[NR_IRQS]; -void __attribute__((weak)) -proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +#ifdef CONFIG_GENERIC_PENDING_IRQ +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) +{ + /* + * Save these away for later use. Re-progam when the + * interrupt is pending + */ + set_pending_irq(irq, mask_val); +} +#else +void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) { irq_affinity[irq] = mask_val; irq_desc[irq].handler->set_affinity(irq, mask_val); } +#endif static int irq_affinity_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b0237122b24e..f3ea492ab44d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -37,6 +37,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/moduleloader.h> +#include <asm-generic/sections.h> #include <asm/cacheflush.h> #include <asm/errno.h> #include <asm/kdebug.h> @@ -72,7 +73,7 @@ static struct hlist_head kprobe_insn_pages; * get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -kprobe_opcode_t *get_insn_slot(void) +kprobe_opcode_t __kprobes *get_insn_slot(void) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -117,7 +118,7 @@ kprobe_opcode_t *get_insn_slot(void) return kip->insns; } -void free_insn_slot(kprobe_opcode_t *slot) +void __kprobes free_insn_slot(kprobe_opcode_t *slot) { struct kprobe_insn_page *kip; struct hlist_node *pos; @@ -152,20 +153,42 @@ void free_insn_slot(kprobe_opcode_t *slot) } /* Locks kprobe: irqs must be disabled */ -void lock_kprobes(void) +void __kprobes lock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we take the kprobe_lock + * and before we get a chance to update kprobe_cpu, this to prevent + * deadlock when we have a kprobe on ISR routine and a kprobe on task + * routine + */ + local_irq_save(flags); + spin_lock(&kprobe_lock); kprobe_cpu = smp_processor_id(); + + local_irq_restore(flags); } -void unlock_kprobes(void) +void __kprobes unlock_kprobes(void) { + unsigned long flags = 0; + + /* Avoiding local interrupts to happen right after we update + * kprobe_cpu and before we get a a chance to release kprobe_lock, + * this to prevent deadlock when we have a kprobe on ISR routine and + * a kprobe on task routine + */ + local_irq_save(flags); + kprobe_cpu = NR_CPUS; spin_unlock(&kprobe_lock); + + local_irq_restore(flags); } /* You have to be holding the kprobe_lock */ -struct kprobe *get_kprobe(void *addr) +struct kprobe __kprobes *get_kprobe(void *addr) { struct hlist_head *head; struct hlist_node *node; @@ -183,7 +206,7 @@ struct kprobe *get_kprobe(void *addr) * Aggregate handlers for multiple kprobes support - these handlers * take care of invoking the individual kprobe handlers on p->list */ -static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp; @@ -198,8 +221,8 @@ static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, - unsigned long flags) +static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) { struct kprobe *kp; @@ -213,8 +236,8 @@ static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs, return; } -static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, - int trapnr) +static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) { /* * if we faulted "during" the execution of a user specified @@ -227,7 +250,7 @@ static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, return 0; } -static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) { struct kprobe *kp = curr_kprobe; if (curr_kprobe && kp->break_handler) { @@ -240,7 +263,7 @@ static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs) return 0; } -struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) +struct kretprobe_instance __kprobes *get_free_rp_inst(struct kretprobe *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -249,7 +272,8 @@ struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp) return NULL; } -static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) +static struct kretprobe_instance __kprobes *get_used_rp_inst(struct kretprobe + *rp) { struct hlist_node *node; struct kretprobe_instance *ri; @@ -258,7 +282,7 @@ static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp) return NULL; } -void add_rp_inst(struct kretprobe_instance *ri) +void __kprobes add_rp_inst(struct kretprobe_instance *ri) { /* * Remove rp inst off the free list - @@ -276,7 +300,7 @@ void add_rp_inst(struct kretprobe_instance *ri) hlist_add_head(&ri->uflist, &ri->rp->used_instances); } -void recycle_rp_inst(struct kretprobe_instance *ri) +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri) { /* remove rp inst off the rprobe_inst_table */ hlist_del(&ri->hlist); @@ -291,7 +315,7 @@ void recycle_rp_inst(struct kretprobe_instance *ri) kfree(ri); } -struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) +struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) { return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; } @@ -302,7 +326,7 @@ struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk) * instances associated with this task. These left over instances represent * probed functions that have been called but will never return. */ -void kprobe_flush_task(struct task_struct *tk) +void __kprobes kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; struct hlist_head *head; @@ -322,7 +346,8 @@ void kprobe_flush_task(struct task_struct *tk) * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. */ -static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); @@ -353,7 +378,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) * Add the new probe to old_p->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist */ -static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) { struct kprobe *kp; @@ -395,7 +420,8 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) * the intricacies * TODO: Move kcalloc outside the spinlock */ -static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p) +static int __kprobes register_aggr_kprobe(struct kprobe *old_p, + struct kprobe *p) { int ret = 0; struct kprobe *ap; @@ -434,15 +460,25 @@ static inline void cleanup_aggr_kprobe(struct kprobe *old_p, spin_unlock_irqrestore(&kprobe_lock, flags); } -int register_kprobe(struct kprobe *p) +static int __kprobes in_kprobes_functions(unsigned long addr) +{ + if (addr >= (unsigned long)__kprobes_text_start + && addr < (unsigned long)__kprobes_text_end) + return -EINVAL; + return 0; +} + +int __kprobes register_kprobe(struct kprobe *p) { int ret = 0; unsigned long flags = 0; struct kprobe *old_p; - if ((ret = arch_prepare_kprobe(p)) != 0) { + if ((ret = in_kprobes_functions((unsigned long) p->addr)) != 0) + return ret; + if ((ret = arch_prepare_kprobe(p)) != 0) goto rm_kprobe; - } + spin_lock_irqsave(&kprobe_lock, flags); old_p = get_kprobe(p->addr); p->nmissed = 0; @@ -466,7 +502,7 @@ rm_kprobe: return ret; } -void unregister_kprobe(struct kprobe *p) +void __kprobes unregister_kprobe(struct kprobe *p) { unsigned long flags; struct kprobe *old_p; @@ -487,7 +523,7 @@ static struct notifier_block kprobe_exceptions_nb = { .priority = 0x7fffffff /* we need to notified first */ }; -int register_jprobe(struct jprobe *jp) +int __kprobes register_jprobe(struct jprobe *jp) { /* Todo: Verify probepoint is a function entry point */ jp->kp.pre_handler = setjmp_pre_handler; @@ -496,14 +532,14 @@ int register_jprobe(struct jprobe *jp) return register_kprobe(&jp->kp); } -void unregister_jprobe(struct jprobe *jp) +void __kprobes unregister_jprobe(struct jprobe *jp) { unregister_kprobe(&jp->kp); } #ifdef ARCH_SUPPORTS_KRETPROBES -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { int ret = 0; struct kretprobe_instance *inst; @@ -540,14 +576,14 @@ int register_kretprobe(struct kretprobe *rp) #else /* ARCH_SUPPORTS_KRETPROBES */ -int register_kretprobe(struct kretprobe *rp) +int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; } #endif /* ARCH_SUPPORTS_KRETPROBES */ -void unregister_kretprobe(struct kretprobe *rp) +void __kprobes unregister_kretprobe(struct kretprobe *rp) { unsigned long flags; struct kretprobe_instance *ri; diff --git a/kernel/module.c b/kernel/module.c index c32995fbd8fd..4b39d3793c72 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1509,6 +1509,7 @@ static struct module *load_module(void __user *umod, long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ struct exception_table_entry *extable; + mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", umod, len, uargs); @@ -1779,6 +1780,24 @@ static struct module *load_module(void __user *umod, if (err < 0) goto cleanup; + /* flush the icache in correct context */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + set_fs(old_fs); + mod->args = args; if (obsparmindex) { err = obsolete_params(mod->name, mod->args, @@ -1860,7 +1879,6 @@ sys_init_module(void __user *umod, const char __user *uargs) { struct module *mod; - mm_segment_t old_fs = get_fs(); int ret = 0; /* Must have permission */ @@ -1878,19 +1896,6 @@ sys_init_module(void __user *umod, return PTR_ERR(mod); } - /* flush the icache in correct context */ - set_fs(KERNEL_DS); - - /* Flush the instruction cache, since we've played with text */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); - - set_fs(old_fs); - /* Now sew it into the lists. They won't access us, since strong_try_module_get() will fail. */ stop_machine_run(__link_module, mod, NR_CPUS); diff --git a/kernel/params.c b/kernel/params.c index d586c35ef8fc..fbf173215fd2 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -542,8 +542,8 @@ static void __init kernel_param_sysfs_setup(const char *name, { struct module_kobject *mk; - mk = kmalloc(sizeof(struct module_kobject), GFP_KERNEL); - memset(mk, 0, sizeof(struct module_kobject)); + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); mk->mod = THIS_MODULE; kobj_set_kset_s(mk, module_subsys); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 38798a2ff994..b7b532acd9fc 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -427,21 +427,23 @@ int posix_timer_event(struct k_itimer *timr,int si_private) timr->sigq->info.si_code = SI_TIMER; timr->sigq->info.si_tid = timr->it_id; timr->sigq->info.si_value = timr->it_sigev_value; + if (timr->it_sigev_notify & SIGEV_THREAD_ID) { - if (unlikely(timr->it_process->flags & PF_EXITING)) { - timr->it_sigev_notify = SIGEV_SIGNAL; - put_task_struct(timr->it_process); - timr->it_process = timr->it_process->group_leader; - goto group; - } - return send_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); - } - else { - group: - return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, - timr->it_process); + struct task_struct *leader; + int ret = send_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); + + if (likely(ret >= 0)) + return ret; + + timr->it_sigev_notify = SIGEV_SIGNAL; + leader = timr->it_process->group_leader; + put_task_struct(timr->it_process); + timr->it_process = leader; } + + return send_group_sigqueue(timr->it_sigev_signo, timr->sigq, + timr->it_process); } EXPORT_SYMBOL_GPL(posix_timer_event); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 2c7121d9bff1..396c7873e804 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -1,5 +1,6 @@ config PM bool "Power Management support" + depends on !IA64_HP_SIM ---help--- "Power Management" means that parts of your computer are shut off or put into a power conserving "sleep" mode if they are not @@ -28,7 +29,7 @@ config PM_DEBUG config SOFTWARE_SUSPEND bool "Software Suspend" - depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP)) + depends on PM && SWAP && (X86 || ((FVR || PPC32) && !SMP)) ---help--- Enable the possibility of suspending the machine. It doesn't need APM. @@ -72,6 +73,18 @@ config PM_STD_PARTITION suspended image to. It will simply pick the first available swap device. +config SWSUSP_ENCRYPT + bool "Encrypt suspend image" + depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y) + default "" + ---help--- + To prevent data gathering from swap after resume you can encrypt + the suspend image with a temporary key that is deleted on + resume. + + Note that the temporary key is stored unencrypted on disk while the + system is suspended. + config SUSPEND_SMP bool depends on HOTPLUG_CPU && X86 && PM diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 664eb0469b6e..2d8bf054d036 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -112,24 +112,12 @@ static inline void platform_finish(void) } } -static void finish(void) -{ - device_resume(); - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - - static int prepare_processes(void) { int error; pm_prepare_console(); - sys_sync(); - disable_nonboot_cpus(); if (freeze_processes()) { @@ -162,15 +150,6 @@ static void unprepare_processes(void) pm_restore_console(); } -static int prepare_devices(void) -{ - int error; - - if ((error = device_suspend(PMSG_FREEZE))) - printk("Some devices failed to suspend\n"); - return error; -} - /** * pm_suspend_disk - The granpappy of power management. * @@ -187,17 +166,14 @@ int pm_suspend_disk(void) error = prepare_processes(); if (error) return error; - error = prepare_devices(); + error = device_suspend(PMSG_FREEZE); if (error) { + printk("Some devices failed to suspend\n"); unprepare_processes(); return error; } - pr_debug("PM: Attempting to suspend to disk.\n"); - if (pm_disk_mode == PM_DISK_FIRMWARE) - return pm_ops->enter(PM_SUSPEND_DISK); - pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; if ((error = swsusp_suspend())) @@ -208,11 +184,20 @@ int pm_suspend_disk(void) error = swsusp_write(); if (!error) power_down(pm_disk_mode); + else { + /* swsusp_write can not fail in device_resume, + no need to do second device_resume */ + swsusp_free(); + unprepare_processes(); + return error; + } } else pr_debug("PM: Image restored successfully.\n"); + swsusp_free(); Done: - finish(); + device_resume(); + unprepare_processes(); return error; } @@ -233,9 +218,12 @@ static int software_resume(void) { int error; + down(&pm_sem); if (!swsusp_resume_device) { - if (!strlen(resume_file)) + if (!strlen(resume_file)) { + up(&pm_sem); return -ENOENT; + } swsusp_resume_device = name_to_dev_t(resume_file); pr_debug("swsusp: Resume From Partition %s\n", resume_file); } else { @@ -248,6 +236,7 @@ static int software_resume(void) * FIXME: If noresume is specified, we need to find the partition * and reset it back to normal swap space. */ + up(&pm_sem); return 0; } @@ -270,20 +259,24 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); - if ((error = prepare_devices())) + if ((error = device_suspend(PMSG_FREEZE))) { + printk("Some devices failed to suspend\n"); goto Free; + } mb(); pr_debug("PM: Restoring saved image.\n"); swsusp_resume(); pr_debug("PM: Restore failed, recovering.n"); - finish(); + device_resume(); Free: swsusp_free(); Cleanup: unprepare_processes(); Done: + /* For success case, the suspend path will release the lock */ + up(&pm_sem); pr_debug("PM: Resume from disk failed.\n"); return 0; } @@ -390,7 +383,9 @@ static ssize_t resume_store(struct subsystem * subsys, const char * buf, size_t if (sscanf(buf, "%u:%u", &maj, &min) == 2) { res = MKDEV(maj,min); if (maj == MAJOR(res) && min == MINOR(res)) { + down(&pm_sem); swsusp_resume_device = res; + up(&pm_sem); printk("Attempting manual resume\n"); noresume = 0; software_resume(); diff --git a/kernel/power/main.c b/kernel/power/main.c index 71aa0fd22007..22bdc93cc038 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -143,11 +143,12 @@ static void suspend_finish(suspend_state_t state) -static char * pm_states[] = { +static char *pm_states[PM_SUSPEND_MAX] = { [PM_SUSPEND_STANDBY] = "standby", [PM_SUSPEND_MEM] = "mem", +#ifdef CONFIG_SOFTWARE_SUSPEND [PM_SUSPEND_DISK] = "disk", - NULL, +#endif }; diff --git a/kernel/power/pm.c b/kernel/power/pm.c index 61deda04e39e..159149321b3c 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c @@ -60,9 +60,8 @@ struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback callback) { - struct pm_dev *dev = kmalloc(sizeof(struct pm_dev), GFP_KERNEL); + struct pm_dev *dev = kzalloc(sizeof(struct pm_dev), GFP_KERNEL); if (dev) { - memset(dev, 0, sizeof(*dev)); dev->type = type; dev->id = id; dev->callback = callback; diff --git a/kernel/power/process.c b/kernel/power/process.c index 3bd0d261818f..28de118f7a0b 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -38,7 +38,6 @@ void refrigerator(void) processes around? */ long save; save = current->state; - current->state = TASK_UNINTERRUPTIBLE; pr_debug("%s entered refrigerator\n", current->comm); printk("="); @@ -47,8 +46,10 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); - while (frozen(current)) + while (frozen(current)) { + current->state = TASK_UNINTERRUPTIBLE; schedule(); + } pr_debug("%s left refrigerator\n", current->comm); current->state = save; } @@ -80,13 +81,33 @@ int freeze_processes(void) } while_each_thread(g, p); read_unlock(&tasklist_lock); yield(); /* Yield is okay here */ - if (time_after(jiffies, start_time + TIMEOUT)) { + if (todo && time_after(jiffies, start_time + TIMEOUT)) { printk( "\n" ); printk(KERN_ERR " stopping tasks failed (%d tasks remaining)\n", todo ); - return todo; + break; } } while(todo); + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + if (todo) { + read_lock(&tasklist_lock); + do_each_thread(g, p) + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + p->flags &= ~PF_FREEZE; + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_tsk(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + while_each_thread(g, p); + read_unlock(&tasklist_lock); + return todo; + } + printk( "|\n" ); BUG_ON(in_atomic()); return 0; diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index f2bc71b9fe8b..d967e875ee82 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -31,6 +31,9 @@ * Alex Badea <vampire@go.ro>: * Fixed runaway init * + * Andreas Steinmetz <ast@domdv.de>: + * Added encrypted suspend option + * * More state savers are welcome. Especially for the scsi layer... * * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt @@ -71,8 +74,16 @@ #include <asm/tlbflush.h> #include <asm/io.h> +#include <linux/random.h> +#include <linux/crypto.h> +#include <asm/scatterlist.h> + #include "power.h" +#define CIPHER "aes" +#define MAXKEY 32 +#define MAXIV 32 + /* References to section boundaries */ extern const void __nosave_begin, __nosave_end; @@ -103,7 +114,8 @@ static suspend_pagedir_t *pagedir_save; #define SWSUSP_SIG "S1SUSPEND" static struct swsusp_header { - char reserved[PAGE_SIZE - 20 - sizeof(swp_entry_t)]; + char reserved[PAGE_SIZE - 20 - MAXKEY - MAXIV - sizeof(swp_entry_t)]; + u8 key_iv[MAXKEY+MAXIV]; swp_entry_t swsusp_info; char orig_sig[10]; char sig[10]; @@ -129,6 +141,131 @@ static struct swsusp_info swsusp_info; static unsigned short swapfile_used[MAX_SWAPFILES]; static unsigned short root_swap; +static int write_page(unsigned long addr, swp_entry_t * loc); +static int bio_read_page(pgoff_t page_off, void * page); + +static u8 key_iv[MAXKEY+MAXIV]; + +#ifdef CONFIG_SWSUSP_ENCRYPT + +static int crypto_init(int mode, void **mem) +{ + int error = 0; + int len; + char *modemsg; + struct crypto_tfm *tfm; + + modemsg = mode ? "suspend not possible" : "resume not possible"; + + tfm = crypto_alloc_tfm(CIPHER, CRYPTO_TFM_MODE_CBC); + if(!tfm) { + printk(KERN_ERR "swsusp: no tfm, %s\n", modemsg); + error = -EINVAL; + goto out; + } + + if(MAXKEY < crypto_tfm_alg_min_keysize(tfm)) { + printk(KERN_ERR "swsusp: key buffer too small, %s\n", modemsg); + error = -ENOKEY; + goto fail; + } + + if (mode) + get_random_bytes(key_iv, MAXKEY+MAXIV); + + len = crypto_tfm_alg_max_keysize(tfm); + if (len > MAXKEY) + len = MAXKEY; + + if (crypto_cipher_setkey(tfm, key_iv, len)) { + printk(KERN_ERR "swsusp: key setup failure, %s\n", modemsg); + error = -EKEYREJECTED; + goto fail; + } + + len = crypto_tfm_alg_ivsize(tfm); + + if (MAXIV < len) { + printk(KERN_ERR "swsusp: iv buffer too small, %s\n", modemsg); + error = -EOVERFLOW; + goto fail; + } + + crypto_cipher_set_iv(tfm, key_iv+MAXKEY, len); + + *mem=(void *)tfm; + + goto out; + +fail: crypto_free_tfm(tfm); +out: return error; +} + +static __inline__ void crypto_exit(void *mem) +{ + crypto_free_tfm((struct crypto_tfm *)mem); +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + src.page = virt_to_page(p->address); + src.offset = 0; + src.length = PAGE_SIZE; + dst.page = virt_to_page((void *)&swsusp_header); + dst.offset = 0; + dst.length = PAGE_SIZE; + + error = crypto_cipher_encrypt((struct crypto_tfm *)mem, &dst, &src, + PAGE_SIZE); + + if (!error) + error = write_page((unsigned long)&swsusp_header, + &(p->swap_address)); + return error; +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + int error = 0; + struct scatterlist src, dst; + + error = bio_read_page(swp_offset(p->swap_address), (void *)p->address); + if (!error) { + src.offset = 0; + src.length = PAGE_SIZE; + dst.offset = 0; + dst.length = PAGE_SIZE; + src.page = dst.page = virt_to_page((void *)p->address); + + error = crypto_cipher_decrypt((struct crypto_tfm *)mem, &dst, + &src, PAGE_SIZE); + } + return error; +} +#else +static __inline__ int crypto_init(int mode, void *mem) +{ + return 0; +} + +static __inline__ void crypto_exit(void *mem) +{ +} + +static __inline__ int crypto_write(struct pbe *p, void *mem) +{ + return write_page(p->address, &(p->swap_address)); +} + +static __inline__ int crypto_read(struct pbe *p, void *mem) +{ + return bio_read_page(swp_offset(p->swap_address), (void *)p->address); +} +#endif + static int mark_swapfiles(swp_entry_t prev) { int error; @@ -140,6 +277,7 @@ static int mark_swapfiles(swp_entry_t prev) !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); memcpy(swsusp_header.sig,SWSUSP_SIG, 10); + memcpy(swsusp_header.key_iv, key_iv, MAXKEY+MAXIV); swsusp_header.swsusp_info = prev; error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0), @@ -179,9 +317,9 @@ static int swsusp_swap_check(void) /* This is called before saving image */ len=strlen(resume_file); root_swap = 0xFFFF; - swap_list_lock(); + spin_lock(&swap_lock); for (i=0; i<MAX_SWAPFILES; i++) { - if (swap_info[i].flags == 0) { + if (!(swap_info[i].flags & SWP_WRITEOK)) { swapfile_used[i]=SWAPFILE_UNUSED; } else { if (!len) { @@ -202,7 +340,7 @@ static int swsusp_swap_check(void) /* This is called before saving image */ } } } - swap_list_unlock(); + spin_unlock(&swap_lock); return (root_swap != 0xffff) ? 0 : -ENODEV; } @@ -216,12 +354,12 @@ static void lock_swapdevices(void) { int i; - swap_list_lock(); + spin_lock(&swap_lock); for (i = 0; i< MAX_SWAPFILES; i++) if (swapfile_used[i] == SWAPFILE_IGNORED) { - swap_info[i].flags ^= 0xFF; + swap_info[i].flags ^= SWP_WRITEOK; } - swap_list_unlock(); + spin_unlock(&swap_lock); } /** @@ -286,6 +424,10 @@ static int data_write(void) int error = 0, i = 0; unsigned int mod = nr_copy_pages / 100; struct pbe *p; + void *tfm; + + if ((error = crypto_init(1, &tfm))) + return error; if (!mod) mod = 1; @@ -294,11 +436,14 @@ static int data_write(void) for_each_pbe (p, pagedir_nosave) { if (!(i%mod)) printk( "\b\b\b\b%3d%%", i / mod ); - if ((error = write_page(p->address, &(p->swap_address)))) + if ((error = crypto_write(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -385,7 +530,6 @@ static int write_pagedir(void) * write_suspend_image - Write entire image and metadata. * */ - static int write_suspend_image(void) { int error; @@ -400,6 +544,7 @@ static int write_suspend_image(void) if ((error = close_swap())) goto FreePagedir; Done: + memset(key_iv, 0, MAXKEY+MAXIV); return error; FreePagedir: free_pagedir_entries(); @@ -591,18 +736,7 @@ static void copy_data_pages(void) static int calc_nr(int nr_copy) { - int extra = 0; - int mod = !!(nr_copy % PBES_PER_PAGE); - int diff = (nr_copy / PBES_PER_PAGE) + mod; - - do { - extra += diff; - nr_copy += diff; - mod = !!(nr_copy % PBES_PER_PAGE); - diff = (nr_copy / PBES_PER_PAGE) + mod - extra; - } while (diff > 0); - - return nr_copy; + return nr_copy + (nr_copy+PBES_PER_PAGE-2)/(PBES_PER_PAGE-1); } /** @@ -886,20 +1020,21 @@ int swsusp_suspend(void) * at resume time, and evil weirdness ensues. */ if ((error = device_power_down(PMSG_FREEZE))) { + printk(KERN_ERR "Some devices failed to power down, aborting suspend\n"); local_irq_enable(); return error; } if ((error = swsusp_swap_check())) { - printk(KERN_ERR "swsusp: FATAL: cannot find swap device, try " - "swapon -a!\n"); + printk(KERN_ERR "swsusp: cannot find swap device, try swapon -a.\n"); + device_power_up(); local_irq_enable(); return error; } save_processor_state(); if ((error = swsusp_arch_suspend())) - printk("Error %d suspending\n", error); + printk(KERN_ERR "Error %d suspending\n", error); /* Restore control flow magically appears here */ restore_processor_state(); BUG_ON (nr_copy_pages_check != nr_copy_pages); @@ -924,6 +1059,7 @@ int swsusp_resume(void) BUG_ON(!error); restore_processor_state(); restore_highmem(); + touch_softlockup_watchdog(); device_power_up(); local_irq_enable(); return error; @@ -1179,7 +1315,8 @@ static const char * sanity_check(void) if (strcmp(swsusp_info.uts.machine,system_utsname.machine)) return "machine"; #if 0 - if(swsusp_info.cpus != num_online_cpus()) + /* We can't use number of online CPUs when we use hotplug to remove them ;-))) */ + if (swsusp_info.cpus != num_possible_cpus()) return "number of cpus"; #endif return NULL; @@ -1212,13 +1349,14 @@ static int check_sig(void) return error; if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); + memcpy(key_iv, swsusp_header.key_iv, MAXKEY+MAXIV); + memset(swsusp_header.key_iv, 0, MAXKEY+MAXIV); /* * Reset swap signature now. */ error = bio_write_page(0, &swsusp_header); } else { - printk(KERN_ERR "swsusp: Suspend partition has wrong signature?\n"); return -EINVAL; } if (!error) @@ -1239,6 +1377,10 @@ static int data_read(struct pbe *pblist) int error = 0; int i = 0; int mod = swsusp_info.image_pages / 100; + void *tfm; + + if ((error = crypto_init(0, &tfm))) + return error; if (!mod) mod = 1; @@ -1250,14 +1392,15 @@ static int data_read(struct pbe *pblist) if (!(i % mod)) printk("\b\b\b\b%3d%%", i / mod); - error = bio_read_page(swp_offset(p->swap_address), - (void *)p->address); - if (error) + if ((error = crypto_read(p, tfm))) { + crypto_exit(tfm); return error; + } i++; } printk("\b\b\b\bdone\n"); + crypto_exit(tfm); return error; } @@ -1385,6 +1528,7 @@ int swsusp_read(void) error = read_suspend_image(); blkdev_put(resume_bdev); + memset(key_iv, 0, MAXKEY+MAXIV); if (!error) pr_debug("swsusp: Reading resume file was successful\n"); diff --git a/kernel/printk.c b/kernel/printk.c index 5092397fac29..a967605bc2e3 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -514,6 +514,9 @@ asmlinkage int printk(const char *fmt, ...) return r; } +/* cpu currently holding logbuf_lock */ +static volatile unsigned int printk_cpu = UINT_MAX; + asmlinkage int vprintk(const char *fmt, va_list args) { unsigned long flags; @@ -522,11 +525,15 @@ asmlinkage int vprintk(const char *fmt, va_list args) static char printk_buf[1024]; static int log_level_unknown = 1; - if (unlikely(oops_in_progress)) + preempt_disable(); + if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) + /* If a crash is occurring during printk() on this CPU, + * make sure we can't deadlock */ zap_locks(); /* This stops the holder of console_sem just where we want him */ spin_lock_irqsave(&logbuf_lock, flags); + printk_cpu = smp_processor_id(); /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); @@ -595,6 +602,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * CPU until it is officially up. We shouldn't be calling into * random console drivers on a CPU which doesn't exist yet.. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); goto out; } @@ -604,6 +612,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * We own the drivers. We can drop the spinlock and let * release_console_sem() print the text */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); console_may_schedule = 0; release_console_sem(); @@ -613,9 +622,11 @@ asmlinkage int vprintk(const char *fmt, va_list args) * allows the semaphore holder to proceed and to call the * console drivers with the output which we just produced. */ + printk_cpu = UINT_MAX; spin_unlock_irqrestore(&logbuf_lock, flags); } out: + preempt_enable(); return printed_len; } EXPORT_SYMBOL(printk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 8dcb8f6288bc..019e04ec065a 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,6 +118,33 @@ int ptrace_check_attach(struct task_struct *child, int kill) return ret; } +static int may_attach(struct task_struct *task) +{ + if (!task->mm) + return -EPERM; + if (((current->uid != task->euid) || + (current->uid != task->suid) || + (current->uid != task->uid) || + (current->gid != task->egid) || + (current->gid != task->sgid) || + (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) + return -EPERM; + smp_rmb(); + if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace(current, task); +} + +int ptrace_may_attach(struct task_struct *task) +{ + int err; + task_lock(task); + err = may_attach(task); + task_unlock(task); + return !err; +} + int ptrace_attach(struct task_struct *task) { int retval; @@ -127,22 +154,10 @@ int ptrace_attach(struct task_struct *task) goto bad; if (task == current) goto bad; - if (!task->mm) - goto bad; - if(((current->uid != task->euid) || - (current->uid != task->suid) || - (current->uid != task->uid) || - (current->gid != task->egid) || - (current->gid != task->sgid) || - (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) - goto bad; - smp_rmb(); - if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) - goto bad; /* the same process cannot be attached many times */ if (task->ptrace & PT_PTRACED) goto bad; - retval = security_ptrace(current, task); + retval = may_attach(task); if (retval) goto bad; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f436993bd590..bef3b6901b76 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/rcupdate.h> +#include <linux/rcuref.h> #include <linux/cpu.h> /* Definition for rcupdate control block. */ @@ -72,6 +73,19 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; static int maxbatch = 10; +#ifndef __HAVE_ARCH_CMPXCHG +/* + * We use an array of spinlocks for the rcurefs -- similar to ones in sparc + * 32 bit atomic_t implementations, and a hash function similar to that + * for our refcounting needs. + * Can't help multiprocessors which donot have cmpxchg :( + */ + +spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = { + [0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED +}; +#endif + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. diff --git a/kernel/resource.c b/kernel/resource.c index 26967e042201..92285d822de6 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -430,10 +430,9 @@ EXPORT_SYMBOL(adjust_resource); */ struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) { - struct resource *res = kmalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); if (res) { - memset(res, 0, sizeof(*res)); res->name = name; res->start = start; res->end = start + n - 1; diff --git a/kernel/sched.c b/kernel/sched.c index 5f889d0cbfcc..2632b812cf24 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1478,6 +1478,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next) /** * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch * @prev: the thread we just switched away from. * * finish_task_switch must be called after the context switch, paired @@ -2887,6 +2888,7 @@ switch_tasks: if (next == rq->idle) schedstat_inc(rq, sched_goidle); prefetch(next); + prefetch_stack(next); clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); @@ -4779,7 +4781,7 @@ static int sd_parent_degenerate(struct sched_domain *sd, * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void cpu_attach_domain(struct sched_domain *sd, int cpu) +static void cpu_attach_domain(struct sched_domain *sd, int cpu) { runqueue_t *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -4802,7 +4804,7 @@ void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -4830,8 +4832,8 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void init_sched_build_groups(struct sched_group groups[], - cpumask_t span, int (*group_fn)(int cpu)) +static void init_sched_build_groups(struct sched_group groups[], cpumask_t span, + int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; cpumask_t covered = CPU_MASK_NONE; @@ -4864,12 +4866,85 @@ void init_sched_build_groups(struct sched_group groups[], last->next = first; } +#define SD_NODES_PER_DOMAIN 16 -#ifdef ARCH_HAS_SCHED_DOMAIN -extern void build_sched_domains(const cpumask_t *cpu_map); -extern void arch_init_sched_domains(const cpumask_t *cpu_map); -extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); -#else +#ifdef CONFIG_NUMA +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, unsigned long *used_nodes) +{ + int i, n, val, min_val, best_node = 0; + + min_val = INT_MAX; + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Start at @node */ + n = (node + i) % MAX_NUMNODES; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (test_bit(n, used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + set_bit(best_node, used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @size: number of nodes to include in this span + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static cpumask_t sched_domain_node_span(int node) +{ + int i; + cpumask_t span, nodemask; + DECLARE_BITMAP(used_nodes, MAX_NUMNODES); + + cpus_clear(span); + bitmap_zero(used_nodes, MAX_NUMNODES); + + nodemask = node_to_cpumask(node); + cpus_or(span, span, nodemask); + set_bit(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, used_nodes); + nodemask = node_to_cpumask(next_node); + cpus_or(span, span, nodemask); + } + + return span; +} +#endif + +/* + * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we + * can switch it on easily if needed. + */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; @@ -4891,36 +4966,20 @@ static int cpu_to_phys_group(int cpu) } #ifdef CONFIG_NUMA - -static DEFINE_PER_CPU(struct sched_domain, node_domains); -static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int cpu_to_node_group(int cpu) -{ - return cpu_to_node(cpu); -} -#endif - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) /* - * The domains setup code relies on siblings not spanning - * multiple nodes. Make sure the architecture has a proper - * siblings map: + * The init_sched_build_groups can't handle what we want to do with node + * groups, so roll our own. Now each node has its own list of groups which + * gets dynamically allocated. */ -static void check_sibling_maps(void) -{ - int i, j; +static DEFINE_PER_CPU(struct sched_domain, node_domains); +static struct sched_group **sched_group_nodes_bycpu[NR_CPUS]; - for_each_online_cpu(i) { - for_each_cpu_mask(j, cpu_sibling_map[i]) { - if (cpu_to_node(i) != cpu_to_node(j)) { - printk(KERN_INFO "warning: CPU %d siblings map " - "to different node - isolating " - "them.\n", i); - cpu_sibling_map[i] = cpumask_of_cpu(i); - break; - } - } - } +static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); +static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS]; + +static int cpu_to_allnodes_group(int cpu) +{ + return cpu_to_node(cpu); } #endif @@ -4928,9 +4987,24 @@ static void check_sibling_maps(void) * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ -static void build_sched_domains(const cpumask_t *cpu_map) +void build_sched_domains(const cpumask_t *cpu_map) { int i; +#ifdef CONFIG_NUMA + struct sched_group **sched_group_nodes = NULL; + struct sched_group *sched_group_allnodes = NULL; + + /* + * Allocate the per-node list of sched groups + */ + sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, + GFP_ATOMIC); + if (!sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return; + } + sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; +#endif /* * Set up domains for cpus specified by the cpu_map. @@ -4943,11 +5017,35 @@ static void build_sched_domains(const cpumask_t *cpu_map) cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA + if (cpus_weight(*cpu_map) + > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { + if (!sched_group_allnodes) { + sched_group_allnodes + = kmalloc(sizeof(struct sched_group) + * MAX_NUMNODES, + GFP_KERNEL); + if (!sched_group_allnodes) { + printk(KERN_WARNING + "Can not alloc allnodes sched group\n"); + break; + } + sched_group_allnodes_bycpu[i] + = sched_group_allnodes; + } + sd = &per_cpu(allnodes_domains, i); + *sd = SD_ALLNODES_INIT; + sd->span = *cpu_map; + group = cpu_to_allnodes_group(i); + sd->groups = &sched_group_allnodes[group]; + p = sd; + } else + p = NULL; + sd = &per_cpu(node_domains, i); - group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = *cpu_map; - sd->groups = &sched_group_nodes[group]; + sd->span = sched_domain_node_span(cpu_to_node(i)); + sd->parent = p; + cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; @@ -4972,7 +5070,7 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) @@ -4997,8 +5095,77 @@ static void build_sched_domains(const cpumask_t *cpu_map) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, *cpu_map, - &cpu_to_node_group); + if (sched_group_allnodes) + init_sched_build_groups(sched_group_allnodes, *cpu_map, + &cpu_to_allnodes_group); + + for (i = 0; i < MAX_NUMNODES; i++) { + /* Set up node groups */ + struct sched_group *sg, *prev; + cpumask_t nodemask = node_to_cpumask(i); + cpumask_t domainspan; + cpumask_t covered = CPU_MASK_NONE; + int j; + + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) { + sched_group_nodes[i] = NULL; + continue; + } + + domainspan = sched_domain_node_span(i); + cpus_and(domainspan, domainspan, *cpu_map); + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + sched_group_nodes[i] = sg; + for_each_cpu_mask(j, nodemask) { + struct sched_domain *sd; + sd = &per_cpu(node_domains, j); + sd->groups = sg; + if (sd->groups == NULL) { + /* Turn off balancing if we have no groups */ + sd->flags = 0; + } + } + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", i); + continue; + } + sg->cpu_power = 0; + sg->cpumask = nodemask; + cpus_or(covered, covered, nodemask); + prev = sg; + + for (j = 0; j < MAX_NUMNODES; j++) { + cpumask_t tmp, notcovered; + int n = (i + j) % MAX_NUMNODES; + + cpus_complement(notcovered, covered); + cpus_and(tmp, notcovered, *cpu_map); + cpus_and(tmp, tmp, domainspan); + if (cpus_empty(tmp)) + break; + + nodemask = node_to_cpumask(n); + cpus_and(tmp, tmp, nodemask); + if (cpus_empty(tmp)) + continue; + + sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + break; + } + sg->cpu_power = 0; + sg->cpumask = tmp; + cpus_or(covered, covered, tmp); + prev->next = sg; + prev = sg; + } + prev->next = sched_group_nodes[i]; + } #endif /* Calculate CPU power for physical packages and nodes */ @@ -5017,14 +5184,46 @@ static void build_sched_domains(const cpumask_t *cpu_map) sd->groups->cpu_power = power; #ifdef CONFIG_NUMA - if (i == first_cpu(sd->groups->cpumask)) { - /* Only add "power" once for each physical package. */ - sd = &per_cpu(node_domains, i); - sd->groups->cpu_power += power; + sd = &per_cpu(allnodes_domains, i); + if (sd->groups) { + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + sd->groups->cpu_power = power; } #endif } +#ifdef CONFIG_NUMA + for (i = 0; i < MAX_NUMNODES; i++) { + struct sched_group *sg = sched_group_nodes[i]; + int j; + + if (sg == NULL) + continue; +next_sg: + for_each_cpu_mask(j, sg->cpumask) { + struct sched_domain *sd; + int power; + + sd = &per_cpu(phys_domains, j); + if (j != first_cpu(sd->groups->cpumask)) { + /* + * Only add "power" once for each + * physical package. + */ + continue; + } + power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * + (cpus_weight(sd->groups->cpumask)-1) / 10; + + sg->cpu_power += power; + } + sg = sg->next; + if (sg != sched_group_nodes[i]) + goto next_sg; + } +#endif + /* Attach the domains */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; @@ -5039,13 +5238,10 @@ static void build_sched_domains(const cpumask_t *cpu_map) /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ -static void arch_init_sched_domains(cpumask_t *cpu_map) +static void arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to @@ -5058,10 +5254,47 @@ static void arch_init_sched_domains(cpumask_t *cpu_map) static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { - /* Do nothing: everything is statically allocated. */ -} +#ifdef CONFIG_NUMA + int i; + int cpu; + + for_each_cpu_mask(cpu, *cpu_map) { + struct sched_group *sched_group_allnodes + = sched_group_allnodes_bycpu[cpu]; + struct sched_group **sched_group_nodes + = sched_group_nodes_bycpu[cpu]; + + if (sched_group_allnodes) { + kfree(sched_group_allnodes); + sched_group_allnodes_bycpu[cpu] = NULL; + } + + if (!sched_group_nodes) + continue; + + for (i = 0; i < MAX_NUMNODES; i++) { + cpumask_t nodemask = node_to_cpumask(i); + struct sched_group *oldsg, *sg = sched_group_nodes[i]; -#endif /* ARCH_HAS_SCHED_DOMAIN */ + cpus_and(nodemask, nodemask, *cpu_map); + if (cpus_empty(nodemask)) + continue; + + if (sg == NULL) + continue; + sg = sg->next; +next_sg: + oldsg = sg; + sg = sg->next; + kfree(oldsg); + if (oldsg != sched_group_nodes[i]) + goto next_sg; + } + kfree(sched_group_nodes); + sched_group_nodes_bycpu[cpu] = NULL; + } +#endif +} /* * Detach sched domains from a group of cpus specified in cpu_map diff --git a/kernel/signal.c b/kernel/signal.c index d282fea81138..4980a073237f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -678,7 +678,7 @@ static int check_kill_permission(int sig, struct siginfo *info, /* forward decl */ static void do_notify_parent_cldstop(struct task_struct *tsk, - struct task_struct *parent, + int to_self, int why); /* @@ -729,14 +729,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->group_stop_count = 0; p->signal->flags = SIGNAL_STOP_CONTINUED; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_STOPPED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_STOPPED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_STOPPED); spin_lock(&p->sighand->siglock); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); @@ -777,14 +770,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) p->signal->flags = SIGNAL_STOP_CONTINUED; p->signal->group_exit_code = 0; spin_unlock(&p->sighand->siglock); - if (p->ptrace & PT_PTRACED) - do_notify_parent_cldstop(p, p->parent, - CLD_CONTINUED); - else - do_notify_parent_cldstop( - p->group_leader, - p->group_leader->real_parent, - CLD_CONTINUED); + do_notify_parent_cldstop(p, (p->ptrace & PT_PTRACED), CLD_CONTINUED); spin_lock(&p->sighand->siglock); } else { /* @@ -1380,16 +1366,16 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) unsigned long flags; int ret = 0; - /* - * We need the tasklist lock even for the specific - * thread case (when we don't need to follow the group - * lists) in order to avoid races with "p->sighand" - * going away or changing from under us. - */ BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); - read_lock(&tasklist_lock); + read_lock(&tasklist_lock); + + if (unlikely(p->flags & PF_EXITING)) { + ret = -1; + goto out_err; + } + spin_lock_irqsave(&p->sighand->siglock, flags); - + if (unlikely(!list_empty(&q->list))) { /* * If an SI_TIMER entry is already queue just increment @@ -1399,7 +1385,7 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) BUG(); q->info.si_overrun++; goto out; - } + } /* Short-circuit ignored signals. */ if (sig_ignored(p, sig)) { ret = 1; @@ -1414,8 +1400,10 @@ send_sigqueue(int sig, struct sigqueue *q, struct task_struct *p) out: spin_unlock_irqrestore(&p->sighand->siglock, flags); +out_err: read_unlock(&tasklist_lock); - return(ret); + + return ret; } int @@ -1542,14 +1530,20 @@ void do_notify_parent(struct task_struct *tsk, int sig) spin_unlock_irqrestore(&psig->siglock, flags); } -static void -do_notify_parent_cldstop(struct task_struct *tsk, struct task_struct *parent, - int why) +static void do_notify_parent_cldstop(struct task_struct *tsk, int to_self, int why) { struct siginfo info; unsigned long flags; + struct task_struct *parent; struct sighand_struct *sighand; + if (to_self) + parent = tsk->parent; + else { + tsk = tsk->group_leader; + parent = tsk->real_parent; + } + info.si_signo = SIGCHLD; info.si_errno = 0; info.si_pid = tsk->pid; @@ -1618,8 +1612,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) !(current->ptrace & PT_ATTACHED)) && (likely(current->parent->signal != current->signal) || !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { - do_notify_parent_cldstop(current, current->parent, - CLD_TRAPPED); + do_notify_parent_cldstop(current, 1, CLD_TRAPPED); read_unlock(&tasklist_lock); schedule(); } else { @@ -1668,25 +1661,25 @@ void ptrace_notify(int exit_code) static void finish_stop(int stop_count) { + int to_self; + /* * If there are no other threads in the group, or if there is * a group stop in progress and we are the last to stop, * report to the parent. When ptraced, every thread reports itself. */ - if (stop_count < 0 || (current->ptrace & PT_PTRACED)) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current, current->parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } - else if (stop_count == 0) { - read_lock(&tasklist_lock); - do_notify_parent_cldstop(current->group_leader, - current->group_leader->real_parent, - CLD_STOPPED); - read_unlock(&tasklist_lock); - } + if (stop_count < 0 || (current->ptrace & PT_PTRACED)) + to_self = 1; + else if (stop_count == 0) + to_self = 0; + else + goto out; + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, to_self, CLD_STOPPED); + read_unlock(&tasklist_lock); + +out: schedule(); /* * Now we don't run again until continued. diff --git a/kernel/softlockup.c b/kernel/softlockup.c new file mode 100644 index 000000000000..75976209cea7 --- /dev/null +++ b/kernel/softlockup.c @@ -0,0 +1,151 @@ +/* + * Detect Soft Lockups + * + * started by Ingo Molnar, (C) 2005, Red Hat + * + * this code detects soft lockups: incidents in where on a CPU + * the kernel does not reschedule for 10 seconds or more. + */ + +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/notifier.h> +#include <linux/module.h> + +static DEFINE_SPINLOCK(print_lock); + +static DEFINE_PER_CPU(unsigned long, timestamp) = 0; +static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0; +static DEFINE_PER_CPU(struct task_struct *, watchdog_task); + +static int did_panic = 0; +static int softlock_panic(struct notifier_block *this, unsigned long event, + void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = softlock_panic, +}; + +void touch_softlockup_watchdog(void) +{ + per_cpu(timestamp, raw_smp_processor_id()) = jiffies; +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +/* + * This callback runs from the timer interrupt, and checks + * whether the watchdog thread has hung or not: + */ +void softlockup_tick(struct pt_regs *regs) +{ + int this_cpu = smp_processor_id(); + unsigned long timestamp = per_cpu(timestamp, this_cpu); + + if (per_cpu(print_timestamp, this_cpu) == timestamp) + return; + + /* Do not cause a second panic when there already was one */ + if (did_panic) + return; + + if (time_after(jiffies, timestamp + 10*HZ)) { + per_cpu(print_timestamp, this_cpu) = timestamp; + + spin_lock(&print_lock); + printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n", + this_cpu); + show_regs(regs); + spin_unlock(&print_lock); + } +} + +/* + * The watchdog thread - runs every second and touches the timestamp. + */ +static int watchdog(void * __bind_cpu) +{ + struct sched_param param = { .sched_priority = 99 }; + int this_cpu = (long) __bind_cpu; + + printk("softlockup thread %d started up.\n", this_cpu); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->flags |= PF_NOFREEZE; + + set_current_state(TASK_INTERRUPTIBLE); + + /* + * Run briefly once per second - if this gets delayed for + * more than 10 seconds then the debug-printout triggers + * in softlockup_tick(): + */ + while (!kthread_should_stop()) { + msleep_interruptible(1000); + touch_softlockup_watchdog(); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __devinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + BUG_ON(per_cpu(watchdog_task, hotcpu)); + p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); + if (IS_ERR(p)) { + printk("watchdog for %i failed\n", hotcpu); + return NOTIFY_BAD; + } + per_cpu(watchdog_task, hotcpu) = p; + kthread_bind(p, hotcpu); + break; + case CPU_ONLINE: + + wake_up_process(per_cpu(watchdog_task, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(watchdog_task, hotcpu), smp_processor_id()); + case CPU_DEAD: + p = per_cpu(watchdog_task, hotcpu); + per_cpu(watchdog_task, hotcpu) = NULL; + kthread_stop(p); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __devinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +__init void spawn_softlockup_task(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + + cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + notifier_chain_register(&panic_notifier_list, &panic_block); +} + diff --git a/kernel/sys.c b/kernel/sys.c index 0bcaed6560ac..c80412be2302 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1711,7 +1711,6 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { long error; - int sig; error = security_task_prctl(option, arg2, arg3, arg4, arg5); if (error) @@ -1719,12 +1718,11 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, switch (option) { case PR_SET_PDEATHSIG: - sig = arg2; - if (!valid_signal(sig)) { + if (!valid_signal(arg2)) { error = -EINVAL; break; } - current->pdeath_signal = sig; + current->pdeath_signal = arg2; break; case PR_GET_PDEATHSIG: error = put_user(current->pdeath_signal, (int __user *)arg2); diff --git a/kernel/timer.c b/kernel/timer.c index 5377f40723ff..13e2b513be01 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -950,6 +950,7 @@ void do_timer(struct pt_regs *regs) { jiffies_64++; update_times(); + softlockup_tick(regs); } #ifdef __ARCH_WANT_SYS_ALARM @@ -1428,7 +1429,7 @@ static inline u64 time_interpolator_get_cycles(unsigned int src) } } -static inline u64 time_interpolator_get_counter(void) +static inline u64 time_interpolator_get_counter(int writelock) { unsigned int src = time_interpolator->source; @@ -1442,6 +1443,15 @@ static inline u64 time_interpolator_get_counter(void) now = time_interpolator_get_cycles(src); if (lcycle && time_after(lcycle, now)) return lcycle; + + /* When holding the xtime write lock, there's no need + * to add the overhead of the cmpxchg. Readers are + * force to retry until the write lock is released. + */ + if (writelock) { + time_interpolator->last_cycle = now; + return now; + } /* Keep track of the last timer value returned. The use of cmpxchg here * will cause contention in an SMP environment. */ @@ -1455,7 +1465,7 @@ static inline u64 time_interpolator_get_counter(void) void time_interpolator_reset(void) { time_interpolator->offset = 0; - time_interpolator->last_counter = time_interpolator_get_counter(); + time_interpolator->last_counter = time_interpolator_get_counter(1); } #define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift) @@ -1467,7 +1477,7 @@ unsigned long time_interpolator_get_offset(void) return 0; return time_interpolator->offset + - GET_TI_NSECS(time_interpolator_get_counter(), time_interpolator); + GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator); } #define INTERPOLATOR_ADJUST 65536 @@ -1490,7 +1500,7 @@ static void time_interpolator_update(long delta_nsec) * and the tuning logic insures that. */ - counter = time_interpolator_get_counter(); + counter = time_interpolator_get_counter(1); offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator); if (delta_nsec < 0 || (unsigned long) delta_nsec < offset) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c7e36d4a70ca..91bacb13a7e2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,10 +308,9 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - wq = kmalloc(sizeof(*wq), GFP_KERNEL); + wq = kzalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; - memset(wq, 0, sizeof(*wq)); wq->name = name; /* We don't need the distraction of CPUs appearing and vanishing. */ @@ -499,7 +498,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, case CPU_UP_PREPARE: /* Create a new workqueue thread for it. */ list_for_each_entry(wq, &workqueues, list) { - if (create_workqueue_thread(wq, hotcpu) < 0) { + if (!create_workqueue_thread(wq, hotcpu)) { printk("workqueue for %i failed\n", hotcpu); return NOTIFY_BAD; } |