summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz46
-rw-r--r--kernel/Kconfig.preempt65
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/audit.c587
-rw-r--r--kernel/auditsc.c259
-rw-r--r--kernel/cpu.c14
-rw-r--r--kernel/cpuset.c121
-rw-r--r--kernel/crash_dump.c52
-rw-r--r--kernel/exit.c20
-rw-r--r--kernel/fork.c28
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c8
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kexec.c1063
-rw-r--r--kernel/kmod.c17
-rw-r--r--kernel/kprobes.c288
-rw-r--r--kernel/ksysfs.c13
-rw-r--r--kernel/module.c105
-rw-r--r--kernel/panic.c23
-rw-r--r--kernel/params.c4
-rw-r--r--kernel/posix-timers.c35
-rw-r--r--kernel/power/Kconfig8
-rw-r--r--kernel/power/Makefile6
-rw-r--r--kernel/power/disk.c35
-rw-r--r--kernel/power/main.c22
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c89
-rw-r--r--kernel/power/swsusp.c95
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c16
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/sched.c1065
-rw-r--r--kernel/signal.c22
-rw-r--r--kernel/spinlock.c8
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c133
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/timer.c353
39 files changed, 3422 insertions, 1319 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
new file mode 100644
index 000000000000..248e1c396f8b
--- /dev/null
+++ b/kernel/Kconfig.hz
@@ -0,0 +1,46 @@
+#
+# Timer Interrupt Frequency Configuration
+#
+
+choice
+ prompt "Timer frequency"
+ default HZ_250
+ help
+ Allows the configuration of the timer frequency. It is customary
+ to have the timer interrupt run at 1000 HZ but 100 HZ may be more
+ beneficial for servers and NUMA systems that do not need to have
+ a fast response for user interaction and that may experience bus
+ contention and cacheline bounces as a result of timer interrupts.
+ Note that the timer interrupt occurs on each processor in an SMP
+ environment leading to NR_CPUS * HZ number of timer interrupts
+ per second.
+
+
+ config HZ_100
+ bool "100 HZ"
+ help
+ 100 HZ is a typical choice for servers, SMP and NUMA systems
+ with lots of processors that may show reduced performance if
+ too many timer interrupts are occurring.
+
+ config HZ_250
+ bool "250 HZ"
+ help
+ 250 HZ is a good compromise choice allowing server performance
+ while also showing good interactive responsiveness even
+ on SMP and NUMA systems.
+
+ config HZ_1000
+ bool "1000 HZ"
+ help
+ 1000 HZ is the preferred choice for desktop systems and other
+ systems requiring fast interactive responses to events.
+
+endchoice
+
+config HZ
+ int
+ default 100 if HZ_100
+ default 250 if HZ_250
+ default 1000 if HZ_1000
+
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
new file mode 100644
index 000000000000..0b46a5dff4c0
--- /dev/null
+++ b/kernel/Kconfig.preempt
@@ -0,0 +1,65 @@
+
+choice
+ prompt "Preemption Model"
+ default PREEMPT_NONE
+
+config PREEMPT_NONE
+ bool "No Forced Preemption (Server)"
+ help
+ This is the traditional Linux preemption model, geared towards
+ throughput. It will still provide good latencies most of the
+ time, but there are no guarantees and occasional longer delays
+ are possible.
+
+ Select this option if you are building a kernel for a server or
+ scientific/computation system, or if you want to maximize the
+ raw processing power of the kernel, irrespective of scheduling
+ latencies.
+
+config PREEMPT_VOLUNTARY
+ bool "Voluntary Kernel Preemption (Desktop)"
+ help
+ This option reduces the latency of the kernel by adding more
+ "explicit preemption points" to the kernel code. These new
+ preemption points have been selected to reduce the maximum
+ latency of rescheduling, providing faster application reactions,
+ at the cost of slighly lower throughput.
+
+ This allows reaction to interactive events by allowing a
+ low priority process to voluntarily preempt itself even if it
+ is in kernel mode executing a system call. This allows
+ applications to run more 'smoothly' even when the system is
+ under load.
+
+ Select this if you are building a kernel for a desktop system.
+
+config PREEMPT
+ bool "Preemptible Kernel (Low-Latency Desktop)"
+ help
+ This option reduces the latency of the kernel by making
+ all kernel code (that is not executing in a critical section)
+ preemptible. This allows reaction to interactive events by
+ permitting a low priority process to be preempted involuntarily
+ even if it is in kernel mode executing a system call and would
+ otherwise not be about to reach a natural preemption point.
+ This allows applications to run more 'smoothly' even when the
+ system is under load, at the cost of slighly lower throughput
+ and a slight runtime overhead to kernel code.
+
+ Select this if you are building a kernel for a desktop or
+ embedded system with latency requirements in the milliseconds
+ range.
+
+endchoice
+
+config PREEMPT_BKL
+ bool "Preempt The Big Kernel Lock"
+ depends on SMP || PREEMPT
+ default y
+ help
+ This option reduces the latency of the kernel by making the
+ big kernel lock preemptible.
+
+ Say Y here if you are building a kernel for a desktop system.
+ Say N if you are unsure.
+
diff --git a/kernel/Makefile b/kernel/Makefile
index b01d26fe8db7..cb05cd05d237 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -17,6 +17,7 @@ obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
obj-$(CONFIG_PM) += power/
obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
+obj-$(CONFIG_KEXEC) += kexec.o
obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CPUSETS) += cpuset.o
obj-$(CONFIG_IKCONFIG) += configs.o
@@ -27,6 +28,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_SYSFS) += ksysfs.o
obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_SECCOMP) += seccomp.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
diff --git a/kernel/audit.c b/kernel/audit.c
index 9c4f1af0c794..ef35166fdc29 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,8 @@
#include <asm/types.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/kthread.h>
#include <linux/audit.h>
@@ -68,7 +70,7 @@ static int audit_failure = AUDIT_FAIL_PRINTK;
/* If audit records are to be written to the netlink socket, audit_pid
* contains the (non-zero) pid. */
-static int audit_pid;
+int audit_pid;
/* If audit_limit is non-zero, limit the rate of sending audit records
* to that number per second. This prevents DoS attacks, but results in
@@ -77,7 +79,10 @@ static int audit_rate_limit;
/* Number of outstanding audit_buffers allowed. */
static int audit_backlog_limit = 64;
-static atomic_t audit_backlog = ATOMIC_INIT(0);
+
+/* The identity of the user shutting down the audit system. */
+uid_t audit_sig_uid = -1;
+pid_t audit_sig_pid = -1;
/* Records can be lost in several ways:
0) [suppressed in audit_alloc]
@@ -91,19 +96,17 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
/* The netlink socket. */
static struct sock *audit_sock;
-/* There are two lists of audit buffers. The txlist contains audit
- * buffers that cannot be sent immediately to the netlink device because
- * we are in an irq context (these are sent later in a tasklet).
- *
- * The second list is a list of pre-allocated audit buffers (if more
+/* The audit_freelist is a list of pre-allocated audit buffers (if more
* than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
* being placed on the freelist). */
-static DEFINE_SPINLOCK(audit_txlist_lock);
static DEFINE_SPINLOCK(audit_freelist_lock);
static int audit_freelist_count = 0;
-static LIST_HEAD(audit_txlist);
static LIST_HEAD(audit_freelist);
+static struct sk_buff_head audit_skb_queue;
+static struct task_struct *kauditd_task;
+static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
+
/* There are three lists of rules -- one to search at task creation
* time, one to search at syscall entry time, and another to search at
* syscall exit time. */
@@ -112,7 +115,7 @@ static LIST_HEAD(audit_entlist);
static LIST_HEAD(audit_extlist);
/* The netlink socket is only to be read by 1 CPU, which lets us assume
- * that list additions and deletions never happen simultaneiously in
+ * that list additions and deletions never happen simultaneously in
* auditsc.c */
static DECLARE_MUTEX(audit_netlink_sem);
@@ -132,21 +135,14 @@ static DECLARE_MUTEX(audit_netlink_sem);
* use simultaneously. */
struct audit_buffer {
struct list_head list;
- struct sk_buff_head sklist; /* formatted skbs ready to send */
+ struct sk_buff *skb; /* formatted skb ready to send */
struct audit_context *ctx; /* NULL or associated context */
- int len; /* used area of tmp */
- char tmp[AUDIT_BUFSIZ];
-
- /* Pointer to header and contents */
- struct nlmsghdr *nlh;
- int total;
- int type;
- int pid;
};
-void audit_set_type(struct audit_buffer *ab, int type)
+static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
{
- ab->type = type;
+ struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ nlh->nlmsg_pid = pid;
}
struct audit_entry {
@@ -154,9 +150,6 @@ struct audit_entry {
struct audit_rule rule;
};
-static void audit_log_end_irq(struct audit_buffer *ab);
-static void audit_log_end_fast(struct audit_buffer *ab);
-
static void audit_panic(const char *message)
{
switch (audit_failure)
@@ -227,10 +220,8 @@ void audit_log_lost(const char *message)
if (print) {
printk(KERN_WARNING
- "audit: audit_lost=%d audit_backlog=%d"
- " audit_rate_limit=%d audit_backlog_limit=%d\n",
+ "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n",
atomic_read(&audit_lost),
- atomic_read(&audit_backlog),
audit_rate_limit,
audit_backlog_limit);
audit_panic(message);
@@ -242,7 +233,8 @@ static int audit_set_rate_limit(int limit, uid_t loginuid)
{
int old = audit_rate_limit;
audit_rate_limit = limit;
- audit_log(NULL, "audit_rate_limit=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_rate_limit=%d old=%d by auid=%u",
audit_rate_limit, old, loginuid);
return old;
}
@@ -251,7 +243,8 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid)
{
int old = audit_backlog_limit;
audit_backlog_limit = limit;
- audit_log(NULL, "audit_backlog_limit=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_backlog_limit=%d old=%d by auid=%u",
audit_backlog_limit, old, loginuid);
return old;
}
@@ -262,8 +255,9 @@ static int audit_set_enabled(int state, uid_t loginuid)
if (state != 0 && state != 1)
return -EINVAL;
audit_enabled = state;
- audit_log(NULL, "audit_enabled=%d old=%d by auid %u",
- audit_enabled, old, loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_enabled=%d old=%d by auid=%u",
+ audit_enabled, old, loginuid);
return old;
}
@@ -275,12 +269,44 @@ static int audit_set_failure(int state, uid_t loginuid)
&& state != AUDIT_FAIL_PANIC)
return -EINVAL;
audit_failure = state;
- audit_log(NULL, "audit_failure=%d old=%d by auid %u",
- audit_failure, old, loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_failure=%d old=%d by auid=%u",
+ audit_failure, old, loginuid);
return old;
}
-#ifdef CONFIG_NET
+int kauditd_thread(void *dummy)
+{
+ struct sk_buff *skb;
+
+ while (1) {
+ skb = skb_dequeue(&audit_skb_queue);
+ if (skb) {
+ if (audit_pid) {
+ int err = netlink_unicast(audit_sock, skb, audit_pid, 0);
+ if (err < 0) {
+ BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */
+ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid);
+ audit_pid = 0;
+ }
+ } else {
+ printk(KERN_ERR "%s\n", skb->data + NLMSG_SPACE(0));
+ kfree_skb(skb);
+ }
+ } else {
+ DECLARE_WAITQUEUE(wait, current);
+ set_current_state(TASK_INTERRUPTIBLE);
+ add_wait_queue(&kauditd_wait, &wait);
+
+ if (!skb_queue_len(&audit_skb_queue))
+ schedule();
+
+ __set_current_state(TASK_RUNNING);
+ remove_wait_queue(&kauditd_wait, &wait);
+ }
+ }
+}
+
void audit_send_reply(int pid, int seq, int type, int done, int multi,
void *payload, int size)
{
@@ -293,13 +319,16 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
skb = alloc_skb(len, GFP_KERNEL);
if (!skb)
- goto nlmsg_failure;
+ return;
- nlh = NLMSG_PUT(skb, pid, seq, t, len - sizeof(*nlh));
+ nlh = NLMSG_PUT(skb, pid, seq, t, size);
nlh->nlmsg_flags = flags;
data = NLMSG_DATA(nlh);
memcpy(data, payload, size);
- netlink_unicast(audit_sock, skb, pid, MSG_DONTWAIT);
+
+ /* Ignore failure. It'll only happen if the sender goes away,
+ because our timeout is set to infinite. */
+ netlink_unicast(audit_sock, skb, pid, 0);
return;
nlmsg_failure: /* Used by NLMSG_PUT */
@@ -321,10 +350,12 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
case AUDIT_SET:
case AUDIT_ADD:
case AUDIT_DEL:
+ case AUDIT_SIGNAL_INFO:
if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL))
err = -EPERM;
break;
case AUDIT_USER:
+ case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
if (!cap_raised(eff_cap, CAP_AUDIT_WRITE))
err = -EPERM;
break;
@@ -344,11 +375,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
uid_t loginuid; /* loginuid of sender */
+ struct audit_sig_info sig_data;
err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
if (err)
return err;
+ /* As soon as there's any sign of userspace auditd, start kauditd to talk to it */
+ if (!kauditd_task)
+ kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
+
pid = NETLINK_CREDS(skb)->pid;
uid = NETLINK_CREDS(skb)->uid;
loginuid = NETLINK_CB(skb).loginuid;
@@ -363,7 +404,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
status_set.rate_limit = audit_rate_limit;
status_set.backlog_limit = audit_backlog_limit;
status_set.lost = atomic_read(&audit_lost);
- status_set.backlog = atomic_read(&audit_backlog);
+ status_set.backlog = skb_queue_len(&audit_skb_queue);
audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0,
&status_set, sizeof(status_set));
break;
@@ -382,7 +423,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (status_get->mask & AUDIT_STATUS_PID) {
int old = audit_pid;
audit_pid = status_get->pid;
- audit_log(NULL, "audit_pid=%d old=%d by auid %u",
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "audit_pid=%d old=%d by auid=%u",
audit_pid, old, loginuid);
}
if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
@@ -392,18 +434,15 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
loginuid);
break;
case AUDIT_USER:
- ab = audit_log_start(NULL);
+ case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
+ ab = audit_log_start(NULL, msg_type);
if (!ab)
break; /* audit_panic has been called */
audit_log_format(ab,
- "user pid=%d uid=%d length=%d loginuid=%u"
+ "user pid=%d uid=%u auid=%u"
" msg='%.1024s'",
- pid, uid,
- (int)(nlh->nlmsg_len
- - ((char *)data - (char *)nlh)),
- loginuid, (char *)data);
- ab->type = AUDIT_USER;
- ab->pid = pid;
+ pid, uid, loginuid, (char *)data);
+ audit_set_pid(ab, pid);
audit_log_end(ab);
break;
case AUDIT_ADD:
@@ -412,12 +451,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return -EINVAL;
/* fallthrough */
case AUDIT_LIST:
-#ifdef CONFIG_AUDITSYSCALL
err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid,
uid, seq, data, loginuid);
-#else
- err = -EOPNOTSUPP;
-#endif
+ break;
+ case AUDIT_SIGNAL_INFO:
+ sig_data.uid = audit_sig_uid;
+ sig_data.pid = audit_sig_pid;
+ audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
+ 0, 0, &sig_data, sizeof(sig_data));
break;
default:
err = -EINVAL;
@@ -467,87 +508,6 @@ static void audit_receive(struct sock *sk, int length)
up(&audit_netlink_sem);
}
-/* Move data from tmp buffer into an skb. This is an extra copy, and
- * that is unfortunate. However, the copy will only occur when a record
- * is being written to user space, which is already a high-overhead
- * operation. (Elimination of the copy is possible, for example, by
- * writing directly into a pre-allocated skb, at the cost of wasting
- * memory. */
-static void audit_log_move(struct audit_buffer *ab)
-{
- struct sk_buff *skb;
- char *start;
- int extra = ab->nlh ? 0 : NLMSG_SPACE(0);
-
- /* possible resubmission */
- if (ab->len == 0)
- return;
-
- skb = skb_peek_tail(&ab->sklist);
- if (!skb || skb_tailroom(skb) <= ab->len + extra) {
- skb = alloc_skb(2 * ab->len + extra, GFP_ATOMIC);
- if (!skb) {
- ab->len = 0; /* Lose information in ab->tmp */
- audit_log_lost("out of memory in audit_log_move");
- return;
- }
- __skb_queue_tail(&ab->sklist, skb);
- if (!ab->nlh)
- ab->nlh = (struct nlmsghdr *)skb_put(skb,
- NLMSG_SPACE(0));
- }
- start = skb_put(skb, ab->len);
- memcpy(start, ab->tmp, ab->len);
- ab->len = 0;
-}
-
-/* Iterate over the skbuff in the audit_buffer, sending their contents
- * to user space. */
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&ab->sklist))) {
- int retval = 0;
-
- if (audit_pid) {
- if (ab->nlh) {
- ab->nlh->nlmsg_len = ab->total;
- ab->nlh->nlmsg_type = ab->type;
- ab->nlh->nlmsg_flags = 0;
- ab->nlh->nlmsg_seq = 0;
- ab->nlh->nlmsg_pid = ab->pid;
- }
- skb_get(skb); /* because netlink_* frees */
- retval = netlink_unicast(audit_sock, skb, audit_pid,
- MSG_DONTWAIT);
- }
- if (retval == -EAGAIN &&
- (atomic_read(&audit_backlog)) < audit_backlog_limit) {
- skb_queue_head(&ab->sklist, skb);
- audit_log_end_irq(ab);
- return 1;
- }
- if (retval < 0) {
- if (retval == -ECONNREFUSED) {
- printk(KERN_ERR
- "audit: *NO* daemon at audit_pid=%d\n",
- audit_pid);
- audit_pid = 0;
- } else
- audit_log_lost("netlink socket too busy");
- }
- if (!audit_pid) { /* No daemon */
- int offset = ab->nlh ? NLMSG_SPACE(0) : 0;
- int len = skb->len - offset;
- skb->data[offset + len] = '\0';
- printk(KERN_ERR "%s\n", skb->data + offset);
- }
- kfree_skb(skb);
- ab->nlh = NULL;
- }
- return 0;
-}
/* Initialize audit support at boot time. */
static int __init audit_init(void)
@@ -558,40 +518,13 @@ static int __init audit_init(void)
if (!audit_sock)
audit_panic("cannot initialize netlink socket");
+ audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ skb_queue_head_init(&audit_skb_queue);
audit_initialized = 1;
audit_enabled = audit_default;
- audit_log(NULL, "initialized");
- return 0;
-}
-
-#else
-/* Without CONFIG_NET, we have no skbuffs. For now, print what we have
- * in the buffer. */
-static void audit_log_move(struct audit_buffer *ab)
-{
- printk(KERN_ERR "%*.*s\n", ab->len, ab->len, ab->tmp);
- ab->len = 0;
-}
-
-static inline int audit_log_drain(struct audit_buffer *ab)
-{
- return 0;
-}
-
-/* Initialize audit support at boot time. */
-int __init audit_init(void)
-{
- printk(KERN_INFO "audit: initializing WITHOUT netlink support\n");
- audit_sock = NULL;
- audit_pid = 0;
-
- audit_initialized = 1;
- audit_enabled = audit_default;
- audit_log(NULL, "initialized");
+ audit_log(NULL, AUDIT_KERNEL, "initialized");
return 0;
}
-#endif
-
__initcall(audit_init);
/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */
@@ -608,6 +541,102 @@ static int __init audit_enable(char *str)
__setup("audit=", audit_enable);
+static void audit_buffer_free(struct audit_buffer *ab)
+{
+ unsigned long flags;
+
+ if (!ab)
+ return;
+
+ if (ab->skb)
+ kfree_skb(ab->skb);
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (++audit_freelist_count > AUDIT_MAXFREE)
+ kfree(ab);
+ else
+ list_add(&ab->list, &audit_freelist);
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+}
+
+static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx,
+ int gfp_mask, int type)
+{
+ unsigned long flags;
+ struct audit_buffer *ab = NULL;
+ struct nlmsghdr *nlh;
+
+ spin_lock_irqsave(&audit_freelist_lock, flags);
+ if (!list_empty(&audit_freelist)) {
+ ab = list_entry(audit_freelist.next,
+ struct audit_buffer, list);
+ list_del(&ab->list);
+ --audit_freelist_count;
+ }
+ spin_unlock_irqrestore(&audit_freelist_lock, flags);
+
+ if (!ab) {
+ ab = kmalloc(sizeof(*ab), gfp_mask);
+ if (!ab)
+ goto err;
+ }
+
+ ab->skb = alloc_skb(AUDIT_BUFSIZ, gfp_mask);
+ if (!ab->skb)
+ goto err;
+
+ ab->ctx = ctx;
+ nlh = (struct nlmsghdr *)skb_put(ab->skb, NLMSG_SPACE(0));
+ nlh->nlmsg_type = type;
+ nlh->nlmsg_flags = 0;
+ nlh->nlmsg_pid = 0;
+ nlh->nlmsg_seq = 0;
+ return ab;
+err:
+ audit_buffer_free(ab);
+ return NULL;
+}
+
+/* Compute a serial number for the audit record. Audit records are
+ * written to user-space as soon as they are generated, so a complete
+ * audit record may be written in several pieces. The timestamp of the
+ * record and this serial number are used by the user-space tools to
+ * determine which pieces belong to the same audit record. The
+ * (timestamp,serial) tuple is unique for each syscall and is live from
+ * syscall entry to syscall exit.
+ *
+ * Atomic values are only guaranteed to be 24-bit, so we count down.
+ *
+ * NOTE: Another possibility is to store the formatted records off the
+ * audit context (for those records that have a context), and emit them
+ * all at syscall exit. However, this could delay the reporting of
+ * significant errors until syscall exit (or never, if the system
+ * halts). */
+unsigned int audit_serial(void)
+{
+ static atomic_t serial = ATOMIC_INIT(0xffffff);
+ unsigned int a, b;
+
+ do {
+ a = atomic_read(&serial);
+ if (atomic_dec_and_test(&serial))
+ atomic_set(&serial, 0xffffff);
+ b = atomic_read(&serial);
+ } while (b != a - 1);
+
+ return 0xffffff - b;
+}
+
+static inline void audit_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
+{
+ if (ctx)
+ auditsc_get_stamp(ctx, t, serial);
+ else {
+ *t = CURRENT_TIME;
+ *serial = audit_serial();
+ }
+}
/* Obtain an audit buffer. This routine does locking to obtain the
* audit buffer, but then no locking is required for calls to
@@ -615,10 +644,9 @@ __setup("audit=", audit_enable);
* syscall, then the syscall is marked as auditable and an audit record
* will be written at syscall exit. If there is no associated task, tsk
* should be NULL. */
-struct audit_buffer *audit_log_start(struct audit_context *ctx)
+struct audit_buffer *audit_log_start(struct audit_context *ctx, int type)
{
struct audit_buffer *ab = NULL;
- unsigned long flags;
struct timespec t;
unsigned int serial;
@@ -626,57 +654,48 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx)
return NULL;
if (audit_backlog_limit
- && atomic_read(&audit_backlog) > audit_backlog_limit) {
+ && skb_queue_len(&audit_skb_queue) > audit_backlog_limit) {
if (audit_rate_check())
printk(KERN_WARNING
"audit: audit_backlog=%d > "
"audit_backlog_limit=%d\n",
- atomic_read(&audit_backlog),
+ skb_queue_len(&audit_skb_queue),
audit_backlog_limit);
audit_log_lost("backlog limit exceeded");
return NULL;
}
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (!list_empty(&audit_freelist)) {
- ab = list_entry(audit_freelist.next,
- struct audit_buffer, list);
- list_del(&ab->list);
- --audit_freelist_count;
- }
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
-
- if (!ab)
- ab = kmalloc(sizeof(*ab), GFP_ATOMIC);
+ ab = audit_buffer_alloc(ctx, GFP_ATOMIC, type);
if (!ab) {
audit_log_lost("out of memory in audit_log_start");
return NULL;
}
- atomic_inc(&audit_backlog);
- skb_queue_head_init(&ab->sklist);
-
- ab->ctx = ctx;
- ab->len = 0;
- ab->nlh = NULL;
- ab->total = 0;
- ab->type = AUDIT_KERNEL;
- ab->pid = 0;
+ audit_get_stamp(ab->ctx, &t, &serial);
-#ifdef CONFIG_AUDITSYSCALL
- if (ab->ctx)
- audit_get_stamp(ab->ctx, &t, &serial);
- else
-#endif
- {
- t = CURRENT_TIME;
- serial = 0;
- }
audit_log_format(ab, "audit(%lu.%03lu:%u): ",
t.tv_sec, t.tv_nsec/1000000, serial);
return ab;
}
+/**
+ * audit_expand - expand skb in the audit buffer
+ * @ab: audit_buffer
+ *
+ * Returns 0 (no space) on failed expansion, or available space if
+ * successful.
+ */
+static inline int audit_expand(struct audit_buffer *ab, int extra)
+{
+ struct sk_buff *skb = ab->skb;
+ int ret = pskb_expand_head(skb, skb_headroom(skb), extra,
+ GFP_ATOMIC);
+ if (ret < 0) {
+ audit_log_lost("out of memory in audit_expand");
+ return 0;
+ }
+ return skb_tailroom(skb);
+}
/* Format an audit message into the audit buffer. If there isn't enough
* room in the audit buffer, more room will be allocated and vsnprint
@@ -686,26 +705,35 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
va_list args)
{
int len, avail;
+ struct sk_buff *skb;
+ va_list args2;
if (!ab)
return;
- avail = sizeof(ab->tmp) - ab->len;
- if (avail <= 0) {
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ if (avail == 0) {
+ avail = audit_expand(ab, AUDIT_BUFSIZ);
+ if (!avail)
+ goto out;
}
- len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ va_copy(args2, args);
+ len = vsnprintf(skb->tail, avail, fmt, args);
if (len >= avail) {
/* The printk buffer is 1024 bytes long, so if we get
* here and AUDIT_BUFSIZ is at least 1024, then we can
* log everything that printk could have logged. */
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
- len = vsnprintf(ab->tmp + ab->len, avail, fmt, args);
+ avail = audit_expand(ab, max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail));
+ if (!avail)
+ goto out;
+ len = vsnprintf(skb->tail, avail, fmt, args2);
}
- ab->len += (len < avail) ? len : avail;
- ab->total += (len < avail) ? len : avail;
+ if (len > 0)
+ skb_put(skb, len);
+out:
+ return;
}
/* Format a message into the audit buffer. All the work is done in
@@ -721,20 +749,47 @@ void audit_log_format(struct audit_buffer *ab, const char *fmt, ...)
va_end(args);
}
-void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len)
+/* This function will take the passed buf and convert it into a string of
+ * ascii hex digits. The new string is placed onto the skb. */
+void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
+ size_t len)
{
- int i;
+ int i, avail, new_len;
+ unsigned char *ptr;
+ struct sk_buff *skb;
+ static const unsigned char *hex = "0123456789ABCDEF";
+
+ BUG_ON(!ab->skb);
+ skb = ab->skb;
+ avail = skb_tailroom(skb);
+ new_len = len<<1;
+ if (new_len >= avail) {
+ /* Round the buffer request up to the next multiple */
+ new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1);
+ avail = audit_expand(ab, new_len);
+ if (!avail)
+ return;
+ }
- for (i=0; i<len; i++)
- audit_log_format(ab, "%02x", buf[i]);
+ ptr = skb->tail;
+ for (i=0; i<len; i++) {
+ *ptr++ = hex[(buf[i] & 0xF0)>>4]; /* Upper nibble */
+ *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */
+ }
+ *ptr = 0;
+ skb_put(skb, len << 1); /* new string is twice the old string */
}
+/* This code will escape a string that is passed to it if the string
+ * contains a control character, unprintable character, double quote mark,
+ * or a space. Unescaped strings will start and end with a double quote mark.
+ * Strings that are escaped are printed in hex (2 digits per char). */
void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
{
const unsigned char *p = string;
while (*p) {
- if (*p == '"' || *p == ' ' || *p < 0x20 || *p > 0x7f) {
+ if (*p == '"' || *p < 0x21 || *p > 0x7f) {
audit_log_hex(ab, string, strlen(string));
return;
}
@@ -743,117 +798,63 @@ void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
audit_log_format(ab, "\"%s\"", string);
}
-
-/* This is a helper-function to print the d_path without using a static
- * buffer or allocating another buffer in addition to the one in
- * audit_buffer. */
+/* This is a helper-function to print the escaped d_path */
void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
struct dentry *dentry, struct vfsmount *vfsmnt)
{
- char *p;
- int len, avail;
+ char *p, *path;
- if (prefix) audit_log_format(ab, " %s", prefix);
-
- if (ab->len > 128)
- audit_log_move(ab);
- avail = sizeof(ab->tmp) - ab->len;
- p = d_path(dentry, vfsmnt, ab->tmp + ab->len, avail);
- if (IS_ERR(p)) {
- /* FIXME: can we save some information here? */
- audit_log_format(ab, "<toolong>");
- } else {
- /* path isn't at start of buffer */
- len = (ab->tmp + sizeof(ab->tmp) - 1) - p;
- memmove(ab->tmp + ab->len, p, len);
- ab->len += len;
- ab->total += len;
- }
-}
-
-/* Remove queued messages from the audit_txlist and send them to userspace. */
-static void audit_tasklet_handler(unsigned long arg)
-{
- LIST_HEAD(list);
- struct audit_buffer *ab;
- unsigned long flags;
+ if (prefix)
+ audit_log_format(ab, " %s", prefix);
- spin_lock_irqsave(&audit_txlist_lock, flags);
- list_splice_init(&audit_txlist, &list);
- spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
- while (!list_empty(&list)) {
- ab = list_entry(list.next, struct audit_buffer, list);
- list_del(&ab->list);
- audit_log_end_fast(ab);
+ /* We will allow 11 spaces for ' (deleted)' to be appended */
+ path = kmalloc(PATH_MAX+11, GFP_KERNEL);
+ if (!path) {
+ audit_log_format(ab, "<no memory>");
+ return;
}
+ p = d_path(dentry, vfsmnt, path, PATH_MAX+11);
+ if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
+ /* FIXME: can we save some information here? */
+ audit_log_format(ab, "<too long>");
+ } else
+ audit_log_untrustedstring(ab, p);
+ kfree(path);
}
-static DECLARE_TASKLET(audit_tasklet, audit_tasklet_handler, 0);
-
/* The netlink_* functions cannot be called inside an irq context, so
* the audit buffer is places on a queue and a tasklet is scheduled to
* remove them from the queue outside the irq context. May be called in
* any context. */
-static void audit_log_end_irq(struct audit_buffer *ab)
-{
- unsigned long flags;
-
- if (!ab)
- return;
- spin_lock_irqsave(&audit_txlist_lock, flags);
- list_add_tail(&ab->list, &audit_txlist);
- spin_unlock_irqrestore(&audit_txlist_lock, flags);
-
- tasklet_schedule(&audit_tasklet);
-}
-
-/* Send the message in the audit buffer directly to user space. May not
- * be called in an irq context. */
-static void audit_log_end_fast(struct audit_buffer *ab)
+void audit_log_end(struct audit_buffer *ab)
{
- unsigned long flags;
-
- BUG_ON(in_irq());
if (!ab)
return;
if (!audit_rate_check()) {
audit_log_lost("rate limit exceeded");
} else {
- audit_log_move(ab);
- if (audit_log_drain(ab))
- return;
+ if (audit_pid) {
+ struct nlmsghdr *nlh = (struct nlmsghdr *)ab->skb->data;
+ nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0);
+ skb_queue_tail(&audit_skb_queue, ab->skb);
+ ab->skb = NULL;
+ wake_up_interruptible(&kauditd_wait);
+ } else {
+ printk("%s\n", ab->skb->data + NLMSG_SPACE(0));
+ }
}
-
- atomic_dec(&audit_backlog);
- spin_lock_irqsave(&audit_freelist_lock, flags);
- if (++audit_freelist_count > AUDIT_MAXFREE)
- kfree(ab);
- else
- list_add(&ab->list, &audit_freelist);
- spin_unlock_irqrestore(&audit_freelist_lock, flags);
-}
-
-/* Send or queue the message in the audit buffer, depending on the
- * current context. (A convenience function that may be called in any
- * context.) */
-void audit_log_end(struct audit_buffer *ab)
-{
- if (in_irq())
- audit_log_end_irq(ab);
- else
- audit_log_end_fast(ab);
+ audit_buffer_free(ab);
}
/* Log an audit record. This is a convenience function that calls
* audit_log_start, audit_log_vformat, and audit_log_end. It may be
* called in any context. */
-void audit_log(struct audit_context *ctx, const char *fmt, ...)
+void audit_log(struct audit_context *ctx, int type, const char *fmt, ...)
{
struct audit_buffer *ab;
va_list args;
- ab = audit_log_start(ctx);
+ ab = audit_log_start(ctx, type);
if (ab) {
va_start(args, fmt);
audit_log_vformat(ab, fmt, args);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 37b3ac94bc47..e75f84e1a1a0 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -34,7 +34,8 @@
#include <asm/types.h>
#include <linux/mm.h>
#include <linux/module.h>
-
+#include <linux/mount.h>
+#include <linux/socket.h>
#include <linux/audit.h>
#include <linux/personality.h>
#include <linux/time.h>
@@ -112,6 +113,23 @@ struct audit_aux_data_ipcctl {
mode_t mode;
};
+struct audit_aux_data_socketcall {
+ struct audit_aux_data d;
+ int nargs;
+ unsigned long args[0];
+};
+
+struct audit_aux_data_sockaddr {
+ struct audit_aux_data d;
+ int len;
+ char a[0];
+};
+
+struct audit_aux_data_path {
+ struct audit_aux_data d;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+};
/* The per-task audit context. */
struct audit_context {
@@ -127,6 +145,8 @@ struct audit_context {
int auditable; /* 1 if record should be written */
int name_count;
struct audit_names names[AUDIT_NAMES];
+ struct dentry * pwd;
+ struct vfsmount * pwdmnt;
struct audit_context *previous; /* For nested syscalls */
struct audit_aux_data *aux;
@@ -157,6 +177,8 @@ struct audit_entry {
struct audit_rule rule;
};
+extern int audit_pid;
+
/* Check to see if two rules are identical. It is called from
* audit_del_rule during AUDIT_DEL. */
static int audit_compare_rule(struct audit_rule *a, struct audit_rule *b)
@@ -226,7 +248,6 @@ static inline int audit_del_rule(struct audit_rule *rule,
return -EFAULT; /* No matching rule */
}
-#ifdef CONFIG_NET
/* Copy rule from user-space to kernel-space. Called during
* AUDIT_ADD. */
static int audit_copy_rule(struct audit_rule *d, struct audit_rule *s)
@@ -287,7 +308,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_add_rule(entry, &audit_entlist);
if (!err && (flags & AUDIT_AT_EXIT))
err = audit_add_rule(entry, &audit_extlist);
- audit_log(NULL, "auid %u added an audit rule\n", loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "auid=%u added an audit rule\n", loginuid);
break;
case AUDIT_DEL:
flags =((struct audit_rule *)data)->flags;
@@ -297,7 +319,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
err = audit_del_rule(data, &audit_entlist);
if (!err && (flags & AUDIT_AT_EXIT))
err = audit_del_rule(data, &audit_extlist);
- audit_log(NULL, "auid %u removed an audit rule\n", loginuid);
+ audit_log(NULL, AUDIT_CONFIG_CHANGE,
+ "auid=%u removed an audit rule\n", loginuid);
break;
default:
return -EINVAL;
@@ -305,7 +328,6 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
return err;
}
-#endif
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
* otherwise. */
@@ -444,7 +466,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
/* At syscall entry and exit time, this filter is called if the
* audit_state is not low enough that auditing cannot take place, but is
- * also not high enough that we already know we have to write and audit
+ * also not high enough that we already know we have to write an audit
* record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT).
*/
static enum audit_state audit_filter_syscall(struct task_struct *tsk,
@@ -532,6 +554,12 @@ static inline void audit_free_names(struct audit_context *context)
if (context->names[i].name)
__putname(context->names[i].name);
context->name_count = 0;
+ if (context->pwd)
+ dput(context->pwd);
+ if (context->pwdmnt)
+ mntput(context->pwdmnt);
+ context->pwd = NULL;
+ context->pwdmnt = NULL;
}
static inline void audit_free_aux(struct audit_context *context)
@@ -539,6 +567,11 @@ static inline void audit_free_aux(struct audit_context *context)
struct audit_aux_data *aux;
while ((aux = context->aux)) {
+ if (aux->type == AUDIT_AVC_PATH) {
+ struct audit_aux_data_path *axi = (void *)aux;
+ dput(axi->dentry);
+ mntput(axi->mnt);
+ }
context->aux = aux->next;
kfree(aux);
}
@@ -625,7 +658,8 @@ static void audit_log_task_info(struct audit_buffer *ab)
struct vm_area_struct *vma;
get_task_comm(name, current);
- audit_log_format(ab, " comm=%s", name);
+ audit_log_format(ab, " comm=");
+ audit_log_untrustedstring(ab, name);
if (!mm)
return;
@@ -649,23 +683,24 @@ static void audit_log_exit(struct audit_context *context)
{
int i;
struct audit_buffer *ab;
+ struct audit_aux_data *aux;
- ab = audit_log_start(context);
+ ab = audit_log_start(context, AUDIT_SYSCALL);
if (!ab)
return; /* audit_panic has been called */
- audit_log_format(ab, "syscall=%d", context->major);
+ audit_log_format(ab, "arch=%x syscall=%d",
+ context->arch, context->major);
if (context->personality != PER_LINUX)
audit_log_format(ab, " per=%lx", context->personality);
- audit_log_format(ab, " arch=%x", context->arch);
if (context->return_valid)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
- " pid=%d loginuid=%d uid=%d gid=%d"
- " euid=%d suid=%d fsuid=%d"
- " egid=%d sgid=%d fsgid=%d",
+ " pid=%d auid=%u uid=%u gid=%u"
+ " euid=%u suid=%u fsuid=%u"
+ " egid=%u sgid=%u fsgid=%u",
context->argv[0],
context->argv[1],
context->argv[2],
@@ -679,33 +714,57 @@ static void audit_log_exit(struct audit_context *context)
context->egid, context->sgid, context->fsgid);
audit_log_task_info(ab);
audit_log_end(ab);
- while (context->aux) {
- struct audit_aux_data *aux;
- ab = audit_log_start(context);
+ for (aux = context->aux; aux; aux = aux->next) {
+
+ ab = audit_log_start(context, aux->type);
if (!ab)
continue; /* audit_panic has been called */
- aux = context->aux;
- context->aux = aux->next;
-
- audit_log_format(ab, "auxitem=%d", aux->type);
switch (aux->type) {
- case AUDIT_AUX_IPCPERM: {
+ case AUDIT_IPC: {
struct audit_aux_data_ipcctl *axi = (void *)aux;
audit_log_format(ab,
- " qbytes=%lx uid=%d gid=%d mode=%x",
+ " qbytes=%lx iuid=%u igid=%u mode=%x",
axi->qbytes, axi->uid, axi->gid, axi->mode);
- }
+ break; }
+
+ case AUDIT_SOCKETCALL: {
+ int i;
+ struct audit_aux_data_socketcall *axs = (void *)aux;
+ audit_log_format(ab, "nargs=%d", axs->nargs);
+ for (i=0; i<axs->nargs; i++)
+ audit_log_format(ab, " a%d=%lx", i, axs->args[i]);
+ break; }
+
+ case AUDIT_SOCKADDR: {
+ struct audit_aux_data_sockaddr *axs = (void *)aux;
+
+ audit_log_format(ab, "saddr=");
+ audit_log_hex(ab, axs->a, axs->len);
+ break; }
+
+ case AUDIT_AVC_PATH: {
+ struct audit_aux_data_path *axi = (void *)aux;
+ audit_log_d_path(ab, "path=", axi->dentry, axi->mnt);
+ break; }
+
}
audit_log_end(ab);
- kfree(aux);
}
+ if (context->pwd && context->pwdmnt) {
+ ab = audit_log_start(context, AUDIT_CWD);
+ if (ab) {
+ audit_log_d_path(ab, "cwd=", context->pwd, context->pwdmnt);
+ audit_log_end(ab);
+ }
+ }
for (i = 0; i < context->name_count; i++) {
- ab = audit_log_start(context);
+ ab = audit_log_start(context, AUDIT_PATH);
if (!ab)
continue; /* audit_panic has been called */
+
audit_log_format(ab, "item=%d", i);
if (context->names[i].name) {
audit_log_format(ab, " name=");
@@ -713,7 +772,7 @@ static void audit_log_exit(struct audit_context *context)
}
if (context->names[i].ino != (unsigned long)-1)
audit_log_format(ab, " inode=%lu dev=%02x:%02x mode=%#o"
- " uid=%d gid=%d rdev=%02x:%02x",
+ " ouid=%u ogid=%u rdev=%02x:%02x",
context->names[i].ino,
MAJOR(context->names[i].dev),
MINOR(context->names[i].dev),
@@ -741,42 +800,12 @@ void audit_free(struct task_struct *tsk)
/* Check for system calls that do not go through the exit
* function (e.g., exit_group), then free context block. */
- if (context->in_syscall && context->auditable)
+ if (context->in_syscall && context->auditable && context->pid != audit_pid)
audit_log_exit(context);
audit_free_context(context);
}
-/* Compute a serial number for the audit record. Audit records are
- * written to user-space as soon as they are generated, so a complete
- * audit record may be written in several pieces. The timestamp of the
- * record and this serial number are used by the user-space daemon to
- * determine which pieces belong to the same audit record. The
- * (timestamp,serial) tuple is unique for each syscall and is live from
- * syscall entry to syscall exit.
- *
- * Atomic values are only guaranteed to be 24-bit, so we count down.
- *
- * NOTE: Another possibility is to store the formatted records off the
- * audit context (for those records that have a context), and emit them
- * all at syscall exit. However, this could delay the reporting of
- * significant errors until syscall exit (or never, if the system
- * halts). */
-static inline unsigned int audit_serial(void)
-{
- static atomic_t serial = ATOMIC_INIT(0xffffff);
- unsigned int a, b;
-
- do {
- a = atomic_read(&serial);
- if (atomic_dec_and_test(&serial))
- atomic_set(&serial, 0xffffff);
- b = atomic_read(&serial);
- } while (b != a - 1);
-
- return 0xffffff - b;
-}
-
/* Fill in audit context at syscall entry. This only happens if the
* audit context was created when the task was created and the state or
* filters demand the audit context be built. If the state from the
@@ -876,7 +905,7 @@ void audit_syscall_exit(struct task_struct *tsk, int valid, long return_code)
if (likely(!context))
return;
- if (context->in_syscall && context->auditable)
+ if (context->in_syscall && context->auditable && context->pid != audit_pid)
audit_log_exit(context);
context->in_syscall = 0;
@@ -916,6 +945,13 @@ void audit_getname(const char *name)
context->names[context->name_count].name = name;
context->names[context->name_count].ino = (unsigned long)-1;
++context->name_count;
+ if (!context->pwd) {
+ read_lock(&current->fs->lock);
+ context->pwd = dget(current->fs->pwd);
+ context->pwdmnt = mntget(current->fs->pwdmnt);
+ read_unlock(&current->fs->lock);
+ }
+
}
/* Intercept a putname request. Called from
@@ -994,34 +1030,26 @@ void audit_inode(const char *name, const struct inode *inode)
context->names[idx].rdev = inode->i_rdev;
}
-void audit_get_stamp(struct audit_context *ctx,
- struct timespec *t, unsigned int *serial)
+void auditsc_get_stamp(struct audit_context *ctx,
+ struct timespec *t, unsigned int *serial)
{
- if (ctx) {
- t->tv_sec = ctx->ctime.tv_sec;
- t->tv_nsec = ctx->ctime.tv_nsec;
- *serial = ctx->serial;
- ctx->auditable = 1;
- } else {
- *t = CURRENT_TIME;
- *serial = 0;
- }
+ t->tv_sec = ctx->ctime.tv_sec;
+ t->tv_nsec = ctx->ctime.tv_nsec;
+ *serial = ctx->serial;
+ ctx->auditable = 1;
}
-extern int audit_set_type(struct audit_buffer *ab, int type);
-
int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
{
if (task->audit_context) {
struct audit_buffer *ab;
- ab = audit_log_start(NULL);
+ ab = audit_log_start(NULL, AUDIT_LOGIN);
if (ab) {
audit_log_format(ab, "login pid=%d uid=%u "
- "old loginuid=%u new loginuid=%u",
+ "old auid=%u new auid=%u",
task->pid, task->uid,
task->audit_context->loginuid, loginuid);
- audit_set_type(ab, AUDIT_LOGIN);
audit_log_end(ab);
}
task->audit_context->loginuid = loginuid;
@@ -1051,8 +1079,89 @@ int audit_ipc_perms(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
ax->gid = gid;
ax->mode = mode;
- ax->d.type = AUDIT_AUX_IPCPERM;
+ ax->d.type = AUDIT_IPC;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+int audit_socketcall(int nargs, unsigned long *args)
+{
+ struct audit_aux_data_socketcall *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->nargs = nargs;
+ memcpy(ax->args, args, nargs * sizeof(unsigned long));
+
+ ax->d.type = AUDIT_SOCKETCALL;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+int audit_sockaddr(int len, void *a)
+{
+ struct audit_aux_data_sockaddr *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->len = len;
+ memcpy(ax->a, a, len);
+
+ ax->d.type = AUDIT_SOCKADDR;
ax->d.next = context->aux;
context->aux = (void *)ax;
return 0;
}
+
+int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
+{
+ struct audit_aux_data_path *ax;
+ struct audit_context *context = current->audit_context;
+
+ if (likely(!context))
+ return 0;
+
+ ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
+ if (!ax)
+ return -ENOMEM;
+
+ ax->dentry = dget(dentry);
+ ax->mnt = mntget(mnt);
+
+ ax->d.type = AUDIT_AVC_PATH;
+ ax->d.next = context->aux;
+ context->aux = (void *)ax;
+ return 0;
+}
+
+void audit_signal_info(int sig, struct task_struct *t)
+{
+ extern pid_t audit_sig_pid;
+ extern uid_t audit_sig_uid;
+
+ if (unlikely(audit_pid && t->pid == audit_pid)) {
+ if (sig == SIGTERM || sig == SIGHUP) {
+ struct audit_context *ctx = current->audit_context;
+ audit_sig_pid = current->pid;
+ if (ctx)
+ audit_sig_uid = ctx->loginuid;
+ else
+ audit_sig_uid = current->uid;
+ }
+ }
+}
+
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 628f4ccda127..53d8263ae12e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -63,19 +63,15 @@ static int take_cpu_down(void *unused)
{
int err;
- /* Take offline: makes arch_cpu_down somewhat easier. */
- cpu_clear(smp_processor_id(), cpu_online_map);
-
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
- cpu_set(smp_processor_id(), cpu_online_map);
- else
- /* Force idle task to run as soon as we yield: it should
- immediately notice cpu is offline and die quickly. */
- sched_idle_next();
+ return err;
- return err;
+ /* Force idle task to run as soon as we yield: it should
+ immediately notice cpu is offline and die quickly. */
+ sched_idle_next();
+ return 0;
}
int cpu_down(unsigned int cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 961d74044deb..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -166,9 +166,8 @@ static struct super_block *cpuset_sb = NULL;
* The hooks from fork and exit, cpuset_fork() and cpuset_exit(), don't
* (usually) grab cpuset_sem. These are the two most performance
* critical pieces of code here. The exception occurs on exit(),
- * if the last task using a cpuset exits, and the cpuset was marked
- * notify_on_release. In that case, the cpuset_sem is taken, the
- * path to the released cpuset calculated, and a usermode call made
+ * when a task in a notify_on_release cpuset exits. Then cpuset_sem
+ * is taken, and if the cpuset count is zero, a usermode call made
* to /sbin/cpuset_release_agent with the name of the cpuset (path
* relative to the root of cpuset file system) as the argument.
*
@@ -229,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
{
- struct qstr qstr;
- struct dentry *d;
-
- qstr.name = name;
- qstr.len = strlen(name);
- qstr.hash = full_name_hash(name, qstr.len);
- d = lookup_hash(&qstr, parent);
+ struct dentry *d = lookup_one_len(name, parent, strlen(name));
if (!IS_ERR(d))
d->d_op = &cpuset_dops;
return d;
@@ -602,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
return 0;
}
+/*
+ * For a given cpuset cur, partition the system as follows
+ * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
+ * exclusive child cpusets
+ * b. All cpus in the current cpuset's cpus_allowed that are not part of any
+ * exclusive child cpusets
+ * Build these two partitions by calling partition_sched_domains
+ *
+ * Call with cpuset_sem held. May nest a call to the
+ * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ */
+static void update_cpu_domains(struct cpuset *cur)
+{
+ struct cpuset *c, *par = cur->parent;
+ cpumask_t pspan, cspan;
+
+ if (par == NULL || cpus_empty(cur->cpus_allowed))
+ return;
+
+ /*
+ * Get all cpus from parent's cpus_allowed not part of exclusive
+ * children
+ */
+ pspan = par->cpus_allowed;
+ list_for_each_entry(c, &par->children, sibling) {
+ if (is_cpu_exclusive(c))
+ cpus_andnot(pspan, pspan, c->cpus_allowed);
+ }
+ if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+ cpus_or(pspan, pspan, cur->cpus_allowed);
+ if (cpus_equal(pspan, cur->cpus_allowed))
+ return;
+ cspan = CPU_MASK_NONE;
+ } else {
+ if (cpus_empty(pspan))
+ return;
+ cspan = cur->cpus_allowed;
+ /*
+ * Get all cpus from current cpuset's cpus_allowed not part
+ * of exclusive children
+ */
+ list_for_each_entry(c, &cur->children, sibling) {
+ if (is_cpu_exclusive(c))
+ cpus_andnot(cspan, cspan, c->cpus_allowed);
+ }
+ }
+
+ lock_cpu_hotplug();
+ partition_sched_domains(&pspan, &cspan);
+ unlock_cpu_hotplug();
+}
+
static int update_cpumask(struct cpuset *cs, char *buf)
{
struct cpuset trialcs;
- int retval;
+ int retval, cpus_unchanged;
trialcs = *cs;
retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -615,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
if (cpus_empty(trialcs.cpus_allowed))
return -ENOSPC;
retval = validate_change(cs, &trialcs);
- if (retval == 0)
- cs->cpus_allowed = trialcs.cpus_allowed;
- return retval;
+ if (retval < 0)
+ return retval;
+ cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+ cs->cpus_allowed = trialcs.cpus_allowed;
+ if (is_cpu_exclusive(cs) && !cpus_unchanged)
+ update_cpu_domains(cs);
+ return 0;
}
static int update_nodemask(struct cpuset *cs, char *buf)
@@ -653,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
{
int turning_on;
struct cpuset trialcs;
- int err;
+ int err, cpu_exclusive_changed;
turning_on = (simple_strtoul(buf, NULL, 10) != 0);
@@ -664,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
clear_bit(bit, &trialcs.flags);
err = validate_change(cs, &trialcs);
- if (err == 0) {
- if (turning_on)
- set_bit(bit, &cs->flags);
- else
- clear_bit(bit, &cs->flags);
- }
- return err;
+ if (err < 0)
+ return err;
+ cpu_exclusive_changed =
+ (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
+ if (turning_on)
+ set_bit(bit, &cs->flags);
+ else
+ clear_bit(bit, &cs->flags);
+
+ if (cpu_exclusive_changed)
+ update_cpu_domains(cs);
+ return 0;
}
static int attach_task(struct cpuset *cs, char *buf)
@@ -1316,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
up(&cpuset_sem);
return -EBUSY;
}
- spin_lock(&cs->dentry->d_lock);
parent = cs->parent;
set_bit(CS_REMOVED, &cs->flags);
+ if (is_cpu_exclusive(cs))
+ update_cpu_domains(cs);
list_del(&cs->sibling); /* delete my sibling from parent->children */
if (list_empty(&parent->children))
check_for_release(parent);
+ spin_lock(&cs->dentry->d_lock);
d = dget(cs->dentry);
cs->dentry = NULL;
spin_unlock(&d->d_lock);
@@ -1404,6 +1460,18 @@ void cpuset_fork(struct task_struct *tsk)
*
* Description: Detach cpuset from @tsk and release it.
*
+ * Note that cpusets marked notify_on_release force every task
+ * in them to take the global cpuset_sem semaphore when exiting.
+ * This could impact scaling on very large systems. Be reluctant
+ * to use notify_on_release cpusets where very high task exit
+ * scaling is required on large systems.
+ *
+ * Don't even think about derefencing 'cs' after the cpuset use
+ * count goes to zero, except inside a critical section guarded
+ * by the cpuset_sem semaphore. If you don't hold cpuset_sem,
+ * then a zero cpuset use count is a license to any other task to
+ * nuke the cpuset immediately.
+ *
**/
void cpuset_exit(struct task_struct *tsk)
@@ -1415,10 +1483,13 @@ void cpuset_exit(struct task_struct *tsk)
tsk->cpuset = NULL;
task_unlock(tsk);
- if (atomic_dec_and_test(&cs->count)) {
+ if (notify_on_release(cs)) {
down(&cpuset_sem);
- check_for_release(cs);
+ if (atomic_dec_and_test(&cs->count))
+ check_for_release(cs);
up(&cpuset_sem);
+ } else {
+ atomic_dec(&cs->count);
}
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..459ba49e376a
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,52 @@
+/*
+ * kernel/crash_dump.c - Memory preserving reboot related code.
+ *
+ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com)
+ * Copyright (C) IBM Corporation, 2004. All rights reserved
+ */
+
+#include <linux/smp_lock.h>
+#include <linux/errno.h>
+#include <linux/proc_fs.h>
+#include <linux/bootmem.h>
+#include <linux/highmem.h>
+#include <linux/crash_dump.h>
+
+#include <asm/io.h>
+#include <asm/uaccess.h>
+
+/* Stores the physical address of elf header of crash image. */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * Copy a page from "oldmem". For this page, there is no pte mapped
+ * in the current kernel. We stitch up a pte, similar to kmap_atomic.
+ */
+ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
+ size_t csize, unsigned long offset, int userbuf)
+{
+ void *page, *vaddr;
+
+ if (!csize)
+ return 0;
+
+ page = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ vaddr = kmap_atomic_pfn(pfn, KM_PTE0);
+ copy_page(page, vaddr);
+ kunmap_atomic(vaddr, KM_PTE0);
+
+ if (userbuf) {
+ if (copy_to_user(buf, (page + offset), csize)) {
+ kfree(page);
+ return -EFAULT;
+ }
+ } else {
+ memcpy(buf, (page + offset), csize);
+ }
+
+ kfree(page);
+ return csize;
+}
diff --git a/kernel/exit.c b/kernel/exit.c
index edaa50b5bbfa..3ebcd60a19c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -72,6 +72,11 @@ repeat:
BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
__exit_signal(p);
__exit_sighand(p);
+ /*
+ * Note that the fastpath in sys_times depends on __exit_signal having
+ * updated the counters before a task is removed from the tasklist of
+ * the process by __unhash_process.
+ */
__unhash_process(p);
/*
@@ -793,6 +798,17 @@ fastcall NORET_TYPE void do_exit(long code)
ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
}
+ /*
+ * We're taking recursive faults here in do_exit. Safest is to just
+ * leave this task alone and wait for reboot.
+ */
+ if (unlikely(tsk->flags & PF_EXITING)) {
+ printk(KERN_ALERT
+ "Fixing recursive fault but reboot is needed!\n");
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ }
+
tsk->flags |= PF_EXITING;
/*
@@ -811,10 +827,8 @@ fastcall NORET_TYPE void do_exit(long code)
acct_update_integrals(tsk);
update_mem_hiwater(tsk);
group_dead = atomic_dec_and_test(&tsk->signal->live);
- if (group_dead) {
- del_timer_sync(&tsk->signal->real_timer);
+ if (group_dead)
acct_process(code);
- }
exit_mm(tsk);
exit_sem(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index f42a17f88699..2c7806873bfd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -194,6 +194,7 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
mm->mmap = NULL;
mm->mmap_cache = NULL;
mm->free_area_cache = oldmm->mmap_base;
+ mm->cached_hole_size = ~0UL;
mm->map_count = 0;
set_mm_counter(mm, rss, 0);
set_mm_counter(mm, anon_rss, 0);
@@ -249,8 +250,9 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
/*
* Link in the new vma and copy the page table entries:
- * link in first so that swapoff can see swap entries,
- * and try_to_unmap_one's find_vma find the new vma.
+ * link in first so that swapoff can see swap entries.
+ * Note that, exceptionally, here the vma is inserted
+ * without holding mm->mmap_sem.
*/
spin_lock(&mm->page_table_lock);
*pprev = tmp;
@@ -322,6 +324,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
mm->ioctx_list = NULL;
mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
mm->free_area_cache = TASK_UNMAPPED_BASE;
+ mm->cached_hole_size = ~0UL;
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -1000,9 +1003,6 @@ static task_t *copy_process(unsigned long clone_flags,
p->pdeath_signal = 0;
p->exit_state = 0;
- /* Perform scheduler related setup */
- sched_fork(p);
-
/*
* Ok, make it visible to the rest of the system.
* We dont wake it up yet.
@@ -1011,18 +1011,24 @@ static task_t *copy_process(unsigned long clone_flags,
INIT_LIST_HEAD(&p->ptrace_children);
INIT_LIST_HEAD(&p->ptrace_list);
+ /* Perform scheduler related setup. Assign this task to a CPU. */
+ sched_fork(p, clone_flags);
+
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
/*
- * The task hasn't been attached yet, so cpus_allowed mask cannot
- * have changed. The cpus_allowed mask of the parent may have
- * changed after it was copied first time, and it may then move to
- * another CPU - so we re-copy it here and set the child's CPU to
- * the parent's CPU. This avoids alot of nasty races.
+ * The task hasn't been attached yet, so its cpus_allowed mask will
+ * not be changed, nor will its assigned CPU.
+ *
+ * The cpus_allowed mask of the parent may have changed after it was
+ * copied first time - so re-copy it here, then check the child's CPU
+ * to ensure it is on a valid CPU (and if not, just force it back to
+ * parent's CPU). This avoids alot of nasty races.
*/
p->cpus_allowed = current->cpus_allowed;
- set_task_cpu(p, smp_processor_id());
+ if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed)))
+ set_task_cpu(p, smp_processor_id());
/*
* Check for pending SIGKILL! The new thread should not be allowed
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 2fb0e46e11f3..436c7d93c00a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,6 +30,7 @@
*/
irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
[0 ... NR_IRQS-1] = {
+ .status = IRQ_DISABLED,
.handler = &no_irq_type,
.lock = SPIN_LOCK_UNLOCKED
}
@@ -118,8 +119,6 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
*/
desc->handler->ack(irq);
action_ret = handle_IRQ_event(irq, regs, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
desc->handler->end(irq);
return 1;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5202e4c4a5b6..ac6700985705 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -6,6 +6,7 @@
* This file contains driver APIs to the irq subsystem.
*/
+#include <linux/config.h>
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/random.h>
@@ -255,6 +256,13 @@ void free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries */
*pp = action->next;
+
+ /* Currently used only by UML, might disappear one day.*/
+#ifdef CONFIG_IRQ_RELEASE_METHOD
+ if (desc->handler->release)
+ desc->handler->release(irq, dev_id);
+#endif
+
if (!desc->action) {
desc->status |= IRQ_DISABLED;
if (desc->handler->shutdown)
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index f6297c306905..ba039e827d58 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -45,7 +45,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
}
}
-void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
+static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
{
static int count = 100;
diff --git a/kernel/kexec.c b/kernel/kexec.c
new file mode 100644
index 000000000000..7843548cf2d9
--- /dev/null
+++ b/kernel/kexec.c
@@ -0,0 +1,1063 @@
+/*
+ * kexec.c - kexec system call
+ * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/kexec.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/syscalls.h>
+#include <linux/reboot.h>
+#include <linux/syscalls.h>
+#include <linux/ioport.h>
+#include <linux/hardirq.h>
+
+#include <asm/page.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/system.h>
+#include <asm/semaphore.h>
+
+/* Location of the reserved area for the crash kernel */
+struct resource crashk_res = {
+ .name = "Crash kernel",
+ .start = 0,
+ .end = 0,
+ .flags = IORESOURCE_BUSY | IORESOURCE_MEM
+};
+
+int kexec_should_crash(struct task_struct *p)
+{
+ if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+ return 1;
+ return 0;
+}
+
+/*
+ * When kexec transitions to the new kernel there is a one-to-one
+ * mapping between physical and virtual addresses. On processors
+ * where you can disable the MMU this is trivial, and easy. For
+ * others it is still a simple predictable page table to setup.
+ *
+ * In that environment kexec copies the new kernel to its final
+ * resting place. This means I can only support memory whose
+ * physical address can fit in an unsigned long. In particular
+ * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
+ * If the assembly stub has more restrictive requirements
+ * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
+ * defined more restrictively in <asm/kexec.h>.
+ *
+ * The code for the transition from the current kernel to the
+ * the new kernel is placed in the control_code_buffer, whose size
+ * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
+ * page of memory is necessary, but some architectures require more.
+ * Because this memory must be identity mapped in the transition from
+ * virtual to physical addresses it must live in the range
+ * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
+ * modifiable.
+ *
+ * The assembly stub in the control code buffer is passed a linked list
+ * of descriptor pages detailing the source pages of the new kernel,
+ * and the destination addresses of those source pages. As this data
+ * structure is not used in the context of the current OS, it must
+ * be self-contained.
+ *
+ * The code has been made to work with highmem pages and will use a
+ * destination page in its final resting place (if it happens
+ * to allocate it). The end product of this is that most of the
+ * physical address space, and most of RAM can be used.
+ *
+ * Future directions include:
+ * - allocating a page table with the control code buffer identity
+ * mapped, to simplify machine_kexec and make kexec_on_panic more
+ * reliable.
+ */
+
+/*
+ * KIMAGE_NO_DEST is an impossible destination address..., for
+ * allocating pages whose destination address we do not care about.
+ */
+#define KIMAGE_NO_DEST (-1UL)
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+ unsigned int gfp_mask,
+ unsigned long dest);
+
+static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ size_t segment_bytes;
+ struct kimage *image;
+ unsigned long i;
+ int result;
+
+ /* Allocate a controlling structure */
+ result = -ENOMEM;
+ image = kmalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
+ goto out;
+
+ memset(image, 0, sizeof(*image));
+ image->head = 0;
+ image->entry = &image->head;
+ image->last_entry = &image->head;
+ image->control_page = ~0; /* By default this does not apply */
+ image->start = entry;
+ image->type = KEXEC_TYPE_DEFAULT;
+
+ /* Initialize the list of control pages */
+ INIT_LIST_HEAD(&image->control_pages);
+
+ /* Initialize the list of destination pages */
+ INIT_LIST_HEAD(&image->dest_pages);
+
+ /* Initialize the list of unuseable pages */
+ INIT_LIST_HEAD(&image->unuseable_pages);
+
+ /* Read in the segments */
+ image->nr_segments = nr_segments;
+ segment_bytes = nr_segments * sizeof(*segments);
+ result = copy_from_user(image->segment, segments, segment_bytes);
+ if (result)
+ goto out;
+
+ /*
+ * Verify we have good destination addresses. The caller is
+ * responsible for making certain we don't attempt to load
+ * the new image into invalid or reserved areas of RAM. This
+ * just verifies it is an address we can use.
+ *
+ * Since the kernel does everything in page size chunks ensure
+ * the destination addreses are page aligned. Too many
+ * special cases crop of when we don't do this. The most
+ * insidious is getting overlapping destination addresses
+ * simply because addresses are changed to page size
+ * granularity.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
+ goto out;
+ if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
+ goto out;
+ }
+
+ /* Verify our destination addresses do not overlap.
+ * If we alloed overlapping destination addresses
+ * through very weird things can happen with no
+ * easy explanation as one segment stops on another.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+ unsigned long j;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ for (j = 0; j < i; j++) {
+ unsigned long pstart, pend;
+ pstart = image->segment[j].mem;
+ pend = pstart + image->segment[j].memsz;
+ /* Do the segments overlap ? */
+ if ((mend > pstart) && (mstart < pend))
+ goto out;
+ }
+ }
+
+ /* Ensure our buffer sizes are strictly less than
+ * our memory sizes. This should always be the case,
+ * and it is easier to check up front than to be surprised
+ * later on.
+ */
+ result = -EINVAL;
+ for (i = 0; i < nr_segments; i++) {
+ if (image->segment[i].bufsz > image->segment[i].memsz)
+ goto out;
+ }
+
+ result = 0;
+out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+
+}
+
+static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
+{
+ int result;
+ struct kimage *image;
+
+ /* Allocate and initialize a controlling structure */
+ image = NULL;
+ result = do_kimage_alloc(&image, entry, nr_segments, segments);
+ if (result)
+ goto out;
+
+ *rimage = image;
+
+ /*
+ * Find a location for the control code buffer, and add it
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ result = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_CODE_SIZE));
+ if (!image->control_code_page) {
+ printk(KERN_ERR "Could not allocate control_code_buffer\n");
+ goto out;
+ }
+
+ result = 0;
+ out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+}
+
+static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
+ unsigned long nr_segments,
+ struct kexec_segment *segments)
+{
+ int result;
+ struct kimage *image;
+ unsigned long i;
+
+ image = NULL;
+ /* Verify we have a valid entry point */
+ if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
+ result = -EADDRNOTAVAIL;
+ goto out;
+ }
+
+ /* Allocate and initialize a controlling structure */
+ result = do_kimage_alloc(&image, entry, nr_segments, segments);
+ if (result)
+ goto out;
+
+ /* Enable the special crash kernel control page
+ * allocation policy.
+ */
+ image->control_page = crashk_res.start;
+ image->type = KEXEC_TYPE_CRASH;
+
+ /*
+ * Verify we have good destination addresses. Normally
+ * the caller is responsible for making certain we don't
+ * attempt to load the new image into invalid or reserved
+ * areas of RAM. But crash kernels are preloaded into a
+ * reserved area of ram. We must ensure the addresses
+ * are in the reserved area otherwise preloading the
+ * kernel could corrupt things.
+ */
+ result = -EADDRNOTAVAIL;
+ for (i = 0; i < nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ /* Ensure we are within the crash kernel limits */
+ if ((mstart < crashk_res.start) || (mend > crashk_res.end))
+ goto out;
+ }
+
+ /*
+ * Find a location for the control code buffer, and add
+ * the vector of segments so that it's pages will also be
+ * counted as destination pages.
+ */
+ result = -ENOMEM;
+ image->control_code_page = kimage_alloc_control_pages(image,
+ get_order(KEXEC_CONTROL_CODE_SIZE));
+ if (!image->control_code_page) {
+ printk(KERN_ERR "Could not allocate control_code_buffer\n");
+ goto out;
+ }
+
+ result = 0;
+out:
+ if (result == 0)
+ *rimage = image;
+ else
+ kfree(image);
+
+ return result;
+}
+
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
+{
+ unsigned long i;
+
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_pages(unsigned int gfp_mask,
+ unsigned int order)
+{
+ struct page *pages;
+
+ pages = alloc_pages(gfp_mask, order);
+ if (pages) {
+ unsigned int count, i;
+ pages->mapping = NULL;
+ pages->private = order;
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ SetPageReserved(pages + i);
+ }
+
+ return pages;
+}
+
+static void kimage_free_pages(struct page *page)
+{
+ unsigned int order, count, i;
+
+ order = page->private;
+ count = 1 << order;
+ for (i = 0; i < count; i++)
+ ClearPageReserved(page + i);
+ __free_pages(page, order);
+}
+
+static void kimage_free_page_list(struct list_head *list)
+{
+ struct list_head *pos, *next;
+
+ list_for_each_safe(pos, next, list) {
+ struct page *page;
+
+ page = list_entry(pos, struct page, lru);
+ list_del(&page->lru);
+ kimage_free_pages(page);
+ }
+}
+
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * At worst this runs in O(N) of the image size.
+ */
+ struct list_head extra_pages;
+ struct page *pages;
+ unsigned int count;
+
+ count = 1 << order;
+ INIT_LIST_HEAD(&extra_pages);
+
+ /* Loop while I can allocate a page and the page allocated
+ * is a destination page.
+ */
+ do {
+ unsigned long pfn, epfn, addr, eaddr;
+
+ pages = kimage_alloc_pages(GFP_KERNEL, order);
+ if (!pages)
+ break;
+ pfn = page_to_pfn(pages);
+ epfn = pfn + count;
+ addr = pfn << PAGE_SHIFT;
+ eaddr = epfn << PAGE_SHIFT;
+ if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
+ kimage_is_destination_range(image, addr, eaddr)) {
+ list_add(&pages->lru, &extra_pages);
+ pages = NULL;
+ }
+ } while (!pages);
+
+ if (pages) {
+ /* Remember the allocated page... */
+ list_add(&pages->lru, &image->control_pages);
+
+ /* Because the page is already in it's destination
+ * location we will never allocate another page at
+ * that address. Therefore kimage_alloc_pages
+ * will not return it (again) and we don't need
+ * to give it an entry in image->segment[].
+ */
+ }
+ /* Deal with the destination pages I have inadvertently allocated.
+ *
+ * Ideally I would convert multi-page allocations into single
+ * page allocations, and add everyting to image->dest_pages.
+ *
+ * For now it is simpler to just free the pages.
+ */
+ kimage_free_page_list(&extra_pages);
+
+ return pages;
+}
+
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ /* Control pages are special, they are the intermediaries
+ * that are needed while we copy the rest of the pages
+ * to their final resting place. As such they must
+ * not conflict with either the destination addresses
+ * or memory the kernel is already using.
+ *
+ * Control pages are also the only pags we must allocate
+ * when loading a crash kernel. All of the other pages
+ * are specified by the segments and we just memcpy
+ * into them directly.
+ *
+ * The only case where we really need more than one of
+ * these are for architectures where we cannot disable
+ * the MMU and must instead generate an identity mapped
+ * page table for all of the memory.
+ *
+ * Given the low demand this implements a very simple
+ * allocator that finds the first hole of the appropriate
+ * size in the reserved memory region, and allocates all
+ * of the memory up to and including the hole.
+ */
+ unsigned long hole_start, hole_end, size;
+ struct page *pages;
+
+ pages = NULL;
+ size = (1 << order) << PAGE_SHIFT;
+ hole_start = (image->control_page + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ while (hole_end <= crashk_res.end) {
+ unsigned long i;
+
+ if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+ break;
+ if (hole_end > crashk_res.end)
+ break;
+ /* See if I overlap any of the segments */
+ for (i = 0; i < image->nr_segments; i++) {
+ unsigned long mstart, mend;
+
+ mstart = image->segment[i].mem;
+ mend = mstart + image->segment[i].memsz - 1;
+ if ((hole_end >= mstart) && (hole_start <= mend)) {
+ /* Advance the hole to the end of the segment */
+ hole_start = (mend + (size - 1)) & ~(size - 1);
+ hole_end = hole_start + size - 1;
+ break;
+ }
+ }
+ /* If I don't overlap any segments I have found my hole! */
+ if (i == image->nr_segments) {
+ pages = pfn_to_page(hole_start >> PAGE_SHIFT);
+ break;
+ }
+ }
+ if (pages)
+ image->control_page = hole_end;
+
+ return pages;
+}
+
+
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
+{
+ struct page *pages = NULL;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ pages = kimage_alloc_normal_control_pages(image, order);
+ break;
+ case KEXEC_TYPE_CRASH:
+ pages = kimage_alloc_crash_control_pages(image, order);
+ break;
+ }
+
+ return pages;
+}
+
+static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ if (image->entry == image->last_entry) {
+ kimage_entry_t *ind_page;
+ struct page *page;
+
+ page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
+ if (!page)
+ return -ENOMEM;
+
+ ind_page = page_address(page);
+ *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
+ image->entry = ind_page;
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ }
+ *image->entry = entry;
+ image->entry++;
+ *image->entry = 0;
+
+ return 0;
+}
+
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
+{
+ int result;
+
+ destination &= PAGE_MASK;
+ result = kimage_add_entry(image, destination | IND_DESTINATION);
+ if (result == 0)
+ image->destination = destination;
+
+ return result;
+}
+
+
+static int kimage_add_page(struct kimage *image, unsigned long page)
+{
+ int result;
+
+ page &= PAGE_MASK;
+ result = kimage_add_entry(image, page | IND_SOURCE);
+ if (result == 0)
+ image->destination += PAGE_SIZE;
+
+ return result;
+}
+
+
+static void kimage_free_extra_pages(struct kimage *image)
+{
+ /* Walk through and free any extra destination pages I may have */
+ kimage_free_page_list(&image->dest_pages);
+
+ /* Walk through and free any unuseable pages I have cached */
+ kimage_free_page_list(&image->unuseable_pages);
+
+}
+static int kimage_terminate(struct kimage *image)
+{
+ if (*image->entry != 0)
+ image->entry++;
+
+ *image->entry = IND_DONE;
+
+ return 0;
+}
+
+#define for_each_kimage_entry(image, ptr, entry) \
+ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
+ ptr = (entry & IND_INDIRECTION)? \
+ phys_to_virt((entry & PAGE_MASK)): ptr +1)
+
+static void kimage_free_entry(kimage_entry_t entry)
+{
+ struct page *page;
+
+ page = pfn_to_page(entry >> PAGE_SHIFT);
+ kimage_free_pages(page);
+}
+
+static void kimage_free(struct kimage *image)
+{
+ kimage_entry_t *ptr, entry;
+ kimage_entry_t ind = 0;
+
+ if (!image)
+ return;
+
+ kimage_free_extra_pages(image);
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_INDIRECTION) {
+ /* Free the previous indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+ /* Save this indirection page until we are
+ * done with it.
+ */
+ ind = entry;
+ }
+ else if (entry & IND_SOURCE)
+ kimage_free_entry(entry);
+ }
+ /* Free the final indirection page */
+ if (ind & IND_INDIRECTION)
+ kimage_free_entry(ind);
+
+ /* Handle any machine specific cleanup */
+ machine_kexec_cleanup(image);
+
+ /* Free the kexec control pages... */
+ kimage_free_page_list(&image->control_pages);
+ kfree(image);
+}
+
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
+{
+ kimage_entry_t *ptr, entry;
+ unsigned long destination = 0;
+
+ for_each_kimage_entry(image, ptr, entry) {
+ if (entry & IND_DESTINATION)
+ destination = entry & PAGE_MASK;
+ else if (entry & IND_SOURCE) {
+ if (page == destination)
+ return ptr;
+ destination += PAGE_SIZE;
+ }
+ }
+
+ return 0;
+}
+
+static struct page *kimage_alloc_page(struct kimage *image,
+ unsigned int gfp_mask,
+ unsigned long destination)
+{
+ /*
+ * Here we implement safeguards to ensure that a source page
+ * is not copied to its destination page before the data on
+ * the destination page is no longer useful.
+ *
+ * To do this we maintain the invariant that a source page is
+ * either its own destination page, or it is not a
+ * destination page at all.
+ *
+ * That is slightly stronger than required, but the proof
+ * that no problems will not occur is trivial, and the
+ * implementation is simply to verify.
+ *
+ * When allocating all pages normally this algorithm will run
+ * in O(N) time, but in the worst case it will run in O(N^2)
+ * time. If the runtime is a problem the data structures can
+ * be fixed.
+ */
+ struct page *page;
+ unsigned long addr;
+
+ /*
+ * Walk through the list of destination pages, and see if I
+ * have a match.
+ */
+ list_for_each_entry(page, &image->dest_pages, lru) {
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+ if (addr == destination) {
+ list_del(&page->lru);
+ return page;
+ }
+ }
+ page = NULL;
+ while (1) {
+ kimage_entry_t *old;
+
+ /* Allocate a page, if we run out of memory give up */
+ page = kimage_alloc_pages(gfp_mask, 0);
+ if (!page)
+ return 0;
+ /* If the page cannot be used file it away */
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ list_add(&page->lru, &image->unuseable_pages);
+ continue;
+ }
+ addr = page_to_pfn(page) << PAGE_SHIFT;
+
+ /* If it is the destination page we want use it */
+ if (addr == destination)
+ break;
+
+ /* If the page is not a destination page use it */
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
+ break;
+
+ /*
+ * I know that the page is someones destination page.
+ * See if there is already a source page for this
+ * destination page. And if so swap the source pages.
+ */
+ old = kimage_dst_used(image, addr);
+ if (old) {
+ /* If so move it */
+ unsigned long old_addr;
+ struct page *old_page;
+
+ old_addr = *old & PAGE_MASK;
+ old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
+ copy_highpage(page, old_page);
+ *old = addr | (*old & ~PAGE_MASK);
+
+ /* The old page I have found cannot be a
+ * destination page, so return it.
+ */
+ addr = old_addr;
+ page = old_page;
+ break;
+ }
+ else {
+ /* Place the page on the destination list I
+ * will use it later.
+ */
+ list_add(&page->lru, &image->dest_pages);
+ }
+ }
+
+ return page;
+}
+
+static int kimage_load_normal_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ unsigned long maddr;
+ unsigned long ubytes, mbytes;
+ int result;
+ unsigned char *buf;
+
+ result = 0;
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+
+ result = kimage_set_destination(image, maddr);
+ if (result < 0)
+ goto out;
+
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
+ if (page == 0) {
+ result = -ENOMEM;
+ goto out;
+ }
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
+ goto out;
+
+ ptr = kmap(page);
+ /* Start with a clear page */
+ memset(ptr, 0, PAGE_SIZE);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+ if (mchunk > mbytes)
+ mchunk = mbytes;
+
+ uchunk = mchunk;
+ if (uchunk > ubytes)
+ uchunk = ubytes;
+
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = (result < 0) ? result : -EIO;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_crash_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ /* For crash dumps kernels we simply copy the data from
+ * user space to it's destination.
+ * We do things a page at a time for the sake of kmap.
+ */
+ unsigned long maddr;
+ unsigned long ubytes, mbytes;
+ int result;
+ unsigned char *buf;
+
+ result = 0;
+ buf = segment->buf;
+ ubytes = segment->bufsz;
+ mbytes = segment->memsz;
+ maddr = segment->mem;
+ while (mbytes) {
+ struct page *page;
+ char *ptr;
+ size_t uchunk, mchunk;
+
+ page = pfn_to_page(maddr >> PAGE_SHIFT);
+ if (page == 0) {
+ result = -ENOMEM;
+ goto out;
+ }
+ ptr = kmap(page);
+ ptr += maddr & ~PAGE_MASK;
+ mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
+ if (mchunk > mbytes)
+ mchunk = mbytes;
+
+ uchunk = mchunk;
+ if (uchunk > ubytes) {
+ uchunk = ubytes;
+ /* Zero the trailing part of the page */
+ memset(ptr + uchunk, 0, mchunk - uchunk);
+ }
+ result = copy_from_user(ptr, buf, uchunk);
+ kunmap(page);
+ if (result) {
+ result = (result < 0) ? result : -EIO;
+ goto out;
+ }
+ ubytes -= uchunk;
+ maddr += mchunk;
+ buf += mchunk;
+ mbytes -= mchunk;
+ }
+out:
+ return result;
+}
+
+static int kimage_load_segment(struct kimage *image,
+ struct kexec_segment *segment)
+{
+ int result = -ENOMEM;
+
+ switch (image->type) {
+ case KEXEC_TYPE_DEFAULT:
+ result = kimage_load_normal_segment(image, segment);
+ break;
+ case KEXEC_TYPE_CRASH:
+ result = kimage_load_crash_segment(image, segment);
+ break;
+ }
+
+ return result;
+}
+
+/*
+ * Exec Kernel system call: for obvious reasons only root may call it.
+ *
+ * This call breaks up into three pieces.
+ * - A generic part which loads the new kernel from the current
+ * address space, and very carefully places the data in the
+ * allocated pages.
+ *
+ * - A generic part that interacts with the kernel and tells all of
+ * the devices to shut down. Preventing on-going dmas, and placing
+ * the devices in a consistent state so a later kernel can
+ * reinitialize them.
+ *
+ * - A machine specific part that includes the syscall number
+ * and the copies the image to it's final destination. And
+ * jumps into the image at entry.
+ *
+ * kexec does not sync, or unmount filesystems so if you need
+ * that to happen you need to do that yourself.
+ */
+struct kimage *kexec_image = NULL;
+static struct kimage *kexec_crash_image = NULL;
+/*
+ * A home grown binary mutex.
+ * Nothing can wait so this mutex is safe to use
+ * in interrupt context :)
+ */
+static int kexec_lock = 0;
+
+asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
+ struct kexec_segment __user *segments,
+ unsigned long flags)
+{
+ struct kimage **dest_image, *image;
+ int locked;
+ int result;
+
+ /* We only trust the superuser with rebooting the system. */
+ if (!capable(CAP_SYS_BOOT))
+ return -EPERM;
+
+ /*
+ * Verify we have a legal set of flags
+ * This leaves us room for future extensions.
+ */
+ if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
+ return -EINVAL;
+
+ /* Verify we are on the appropriate architecture */
+ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
+ ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
+ return -EINVAL;
+
+ /* Put an artificial cap on the number
+ * of segments passed to kexec_load.
+ */
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ image = NULL;
+ result = 0;
+
+ /* Because we write directly to the reserved memory
+ * region when loading crash kernels we need a mutex here to
+ * prevent multiple crash kernels from attempting to load
+ * simultaneously, and to prevent a crash kernel from loading
+ * over the top of a in use crash kernel.
+ *
+ * KISS: always take the mutex.
+ */
+ locked = xchg(&kexec_lock, 1);
+ if (locked)
+ return -EBUSY;
+
+ dest_image = &kexec_image;
+ if (flags & KEXEC_ON_CRASH)
+ dest_image = &kexec_crash_image;
+ if (nr_segments > 0) {
+ unsigned long i;
+
+ /* Loading another kernel to reboot into */
+ if ((flags & KEXEC_ON_CRASH) == 0)
+ result = kimage_normal_alloc(&image, entry,
+ nr_segments, segments);
+ /* Loading another kernel to switch to if this one crashes */
+ else if (flags & KEXEC_ON_CRASH) {
+ /* Free any current crash dump kernel before
+ * we corrupt it.
+ */
+ kimage_free(xchg(&kexec_crash_image, NULL));
+ result = kimage_crash_alloc(&image, entry,
+ nr_segments, segments);
+ }
+ if (result)
+ goto out;
+
+ result = machine_kexec_prepare(image);
+ if (result)
+ goto out;
+
+ for (i = 0; i < nr_segments; i++) {
+ result = kimage_load_segment(image, &image->segment[i]);
+ if (result)
+ goto out;
+ }
+ result = kimage_terminate(image);
+ if (result)
+ goto out;
+ }
+ /* Install the new kernel, and Uninstall the old */
+ image = xchg(dest_image, image);
+
+out:
+ xchg(&kexec_lock, 0); /* Release the mutex */
+ kimage_free(image);
+
+ return result;
+}
+
+#ifdef CONFIG_COMPAT
+asmlinkage long compat_sys_kexec_load(unsigned long entry,
+ unsigned long nr_segments,
+ struct compat_kexec_segment __user *segments,
+ unsigned long flags)
+{
+ struct compat_kexec_segment in;
+ struct kexec_segment out, __user *ksegments;
+ unsigned long i, result;
+
+ /* Don't allow clients that don't understand the native
+ * architecture to do anything.
+ */
+ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
+ return -EINVAL;
+
+ if (nr_segments > KEXEC_SEGMENT_MAX)
+ return -EINVAL;
+
+ ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
+ for (i=0; i < nr_segments; i++) {
+ result = copy_from_user(&in, &segments[i], sizeof(in));
+ if (result)
+ return -EFAULT;
+
+ out.buf = compat_ptr(in.buf);
+ out.bufsz = in.bufsz;
+ out.mem = in.mem;
+ out.memsz = in.memsz;
+
+ result = copy_to_user(&ksegments[i], &out, sizeof(out));
+ if (result)
+ return -EFAULT;
+ }
+
+ return sys_kexec_load(entry, nr_segments, ksegments, flags);
+}
+#endif
+
+void crash_kexec(struct pt_regs *regs)
+{
+ struct kimage *image;
+ int locked;
+
+
+ /* Take the kexec_lock here to prevent sys_kexec_load
+ * running on one cpu from replacing the crash kernel
+ * we are using after a panic on a different cpu.
+ *
+ * If the crash kernel was not located in a fixed area
+ * of memory the xchg(&kexec_crash_image) would be
+ * sufficient. But since I reuse the memory...
+ */
+ locked = xchg(&kexec_lock, 1);
+ if (!locked) {
+ image = xchg(&kexec_crash_image, NULL);
+ if (image) {
+ machine_crash_shutdown(regs);
+ machine_kexec(image);
+ }
+ xchg(&kexec_lock, 0);
+ }
+}
diff --git a/kernel/kmod.c b/kernel/kmod.c
index eed53d4f5230..44166e3bb8af 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -120,6 +120,7 @@ struct subprocess_info {
char *path;
char **argv;
char **envp;
+ struct key *ring;
int wait;
int retval;
};
@@ -130,16 +131,21 @@ struct subprocess_info {
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ struct key *old_session;
int retval;
- /* Unblock all signals. */
+ /* Unblock all signals and set the session keyring. */
+ key_get(sub_info->ring);
flush_signals(current);
spin_lock_irq(&current->sighand->siglock);
+ old_session = __install_session_keyring(current, sub_info->ring);
flush_signal_handlers(current, 1);
sigemptyset(&current->blocked);
recalc_sigpending();
spin_unlock_irq(&current->sighand->siglock);
+ key_put(old_session);
+
/* We can run anywhere, unlike our parent keventd(). */
set_cpus_allowed(current, CPU_MASK_ALL);
@@ -211,10 +217,11 @@ static void __call_usermodehelper(void *data)
}
/**
- * call_usermodehelper - start a usermode application
+ * call_usermodehelper_keys - start a usermode application
* @path: pathname for the application
* @argv: null-terminated argument list
* @envp: null-terminated environment list
+ * @session_keyring: session keyring for process (NULL for an empty keyring)
* @wait: wait for the application to finish and return status.
*
* Runs a user-space application. The application is started
@@ -224,7 +231,8 @@ static void __call_usermodehelper(void *data)
* Must be called from process context. Returns a negative error code
* if program was not execed successfully, or 0.
*/
-int call_usermodehelper(char *path, char **argv, char **envp, int wait)
+int call_usermodehelper_keys(char *path, char **argv, char **envp,
+ struct key *session_keyring, int wait)
{
DECLARE_COMPLETION(done);
struct subprocess_info sub_info = {
@@ -232,6 +240,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
.path = path,
.argv = argv,
.envp = envp,
+ .ring = session_keyring,
.wait = wait,
.retval = 0,
};
@@ -247,7 +256,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait)
wait_for_completion(&done);
return sub_info.retval;
}
-EXPORT_SYMBOL(call_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper_keys);
void __init usermodehelper_init(void)
{
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 037142b72a49..334f37472c56 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -27,6 +27,9 @@
* interface to access function arguments.
* 2004-Sep Prasanna S Panchamukhi <prasanna@in.ibm.com> Changed Kprobes
* exceptions notifier to be first on the priority list.
+ * 2005-May Hien Nguyen <hien@us.ibm.com>, Jim Keniston
+ * <jkenisto@us.ibm.com> and Prasanna S Panchamukhi
+ * <prasanna@in.ibm.com> added function-return probes.
*/
#include <linux/kprobes.h>
#include <linux/spinlock.h>
@@ -41,6 +44,7 @@
#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS)
static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
+static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
unsigned int kprobe_cpu = NR_CPUS;
static DEFINE_SPINLOCK(kprobe_lock);
@@ -78,22 +82,23 @@ struct kprobe *get_kprobe(void *addr)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
-int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
list_for_each_entry(kp, &p->list, list) {
if (kp->pre_handler) {
curr_kprobe = kp;
- kp->pre_handler(kp, regs);
- curr_kprobe = NULL;
+ if (kp->pre_handler(kp, regs))
+ return 1;
}
+ curr_kprobe = NULL;
}
return 0;
}
-void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
{
struct kprobe *kp;
@@ -107,7 +112,8 @@ void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
return;
}
-int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
{
/*
* if we faulted "during" the execution of a user specified
@@ -120,19 +126,191 @@ int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, int trapnr)
return 0;
}
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kprobe *kp = curr_kprobe;
+ if (curr_kprobe && kp->break_handler) {
+ if (kp->break_handler(kp, regs)) {
+ curr_kprobe = NULL;
+ return 1;
+ }
+ }
+ curr_kprobe = NULL;
+ return 0;
+}
+
+struct kprobe trampoline_p = {
+ .addr = (kprobe_opcode_t *) &kretprobe_trampoline,
+ .pre_handler = trampoline_probe_handler,
+ .post_handler = trampoline_post_handler
+};
+
+struct kretprobe_instance *get_free_rp_inst(struct kretprobe *rp)
+{
+ struct hlist_node *node;
+ struct kretprobe_instance *ri;
+ hlist_for_each_entry(ri, node, &rp->free_instances, uflist)
+ return ri;
+ return NULL;
+}
+
+static struct kretprobe_instance *get_used_rp_inst(struct kretprobe *rp)
+{
+ struct hlist_node *node;
+ struct kretprobe_instance *ri;
+ hlist_for_each_entry(ri, node, &rp->used_instances, uflist)
+ return ri;
+ return NULL;
+}
+
+struct kretprobe_instance *get_rp_inst(void *sara)
+{
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct task_struct *tsk;
+ struct kretprobe_instance *ri;
+
+ tsk = arch_get_kprobe_task(sara);
+ head = &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+ hlist_for_each_entry(ri, node, head, hlist) {
+ if (ri->stack_addr == sara)
+ return ri;
+ }
+ return NULL;
+}
+
+void add_rp_inst(struct kretprobe_instance *ri)
+{
+ struct task_struct *tsk;
+ /*
+ * Remove rp inst off the free list -
+ * Add it back when probed function returns
+ */
+ hlist_del(&ri->uflist);
+ tsk = arch_get_kprobe_task(ri->stack_addr);
+ /* Add rp inst onto table */
+ INIT_HLIST_NODE(&ri->hlist);
+ hlist_add_head(&ri->hlist,
+ &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]);
+
+ /* Also add this rp inst to the used list. */
+ INIT_HLIST_NODE(&ri->uflist);
+ hlist_add_head(&ri->uflist, &ri->rp->used_instances);
+}
+
+void recycle_rp_inst(struct kretprobe_instance *ri)
+{
+ /* remove rp inst off the rprobe_inst_table */
+ hlist_del(&ri->hlist);
+ if (ri->rp) {
+ /* remove rp inst off the used list */
+ hlist_del(&ri->uflist);
+ /* put rp inst back onto the free list */
+ INIT_HLIST_NODE(&ri->uflist);
+ hlist_add_head(&ri->uflist, &ri->rp->free_instances);
+ } else
+ /* Unregistering */
+ kfree(ri);
+}
+
+struct hlist_head * kretprobe_inst_table_head(struct task_struct *tsk)
+{
+ return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)];
+}
+
+struct kretprobe_instance *get_rp_inst_tsk(struct task_struct *tk)
+{
+ struct task_struct *tsk;
+ struct hlist_head *head;
+ struct hlist_node *node;
+ struct kretprobe_instance *ri;
+
+ head = &kretprobe_inst_table[hash_ptr(tk, KPROBE_HASH_BITS)];
+
+ hlist_for_each_entry(ri, node, head, hlist) {
+ tsk = arch_get_kprobe_task(ri->stack_addr);
+ if (tsk == tk)
+ return ri;
+ }
+ return NULL;
+}
+
+/*
+ * This function is called from do_exit or do_execv when task tk's stack is
+ * about to be recycled. Recycle any function-return probe instances
+ * associated with this task. These represent probed functions that have
+ * been called but may never return.
+ */
+void kprobe_flush_task(struct task_struct *tk)
+{
+ unsigned long flags = 0;
+ spin_lock_irqsave(&kprobe_lock, flags);
+ arch_kprobe_flush_task(tk);
+ spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
+/*
+ * This kprobe pre_handler is registered with every kretprobe. When probe
+ * hits it will set up the return probe.
+ */
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
+{
+ struct kretprobe *rp = container_of(p, struct kretprobe, kp);
+
+ /*TODO: consider to only swap the RA after the last pre_handler fired */
+ arch_prepare_kretprobe(rp, regs);
+ return 0;
+}
+
+static inline void free_rp_inst(struct kretprobe *rp)
+{
+ struct kretprobe_instance *ri;
+ while ((ri = get_free_rp_inst(rp)) != NULL) {
+ hlist_del(&ri->uflist);
+ kfree(ri);
+ }
+}
+
+/*
+ * Keep all fields in the kprobe consistent
+ */
+static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+ memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t));
+ memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn));
+}
+
+/*
+* Add the new probe to old_p->list. Fail if this is the
+* second jprobe at the address - two jprobes can't coexist
+*/
+static int add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+{
+ struct kprobe *kp;
+
+ if (p->break_handler) {
+ list_for_each_entry(kp, &old_p->list, list) {
+ if (kp->break_handler)
+ return -EEXIST;
+ }
+ list_add_tail(&p->list, &old_p->list);
+ } else
+ list_add(&p->list, &old_p->list);
+ return 0;
+}
+
/*
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
+ copy_kprobe(p, ap);
ap->addr = p->addr;
- ap->opcode = p->opcode;
- memcpy(&ap->ainsn, &p->ainsn, sizeof(struct arch_specific_insn));
-
ap->pre_handler = aggr_pre_handler;
ap->post_handler = aggr_post_handler;
ap->fault_handler = aggr_fault_handler;
+ ap->break_handler = aggr_break_handler;
INIT_LIST_HEAD(&ap->list);
list_add(&p->list, &ap->list);
@@ -153,16 +331,16 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
int ret = 0;
struct kprobe *ap;
- if (old_p->break_handler || p->break_handler) {
- ret = -EEXIST; /* kprobe and jprobe can't (yet) coexist */
- } else if (old_p->pre_handler == aggr_pre_handler) {
- list_add(&p->list, &old_p->list);
+ if (old_p->pre_handler == aggr_pre_handler) {
+ copy_kprobe(old_p, p);
+ ret = add_new_kprobe(old_p, p);
} else {
ap = kcalloc(1, sizeof(struct kprobe), GFP_ATOMIC);
if (!ap)
return -ENOMEM;
add_aggr_kprobe(ap, old_p);
- list_add(&p->list, &ap->list);
+ copy_kprobe(ap, p);
+ ret = add_new_kprobe(ap, p);
}
return ret;
}
@@ -170,10 +348,8 @@ static int register_aggr_kprobe(struct kprobe *old_p, struct kprobe *p)
/* kprobe removal house-keeping routines */
static inline void cleanup_kprobe(struct kprobe *p, unsigned long flags)
{
- *p->addr = p->opcode;
+ arch_disarm_kprobe(p);
hlist_del(&p->hlist);
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
spin_unlock_irqrestore(&kprobe_lock, flags);
arch_remove_kprobe(p);
}
@@ -200,6 +376,7 @@ int register_kprobe(struct kprobe *p)
}
spin_lock_irqsave(&kprobe_lock, flags);
old_p = get_kprobe(p->addr);
+ p->nmissed = 0;
if (old_p) {
ret = register_aggr_kprobe(old_p, p);
goto out;
@@ -210,10 +387,8 @@ int register_kprobe(struct kprobe *p)
hlist_add_head(&p->hlist,
&kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
- p->opcode = *p->addr;
- *p->addr = BREAKPOINT_INSTRUCTION;
- flush_icache_range((unsigned long) p->addr,
- (unsigned long) p->addr + sizeof(kprobe_opcode_t));
+ arch_arm_kprobe(p);
+
out:
spin_unlock_irqrestore(&kprobe_lock, flags);
rm_kprobe:
@@ -257,16 +432,82 @@ void unregister_jprobe(struct jprobe *jp)
unregister_kprobe(&jp->kp);
}
+#ifdef ARCH_SUPPORTS_KRETPROBES
+
+int register_kretprobe(struct kretprobe *rp)
+{
+ int ret = 0;
+ struct kretprobe_instance *inst;
+ int i;
+
+ rp->kp.pre_handler = pre_handler_kretprobe;
+
+ /* Pre-allocate memory for max kretprobe instances */
+ if (rp->maxactive <= 0) {
+#ifdef CONFIG_PREEMPT
+ rp->maxactive = max(10, 2 * NR_CPUS);
+#else
+ rp->maxactive = NR_CPUS;
+#endif
+ }
+ INIT_HLIST_HEAD(&rp->used_instances);
+ INIT_HLIST_HEAD(&rp->free_instances);
+ for (i = 0; i < rp->maxactive; i++) {
+ inst = kmalloc(sizeof(struct kretprobe_instance), GFP_KERNEL);
+ if (inst == NULL) {
+ free_rp_inst(rp);
+ return -ENOMEM;
+ }
+ INIT_HLIST_NODE(&inst->uflist);
+ hlist_add_head(&inst->uflist, &rp->free_instances);
+ }
+
+ rp->nmissed = 0;
+ /* Establish function entry probe point */
+ if ((ret = register_kprobe(&rp->kp)) != 0)
+ free_rp_inst(rp);
+ return ret;
+}
+
+#else /* ARCH_SUPPORTS_KRETPROBES */
+
+int register_kretprobe(struct kretprobe *rp)
+{
+ return -ENOSYS;
+}
+
+#endif /* ARCH_SUPPORTS_KRETPROBES */
+
+void unregister_kretprobe(struct kretprobe *rp)
+{
+ unsigned long flags;
+ struct kretprobe_instance *ri;
+
+ unregister_kprobe(&rp->kp);
+ /* No race here */
+ spin_lock_irqsave(&kprobe_lock, flags);
+ free_rp_inst(rp);
+ while ((ri = get_used_rp_inst(rp)) != NULL) {
+ ri->rp = NULL;
+ hlist_del(&ri->uflist);
+ }
+ spin_unlock_irqrestore(&kprobe_lock, flags);
+}
+
static int __init init_kprobes(void)
{
int i, err = 0;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
- for (i = 0; i < KPROBE_TABLE_SIZE; i++)
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
INIT_HLIST_HEAD(&kprobe_table[i]);
+ INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
+ }
err = register_die_notifier(&kprobe_exceptions_nb);
+ /* Register the trampoline probe for return probe */
+ register_kprobe(&trampoline_p);
return err;
}
@@ -277,3 +518,6 @@ EXPORT_SYMBOL_GPL(unregister_kprobe);
EXPORT_SYMBOL_GPL(register_jprobe);
EXPORT_SYMBOL_GPL(unregister_jprobe);
EXPORT_SYMBOL_GPL(jprobe_return);
+EXPORT_SYMBOL_GPL(register_kretprobe);
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
+
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 1f064a63f8cf..015fb69ad94d 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -30,6 +30,16 @@ static ssize_t hotplug_seqnum_show(struct subsystem *subsys, char *page)
KERNEL_ATTR_RO(hotplug_seqnum);
#endif
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+static ssize_t crash_notes_show(struct subsystem *subsys, char *page)
+{
+ return sprintf(page, "%p\n", (void *)crash_notes);
+}
+KERNEL_ATTR_RO(crash_notes);
+#endif
+
decl_subsys(kernel, NULL, NULL);
EXPORT_SYMBOL_GPL(kernel_subsys);
@@ -37,6 +47,9 @@ static struct attribute * kernel_attrs[] = {
#ifdef CONFIG_HOTPLUG
&hotplug_seqnum_attr.attr,
#endif
+#ifdef CONFIG_KEXEC
+ &crash_notes_attr.attr,
+#endif
NULL
};
diff --git a/kernel/module.c b/kernel/module.c
index 5734ab09d3f9..068e271ab3a5 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -35,6 +35,7 @@
#include <linux/notifier.h>
#include <linux/stop_machine.h>
#include <linux/device.h>
+#include <linux/string.h>
#include <asm/uaccess.h>
#include <asm/semaphore.h>
#include <asm/cacheflush.h>
@@ -370,6 +371,43 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
#endif /* CONFIG_SMP */
#ifdef CONFIG_MODULE_UNLOAD
+#define MODINFO_ATTR(field) \
+static void setup_modinfo_##field(struct module *mod, const char *s) \
+{ \
+ mod->field = kstrdup(s, GFP_KERNEL); \
+} \
+static ssize_t show_modinfo_##field(struct module_attribute *mattr, \
+ struct module *mod, char *buffer) \
+{ \
+ return sprintf(buffer, "%s\n", mod->field); \
+} \
+static int modinfo_##field##_exists(struct module *mod) \
+{ \
+ return mod->field != NULL; \
+} \
+static void free_modinfo_##field(struct module *mod) \
+{ \
+ kfree(mod->field); \
+ mod->field = NULL; \
+} \
+static struct module_attribute modinfo_##field = { \
+ .attr = { .name = __stringify(field), .mode = 0444, \
+ .owner = THIS_MODULE }, \
+ .show = show_modinfo_##field, \
+ .setup = setup_modinfo_##field, \
+ .test = modinfo_##field##_exists, \
+ .free = free_modinfo_##field, \
+};
+
+MODINFO_ATTR(version);
+MODINFO_ATTR(srcversion);
+
+static struct module_attribute *modinfo_attrs[] = {
+ &modinfo_version,
+ &modinfo_srcversion,
+ NULL,
+};
+
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
@@ -379,7 +417,7 @@ static void module_unload_init(struct module *mod)
for (i = 0; i < NR_CPUS; i++)
local_set(&mod->ref[i].count, 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[_smp_processor_id()].count, 1);
+ local_set(&mod->ref[raw_smp_processor_id()].count, 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -692,7 +730,7 @@ static int obsparm_copy_string(const char *val, struct kernel_param *kp)
return 0;
}
-int set_obsolete(const char *val, struct kernel_param *kp)
+static int set_obsolete(const char *val, struct kernel_param *kp)
{
unsigned int min, max;
unsigned int size, maxsize;
@@ -1031,6 +1069,32 @@ static void module_remove_refcnt_attr(struct module *mod)
}
#endif
+#ifdef CONFIG_MODULE_UNLOAD
+static int module_add_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ int error = 0;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) {
+ if (!attr->test ||
+ (attr->test && attr->test(mod)))
+ error = sysfs_create_file(&mod->mkobj.kobj,&attr->attr);
+ }
+ return error;
+}
+
+static void module_remove_modinfo_attrs(struct module *mod)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ sysfs_remove_file(&mod->mkobj.kobj,&attr->attr);
+ attr->free(mod);
+ }
+}
+#endif
static int mod_sysfs_setup(struct module *mod,
struct kernel_param *kparam,
@@ -1056,6 +1120,12 @@ static int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg;
+#ifdef CONFIG_MODULE_UNLOAD
+ err = module_add_modinfo_attrs(mod);
+ if (err)
+ goto out_unreg;
+#endif
+
return 0;
out_unreg:
@@ -1066,6 +1136,9 @@ out:
static void mod_kobject_remove(struct module *mod)
{
+#ifdef CONFIG_MODULE_UNLOAD
+ module_remove_modinfo_attrs(mod);
+#endif
module_remove_refcnt_attr(mod);
module_param_sysfs_remove(mod);
@@ -1311,6 +1384,23 @@ static char *get_modinfo(Elf_Shdr *sechdrs,
return NULL;
}
+#ifdef CONFIG_MODULE_UNLOAD
+static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
+ unsigned int infoindex)
+{
+ struct module_attribute *attr;
+ int i;
+
+ for (i = 0; (attr = modinfo_attrs[i]); i++) {
+ if (attr->setup)
+ attr->setup(mod,
+ get_modinfo(sechdrs,
+ infoindex,
+ attr->attr.name));
+ }
+}
+#endif
+
#ifdef CONFIG_KALLSYMS
int is_exported(const char *name, const struct module *mod)
{
@@ -1615,6 +1705,11 @@ static struct module *load_module(void __user *umod,
/* Set up license info based on the info section */
set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
+#ifdef CONFIG_MODULE_UNLOAD
+ /* Set up MODINFO_ATTR fields */
+ setup_modinfo(mod, sechdrs, infoindex);
+#endif
+
/* Fix up syms, so that st_value is a pointer to location. */
err = simplify_symbols(sechdrs, symindex, strtab, versindex, pcpuindex,
mod);
@@ -1758,6 +1853,7 @@ sys_init_module(void __user *umod,
const char __user *uargs)
{
struct module *mod;
+ mm_segment_t old_fs = get_fs();
int ret = 0;
/* Must have permission */
@@ -1775,6 +1871,9 @@ sys_init_module(void __user *umod,
return PTR_ERR(mod);
}
+ /* flush the icache in correct context */
+ set_fs(KERNEL_DS);
+
/* Flush the instruction cache, since we've played with text */
if (mod->module_init)
flush_icache_range((unsigned long)mod->module_init,
@@ -1783,6 +1882,8 @@ sys_init_module(void __user *umod,
flush_icache_range((unsigned long)mod->module_core,
(unsigned long)mod->module_core + mod->core_size);
+ set_fs(old_fs);
+
/* Now sew it into the lists. They won't access us, since
strong_try_module_get() will fail. */
stop_machine_run(__link_module, mod, NR_CPUS);
diff --git a/kernel/panic.c b/kernel/panic.c
index 081f7465fc8d..74ba5f3e46c7 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -18,6 +18,7 @@
#include <linux/sysrq.h>
#include <linux/interrupt.h>
#include <linux/nmi.h>
+#include <linux/kexec.h>
int panic_timeout;
int panic_on_oops;
@@ -63,6 +64,13 @@ NORET_TYPE void panic(const char * fmt, ...)
unsigned long caller = (unsigned long) __builtin_return_address(0);
#endif
+ /*
+ * It's possible to come here directly from a panic-assertion and not
+ * have preempt disabled. Some functions called from here want
+ * preempt to be disabled. No point enabling it later though...
+ */
+ preempt_disable();
+
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
@@ -70,7 +78,19 @@ NORET_TYPE void panic(const char * fmt, ...)
printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
bust_spinlocks(0);
+ /*
+ * If we have crashed and we have a crash kernel loaded let it handle
+ * everything else.
+ * Do we want to call this before we try to display a message?
+ */
+ crash_kexec(NULL);
+
#ifdef CONFIG_SMP
+ /*
+ * Note smp_send_stop is the usual smp shutdown function, which
+ * unfortunately means it may not be hardened to work in a panic
+ * situation.
+ */
smp_send_stop();
#endif
@@ -79,8 +99,7 @@ NORET_TYPE void panic(const char * fmt, ...)
if (!panic_blink)
panic_blink = no_blink;
- if (panic_timeout > 0)
- {
+ if (panic_timeout > 0) {
/*
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked..
diff --git a/kernel/params.c b/kernel/params.c
index 5513844bec13..d586c35ef8fc 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -629,7 +629,7 @@ static ssize_t module_attr_show(struct kobject *kobj,
mk = to_module_kobject(kobj);
if (!attribute->show)
- return -EPERM;
+ return -EIO;
if (!try_module_get(mk->mod))
return -ENODEV;
@@ -653,7 +653,7 @@ static ssize_t module_attr_store(struct kobject *kobj,
mk = to_module_kobject(kobj);
if (!attribute->store)
- return -EPERM;
+ return -EIO;
if (!try_module_get(mk->mod))
return -ENODEV;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fd316c272260..5b7b4736d82b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -89,23 +89,6 @@ static struct idr posix_timers_id;
static DEFINE_SPINLOCK(idr_lock);
/*
- * Just because the timer is not in the timer list does NOT mean it is
- * inactive. It could be in the "fire" routine getting a new expire time.
- */
-#define TIMER_INACTIVE 1
-
-#ifdef CONFIG_SMP
-# define timer_active(tmr) \
- ((tmr)->it.real.timer.entry.prev != (void *)TIMER_INACTIVE)
-# define set_timer_inactive(tmr) \
- do { \
- (tmr)->it.real.timer.entry.prev = (void *)TIMER_INACTIVE; \
- } while (0)
-#else
-# define timer_active(tmr) BARFY // error to use outside of SMP
-# define set_timer_inactive(tmr) do { } while (0)
-#endif
-/*
* we assume that the new SIGEV_THREAD_ID shares no bits with the other
* SIGEV values. Here we put out an error if this assumption fails.
*/
@@ -226,7 +209,6 @@ static inline int common_timer_create(struct k_itimer *new_timer)
init_timer(&new_timer->it.real.timer);
new_timer->it.real.timer.data = (unsigned long) new_timer;
new_timer->it.real.timer.function = posix_timer_fn;
- set_timer_inactive(new_timer);
return 0;
}
@@ -480,7 +462,6 @@ static void posix_timer_fn(unsigned long __data)
int do_notify = 1;
spin_lock_irqsave(&timr->it_lock, flags);
- set_timer_inactive(timr);
if (!list_empty(&timr->it.real.abs_timer_entry)) {
spin_lock(&abs_list.lock);
do {
@@ -983,8 +964,8 @@ common_timer_set(struct k_itimer *timr, int flags,
* careful here. If smp we could be in the "fire" routine which will
* be spinning as we hold the lock. But this is ONLY an SMP issue.
*/
+ if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
#ifdef CONFIG_SMP
- if (timer_active(timr) && !del_timer(&timr->it.real.timer))
/*
* It can only be active if on an other cpu. Since
* we have cleared the interval stuff above, it should
@@ -994,11 +975,9 @@ common_timer_set(struct k_itimer *timr, int flags,
* a "retry" exit status.
*/
return TIMER_RETRY;
-
- set_timer_inactive(timr);
-#else
- del_timer(&timr->it.real.timer);
#endif
+ }
+
remove_from_abslist(timr);
timr->it_requeue_pending = (timr->it_requeue_pending + 2) &
@@ -1083,8 +1062,9 @@ retry:
static inline int common_timer_del(struct k_itimer *timer)
{
timer->it.real.incr = 0;
+
+ if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
#ifdef CONFIG_SMP
- if (timer_active(timer) && !del_timer(&timer->it.real.timer))
/*
* It can only be active if on an other cpu. Since
* we have cleared the interval stuff above, it should
@@ -1094,9 +1074,9 @@ static inline int common_timer_del(struct k_itimer *timer)
* a "retry" exit status.
*/
return TIMER_RETRY;
-#else
- del_timer(&timer->it.real.timer);
#endif
+ }
+
remove_from_abslist(timer);
return 0;
@@ -1197,6 +1177,7 @@ void exit_itimers(struct signal_struct *sig)
tmr = list_entry(sig->posix_timers.next, struct k_itimer, list);
itimer_delete(tmr);
}
+ del_timer_sync(&sig->real_timer);
}
/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 696387ffe49c..2c7121d9bff1 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,8 +27,8 @@ config PM_DEBUG
like suspend support.
config SOFTWARE_SUSPEND
- bool "Software Suspend (EXPERIMENTAL)"
- depends on EXPERIMENTAL && PM && SWAP
+ bool "Software Suspend"
+ depends on EXPERIMENTAL && PM && SWAP && ((X86 && SMP) || ((FVR || PPC32 || X86) && !SMP))
---help---
Enable the possibility of suspending the machine.
It doesn't need APM.
@@ -72,3 +72,7 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
+config SUSPEND_SMP
+ bool
+ depends on HOTPLUG_CPU && X86 && PM
+ default y
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index fbdc634135a7..2f438d0eaa13 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,9 +3,9 @@ ifeq ($(CONFIG_PM_DEBUG),y)
EXTRA_CFLAGS += -DDEBUG
endif
-swsusp-smp-$(CONFIG_SMP) += smp.o
-
obj-y := main.o process.o console.o pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o $(swsusp-smp-y) disk.o
+obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o
+
+obj-$(CONFIG_SUSPEND_SMP) += smp.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 02b6764034dc..fb8de63c2919 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -117,8 +117,8 @@ static void finish(void)
{
device_resume();
platform_finish();
- enable_nonboot_cpus();
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}
@@ -131,28 +131,35 @@ static int prepare_processes(void)
sys_sync();
+ disable_nonboot_cpus();
+
if (freeze_processes()) {
error = -EBUSY;
- return error;
+ goto thaw;
}
if (pm_disk_mode == PM_DISK_PLATFORM) {
if (pm_ops && pm_ops->prepare) {
if ((error = pm_ops->prepare(PM_SUSPEND_DISK)))
- return error;
+ goto thaw;
}
}
/* Free memory before shutting down devices. */
free_some_memory();
-
return 0;
+thaw:
+ thaw_processes();
+ enable_nonboot_cpus();
+ pm_restore_console();
+ return error;
}
static void unprepare_processes(void)
{
- enable_nonboot_cpus();
+ platform_finish();
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}
@@ -160,15 +167,9 @@ static int prepare_devices(void)
{
int error;
- disable_nonboot_cpus();
- if ((error = device_suspend(PMSG_FREEZE))) {
+ if ((error = device_suspend(PMSG_FREEZE)))
printk("Some devices failed to suspend\n");
- platform_finish();
- enable_nonboot_cpus();
- return error;
- }
-
- return 0;
+ return error;
}
/**
@@ -185,9 +186,9 @@ int pm_suspend_disk(void)
int error;
error = prepare_processes();
- if (!error) {
- error = prepare_devices();
- }
+ if (error)
+ return error;
+ error = prepare_devices();
if (error) {
unprepare_processes();
@@ -250,7 +251,7 @@ static int software_resume(void)
if ((error = prepare_processes())) {
swsusp_close();
- goto Cleanup;
+ goto Done;
}
pr_debug("PM: Reading swsusp image.\n");
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 7960ddf04a57..c94cb9e95090 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -55,6 +55,13 @@ static int suspend_prepare(suspend_state_t state)
pm_prepare_console();
+ disable_nonboot_cpus();
+
+ if (num_online_cpus() != 1) {
+ error = -EPERM;
+ goto Enable_cpu;
+ }
+
if (freeze_processes()) {
error = -EAGAIN;
goto Thaw;
@@ -75,6 +82,8 @@ static int suspend_prepare(suspend_state_t state)
pm_ops->finish(state);
Thaw:
thaw_processes();
+ Enable_cpu:
+ enable_nonboot_cpus();
pm_restore_console();
return error;
}
@@ -113,6 +122,7 @@ static void suspend_finish(suspend_state_t state)
if (pm_ops && pm_ops->finish)
pm_ops->finish(state);
thaw_processes();
+ enable_nonboot_cpus();
pm_restore_console();
}
@@ -150,20 +160,14 @@ static int enter_state(suspend_state_t state)
goto Unlock;
}
- /* Suspend is hard to get right on SMP. */
- if (num_online_cpus() != 1) {
- error = -EPERM;
- goto Unlock;
- }
-
- pr_debug("PM: Preparing system for suspend\n");
+ pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
if ((error = suspend_prepare(state)))
goto Unlock;
- pr_debug("PM: Entering state.\n");
+ pr_debug("PM: Entering %s sleep\n", pm_states[state]);
error = suspend_enter(state);
- pr_debug("PM: Finishing up.\n");
+ pr_debug("PM: Finishing wakeup.\n");
suspend_finish(state);
Unlock:
up(&pm_sem);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 78d92dc6a1ed..0a086640bcfc 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -32,7 +32,7 @@ static inline int freezeable(struct task_struct * p)
}
/* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(unsigned long flag)
+void refrigerator(void)
{
/* Hmm, should we be allowed to suspend when there are realtime
processes around? */
@@ -41,14 +41,13 @@ void refrigerator(unsigned long flag)
current->state = TASK_UNINTERRUPTIBLE;
pr_debug("%s entered refrigerator\n", current->comm);
printk("=");
- current->flags &= ~PF_FREEZE;
+ frozen_process(current);
spin_lock_irq(&current->sighand->siglock);
recalc_sigpending(); /* We sent fake signal, clean it up */
spin_unlock_irq(&current->sighand->siglock);
- current->flags |= PF_FROZEN;
- while (current->flags & PF_FROZEN)
+ while (frozen(current))
schedule();
pr_debug("%s left refrigerator\n", current->comm);
current->state = save;
@@ -57,10 +56,10 @@ void refrigerator(unsigned long flag)
/* 0 = success, else # of processes that we failed to stop */
int freeze_processes(void)
{
- int todo;
- unsigned long start_time;
+ int todo;
+ unsigned long start_time;
struct task_struct *g, *p;
-
+
printk( "Stopping tasks: " );
start_time = jiffies;
do {
@@ -70,14 +69,12 @@ int freeze_processes(void)
unsigned long flags;
if (!freezeable(p))
continue;
- if ((p->flags & PF_FROZEN) ||
+ if ((frozen(p)) ||
(p->state == TASK_TRACED) ||
(p->state == TASK_STOPPED))
continue;
- /* FIXME: smp problem here: we may not access other process' flags
- without locking */
- p->flags |= PF_FREEZE;
+ freeze(p);
spin_lock_irqsave(&p->sighand->siglock, flags);
signal_wake_up(p, 0);
spin_unlock_irqrestore(&p->sighand->siglock, flags);
@@ -91,7 +88,7 @@ int freeze_processes(void)
return todo;
}
} while(todo);
-
+
printk( "|\n" );
BUG_ON(in_atomic());
return 0;
@@ -106,10 +103,7 @@ void thaw_processes(void)
do_each_thread(g, p) {
if (!freezeable(p))
continue;
- if (p->flags & PF_FROZEN) {
- p->flags &= ~PF_FROZEN;
- wake_up_process(p);
- } else
+ if (!thaw_process(p))
printk(KERN_INFO " Strange, %s not stopped\n", p->comm );
} while_each_thread(g, p);
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
index cba3584b80fe..bbe23079c62c 100644
--- a/kernel/power/smp.c
+++ b/kernel/power/smp.c
@@ -13,73 +13,52 @@
#include <linux/interrupt.h>
#include <linux/suspend.h>
#include <linux/module.h>
+#include <linux/cpu.h>
#include <asm/atomic.h>
#include <asm/tlbflush.h>
-static atomic_t cpu_counter, freeze;
-
-
-static void smp_pause(void * data)
-{
- struct saved_context ctxt;
- __save_processor_state(&ctxt);
- printk("Sleeping in:\n");
- dump_stack();
- atomic_inc(&cpu_counter);
- while (atomic_read(&freeze)) {
- /* FIXME: restore takes place at random piece inside this.
- This should probably be written in assembly, and
- preserve general-purpose registers, too
-
- What about stack? We may need to move to new stack here.
-
- This should better be ran with interrupts disabled.
- */
- cpu_relax();
- barrier();
- }
- atomic_dec(&cpu_counter);
- __restore_processor_state(&ctxt);
-}
-
-static cpumask_t oldmask;
+/* This is protected by pm_sem semaphore */
+static cpumask_t frozen_cpus;
void disable_nonboot_cpus(void)
{
- oldmask = current->cpus_allowed;
- set_cpus_allowed(current, cpumask_of_cpu(0));
- printk("Freezing CPUs (at %d)", _smp_processor_id());
- current->state = TASK_INTERRUPTIBLE;
- schedule_timeout(HZ);
- printk("...");
- BUG_ON(_smp_processor_id() != 0);
-
- /* FIXME: for this to work, all the CPUs must be running
- * "idle" thread (or we deadlock). Is that guaranteed? */
+ int cpu, error;
- atomic_set(&cpu_counter, 0);
- atomic_set(&freeze, 1);
- smp_call_function(smp_pause, NULL, 0, 0);
- while (atomic_read(&cpu_counter) < (num_online_cpus() - 1)) {
- cpu_relax();
- barrier();
+ error = 0;
+ cpus_clear(frozen_cpus);
+ printk("Freezing cpus ...\n");
+ for_each_online_cpu(cpu) {
+ if (cpu == 0)
+ continue;
+ error = cpu_down(cpu);
+ if (!error) {
+ cpu_set(cpu, frozen_cpus);
+ printk("CPU%d is down\n", cpu);
+ continue;
+ }
+ printk("Error taking cpu %d down: %d\n", cpu, error);
}
- printk("ok\n");
+ BUG_ON(smp_processor_id() != 0);
+ if (error)
+ panic("cpus not sleeping");
}
void enable_nonboot_cpus(void)
{
- printk("Restarting CPUs");
- atomic_set(&freeze, 0);
- while (atomic_read(&cpu_counter)) {
- cpu_relax();
- barrier();
- }
- printk("...");
- set_cpus_allowed(current, oldmask);
- schedule();
- printk("ok\n");
+ int cpu, error;
+ printk("Thawing cpus ...\n");
+ for_each_cpu_mask(cpu, frozen_cpus) {
+ error = smp_prepare_cpu(cpu);
+ if (!error)
+ error = cpu_up(cpu);
+ if (!error) {
+ printk("CPU%d is up\n", cpu);
+ continue;
+ }
+ printk("Error taking cpu %d up: %d\n", cpu, error);
+ panic("Not enough cpus");
+ }
+ cpus_clear(frozen_cpus);
}
-
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 90b3b68dee3f..c285fc5a2320 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -10,12 +10,12 @@
* This file is released under the GPLv2.
*
* I'd like to thank the following people for their work:
- *
+ *
* Pavel Machek <pavel@ucw.cz>:
* Modifications, defectiveness pointing, being with me at the very beginning,
* suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
*
- * Steve Doddi <dirk@loth.demon.co.uk>:
+ * Steve Doddi <dirk@loth.demon.co.uk>:
* Support the possibility of hardware state restoring.
*
* Raph <grey.havens@earthling.net>:
@@ -81,14 +81,14 @@ static int nr_copy_pages_check;
extern char resume_file[];
/* Local variables that should not be affected by save */
-unsigned int nr_copy_pages __nosavedata = 0;
+static unsigned int nr_copy_pages __nosavedata = 0;
/* Suspend pagedir is allocated before final copy, therefore it
- must be freed after resume
+ must be freed after resume
Warning: this is evil. There are actually two pagedirs at time of
resume. One is "pagedir_save", which is empty frame allocated at
- time of suspend, that must be freed. Second is "pagedir_nosave",
+ time of suspend, that must be freed. Second is "pagedir_nosave",
allocated at time of resume, that travels through memory not to
collide with anything.
@@ -132,7 +132,7 @@ static int mark_swapfiles(swp_entry_t prev)
{
int error;
- rw_swap_page_sync(READ,
+ rw_swap_page_sync(READ,
swp_entry(root_swap, 0),
virt_to_page((unsigned long)&swsusp_header));
if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
@@ -140,7 +140,7 @@ static int mark_swapfiles(swp_entry_t prev)
memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
swsusp_header.swsusp_info = prev;
- error = rw_swap_page_sync(WRITE,
+ error = rw_swap_page_sync(WRITE,
swp_entry(root_swap, 0),
virt_to_page((unsigned long)
&swsusp_header));
@@ -174,22 +174,22 @@ static int is_resume_device(const struct swap_info_struct *swap_info)
static int swsusp_swap_check(void) /* This is called before saving image */
{
int i, len;
-
+
len=strlen(resume_file);
root_swap = 0xFFFF;
-
+
swap_list_lock();
- for(i=0; i<MAX_SWAPFILES; i++) {
+ for (i=0; i<MAX_SWAPFILES; i++) {
if (swap_info[i].flags == 0) {
swapfile_used[i]=SWAPFILE_UNUSED;
} else {
- if(!len) {
+ if (!len) {
printk(KERN_WARNING "resume= option should be used to set suspend device" );
- if(root_swap == 0xFFFF) {
+ if (root_swap == 0xFFFF) {
swapfile_used[i] = SWAPFILE_SUSPEND;
root_swap = i;
} else
- swapfile_used[i] = SWAPFILE_IGNORED;
+ swapfile_used[i] = SWAPFILE_IGNORED;
} else {
/* we ignore all swap devices that are not the resume_file */
if (is_resume_device(&swap_info[i])) {
@@ -209,15 +209,15 @@ static int swsusp_swap_check(void) /* This is called before saving image */
* This is called after saving image so modification
* will be lost after resume... and that's what we want.
* we make the device unusable. A new call to
- * lock_swapdevices can unlock the devices.
+ * lock_swapdevices can unlock the devices.
*/
static void lock_swapdevices(void)
{
int i;
swap_list_lock();
- for(i = 0; i< MAX_SWAPFILES; i++)
- if(swapfile_used[i] == SWAPFILE_IGNORED) {
+ for (i = 0; i< MAX_SWAPFILES; i++)
+ if (swapfile_used[i] == SWAPFILE_IGNORED) {
swap_info[i].flags ^= 0xFF;
}
swap_list_unlock();
@@ -229,7 +229,7 @@ static void lock_swapdevices(void)
* @loc: Place to store the entry we used.
*
* Allocate a new swap entry and 'sync' it. Note we discard -EIO
- * errors. That is an artifact left over from swsusp. It did not
+ * errors. That is an artifact left over from swsusp. It did not
* check the return of rw_swap_page_sync() at all, since most pages
* written back to swap would return -EIO.
* This is a partial improvement, since we will at least return other
@@ -241,7 +241,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
int error = 0;
entry = get_swap_page();
- if (swp_offset(entry) &&
+ if (swp_offset(entry) &&
swapfile_used[swp_type(entry)] == SWAPFILE_SUSPEND) {
error = rw_swap_page_sync(WRITE, entry,
virt_to_page(addr));
@@ -257,7 +257,7 @@ static int write_page(unsigned long addr, swp_entry_t * loc)
/**
* data_free - Free the swap entries used by the saved image.
*
- * Walk the list of used swap entries and free each one.
+ * Walk the list of used swap entries and free each one.
* This is only used for cleanup when suspend fails.
*/
static void data_free(void)
@@ -290,7 +290,7 @@ static int data_write(void)
mod = 1;
printk( "Writing data to swap (%d pages)... ", nr_copy_pages );
- for_each_pbe(p, pagedir_nosave) {
+ for_each_pbe (p, pagedir_nosave) {
if (!(i%mod))
printk( "\b\b\b\b%3d%%", i / mod );
if ((error = write_page(p->address, &(p->swap_address))))
@@ -335,7 +335,7 @@ static int close_swap(void)
dump_info();
error = write_page((unsigned long)&swsusp_info, &entry);
- if (!error) {
+ if (!error) {
printk( "S" );
error = mark_swapfiles(entry);
printk( "|\n" );
@@ -370,7 +370,7 @@ static int write_pagedir(void)
struct pbe * pbe;
printk( "Writing pagedir...");
- for_each_pb_page(pbe, pagedir_nosave) {
+ for_each_pb_page (pbe, pagedir_nosave) {
if ((error = write_page((unsigned long)pbe, &swsusp_info.pagedir[n++])))
return error;
}
@@ -472,7 +472,7 @@ static int save_highmem(void)
int res = 0;
pr_debug("swsusp: Saving Highmem\n");
- for_each_zone(zone) {
+ for_each_zone (zone) {
if (is_highmem(zone))
res = save_highmem_zone(zone);
if (res)
@@ -547,7 +547,7 @@ static void count_data_pages(void)
nr_copy_pages = 0;
- for_each_zone(zone) {
+ for_each_zone (zone) {
if (is_highmem(zone))
continue;
mark_free_pages(zone);
@@ -562,9 +562,9 @@ static void copy_data_pages(void)
struct zone *zone;
unsigned long zone_pfn;
struct pbe * pbe = pagedir_nosave;
-
+
pr_debug("copy_data_pages(): pages to copy: %d\n", nr_copy_pages);
- for_each_zone(zone) {
+ for_each_zone (zone) {
if (is_highmem(zone))
continue;
mark_free_pages(zone);
@@ -702,7 +702,7 @@ static void free_image_pages(void)
{
struct pbe * p;
- for_each_pbe(p, pagedir_save) {
+ for_each_pbe (p, pagedir_save) {
if (p->address) {
ClearPageNosave(virt_to_page(p->address));
free_page(p->address);
@@ -719,7 +719,7 @@ static int alloc_image_pages(void)
{
struct pbe * p;
- for_each_pbe(p, pagedir_save) {
+ for_each_pbe (p, pagedir_save) {
p->address = get_zeroed_page(GFP_ATOMIC | __GFP_COLD);
if (!p->address)
return -ENOMEM;
@@ -740,7 +740,7 @@ void swsusp_free(void)
/**
* enough_free_mem - Make sure we enough free memory to snapshot.
*
- * Returns TRUE or FALSE after checking the number of available
+ * Returns TRUE or FALSE after checking the number of available
* free pages.
*/
@@ -758,11 +758,11 @@ static int enough_free_mem(void)
/**
* enough_swap - Make sure we have enough swap to save the image.
*
- * Returns TRUE or FALSE after checking the total amount of swap
+ * Returns TRUE or FALSE after checking the total amount of swap
* space avaiable.
*
* FIXME: si_swapinfo(&i) returns all swap devices information.
- * We should only consider resume_device.
+ * We should only consider resume_device.
*/
static int enough_swap(void)
@@ -781,18 +781,18 @@ static int swsusp_alloc(void)
{
int error;
+ pagedir_nosave = NULL;
+ nr_copy_pages = calc_nr(nr_copy_pages);
+
pr_debug("suspend: (pages needed: %d + %d free: %d)\n",
nr_copy_pages, PAGES_FOR_IO, nr_free_pages());
- pagedir_nosave = NULL;
if (!enough_free_mem())
return -ENOMEM;
if (!enough_swap())
return -ENOSPC;
- nr_copy_pages = calc_nr(nr_copy_pages);
-
if (!(pagedir_save = alloc_pagedir(nr_copy_pages))) {
printk(KERN_ERR "suspend: Allocating pagedir failed.\n");
return -ENOMEM;
@@ -827,8 +827,8 @@ static int suspend_prepare_image(void)
error = swsusp_alloc();
if (error)
return error;
-
- /* During allocating of suspend pagedir, new cold pages may appear.
+
+ /* During allocating of suspend pagedir, new cold pages may appear.
* Kill them.
*/
drain_local_pages();
@@ -929,21 +929,6 @@ int swsusp_resume(void)
return error;
}
-/* More restore stuff */
-
-/*
- * Returns true if given address/order collides with any orig_address
- */
-static int does_collide_order(unsigned long addr, int order)
-{
- int i;
-
- for (i=0; i < (1<<order); i++)
- if (!PageNosaveFree(virt_to_page(addr + i * PAGE_SIZE)))
- return 1;
- return 0;
-}
-
/**
* On resume, for storing the PBE list and the image,
* we can only use memory pages that do not conflict with the pages
@@ -973,7 +958,7 @@ static unsigned long get_usable_page(unsigned gfp_mask)
unsigned long m;
m = get_zeroed_page(gfp_mask);
- while (does_collide_order(m, 0)) {
+ while (!PageNosaveFree(virt_to_page(m))) {
eat_page((void *)m);
m = get_zeroed_page(gfp_mask);
if (!m)
@@ -1045,7 +1030,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
/* Set page flags */
- for_each_zone(zone) {
+ for_each_zone (zone) {
for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
SetPageNosaveFree(pfn_to_page(zone_pfn +
zone->zone_start_pfn));
@@ -1061,7 +1046,7 @@ static struct pbe * swsusp_pagedir_relocate(struct pbe *pblist)
/* Relocate colliding pages */
for_each_pb_page (pbpage, pblist) {
- if (does_collide_order((unsigned long)pbpage, 0)) {
+ if (!PageNosaveFree(virt_to_page((unsigned long)pbpage))) {
m = (void *)get_usable_page(GFP_ATOMIC | __GFP_COLD);
if (!m) {
error = -ENOMEM;
@@ -1193,8 +1178,10 @@ static const char * sanity_check(void)
return "version";
if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
return "machine";
+#if 0
if(swsusp_info.cpus != num_online_cpus())
return "number of cpus";
+#endif
return NULL;
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 290a07ce2c8a..5092397fac29 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -160,42 +160,6 @@ static int __init console_setup(char *str)
__setup("console=", console_setup);
-/**
- * add_preferred_console - add a device to the list of preferred consoles.
- *
- * The last preferred console added will be used for kernel messages
- * and stdin/out/err for init. Normally this is used by console_setup
- * above to handle user-supplied console arguments; however it can also
- * be used by arch-specific code either to override the user or more
- * commonly to provide a default console (ie from PROM variables) when
- * the user has not supplied one.
- */
-int __init add_preferred_console(char *name, int idx, char *options)
-{
- struct console_cmdline *c;
- int i;
-
- /*
- * See if this tty is not yet registered, and
- * if we have a slot free.
- */
- for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
- if (strcmp(console_cmdline[i].name, name) == 0 &&
- console_cmdline[i].index == idx) {
- selected_console = i;
- return 0;
- }
- if (i == MAX_CMDLINECONSOLES)
- return -E2BIG;
- selected_console = i;
- c = &console_cmdline[i];
- memcpy(c->name, name, sizeof(c->name));
- c->name[sizeof(c->name) - 1] = 0;
- c->options = options;
- c->index = idx;
- return 0;
-}
-
static int __init log_buf_len_setup(char *str)
{
unsigned long size = memparse(str, &str);
@@ -624,8 +588,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
log_level_unknown = 1;
}
- if (!cpu_online(smp_processor_id()) &&
- system_state != SYSTEM_RUNNING) {
+ if (!cpu_online(smp_processor_id())) {
/*
* Some console drivers may assume that per-cpu resources have
* been allocated. So don't allow them to be called by this
@@ -671,6 +634,42 @@ static void call_console_drivers(unsigned long start, unsigned long end) {}
#endif
/**
+ * add_preferred_console - add a device to the list of preferred consoles.
+ *
+ * The last preferred console added will be used for kernel messages
+ * and stdin/out/err for init. Normally this is used by console_setup
+ * above to handle user-supplied console arguments; however it can also
+ * be used by arch-specific code either to override the user or more
+ * commonly to provide a default console (ie from PROM variables) when
+ * the user has not supplied one.
+ */
+int __init add_preferred_console(char *name, int idx, char *options)
+{
+ struct console_cmdline *c;
+ int i;
+
+ /*
+ * See if this tty is not yet registered, and
+ * if we have a slot free.
+ */
+ for(i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++)
+ if (strcmp(console_cmdline[i].name, name) == 0 &&
+ console_cmdline[i].index == idx) {
+ selected_console = i;
+ return 0;
+ }
+ if (i == MAX_CMDLINECONSOLES)
+ return -E2BIG;
+ selected_console = i;
+ c = &console_cmdline[i];
+ memcpy(c->name, name, sizeof(c->name));
+ c->name[sizeof(c->name) - 1] = 0;
+ c->options = options;
+ c->index = idx;
+ return 0;
+}
+
+/**
* acquire_console_sem - lock the console system for exclusive use.
*
* Acquires a semaphore which guarantees that the caller has
@@ -876,8 +875,10 @@ void register_console(struct console * console)
break;
console->flags |= CON_ENABLED;
console->index = console_cmdline[i].index;
- if (i == preferred_console)
+ if (i == selected_console) {
console->flags |= CON_CONSDEV;
+ preferred_console = selected_console;
+ }
break;
}
@@ -897,6 +898,8 @@ void register_console(struct console * console)
if ((console->flags & CON_CONSDEV) || console_drivers == NULL) {
console->next = console_drivers;
console_drivers = console;
+ if (console->next)
+ console->next->flags &= ~CON_CONSDEV;
} else {
console->next = console_drivers->next;
console_drivers->next = console;
@@ -937,10 +940,14 @@ int unregister_console(struct console * console)
/* If last console is removed, we re-enable picking the first
* one that gets registered. Without that, pmac early boot console
* would prevent fbcon from taking over.
+ *
+ * If this isn't the last console and it has CON_CONSDEV set, we
+ * need to set it on the next preferred console.
*/
if (console_drivers == NULL)
preferred_console = selected_console;
-
+ else if (console->flags & CON_CONSDEV)
+ console_drivers->flags |= CON_CONSDEV;
release_console_sem();
return res;
diff --git a/kernel/profile.c b/kernel/profile.c
index 0221a50ca867..ad8cbb75ffa2 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -49,15 +49,19 @@ static DECLARE_MUTEX(profile_flip_mutex);
static int __init profile_setup(char * str)
{
+ static char __initdata schedstr[] = "schedule";
int par;
- if (!strncmp(str, "schedule", 8)) {
+ if (!strncmp(str, schedstr, strlen(schedstr))) {
prof_on = SCHED_PROFILING;
- printk(KERN_INFO "kernel schedule profiling enabled\n");
- if (str[7] == ',')
- str += 8;
- }
- if (get_option(&str,&par)) {
+ if (str[strlen(schedstr)] == ',')
+ str += strlen(schedstr) + 1;
+ if (get_option(&str, &par))
+ prof_shift = par;
+ printk(KERN_INFO
+ "kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift);
+ } else if (get_option(&str, &par)) {
prof_shift = par;
prof_on = CPU_PROFILING;
printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n",
diff --git a/kernel/resource.c b/kernel/resource.c
index 52f696f11adf..26967e042201 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -263,7 +263,7 @@ static int find_resource(struct resource *root, struct resource *new,
new->start = min;
if (new->end > max)
new->end = max;
- new->start = (new->start + align - 1) & ~(align - 1);
+ new->start = ALIGN(new->start, align);
if (alignf)
alignf(alignf_data, new, size, align);
if (new->start < new->end && new->end - new->start >= size - 1) {
diff --git a/kernel/sched.c b/kernel/sched.c
index 0dc3158667a2..a07cff90d849 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -166,7 +166,7 @@
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE)
-static inline unsigned int task_timeslice(task_t *p)
+static unsigned int task_timeslice(task_t *p)
{
if (p->static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio);
@@ -206,7 +206,7 @@ struct runqueue {
*/
unsigned long nr_running;
#ifdef CONFIG_SMP
- unsigned long cpu_load;
+ unsigned long cpu_load[3];
#endif
unsigned long long nr_switches;
@@ -260,23 +260,87 @@ struct runqueue {
static DEFINE_PER_CPU(struct runqueue, runqueues);
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
#define for_each_domain(cpu, domain) \
- for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent)
+for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
-/*
- * Default context-switch locking:
- */
#ifndef prepare_arch_switch
-# define prepare_arch_switch(rq, next) do { } while (0)
-# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock)
-# define task_running(rq, p) ((rq)->curr == (p))
+# define prepare_arch_switch(next) do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev) do { } while (0)
#endif
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+ return rq->curr == p;
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+ spin_unlock_irq(&rq->lock);
+}
+
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline int task_running(runqueue_t *rq, task_t *p)
+{
+#ifdef CONFIG_SMP
+ return p->oncpu;
+#else
+ return rq->curr == p;
+#endif
+}
+
+static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
+{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->oncpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ spin_unlock_irq(&rq->lock);
+#else
+ spin_unlock(&rq->lock);
+#endif
+}
+
+static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
+{
+#ifdef CONFIG_SMP
+ /*
+ * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->oncpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
@@ -309,7 +373,7 @@ static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
-#define SCHEDSTAT_VERSION 11
+#define SCHEDSTAT_VERSION 12
static int show_schedstat(struct seq_file *seq, void *v)
{
@@ -338,6 +402,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
#ifdef CONFIG_SMP
/* domain-specific stats */
+ preempt_disable();
for_each_domain(cpu, sd) {
enum idle_type itype;
char mask_str[NR_CPUS];
@@ -356,11 +421,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
- seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu\n",
+ seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
- sd->sbe_pushed, sd->sbe_attempts,
+ sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance);
}
+ preempt_enable();
#endif
}
return 0;
@@ -414,22 +481,6 @@ static inline runqueue_t *this_rq_lock(void)
return rq;
}
-#ifdef CONFIG_SCHED_SMT
-static int cpu_and_siblings_are_idle(int cpu)
-{
- int sib;
- for_each_cpu_mask(sib, cpu_sibling_map[cpu]) {
- if (idle_cpu(sib))
- continue;
- return 0;
- }
-
- return 1;
-}
-#else
-#define cpu_and_siblings_are_idle(A) idle_cpu(A)
-#endif
-
#ifdef CONFIG_SCHEDSTATS
/*
* Called when a process is dequeued from the active array and given
@@ -622,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
rq->nr_running++;
}
-static void recalc_task_prio(task_t *p, unsigned long long now)
+static int recalc_task_prio(task_t *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long long __sleep_time = now - p->timestamp;
@@ -681,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now)
}
}
- p->prio = effective_prio(p);
+ return effective_prio(p);
}
/*
@@ -704,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
}
#endif
- recalc_task_prio(p, now);
+ p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
@@ -782,22 +833,12 @@ inline int task_curr(const task_t *p)
}
#ifdef CONFIG_SMP
-enum request_type {
- REQ_MOVE_TASK,
- REQ_SET_DOMAIN,
-};
-
typedef struct {
struct list_head list;
- enum request_type type;
- /* For REQ_MOVE_TASK */
task_t *task;
int dest_cpu;
- /* For REQ_SET_DOMAIN */
- struct sched_domain *sd;
-
struct completion done;
} migration_req_t;
@@ -819,7 +860,6 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
}
init_completion(&req->done);
- req->type = REQ_MOVE_TASK;
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
@@ -886,26 +926,154 @@ void kick_process(task_t *p)
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return min(rq->cpu_load, load_now);
+ return min(rq->cpu_load[type-1], load_now);
}
/*
* Return a high guess at the load of a migration-target cpu
*/
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return max(rq->cpu_load, load_now);
+ return max(rq->cpu_load[type-1], load_now);
}
-#endif
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
+{
+ struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
+ unsigned long min_load = ULONG_MAX, this_load = 0;
+ int load_idx = sd->forkexec_idx;
+ int imbalance = 100 + (sd->imbalance_pct-100)/2;
+
+ do {
+ unsigned long load, avg_load;
+ int local_group;
+ int i;
+
+ local_group = cpu_isset(this_cpu, group->cpumask);
+ /* XXX: put a cpus allowed check */
+
+ /* Tally up the load of all CPUs in the group */
+ avg_load = 0;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ /* Bias balancing toward cpus of our domain */
+ if (local_group)
+ load = source_load(i, load_idx);
+ else
+ load = target_load(i, load_idx);
+
+ avg_load += load;
+ }
+
+ /* Adjust by relative CPU power of the group */
+ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+
+ if (local_group) {
+ this_load = avg_load;
+ this = group;
+ } else if (avg_load < min_load) {
+ min_load = avg_load;
+ idlest = group;
+ }
+ group = group->next;
+ } while (group != sd->groups);
+
+ if (!idlest || 100*this_load < imbalance*min_load)
+ return NULL;
+ return idlest;
+}
+
+/*
+ * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ */
+static int find_idlest_cpu(struct sched_group *group, int this_cpu)
+{
+ unsigned long load, min_load = ULONG_MAX;
+ int idlest = -1;
+ int i;
+
+ for_each_cpu_mask(i, group->cpumask) {
+ load = source_load(i, 0);
+
+ if (load < min_load || (load == min_load && i == this_cpu)) {
+ min_load = load;
+ idlest = i;
+ }
+ }
+
+ return idlest;
+}
+
+/*
+ * sched_balance_self: balance the current task (running on cpu) in domains
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int sched_balance_self(int cpu, int flag)
+{
+ struct task_struct *t = current;
+ struct sched_domain *tmp, *sd = NULL;
+
+ for_each_domain(cpu, tmp)
+ if (tmp->flags & flag)
+ sd = tmp;
+
+ while (sd) {
+ cpumask_t span;
+ struct sched_group *group;
+ int new_cpu;
+ int weight;
+
+ span = sd->span;
+ group = find_idlest_group(sd, t, cpu);
+ if (!group)
+ goto nextlevel;
+
+ new_cpu = find_idlest_cpu(group, cpu);
+ if (new_cpu == -1 || new_cpu == cpu)
+ goto nextlevel;
+
+ /* Now try balancing at a lower domain level */
+ cpu = new_cpu;
+nextlevel:
+ sd = NULL;
+ weight = cpus_weight(span);
+ for_each_domain(cpu, tmp) {
+ if (weight <= cpus_weight(tmp->span))
+ break;
+ if (tmp->flags & flag)
+ sd = tmp;
+ }
+ /* while loop will break here if sd == NULL */
+ }
+
+ return cpu;
+}
+
+#endif /* CONFIG_SMP */
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
@@ -927,14 +1095,14 @@ static int wake_idle(int cpu, task_t *p)
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
- cpus_and(tmp, sd->span, cpu_online_map);
- cpus_and(tmp, tmp, p->cpus_allowed);
+ cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
return i;
}
}
- else break;
+ else
+ break;
}
return cpu;
}
@@ -967,7 +1135,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
runqueue_t *rq;
#ifdef CONFIG_SMP
unsigned long load, this_load;
- struct sched_domain *sd;
+ struct sched_domain *sd, *this_sd = NULL;
int new_cpu;
#endif
@@ -986,70 +1154,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
if (unlikely(task_running(rq, p)))
goto out_activate;
-#ifdef CONFIG_SCHEDSTATS
+ new_cpu = cpu;
+
schedstat_inc(rq, ttwu_cnt);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
- } else {
- for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
+ goto out_set_cpu;
+ }
+
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ this_sd = sd;
+ break;
}
}
-#endif
- new_cpu = cpu;
- if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
- load = source_load(cpu);
- this_load = target_load(this_cpu);
-
/*
- * If sync wakeup then subtract the (maximum possible) effect of
- * the currently running task from the load of the current CPU:
+ * Check for affine wakeup and passive balancing possibilities.
*/
- if (sync)
- this_load -= SCHED_LOAD_SCALE;
+ if (this_sd) {
+ int idx = this_sd->wake_idx;
+ unsigned int imbalance;
- /* Don't pull the task off an idle CPU to a busy one */
- if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
- goto out_set_cpu;
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
- new_cpu = this_cpu; /* Wake to this CPU if we can */
+ load = source_load(cpu, idx);
+ this_load = target_load(this_cpu, idx);
- /*
- * Scan domains for affine wakeup and passive balancing
- * possibilities.
- */
- for_each_domain(this_cpu, sd) {
- unsigned int imbalance;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+ new_cpu = this_cpu; /* Wake to this CPU if we can */
- if ((sd->flags & SD_WAKE_AFFINE) &&
- !task_hot(p, rq->timestamp_last_tick, sd)) {
+ if (this_sd->flags & SD_WAKE_AFFINE) {
+ unsigned long tl = this_load;
/*
- * This domain has SD_WAKE_AFFINE and p is cache cold
- * in this domain.
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
*/
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_affine);
+ if (sync)
+ tl -= SCHED_LOAD_SCALE;
+
+ if ((tl <= load &&
+ tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+ 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
goto out_set_cpu;
}
- } else if ((sd->flags & SD_WAKE_BALANCE) &&
- imbalance*this_load <= 100*load) {
- /*
- * This domain has SD_WAKE_BALANCE and there is
- * an imbalance.
- */
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_balance);
+ }
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
goto out_set_cpu;
}
}
@@ -1120,17 +1287,19 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
return try_to_wake_up(p, state, 0);
}
-#ifdef CONFIG_SMP
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- struct sched_domain *sd);
-#endif
-
/*
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*/
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, int clone_flags)
{
+ int cpu = get_cpu();
+
+#ifdef CONFIG_SMP
+ cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
+#endif
+ set_task_cpu(p, cpu);
+
/*
* We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that
@@ -1140,17 +1309,14 @@ void fastcall sched_fork(task_t *p)
p->state = TASK_RUNNING;
INIT_LIST_HEAD(&p->run_list);
p->array = NULL;
- spin_lock_init(&p->switch_lock);
#ifdef CONFIG_SCHEDSTATS
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ p->oncpu = 0;
+#endif
#ifdef CONFIG_PREEMPT
- /*
- * During context-switch we hold precisely one spinlock, which
- * schedule_tail drops. (in the common case it's this_rq()->lock,
- * but it also can be p->switch_lock.) So we compensate with a count
- * of 1. Also, we want to start with kernel preemption disabled.
- */
+ /* Want to start with kernel preemption disabled. */
p->thread_info->preempt_count = 1;
#endif
/*
@@ -1174,12 +1340,10 @@ void fastcall sched_fork(task_t *p)
* runqueue lock is not a problem.
*/
current->time_slice = 1;
- preempt_disable();
scheduler_tick();
- local_irq_enable();
- preempt_enable();
- } else
- local_irq_enable();
+ }
+ local_irq_enable();
+ put_cpu();
}
/*
@@ -1196,10 +1360,9 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
runqueue_t *rq, *this_rq;
rq = task_rq_lock(p, &flags);
- cpu = task_cpu(p);
- this_cpu = smp_processor_id();
-
BUG_ON(p->state != TASK_RUNNING);
+ this_cpu = smp_processor_id();
+ cpu = task_cpu(p);
/*
* We decrease the sleep average of forking parents
@@ -1296,22 +1459,40 @@ void fastcall sched_exit(task_t * p)
}
/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
+{
+ prepare_lock_switch(rq, next);
+ prepare_arch_switch(next);
+}
+
+/**
* finish_task_switch - clean up after a task-switch
* @prev: the thread we just switched away from.
*
- * We enter this with the runqueue still locked, and finish_arch_switch()
- * will unlock it along with doing any other architecture-specific cleanup
- * actions.
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
*
* Note that we may have delayed dropping an mm in context_switch(). If
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
*/
-static inline void finish_task_switch(task_t *prev)
+static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
__releases(rq->lock)
{
- runqueue_t *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
unsigned long prev_task_flags;
@@ -1329,7 +1510,8 @@ static inline void finish_task_switch(task_t *prev)
* Manfred Spraul <manfred@colorfullife.com>
*/
prev_task_flags = prev->flags;
- finish_arch_switch(rq, prev);
+ finish_arch_switch(prev);
+ finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
if (unlikely(prev_task_flags & PF_DEAD))
@@ -1343,8 +1525,12 @@ static inline void finish_task_switch(task_t *prev)
asmlinkage void schedule_tail(task_t *prev)
__releases(rq->lock)
{
- finish_task_switch(prev);
-
+ runqueue_t *rq = this_rq();
+ finish_task_switch(rq, prev);
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+ /* In this case, finish_task_switch does not reenable preemption */
+ preempt_enable();
+#endif
if (current->set_child_tid)
put_user(current->pid, current->set_child_tid);
}
@@ -1494,51 +1680,6 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
}
/*
- * find_idlest_cpu - find the least busy runqueue.
- */
-static int find_idlest_cpu(struct task_struct *p, int this_cpu,
- struct sched_domain *sd)
-{
- unsigned long load, min_load, this_load;
- int i, min_cpu;
- cpumask_t mask;
-
- min_cpu = UINT_MAX;
- min_load = ULONG_MAX;
-
- cpus_and(mask, sd->span, p->cpus_allowed);
-
- for_each_cpu_mask(i, mask) {
- load = target_load(i);
-
- if (load < min_load) {
- min_cpu = i;
- min_load = load;
-
- /* break out early on an idle CPU: */
- if (!min_load)
- break;
- }
- }
-
- /* add +1 to account for the new task */
- this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
-
- /*
- * Would with the addition of the new task to the
- * current CPU there be an imbalance between this
- * CPU and the idlest CPU?
- *
- * Use half of the balancing threshold - new-context is
- * a good opportunity to balance.
- */
- if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100)
- return min_cpu;
-
- return this_cpu;
-}
-
-/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -1571,37 +1712,16 @@ out:
}
/*
- * sched_exec(): find the highest-level, exec-balance-capable
- * domain and try to migrate the task to the least loaded CPU.
- *
- * execve() is a valuable balancing opportunity, because at this point
- * the task has the smallest effective memory and cache footprint.
+ * sched_exec - execve() is a valuable balancing opportunity, because at
+ * this point the task has the smallest effective memory and cache footprint.
*/
void sched_exec(void)
{
- struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu();
-
- /* Prefer the current CPU if there's only this task running */
- if (this_rq()->nr_running <= 1)
- goto out;
-
- for_each_domain(this_cpu, tmp)
- if (tmp->flags & SD_BALANCE_EXEC)
- sd = tmp;
-
- if (sd) {
- schedstat_inc(sd, sbe_attempts);
- new_cpu = find_idlest_cpu(current, this_cpu, sd);
- if (new_cpu != this_cpu) {
- schedstat_inc(sd, sbe_pushed);
- put_cpu();
- sched_migrate_task(current, new_cpu);
- return;
- }
- }
-out:
+ new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
put_cpu();
+ if (new_cpu != this_cpu)
+ sched_migrate_task(current, new_cpu);
}
/*
@@ -1632,7 +1752,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle)
+ struct sched_domain *sd, enum idle_type idle, int *all_pinned)
{
/*
* We do not migrate tasks that are:
@@ -1640,23 +1760,24 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (task_running(rq, p))
- return 0;
if (!cpu_isset(this_cpu, p->cpus_allowed))
return 0;
+ *all_pinned = 0;
+
+ if (task_running(rq, p))
+ return 0;
/*
* Aggressive migration if:
- * 1) the [whole] cpu is idle, or
+ * 1) task is cache cold, or
* 2) too many balance attempts have failed.
*/
- if (cpu_and_siblings_are_idle(this_cpu) || \
- sd->nr_balance_failed > sd->cache_nice_tries)
+ if (sd->nr_balance_failed > sd->cache_nice_tries)
return 1;
if (task_hot(p, rq->timestamp_last_tick, sd))
- return 0;
+ return 0;
return 1;
}
@@ -1669,16 +1790,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
*/
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle)
+ enum idle_type idle, int *all_pinned)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0;
+ int idx, pulled = 0, pinned = 0;
task_t *tmp;
- if (max_nr_move <= 0 || busiest->nr_running <= 1)
+ if (max_nr_move == 0)
goto out;
+ pinned = 1;
+
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
@@ -1717,7 +1840,7 @@ skip_queue:
curr = curr->prev;
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
if (curr != head)
goto skip_queue;
idx++;
@@ -1746,6 +1869,9 @@ out:
* inside pull_task().
*/
schedstat_add(sd, lb_gained[idle], pulled);
+
+ if (all_pinned)
+ *all_pinned = pinned;
return pulled;
}
@@ -1760,8 +1886,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ int load_idx;
max_load = this_load = total_load = total_pwr = 0;
+ if (idle == NOT_IDLE)
+ load_idx = sd->busy_idx;
+ else if (idle == NEWLY_IDLE)
+ load_idx = sd->newidle_idx;
+ else
+ load_idx = sd->idle_idx;
do {
unsigned long load;
@@ -1776,9 +1909,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
for_each_cpu_mask(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */
if (local_group)
- load = target_load(i);
+ load = target_load(i, load_idx);
else
- load = source_load(i);
+ load = source_load(i, load_idx);
avg_load += load;
}
@@ -1792,12 +1925,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (local_group) {
this_load = avg_load;
this = group;
- goto nextgroup;
} else if (avg_load > max_load) {
max_load = avg_load;
busiest = group;
}
-nextgroup:
group = group->next;
} while (group != sd->groups);
@@ -1870,15 +2001,9 @@ nextgroup:
/* Get rid of the scaling factor, rounding down as we divide */
*imbalance = *imbalance / SCHED_LOAD_SCALE;
-
return busiest;
out_balanced:
- if (busiest && (idle == NEWLY_IDLE ||
- (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
- *imbalance = 1;
- return busiest;
- }
*imbalance = 0;
return NULL;
@@ -1894,7 +2019,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i);
+ load = source_load(i, 0);
if (load > max_load) {
max_load = load;
@@ -1906,6 +2031,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
}
/*
+ * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
+ * so long as it is large enough.
+ */
+#define MAX_PINNED_INTERVAL 512
+
+/*
* Check this_cpu to ensure it is balanced within domain. Attempt to move
* tasks if there is an imbalance.
*
@@ -1917,7 +2048,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
struct sched_group *group;
runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved;
+ int nr_moved, all_pinned = 0;
+ int active_balance = 0;
spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +2066,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
goto out_balanced;
}
- /*
- * This should be "impossible", but since load
- * balancing is inherently racy and statistical,
- * it could happen in theory.
- */
- if (unlikely(busiest == this_rq)) {
- WARN_ON(1);
- goto out_balanced;
- }
+ BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
@@ -1956,9 +2080,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle);
+ imbalance, sd, idle,
+ &all_pinned);
spin_unlock(&busiest->lock);
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(all_pinned))
+ goto out_balanced;
}
+
spin_unlock(&this_rq->lock);
if (!nr_moved) {
@@ -1966,36 +2096,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- int wake = 0;
spin_lock(&busiest->lock);
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
- wake = 1;
+ active_balance = 1;
}
spin_unlock(&busiest->lock);
- if (wake)
+ if (active_balance)
wake_up_process(busiest->migration_thread);
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries;
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
}
-
- /*
- * We were unbalanced, but unsuccessful in move_tasks(),
- * so bump the balance_interval to lessen the lock contention.
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval++;
- } else {
+ } else
sd->nr_balance_failed = 0;
+ if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * move_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
}
return nr_moved;
@@ -2005,8 +2137,10 @@ out_balanced:
schedstat_inc(sd, lb_balanced[idle]);
+ sd->nr_balance_failed = 0;
/* tune up the balancing interval */
- if (sd->balance_interval < sd->max_interval)
+ if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ (sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
return 0;
@@ -2030,31 +2164,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
if (!group) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
busiest = find_busiest_queue(group);
- if (!busiest || busiest == this_rq) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
+ BUG_ON(busiest == this_rq);
+
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, NEWLY_IDLE);
+ imbalance, sd, NEWLY_IDLE, NULL);
if (!nr_moved)
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ else
+ sd->nr_balance_failed = 0;
spin_unlock(&busiest->lock);
-
-out:
return nr_moved;
+
+out_balanced:
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ sd->nr_balance_failed = 0;
+ return 0;
}
/*
@@ -2086,56 +2225,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
{
struct sched_domain *sd;
- struct sched_group *cpu_group;
runqueue_t *target_rq;
- cpumask_t visited_cpus;
- int cpu;
+ int target_cpu = busiest_rq->push_cpu;
+
+ if (busiest_rq->nr_running <= 1)
+ /* no task to move */
+ return;
+
+ target_rq = cpu_rq(target_cpu);
/*
- * Search for suitable CPUs to push tasks to in successively higher
- * domains with SD_LOAD_BALANCE set.
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
*/
- visited_cpus = CPU_MASK_NONE;
- for_each_domain(busiest_cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- /* no more domains to search */
- break;
+ BUG_ON(busiest_rq == target_rq);
- schedstat_inc(sd, alb_cnt);
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
- cpu_group = sd->groups;
- do {
- for_each_cpu_mask(cpu, cpu_group->cpumask) {
- if (busiest_rq->nr_running <= 1)
- /* no more tasks left to move */
- return;
- if (cpu_isset(cpu, visited_cpus))
- continue;
- cpu_set(cpu, visited_cpus);
- if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
- continue;
-
- target_rq = cpu_rq(cpu);
- /*
- * This condition is "impossible", if it occurs
- * we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-cpu setup.
- */
- BUG_ON(busiest_rq == target_rq);
-
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
- if (move_tasks(target_rq, cpu, busiest_rq,
- 1, sd, SCHED_IDLE)) {
- schedstat_inc(sd, alb_pushed);
- } else {
- schedstat_inc(sd, alb_failed);
- }
- spin_unlock(&target_rq->lock);
- }
- cpu_group = cpu_group->next;
- } while (cpu_group != sd->groups);
- }
+ /* Search for an sd spanning us and the target CPU. */
+ for_each_domain(target_cpu, sd)
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpu_isset(busiest_cpu, sd->span))
+ break;
+
+ if (unlikely(sd == NULL))
+ goto out;
+
+ schedstat_inc(sd, alb_cnt);
+
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+out:
+ spin_unlock(&target_rq->lock);
}
/*
@@ -2156,18 +2281,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
unsigned long old_load, this_load;
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;
+ int i;
- /* Update our load */
- old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (this_load > old_load)
- old_load++;
- this_rq->cpu_load = (old_load + this_load) / 2;
+ /* Update our load */
+ for (i = 0; i < 3; i++) {
+ unsigned long new_load = this_load;
+ int scale = 1 << i;
+ old_load = this_rq->cpu_load[i];
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+ }
for_each_domain(this_cpu, sd) {
unsigned long interval;
@@ -2447,11 +2577,15 @@ out:
#ifdef CONFIG_SCHED_SMT
static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
int i;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
+
+ if (!sd)
return;
/*
@@ -2492,13 +2626,17 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
{
- struct sched_domain *sd = this_rq->sd;
+ struct sched_domain *tmp, *sd = NULL;
cpumask_t sibling_map;
prio_array_t *array;
int ret = 0, i;
task_t *p;
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ for_each_domain(this_cpu, tmp)
+ if (tmp->flags & SD_SHARE_CPUPOWER)
+ sd = tmp;
+
+ if (!sd)
return 0;
/*
@@ -2576,7 +2714,7 @@ void fastcall add_preempt_count(int val)
/*
* Underflow?
*/
- BUG_ON(((int)preempt_count() < 0));
+ BUG_ON((preempt_count() < 0));
preempt_count() += val;
/*
* Spinlock count overflowing soon?
@@ -2613,7 +2751,7 @@ asmlinkage void __sched schedule(void)
struct list_head *queue;
unsigned long long now;
unsigned long run_time;
- int cpu, idx;
+ int cpu, idx, new_prio;
/*
* Test if we are atomic. Since do_exit() needs to call into
@@ -2735,9 +2873,14 @@ go_idle:
delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128;
array = next->array;
- dequeue_task(next, array);
- recalc_task_prio(next, next->timestamp + delta);
- enqueue_task(next, array);
+ new_prio = recalc_task_prio(next, next->timestamp + delta);
+
+ if (unlikely(next->prio != new_prio)) {
+ dequeue_task(next, array);
+ next->prio = new_prio;
+ enqueue_task(next, array);
+ } else
+ requeue_task(next, array);
}
next->activated = 0;
switch_tasks:
@@ -2761,11 +2904,15 @@ switch_tasks:
rq->curr = next;
++*switch_count;
- prepare_arch_switch(rq, next);
+ prepare_task_switch(rq, next);
prev = context_switch(rq, prev, next);
barrier();
-
- finish_task_switch(prev);
+ /*
+ * this_rq must be evaluated again because prev may have moved
+ * CPUs since it called schedule(), thus the 'rq' on its stack
+ * frame will be invalid.
+ */
+ finish_task_switch(this_rq(), prev);
} else
spin_unlock_irq(&rq->lock);
@@ -2869,7 +3016,7 @@ need_resched:
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
- task_t *p = curr->task;
+ task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
}
@@ -3384,13 +3531,24 @@ recheck:
if ((policy == SCHED_NORMAL) != (param->sched_priority == 0))
return -EINVAL;
- if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
- param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
- if ((current->euid != p->euid) && (current->euid != p->uid) &&
- !capable(CAP_SYS_NICE))
- return -EPERM;
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+ */
+ if (!capable(CAP_SYS_NICE)) {
+ /* can't change policy */
+ if (policy != p->policy)
+ return -EPERM;
+ /* can't increase priority */
+ if (policy != SCHED_NORMAL &&
+ param->sched_priority > p->rt_priority &&
+ param->sched_priority >
+ p->signal->rlim[RLIMIT_RTPRIO].rlim_cur)
+ return -EPERM;
+ /* can't change other user's priorities */
+ if ((current->euid != p->euid) &&
+ (current->euid != p->uid))
+ return -EPERM;
+ }
retval = security_task_setscheduler(p, policy, param);
if (retval)
@@ -3755,19 +3913,22 @@ EXPORT_SYMBOL(cond_resched);
*/
int cond_resched_lock(spinlock_t * lock)
{
+ int ret = 0;
+
if (need_lockbreak(lock)) {
spin_unlock(lock);
cpu_relax();
+ ret = 1;
spin_lock(lock);
}
if (need_resched()) {
_raw_spin_unlock(lock);
preempt_enable_no_resched();
__cond_resched();
+ ret = 1;
spin_lock(lock);
- return 1;
}
- return 0;
+ return ret;
}
EXPORT_SYMBOL(cond_resched_lock);
@@ -3811,7 +3972,7 @@ EXPORT_SYMBOL(yield);
*/
void __sched io_schedule(void)
{
- struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+ struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
atomic_inc(&rq->nr_iowait);
schedule();
@@ -3822,7 +3983,7 @@ EXPORT_SYMBOL(io_schedule);
long __sched io_schedule_timeout(long timeout)
{
- struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+ struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
long ret;
atomic_inc(&rq->nr_iowait);
@@ -4027,6 +4188,9 @@ void __devinit init_idle(task_t *idle, int cpu)
spin_lock_irqsave(&rq->lock, flags);
rq->curr = rq->idle = idle;
+#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+ idle->oncpu = 1;
+#endif
set_tsk_need_resched(idle);
spin_unlock_irqrestore(&rq->lock, flags);
@@ -4171,8 +4335,7 @@ static int migration_thread(void * data)
struct list_head *head;
migration_req_t *req;
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ try_to_freeze();
spin_lock_irq(&rq->lock);
@@ -4197,17 +4360,9 @@ static int migration_thread(void * data)
req = list_entry(head->next, migration_req_t, list);
list_del_init(head->next);
- if (req->type == REQ_MOVE_TASK) {
- spin_unlock(&rq->lock);
- __migrate_task(req->task, cpu, req->dest_cpu);
- local_irq_enable();
- } else if (req->type == REQ_SET_DOMAIN) {
- rq->sd = req->sd;
- spin_unlock_irq(&rq->lock);
- } else {
- spin_unlock_irq(&rq->lock);
- WARN_ON(1);
- }
+ spin_unlock(&rq->lock);
+ __migrate_task(req->task, cpu, req->dest_cpu);
+ local_irq_enable();
complete(&req->done);
}
@@ -4243,7 +4398,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
/* No more Mr. Nice Guy. */
if (dest_cpu == NR_CPUS) {
- tsk->cpus_allowed = cpuset_cpus_allowed(tsk);
+ cpus_setall(tsk->cpus_allowed);
dest_cpu = any_online_cpu(tsk->cpus_allowed);
/*
@@ -4438,7 +4593,6 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
migration_req_t *req;
req = list_entry(rq->migration_queue.next,
migration_req_t, list);
- BUG_ON(req->type != REQ_MOVE_TASK);
list_del_init(&req->list);
complete(&req->done);
}
@@ -4469,12 +4623,17 @@ int __init migration_init(void)
#endif
#ifdef CONFIG_SMP
-#define SCHED_DOMAIN_DEBUG
+#undef SCHED_DOMAIN_DEBUG
#ifdef SCHED_DOMAIN_DEBUG
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
int level = 0;
+ if (!sd) {
+ printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+ return;
+ }
+
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
do {
@@ -4557,37 +4716,81 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
#define sched_domain_debug(sd, cpu) {}
#endif
+static int sd_degenerate(struct sched_domain *sd)
+{
+ if (cpus_weight(sd->span) == 1)
+ return 1;
+
+ /* Following flags need at least 2 groups */
+ if (sd->flags & (SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC)) {
+ if (sd->groups != sd->groups->next)
+ return 0;
+ }
+
+ /* Following flags don't use groups */
+ if (sd->flags & (SD_WAKE_IDLE |
+ SD_WAKE_AFFINE |
+ SD_WAKE_BALANCE))
+ return 0;
+
+ return 1;
+}
+
+static int sd_parent_degenerate(struct sched_domain *sd,
+ struct sched_domain *parent)
+{
+ unsigned long cflags = sd->flags, pflags = parent->flags;
+
+ if (sd_degenerate(parent))
+ return 1;
+
+ if (!cpus_equal(sd->span, parent->span))
+ return 0;
+
+ /* Does parent contain flags not in child? */
+ /* WAKE_BALANCE is a subset of WAKE_AFFINE */
+ if (cflags & SD_WAKE_AFFINE)
+ pflags &= ~SD_WAKE_BALANCE;
+ /* Flags needing groups don't count if only 1 group in parent */
+ if (parent->groups == parent->groups->next) {
+ pflags &= ~(SD_LOAD_BALANCE |
+ SD_BALANCE_NEWIDLE |
+ SD_BALANCE_FORK |
+ SD_BALANCE_EXEC);
+ }
+ if (~cflags & pflags)
+ return 0;
+
+ return 1;
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
*/
-void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu)
+void cpu_attach_domain(struct sched_domain *sd, int cpu)
{
- migration_req_t req;
- unsigned long flags;
runqueue_t *rq = cpu_rq(cpu);
- int local = 1;
-
- sched_domain_debug(sd, cpu);
+ struct sched_domain *tmp;
- spin_lock_irqsave(&rq->lock, flags);
-
- if (cpu == smp_processor_id() || !cpu_online(cpu)) {
- rq->sd = sd;
- } else {
- init_completion(&req.done);
- req.type = REQ_SET_DOMAIN;
- req.sd = sd;
- list_add(&req.list, &rq->migration_queue);
- local = 0;
+ /* Remove the sched domains which do not contribute to scheduling. */
+ for (tmp = sd; tmp; tmp = tmp->parent) {
+ struct sched_domain *parent = tmp->parent;
+ if (!parent)
+ break;
+ if (sd_parent_degenerate(tmp, parent))
+ tmp->parent = parent->parent;
}
- spin_unlock_irqrestore(&rq->lock, flags);
+ if (sd && sd_degenerate(sd))
+ sd = sd->parent;
- if (!local) {
- wake_up_process(rq->migration_thread);
- wait_for_completion(&req.done);
- }
+ sched_domain_debug(sd, cpu);
+
+ rcu_assign_pointer(rq->sd, sd);
}
/* cpus with isolated domains */
@@ -4619,7 +4822,7 @@ __setup ("isolcpus=", isolated_cpu_setup);
* covered by the given span, and will set each group's ->cpumask correctly,
* and ->cpu_power to 0.
*/
-void __devinit init_sched_build_groups(struct sched_group groups[],
+void init_sched_build_groups(struct sched_group groups[],
cpumask_t span, int (*group_fn)(int cpu))
{
struct sched_group *first = NULL, *last = NULL;
@@ -4655,13 +4858,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[],
#ifdef ARCH_HAS_SCHED_DOMAIN
-extern void __devinit arch_init_sched_domains(void);
-extern void __devinit arch_destroy_sched_domains(void);
+extern void build_sched_domains(const cpumask_t *cpu_map);
+extern void arch_init_sched_domains(const cpumask_t *cpu_map);
+extern void arch_destroy_sched_domains(const cpumask_t *cpu_map);
#else
#ifdef CONFIG_SCHED_SMT
static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
static struct sched_group sched_group_cpus[NR_CPUS];
-static int __devinit cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu)
{
return cpu;
}
@@ -4669,7 +4873,7 @@ static int __devinit cpu_to_cpu_group(int cpu)
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
-static int __devinit cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu)
{
#ifdef CONFIG_SCHED_SMT
return first_cpu(cpu_sibling_map[cpu]);
@@ -4682,7 +4886,7 @@ static int __devinit cpu_to_phys_group(int cpu)
static DEFINE_PER_CPU(struct sched_domain, node_domains);
static struct sched_group sched_group_nodes[MAX_NUMNODES];
-static int __devinit cpu_to_node_group(int cpu)
+static int cpu_to_node_group(int cpu)
{
return cpu_to_node(cpu);
}
@@ -4713,39 +4917,28 @@ static void check_sibling_maps(void)
#endif
/*
- * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
*/
-static void __devinit arch_init_sched_domains(void)
+static void build_sched_domains(const cpumask_t *cpu_map)
{
int i;
- cpumask_t cpu_default_map;
-
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
- check_sibling_maps();
-#endif
- /*
- * Setup mask for cpus without special case scheduling requirements.
- * For now this just excludes isolated cpus, but could be used to
- * exclude other special cases in the future.
- */
- cpus_complement(cpu_default_map, cpu_isolated_map);
- cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
/*
- * Set up domains. Isolated domains just stay on the dummy domain.
+ * Set up domains for cpus specified by the cpu_map.
*/
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int group;
struct sched_domain *sd = NULL, *p;
cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
#ifdef CONFIG_NUMA
sd = &per_cpu(node_domains, i);
group = cpu_to_node_group(i);
*sd = SD_NODE_INIT;
- sd->span = cpu_default_map;
+ sd->span = *cpu_map;
sd->groups = &sched_group_nodes[group];
#endif
@@ -4763,7 +4956,7 @@ static void __devinit arch_init_sched_domains(void)
group = cpu_to_cpu_group(i);
*sd = SD_SIBLING_INIT;
sd->span = cpu_sibling_map[i];
- cpus_and(sd->span, sd->span, cpu_default_map);
+ cpus_and(sd->span, sd->span, *cpu_map);
sd->parent = p;
sd->groups = &sched_group_cpus[group];
#endif
@@ -4773,7 +4966,7 @@ static void __devinit arch_init_sched_domains(void)
/* Set up CPU (sibling) groups */
for_each_online_cpu(i) {
cpumask_t this_sibling_map = cpu_sibling_map[i];
- cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+ cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
if (i != first_cpu(this_sibling_map))
continue;
@@ -4786,7 +4979,7 @@ static void __devinit arch_init_sched_domains(void)
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
- cpus_and(nodemask, nodemask, cpu_default_map);
+ cpus_and(nodemask, nodemask, *cpu_map);
if (cpus_empty(nodemask))
continue;
@@ -4796,12 +4989,12 @@ static void __devinit arch_init_sched_domains(void)
#ifdef CONFIG_NUMA
/* Set up node groups */
- init_sched_build_groups(sched_group_nodes, cpu_default_map,
+ init_sched_build_groups(sched_group_nodes, *cpu_map,
&cpu_to_node_group);
#endif
/* Calculate CPU power for physical packages and nodes */
- for_each_cpu_mask(i, cpu_default_map) {
+ for_each_cpu_mask(i, *cpu_map) {
int power;
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
@@ -4825,7 +5018,7 @@ static void __devinit arch_init_sched_domains(void)
}
/* Attach the domains */
- for_each_online_cpu(i) {
+ for_each_cpu_mask(i, *cpu_map) {
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
@@ -4835,41 +5028,85 @@ static void __devinit arch_init_sched_domains(void)
cpu_attach_domain(sd, i);
}
}
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ */
+static void arch_init_sched_domains(cpumask_t *cpu_map)
+{
+ cpumask_t cpu_default_map;
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+ check_sibling_maps();
+#endif
+ /*
+ * Setup mask for cpus without special case scheduling requirements.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
+
+ build_sched_domains(&cpu_default_map);
+}
-#ifdef CONFIG_HOTPLUG_CPU
-static void __devinit arch_destroy_sched_domains(void)
+static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
{
/* Do nothing: everything is statically allocated. */
}
-#endif
#endif /* ARCH_HAS_SCHED_DOMAIN */
/*
- * Initial dummy domain for early boot and for hotplug cpu. Being static,
- * it is initialized to zero, so all balancing flags are cleared which is
- * what we want.
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
*/
-static struct sched_domain sched_domain_dummy;
+static inline void detach_destroy_domains(const cpumask_t *cpu_map)
+{
+ int i;
+
+ for_each_cpu_mask(i, *cpu_map)
+ cpu_attach_domain(NULL, i);
+ synchronize_sched();
+ arch_destroy_sched_domains(cpu_map);
+}
+
+/*
+ * Partition sched domains as specified by the cpumasks below.
+ * This attaches all cpus from the cpumasks to the NULL domain,
+ * waits for a RCU quiescent period, recalculates sched
+ * domain information and then attaches them back to the
+ * correct sched domains
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
+{
+ cpumask_t change_map;
+
+ cpus_and(*partition1, *partition1, cpu_online_map);
+ cpus_and(*partition2, *partition2, cpu_online_map);
+ cpus_or(change_map, *partition1, *partition2);
+
+ /* Detach sched domains from all of the affected cpus */
+ detach_destroy_domains(&change_map);
+ if (!cpus_empty(*partition1))
+ build_sched_domains(partition1);
+ if (!cpus_empty(*partition2))
+ build_sched_domains(partition2);
+}
#ifdef CONFIG_HOTPLUG_CPU
/*
* Force a reinitialization of the sched domains hierarchy. The domains
* and groups cannot be updated in place without racing with the balancing
- * code, so we temporarily attach all running cpus to a "dummy" domain
+ * code, so we temporarily attach all running cpus to the NULL domain
* which will prevent rebalancing while the sched domains are recalculated.
*/
static int update_sched_domains(struct notifier_block *nfb,
unsigned long action, void *hcpu)
{
- int i;
-
switch (action) {
case CPU_UP_PREPARE:
case CPU_DOWN_PREPARE:
- for_each_online_cpu(i)
- cpu_attach_domain(&sched_domain_dummy, i);
- arch_destroy_sched_domains();
+ detach_destroy_domains(&cpu_online_map);
return NOTIFY_OK;
case CPU_UP_CANCELED:
@@ -4885,7 +5122,7 @@ static int update_sched_domains(struct notifier_block *nfb,
}
/* The hotplug lock is already held by cpu_up/cpu_down */
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
return NOTIFY_OK;
}
@@ -4894,7 +5131,7 @@ static int update_sched_domains(struct notifier_block *nfb,
void __init sched_init_smp(void)
{
lock_cpu_hotplug();
- arch_init_sched_domains();
+ arch_init_sched_domains(&cpu_online_map);
unlock_cpu_hotplug();
/* XXX: Theoretical race here - CPU may be hotplugged now */
hotcpu_notifier(update_sched_domains, 0);
@@ -4924,13 +5161,15 @@ void __init sched_init(void)
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
- rq->sd = &sched_domain_dummy;
- rq->cpu_load = 0;
+ rq->sd = NULL;
+ for (j = 1; j < 3; j++)
+ rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
rq->migration_thread = NULL;
diff --git a/kernel/signal.c b/kernel/signal.c
index 8f3debc77c5b..ca1186eef938 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -24,6 +24,7 @@
#include <linux/ptrace.h>
#include <linux/posix-timers.h>
#include <linux/signal.h>
+#include <linux/audit.h>
#include <asm/param.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -212,6 +213,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
fastcall void recalc_sigpending_tsk(struct task_struct *t)
{
if (t->signal->group_stop_count > 0 ||
+ (freezing(t)) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked))
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -522,7 +524,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
{
int sig = 0;
- sig = next_signal(pending, mask);
+ /* SIGKILL must have priority, otherwise it is quite easy
+ * to create an unkillable process, sending sig < SIGKILL
+ * to self */
+ if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+ if (!sigismember(mask, SIGKILL))
+ sig = SIGKILL;
+ }
+
+ if (likely(!sig))
+ sig = next_signal(pending, mask);
if (sig) {
if (current->notifier) {
if (sigismember(current->notifier_mask, sig)) {
@@ -658,7 +669,11 @@ static int check_kill_permission(int sig, struct siginfo *info,
&& (current->uid ^ t->suid) && (current->uid ^ t->uid)
&& !capable(CAP_KILL))
return error;
- return security_task_kill(t, info, sig);
+
+ error = security_task_kill(t, info, sig);
+ if (!error)
+ audit_signal_info(sig, t); /* Let audit system see the signal */
+ return error;
}
/* forward decl */
@@ -2216,8 +2231,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
current->state = TASK_INTERRUPTIBLE;
timeout = schedule_timeout(timeout);
- if (current->flags & PF_FREEZE)
- refrigerator(PF_FREEZE);
+ try_to_freeze();
spin_lock_irq(&current->sighand->siglock);
sig = dequeue_signal(current, &these, &info);
current->blocked = current->real_blocked;
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index e15ed17863f1..0c3f9d8bbe17 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -294,7 +294,7 @@ EXPORT_SYMBOL(_spin_unlock_irq);
void __lockfunc _spin_unlock_bh(spinlock_t *lock)
{
_raw_spin_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_spin_unlock_bh);
@@ -318,7 +318,7 @@ EXPORT_SYMBOL(_read_unlock_irq);
void __lockfunc _read_unlock_bh(rwlock_t *lock)
{
_raw_read_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_read_unlock_bh);
@@ -342,7 +342,7 @@ EXPORT_SYMBOL(_write_unlock_irq);
void __lockfunc _write_unlock_bh(rwlock_t *lock)
{
_raw_write_unlock(lock);
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
}
EXPORT_SYMBOL(_write_unlock_bh);
@@ -354,7 +354,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
if (_raw_spin_trylock(lock))
return 1;
- preempt_enable();
+ preempt_enable_no_resched();
local_bh_enable();
return 0;
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 6116b25aa7cf..84a9d18aa8da 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -100,7 +100,7 @@ static int stop_machine(void)
stopmachine_state = STOPMACHINE_WAIT;
for_each_online_cpu(i) {
- if (i == _smp_processor_id())
+ if (i == raw_smp_processor_id())
continue;
ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
if (ret < 0)
@@ -182,7 +182,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
/* If they don't care which CPU fn runs on, bind to any online one. */
if (cpu == NR_CPUS)
- cpu = _smp_processor_id();
+ cpu = raw_smp_processor_id();
p = kthread_create(do_stop, &smdata, "kstopmachine");
if (!IS_ERR(p)) {
diff --git a/kernel/sys.c b/kernel/sys.c
index f006632c2ba7..9a24374c23bc 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,8 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/kexec.h>
#include <linux/workqueue.h>
#include <linux/device.h>
#include <linux/key.h>
@@ -405,6 +407,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
case LINUX_REBOOT_CMD_HALT:
notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL);
system_state = SYSTEM_HALT;
+ device_suspend(PMSG_SUSPEND);
device_shutdown();
printk(KERN_EMERG "System halted.\n");
machine_halt();
@@ -415,6 +418,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
case LINUX_REBOOT_CMD_POWER_OFF:
notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL);
system_state = SYSTEM_POWER_OFF;
+ device_suspend(PMSG_SUSPEND);
device_shutdown();
printk(KERN_EMERG "Power down.\n");
machine_power_off();
@@ -431,11 +435,30 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer);
system_state = SYSTEM_RESTART;
+ device_suspend(PMSG_FREEZE);
device_shutdown();
printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer);
machine_restart(buffer);
break;
+#ifdef CONFIG_KEXEC
+ case LINUX_REBOOT_CMD_KEXEC:
+ {
+ struct kimage *image;
+ image = xchg(&kexec_image, 0);
+ if (!image) {
+ unlock_kernel();
+ return -EINVAL;
+ }
+ notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL);
+ system_state = SYSTEM_RESTART;
+ device_shutdown();
+ printk(KERN_EMERG "Starting new kernel\n");
+ machine_shutdown();
+ machine_kexec(image);
+ break;
+ }
+#endif
#ifdef CONFIG_SOFTWARE_SUSPEND
case LINUX_REBOOT_CMD_SW_SUSPEND:
{
@@ -525,7 +548,7 @@ asmlinkage long sys_setregid(gid_t rgid, gid_t egid)
}
if (new_egid != old_egid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
if (rgid != (gid_t) -1 ||
@@ -556,7 +579,7 @@ asmlinkage long sys_setgid(gid_t gid)
{
if(old_egid != gid)
{
- current->mm->dumpable=0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->gid = current->egid = current->sgid = current->fsgid = gid;
@@ -565,7 +588,7 @@ asmlinkage long sys_setgid(gid_t gid)
{
if(old_egid != gid)
{
- current->mm->dumpable=0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->egid = current->fsgid = gid;
@@ -596,7 +619,7 @@ static int set_user(uid_t new_ruid, int dumpclear)
if(dumpclear)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->uid = new_ruid;
@@ -653,7 +676,7 @@ asmlinkage long sys_setreuid(uid_t ruid, uid_t euid)
if (new_euid != old_euid)
{
- current->mm->dumpable=0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->fsuid = current->euid = new_euid;
@@ -703,7 +726,7 @@ asmlinkage long sys_setuid(uid_t uid)
if (old_euid != uid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->fsuid = current->euid = uid;
@@ -748,7 +771,7 @@ asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (euid != (uid_t) -1) {
if (euid != current->euid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->euid = euid;
@@ -798,7 +821,7 @@ asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid)
if (egid != (gid_t) -1) {
if (egid != current->egid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->egid = egid;
@@ -845,7 +868,7 @@ asmlinkage long sys_setfsuid(uid_t uid)
{
if (uid != old_fsuid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->fsuid = uid;
@@ -875,7 +898,7 @@ asmlinkage long sys_setfsgid(gid_t gid)
{
if (gid != old_fsgid)
{
- current->mm->dumpable = 0;
+ current->mm->dumpable = suid_dumpable;
smp_wmb();
}
current->fsgid = gid;
@@ -894,35 +917,69 @@ asmlinkage long sys_times(struct tms __user * tbuf)
*/
if (tbuf) {
struct tms tmp;
- struct task_struct *tsk = current;
- struct task_struct *t;
cputime_t utime, stime, cutime, cstime;
- read_lock(&tasklist_lock);
- utime = tsk->signal->utime;
- stime = tsk->signal->stime;
- t = tsk;
- do {
- utime = cputime_add(utime, t->utime);
- stime = cputime_add(stime, t->stime);
- t = next_thread(t);
- } while (t != tsk);
-
- /*
- * While we have tasklist_lock read-locked, no dying thread
- * can be updating current->signal->[us]time. Instead,
- * we got their counts included in the live thread loop.
- * However, another thread can come in right now and
- * do a wait call that updates current->signal->c[us]time.
- * To make sure we always see that pair updated atomically,
- * we take the siglock around fetching them.
- */
- spin_lock_irq(&tsk->sighand->siglock);
- cutime = tsk->signal->cutime;
- cstime = tsk->signal->cstime;
- spin_unlock_irq(&tsk->sighand->siglock);
- read_unlock(&tasklist_lock);
+#ifdef CONFIG_SMP
+ if (thread_group_empty(current)) {
+ /*
+ * Single thread case without the use of any locks.
+ *
+ * We may race with release_task if two threads are
+ * executing. However, release task first adds up the
+ * counters (__exit_signal) before removing the task
+ * from the process tasklist (__unhash_process).
+ * __exit_signal also acquires and releases the
+ * siglock which results in the proper memory ordering
+ * so that the list modifications are always visible
+ * after the counters have been updated.
+ *
+ * If the counters have been updated by the second thread
+ * but the thread has not yet been removed from the list
+ * then the other branch will be executing which will
+ * block on tasklist_lock until the exit handling of the
+ * other task is finished.
+ *
+ * This also implies that the sighand->siglock cannot
+ * be held by another processor. So we can also
+ * skip acquiring that lock.
+ */
+ utime = cputime_add(current->signal->utime, current->utime);
+ stime = cputime_add(current->signal->utime, current->stime);
+ cutime = current->signal->cutime;
+ cstime = current->signal->cstime;
+ } else
+#endif
+ {
+ /* Process with multiple threads */
+ struct task_struct *tsk = current;
+ struct task_struct *t;
+
+ read_lock(&tasklist_lock);
+ utime = tsk->signal->utime;
+ stime = tsk->signal->stime;
+ t = tsk;
+ do {
+ utime = cputime_add(utime, t->utime);
+ stime = cputime_add(stime, t->stime);
+ t = next_thread(t);
+ } while (t != tsk);
+
+ /*
+ * While we have tasklist_lock read-locked, no dying thread
+ * can be updating current->signal->[us]time. Instead,
+ * we got their counts included in the live thread loop.
+ * However, another thread can come in right now and
+ * do a wait call that updates current->signal->c[us]time.
+ * To make sure we always see that pair updated atomically,
+ * we take the siglock around fetching them.
+ */
+ spin_lock_irq(&tsk->sighand->siglock);
+ cutime = tsk->signal->cutime;
+ cstime = tsk->signal->cstime;
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+ }
tmp.tms_utime = cputime_to_clock_t(utime);
tmp.tms_stime = cputime_to_clock_t(stime);
tmp.tms_cutime = cputime_to_clock_t(cutime);
@@ -1225,7 +1282,7 @@ static void groups_sort(struct group_info *group_info)
}
/* a simple bsearch */
-static int groups_search(struct group_info *group_info, gid_t grp)
+int groups_search(struct group_info *group_info, gid_t grp)
{
int left, right;
@@ -1652,7 +1709,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
error = 1;
break;
case PR_SET_DUMPABLE:
- if (arg2 != 0 && arg2 != 1) {
+ if (arg2 < 0 || arg2 > 2) {
error = -EINVAL;
break;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70ed1f98..29196ce9b40f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -18,6 +18,8 @@ cond_syscall(sys_acct);
cond_syscall(sys_lookup_dcookie);
cond_syscall(sys_swapon);
cond_syscall(sys_swapoff);
+cond_syscall(sys_kexec_load);
+cond_syscall(compat_sys_kexec_load);
cond_syscall(sys_init_module);
cond_syscall(sys_delete_module);
cond_syscall(sys_socketpair);
@@ -77,6 +79,7 @@ cond_syscall(sys_request_key);
cond_syscall(sys_keyctl);
cond_syscall(compat_sys_keyctl);
cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_set_zone_reclaim);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 701d12c63068..270ee7fadbd8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -58,6 +58,7 @@ extern int sysctl_overcommit_ratio;
extern int max_threads;
extern int sysrq_enabled;
extern int core_uses_pid;
+extern int suid_dumpable;
extern char core_pattern[];
extern int cad_pid;
extern int pid_max;
@@ -950,6 +951,14 @@ static ctl_table fs_table[] = {
.proc_handler = &proc_dointvec,
},
#endif
+ {
+ .ctl_name = KERN_SETUID_DUMPABLE,
+ .procname = "suid_dumpable",
+ .data = &suid_dumpable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec,
+ },
{ .ctl_name = 0 }
};
@@ -991,8 +1000,7 @@ int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *ol
int error = parse_table(name, nlen, oldval, oldlenp,
newval, newlen, head->ctl_table,
&context);
- if (context)
- kfree(context);
+ kfree(context);
if (error != -ENOTDIR)
return error;
tmp = tmp->next;
diff --git a/kernel/timer.c b/kernel/timer.c
index 207aa4f0aa10..f2a11887a726 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -57,6 +57,11 @@ static void time_interpolator_update(long delta_nsec);
#define TVN_MASK (TVN_SIZE - 1)
#define TVR_MASK (TVR_SIZE - 1)
+struct timer_base_s {
+ spinlock_t lock;
+ struct timer_list *running_timer;
+};
+
typedef struct tvec_s {
struct list_head vec[TVN_SIZE];
} tvec_t;
@@ -66,9 +71,8 @@ typedef struct tvec_root_s {
} tvec_root_t;
struct tvec_t_base_s {
- spinlock_t lock;
+ struct timer_base_s t_base;
unsigned long timer_jiffies;
- struct timer_list *running_timer;
tvec_root_t tv1;
tvec_t tv2;
tvec_t tv3;
@@ -77,18 +81,16 @@ struct tvec_t_base_s {
} ____cacheline_aligned_in_smp;
typedef struct tvec_t_base_s tvec_base_t;
+static DEFINE_PER_CPU(tvec_base_t, tvec_bases);
static inline void set_running_timer(tvec_base_t *base,
struct timer_list *timer)
{
#ifdef CONFIG_SMP
- base->running_timer = timer;
+ base->t_base.running_timer = timer;
#endif
}
-/* Fake initialization */
-static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
-
static void check_timer_failed(struct timer_list *timer)
{
static int whine_count;
@@ -103,7 +105,6 @@ static void check_timer_failed(struct timer_list *timer)
/*
* Now fix it up
*/
- spin_lock_init(&timer->lock);
timer->magic = TIMER_MAGIC;
}
@@ -156,65 +157,113 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
list_add_tail(&timer->entry, vec);
}
+typedef struct timer_base_s timer_base_t;
+/*
+ * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
+ * at compile time, and we need timer->base to lock the timer.
+ */
+timer_base_t __init_timer_base
+ ____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
+EXPORT_SYMBOL(__init_timer_base);
+
+/***
+ * init_timer - initialize a timer.
+ * @timer: the timer to be initialized
+ *
+ * init_timer() must be done to a timer prior calling *any* of the
+ * other timer functions.
+ */
+void fastcall init_timer(struct timer_list *timer)
+{
+ timer->entry.next = NULL;
+ timer->base = &per_cpu(tvec_bases, raw_smp_processor_id()).t_base;
+ timer->magic = TIMER_MAGIC;
+}
+EXPORT_SYMBOL(init_timer);
+
+static inline void detach_timer(struct timer_list *timer,
+ int clear_pending)
+{
+ struct list_head *entry = &timer->entry;
+
+ __list_del(entry->prev, entry->next);
+ if (clear_pending)
+ entry->next = NULL;
+ entry->prev = LIST_POISON2;
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(tvec_bases).t_base.lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on ->tvX lists.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static timer_base_t *lock_timer_base(struct timer_list *timer,
+ unsigned long *flags)
+{
+ timer_base_t *base;
+
+ for (;;) {
+ base = timer->base;
+ if (likely(base != NULL)) {
+ spin_lock_irqsave(&base->lock, *flags);
+ if (likely(base == timer->base))
+ return base;
+ /* The timer has migrated to another CPU */
+ spin_unlock_irqrestore(&base->lock, *flags);
+ }
+ cpu_relax();
+ }
+}
+
int __mod_timer(struct timer_list *timer, unsigned long expires)
{
- tvec_base_t *old_base, *new_base;
+ timer_base_t *base;
+ tvec_base_t *new_base;
unsigned long flags;
int ret = 0;
BUG_ON(!timer->function);
-
check_timer(timer);
- spin_lock_irqsave(&timer->lock, flags);
+ base = lock_timer_base(timer, &flags);
+
+ if (timer_pending(timer)) {
+ detach_timer(timer, 0);
+ ret = 1;
+ }
+
new_base = &__get_cpu_var(tvec_bases);
-repeat:
- old_base = timer->base;
- /*
- * Prevent deadlocks via ordering by old_base < new_base.
- */
- if (old_base && (new_base != old_base)) {
- if (old_base < new_base) {
- spin_lock(&new_base->lock);
- spin_lock(&old_base->lock);
- } else {
- spin_lock(&old_base->lock);
- spin_lock(&new_base->lock);
- }
+ if (base != &new_base->t_base) {
/*
- * The timer base might have been cancelled while we were
- * trying to take the lock(s):
+ * We are trying to schedule the timer on the local CPU.
+ * However we can't change timer's base while it is running,
+ * otherwise del_timer_sync() can't detect that the timer's
+ * handler yet has not finished. This also guarantees that
+ * the timer is serialized wrt itself.
*/
- if (timer->base != old_base) {
- spin_unlock(&new_base->lock);
- spin_unlock(&old_base->lock);
- goto repeat;
- }
- } else {
- spin_lock(&new_base->lock);
- if (timer->base != old_base) {
- spin_unlock(&new_base->lock);
- goto repeat;
+ if (unlikely(base->running_timer == timer)) {
+ /* The timer remains on a former base */
+ new_base = container_of(base, tvec_base_t, t_base);
+ } else {
+ /* See the comment in lock_timer_base() */
+ timer->base = NULL;
+ spin_unlock(&base->lock);
+ spin_lock(&new_base->t_base.lock);
+ timer->base = &new_base->t_base;
}
}
- /*
- * Delete the previous timeout (if there was any), and install
- * the new one:
- */
- if (old_base) {
- list_del(&timer->entry);
- ret = 1;
- }
timer->expires = expires;
internal_add_timer(new_base, timer);
- timer->base = new_base;
-
- if (old_base && (new_base != old_base))
- spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
- spin_unlock_irqrestore(&timer->lock, flags);
+ spin_unlock_irqrestore(&new_base->t_base.lock, flags);
return ret;
}
@@ -232,15 +281,15 @@ void add_timer_on(struct timer_list *timer, int cpu)
{
tvec_base_t *base = &per_cpu(tvec_bases, cpu);
unsigned long flags;
-
+
BUG_ON(timer_pending(timer) || !timer->function);
check_timer(timer);
- spin_lock_irqsave(&base->lock, flags);
+ spin_lock_irqsave(&base->t_base.lock, flags);
+ timer->base = &base->t_base;
internal_add_timer(base, timer);
- timer->base = base;
- spin_unlock_irqrestore(&base->lock, flags);
+ spin_unlock_irqrestore(&base->t_base.lock, flags);
}
@@ -295,109 +344,84 @@ EXPORT_SYMBOL(mod_timer);
*/
int del_timer(struct timer_list *timer)
{
+ timer_base_t *base;
unsigned long flags;
- tvec_base_t *base;
+ int ret = 0;
check_timer(timer);
-repeat:
- base = timer->base;
- if (!base)
- return 0;
- spin_lock_irqsave(&base->lock, flags);
- if (base != timer->base) {
+ if (timer_pending(timer)) {
+ base = lock_timer_base(timer, &flags);
+ if (timer_pending(timer)) {
+ detach_timer(timer, 1);
+ ret = 1;
+ }
spin_unlock_irqrestore(&base->lock, flags);
- goto repeat;
}
- list_del(&timer->entry);
- /* Need to make sure that anybody who sees a NULL base also sees the list ops */
- smp_wmb();
- timer->base = NULL;
- spin_unlock_irqrestore(&base->lock, flags);
- return 1;
+ return ret;
}
EXPORT_SYMBOL(del_timer);
#ifdef CONFIG_SMP
-/***
- * del_timer_sync - deactivate a timer and wait for the handler to finish.
- * @timer: the timer to be deactivated
- *
- * This function only differs from del_timer() on SMP: besides deactivating
- * the timer it also makes sure the handler has finished executing on other
- * CPUs.
- *
- * Synchronization rules: callers must prevent restarting of the timer,
- * otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which would prevent
- * completion of the timer's handler. Upon exit the timer is not queued and
- * the handler is not running on any CPU.
- *
- * The function returns whether it has deactivated a pending timer or not.
+/*
+ * This function tries to deactivate a timer. Upon successful (ret >= 0)
+ * exit the timer is not queued and the handler is not running on any CPU.
*
- * del_timer_sync() is slow and complicated because it copes with timer
- * handlers which re-arm the timer (periodic timers). If the timer handler
- * is known to not do this (a single shot timer) then use
- * del_singleshot_timer_sync() instead.
+ * It must not be called from interrupt contexts.
*/
-int del_timer_sync(struct timer_list *timer)
+int try_to_del_timer_sync(struct timer_list *timer)
{
- tvec_base_t *base;
- int i, ret = 0;
+ timer_base_t *base;
+ unsigned long flags;
+ int ret = -1;
- check_timer(timer);
+ base = lock_timer_base(timer, &flags);
-del_again:
- ret += del_timer(timer);
+ if (base->running_timer == timer)
+ goto out;
- for_each_online_cpu(i) {
- base = &per_cpu(tvec_bases, i);
- if (base->running_timer == timer) {
- while (base->running_timer == timer) {
- cpu_relax();
- preempt_check_resched();
- }
- break;
- }
+ ret = 0;
+ if (timer_pending(timer)) {
+ detach_timer(timer, 1);
+ ret = 1;
}
- smp_rmb();
- if (timer_pending(timer))
- goto del_again;
+out:
+ spin_unlock_irqrestore(&base->lock, flags);
return ret;
}
-EXPORT_SYMBOL(del_timer_sync);
/***
- * del_singleshot_timer_sync - deactivate a non-recursive timer
+ * del_timer_sync - deactivate a timer and wait for the handler to finish.
* @timer: the timer to be deactivated
*
- * This function is an optimization of del_timer_sync for the case where the
- * caller can guarantee the timer does not reschedule itself in its timer
- * function.
+ * This function only differs from del_timer() on SMP: besides deactivating
+ * the timer it also makes sure the handler has finished executing on other
+ * CPUs.
*
* Synchronization rules: callers must prevent restarting of the timer,
* otherwise this function is meaningless. It must not be called from
- * interrupt contexts. The caller must not hold locks which wold prevent
- * completion of the timer's handler. Upon exit the timer is not queued and
- * the handler is not running on any CPU.
+ * interrupt contexts. The caller must not hold locks which would prevent
+ * completion of the timer's handler. The timer's handler must not call
+ * add_timer_on(). Upon exit the timer is not queued and the handler is
+ * not running on any CPU.
*
* The function returns whether it has deactivated a pending timer or not.
*/
-int del_singleshot_timer_sync(struct timer_list *timer)
+int del_timer_sync(struct timer_list *timer)
{
- int ret = del_timer(timer);
+ check_timer(timer);
- if (!ret) {
- ret = del_timer_sync(timer);
- BUG_ON(ret);
+ for (;;) {
+ int ret = try_to_del_timer_sync(timer);
+ if (ret >= 0)
+ return ret;
}
-
- return ret;
}
-EXPORT_SYMBOL(del_singleshot_timer_sync);
+
+EXPORT_SYMBOL(del_timer_sync);
#endif
static int cascade(tvec_base_t *base, tvec_t *tv, int index)
@@ -415,7 +439,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
struct timer_list *tmp;
tmp = list_entry(curr, struct timer_list, entry);
- BUG_ON(tmp->base != base);
+ BUG_ON(tmp->base != &base->t_base);
curr = curr->next;
internal_add_timer(base, tmp);
}
@@ -437,7 +461,7 @@ static inline void __run_timers(tvec_base_t *base)
{
struct timer_list *timer;
- spin_lock_irq(&base->lock);
+ spin_lock_irq(&base->t_base.lock);
while (time_after_eq(jiffies, base->timer_jiffies)) {
struct list_head work_list = LIST_HEAD_INIT(work_list);
struct list_head *head = &work_list;
@@ -453,8 +477,7 @@ static inline void __run_timers(tvec_base_t *base)
cascade(base, &base->tv5, INDEX(3));
++base->timer_jiffies;
list_splice_init(base->tv1.vec + index, &work_list);
-repeat:
- if (!list_empty(head)) {
+ while (!list_empty(head)) {
void (*fn)(unsigned long);
unsigned long data;
@@ -462,25 +485,26 @@ repeat:
fn = timer->function;
data = timer->data;
- list_del(&timer->entry);
set_running_timer(base, timer);
- smp_wmb();
- timer->base = NULL;
- spin_unlock_irq(&base->lock);
+ detach_timer(timer, 1);
+ spin_unlock_irq(&base->t_base.lock);
{
- u32 preempt_count = preempt_count();
+ int preempt_count = preempt_count();
fn(data);
if (preempt_count != preempt_count()) {
- printk("huh, entered %p with %08x, exited with %08x?\n", fn, preempt_count, preempt_count());
+ printk(KERN_WARNING "huh, entered %p "
+ "with preempt_count %08x, exited"
+ " with %08x?\n",
+ fn, preempt_count,
+ preempt_count());
BUG();
}
}
- spin_lock_irq(&base->lock);
- goto repeat;
+ spin_lock_irq(&base->t_base.lock);
}
}
set_running_timer(base, NULL);
- spin_unlock_irq(&base->lock);
+ spin_unlock_irq(&base->t_base.lock);
}
#ifdef CONFIG_NO_IDLE_HZ
@@ -499,7 +523,7 @@ unsigned long next_timer_interrupt(void)
int i, j;
base = &__get_cpu_var(tvec_bases);
- spin_lock(&base->lock);
+ spin_lock(&base->t_base.lock);
expires = base->timer_jiffies + (LONG_MAX >> 1);
list = 0;
@@ -547,7 +571,7 @@ found:
expires = nte->expires;
}
}
- spin_unlock(&base->lock);
+ spin_unlock(&base->t_base.lock);
return expires;
}
#endif
@@ -1286,9 +1310,9 @@ static void __devinit init_timers_cpu(int cpu)
{
int j;
tvec_base_t *base;
-
+
base = &per_cpu(tvec_bases, cpu);
- spin_lock_init(&base->lock);
+ spin_lock_init(&base->t_base.lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1302,22 +1326,16 @@ static void __devinit init_timers_cpu(int cpu)
}
#ifdef CONFIG_HOTPLUG_CPU
-static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
+static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
{
struct timer_list *timer;
while (!list_empty(head)) {
timer = list_entry(head->next, struct timer_list, entry);
- /* We're locking backwards from __mod_timer order here,
- beware deadlock. */
- if (!spin_trylock(&timer->lock))
- return 0;
- list_del(&timer->entry);
+ detach_timer(timer, 0);
+ timer->base = &new_base->t_base;
internal_add_timer(new_base, timer);
- timer->base = new_base;
- spin_unlock(&timer->lock);
}
- return 1;
}
static void __devinit migrate_timers(int cpu)
@@ -1331,39 +1349,24 @@ static void __devinit migrate_timers(int cpu)
new_base = &get_cpu_var(tvec_bases);
local_irq_disable();
-again:
- /* Prevent deadlocks via ordering by old_base < new_base. */
- if (old_base < new_base) {
- spin_lock(&new_base->lock);
- spin_lock(&old_base->lock);
- } else {
- spin_lock(&old_base->lock);
- spin_lock(&new_base->lock);
- }
+ spin_lock(&new_base->t_base.lock);
+ spin_lock(&old_base->t_base.lock);
- if (old_base->running_timer)
+ if (old_base->t_base.running_timer)
BUG();
for (i = 0; i < TVR_SIZE; i++)
- if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
- goto unlock_again;
- for (i = 0; i < TVN_SIZE; i++)
- if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
- || !migrate_timer_list(new_base, old_base->tv3.vec + i)
- || !migrate_timer_list(new_base, old_base->tv4.vec + i)
- || !migrate_timer_list(new_base, old_base->tv5.vec + i))
- goto unlock_again;
- spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
+ migrate_timer_list(new_base, old_base->tv1.vec + i);
+ for (i = 0; i < TVN_SIZE; i++) {
+ migrate_timer_list(new_base, old_base->tv2.vec + i);
+ migrate_timer_list(new_base, old_base->tv3.vec + i);
+ migrate_timer_list(new_base, old_base->tv4.vec + i);
+ migrate_timer_list(new_base, old_base->tv5.vec + i);
+ }
+
+ spin_unlock(&old_base->t_base.lock);
+ spin_unlock(&new_base->t_base.lock);
local_irq_enable();
put_cpu_var(tvec_bases);
- return;
-
-unlock_again:
- /* Avoid deadlock with __mod_timer, by backing off. */
- spin_unlock(&old_base->lock);
- spin_unlock(&new_base->lock);
- cpu_relax();
- goto again;
}
#endif /* CONFIG_HOTPLUG_CPU */
@@ -1594,7 +1597,7 @@ void msleep(unsigned int msecs)
EXPORT_SYMBOL(msleep);
/**
- * msleep_interruptible - sleep waiting for waitqueue interruptions
+ * msleep_interruptible - sleep waiting for signals
* @msecs: Time in milliseconds to sleep for
*/
unsigned long msleep_interruptible(unsigned int msecs)