From 72b1e22dfcad1daca6906148fd956ffe404bb0bc Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 10 Jul 2008 11:16:45 -0700
Subject: x64, x2apic/intr-remap: generic irq migration support from process
 context

Generic infrastructure for migrating the irq from the process context in the
presence of CONFIG_GENERIC_PENDING_IRQ.

This will be used later for migrating irq in the presence of
interrupt-remapping.

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: akpm@linux-foundation.org
Cc: arjan@linux.intel.com
Cc: andi@firstfloor.org
Cc: ebiederm@xmission.com
Cc: jbarnes@virtuousgeek.org
Cc: steiner@sgi.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..628b5572a7c2 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -87,7 +87,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
 	set_balance_irq_affinity(irq, cpumask);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	set_pending_irq(irq, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		desc->chip->set_affinity(irq, cpumask);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	} else
+		set_pending_irq(irq, cpumask);
 #else
 	desc->affinity = cpumask;
 	desc->chip->set_affinity(irq, cpumask);
-- 
cgit v1.2.3-58-ga151


From 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 6 Jul 2008 17:23:55 +0800
Subject: rcu classic: simplify the next pending batch

use a batch number(rcp->pending) instead of a flag(rcp->next_pending)

rcu_start_batch() need to change this flag, so mb()s is needed
for memory-access safe.

but(after this patch applied) rcu_start_batch() do not change
this batch number(rcp->pending), rcp->pending is managed by
__rcu_process_callbacks only, and troublesome mb()s are eliminated.

And codes look simpler and clearer.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h |  2 +-
 kernel/rcuclassic.c        | 22 ++++++++--------------
 2 files changed, 9 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 8c774905dcfe..c847e59c6006 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -45,7 +45,7 @@
 struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
-	int	next_pending;	/* Is the next batch already waiting?         */
+	long	pending;	/* Number of the last pending batch           */
 
 	int	signaled;
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 16eeeaa9d618..03726eb95193 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -60,12 +60,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
+	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
+	.pending = -300,
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
 	.cpumask = CPU_MASK_NONE,
 };
@@ -276,14 +278,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
  */
 static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 {
-	if (rcp->next_pending &&
+	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
 		rcp->cur++;
 
 		/*
@@ -441,16 +437,14 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		/* determine batch number */
 		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
 
-		if (!rcp->next_pending) {
+		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
 			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
+			if (rcu_batch_after(rdp->batch, rcp->pending)) {
+				rcp->pending = rdp->batch;
+				rcu_start_batch(rcp);
+			}
 			spin_unlock(&rcp->lock);
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From 5127bed588a2f8f3a1f732de2a8a190b7df5dce3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Sun, 6 Jul 2008 17:23:59 +0800
Subject: rcu classic: new algorithm for callbacks-processing(v2)

This is v2, it's a little deference from v1 that I
had send to lkml.
use ACCESS_ONCE
use rcu_batch_after/rcu_batch_before for batch # comparison.

rcutorture test result:
(hotplugs: do cpu-online/offline once per second)

No CONFIG_NO_HZ:           OK, 12hours
No CONFIG_NO_HZ, hotplugs: OK, 12hours
CONFIG_NO_HZ=y:            OK, 24hours
CONFIG_NO_HZ=y, hotplugs:  Failed.
(Failed also without my patch applied, exactly the same bug occurred,
http://lkml.org/lkml/2008/7/3/24)

v1's email thread:
http://lkml.org/lkml/2008/6/2/539

v1's description:

The code/algorithm of the implement of current callbacks-processing
is very efficient and technical. But when I studied it and I found
a disadvantage:

In multi-CPU systems, when a new RCU callback is being
queued(call_rcu[_bh]), this callback will be invoked after the grace
period for the batch with batch number = rcp->cur+2 has completed
very very likely in current implement. Actually, this callback can be
invoked after the grace period for the batch with
batch number = rcp->cur+1 has completed. The delay of invocation means
that latency of synchronize_rcu() is extended. But more important thing
is that the callbacks usually free memory, and these works are delayed
too! it's necessary for reclaimer to free memory as soon as
possible when left memory is few.

A very simple way can solve this problem:
a field(struct rcu_head::batch) is added to record the batch number for
the RCU callback. And when a new RCU callback is being queued, we
determine the batch number for this callback(head->batch = rcp->cur+1)
and we move this callback to rdp->donelist if we find
that head->batch <= rcp->completed when we process callbacks.
This simple way reduces the wait time for invocation a lot. (about
2.5Grace Period -> 1.5Grace Period in average in multi-CPU systems)

This is my algorithm. But I do not add any field for struct rcu_head
in my implement. We just need to memorize the last 2 batches and
their batch number, because these 2 batches include all entries that
for whom the grace period hasn't completed. So we use a special
linked-list rather than add a field.
Please see the comment of struct rcu_data.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Dipankar Sarma <dipankar@in.ibm.com>
Cc: Gautham Shenoy <ego@in.ibm.com>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h |  26 +++++---
 kernel/rcuclassic.c        | 157 ++++++++++++++++++++++++++++-----------------
 2 files changed, 114 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index c847e59c6006..04c728147be0 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -66,11 +66,7 @@ static inline int rcu_batch_after(long a, long b)
 	return (a - b) > 0;
 }
 
-/*
- * Per-CPU data for Read-Copy UPdate.
- * nxtlist - new callbacks are added here
- * curlist - current batch for which quiescent cycle started if any
- */
+/* Per-CPU data for Read-Copy UPdate. */
 struct rcu_data {
 	/* 1) quiescent state handling : */
 	long		quiescbatch;     /* Batch # for grace period */
@@ -78,12 +74,24 @@ struct rcu_data {
 	int		qs_pending;	 /* core waits for quiesc state */
 
 	/* 2) batch handling */
-	long  	       	batch;           /* Batch # for current RCU batch */
+	/*
+	 * if nxtlist is not NULL, then:
+	 * batch:
+	 *	The batch # for the last entry of nxtlist
+	 * [*nxttail[1], NULL = *nxttail[2]):
+	 *	Entries that batch # <= batch
+	 * [*nxttail[0], *nxttail[1]):
+	 *	Entries that batch # <= batch - 1
+	 * [nxtlist, *nxttail[0]):
+	 *	Entries that batch # <= batch - 2
+	 *	The grace period for these entries has completed, and
+	 *	the other grace-period-completed entries may be moved
+	 *	here temporarily in rcu_process_callbacks().
+	 */
+	long  	       	batch;
 	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail;
+	struct rcu_head **nxttail[3];
 	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *curlist;
-	struct rcu_head **curtail;
 	struct rcu_head *donelist;
 	struct rcu_head **donetail;
 	long		blimit;		 /* Upper limit on a processed batch */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 03726eb95193..d3553ee55f64 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -120,6 +120,43 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
 }
 #endif
 
+static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
+		struct rcu_data *rdp)
+{
+	long batch;
+	smp_mb(); /* reads the most recently updated value of rcu->cur. */
+
+	/*
+	 * Determine the batch number of this callback.
+	 *
+	 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
+	 * local variable "batch" and emits codes like this:
+	 *	1) rdp->batch = rcp->cur + 1 # gets old value
+	 *	......
+	 *	2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
+	 * then [*nxttail[0], *nxttail[1]) may contain callbacks
+	 * that batch# = rdp->batch, see the comment of struct rcu_data.
+	 */
+	batch = ACCESS_ONCE(rcp->cur) + 1;
+
+	if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
+		/* process callbacks */
+		rdp->nxttail[0] = rdp->nxttail[1];
+		rdp->nxttail[1] = rdp->nxttail[2];
+		if (rcu_batch_after(batch - 1, rdp->batch))
+			rdp->nxttail[0] = rdp->nxttail[2];
+	}
+
+	rdp->batch = batch;
+	*rdp->nxttail[2] = head;
+	rdp->nxttail[2] = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+}
+
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -135,18 +172,11 @@ void call_rcu(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
 {
 	unsigned long flags;
-	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
+	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu);
@@ -171,20 +201,11 @@ void call_rcu_bh(struct rcu_head *head,
 				void (*func)(struct rcu_head *rcu))
 {
 	unsigned long flags;
-	struct rcu_data *rdp;
 
 	head->func = func;
 	head->next = NULL;
 	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
+	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -213,12 +234,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 static inline void raise_rcu_softirq(void)
 {
 	raise_softirq(RCU_SOFTIRQ);
-	/*
-	 * The smp_mb() here is required to ensure that this cpu's
-	 * __rcu_process_callbacks() reads the most recently updated
-	 * value of rcu->cur.
-	 */
-	smp_mb();
 }
 
 /*
@@ -360,13 +375,15 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
  * which is dead and hence not processing interrupts.
  */
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
+				struct rcu_head **tail, long batch)
 {
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
+	if (list) {
+		local_irq_disable();
+		this_rdp->batch = batch;
+		*this_rdp->nxttail[2] = list;
+		this_rdp->nxttail[2] = tail;
+		local_irq_enable();
+	}
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
@@ -380,9 +397,9 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+	/* spin_lock implies smp_mb() */
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
 
 	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
@@ -416,27 +433,37 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
+	if (rdp->nxtlist) {
 		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
 
 		/*
-		 * start the next batch of callbacks
+		 * move the other grace-period-completed entries to
+		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
+		 */
+		if (!rcu_batch_before(rcp->completed, rdp->batch))
+			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
+		else if (!rcu_batch_before(rcp->completed, rdp->batch - 1))
+			rdp->nxttail[0] = rdp->nxttail[1];
+
+		/*
+		 * the grace period for entries in
+		 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
+		 * move these entries to donelist
 		 */
+		if (rdp->nxttail[0] != &rdp->nxtlist) {
+			*rdp->donetail = rdp->nxtlist;
+			rdp->donetail = rdp->nxttail[0];
+			rdp->nxtlist = *rdp->nxttail[0];
+			*rdp->donetail = NULL;
+
+			if (rdp->nxttail[1] == rdp->nxttail[0])
+				rdp->nxttail[1] = &rdp->nxtlist;
+			if (rdp->nxttail[2] == rdp->nxttail[0])
+				rdp->nxttail[2] = &rdp->nxtlist;
+			rdp->nxttail[0] = &rdp->nxtlist;
+		}
 
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
+		local_irq_enable();
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
@@ -462,15 +489,26 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
+	if (rdp->nxtlist) {
+		/*
+		 * This cpu has pending rcu entries and the grace period
+		 * for them has completed.
+		 */
+		if (!rcu_batch_before(rcp->completed, rdp->batch))
+			return 1;
+		if (!rcu_batch_before(rcp->completed, rdp->batch - 1) &&
+				rdp->nxttail[0] != rdp->nxttail[1])
+			return 1;
+		if (rdp->nxttail[0] != &rdp->nxtlist)
+			return 1;
 
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
+		/*
+		 * This cpu has pending rcu entries and the new batch
+		 * for then hasn't been started nor scheduled start
+		 */
+		if (rcu_batch_after(rdp->batch, rcp->pending))
+			return 1;
+	}
 
 	/* This cpu has finished callbacks to invoke */
 	if (rdp->donelist)
@@ -506,7 +544,7 @@ int rcu_needs_cpu(int cpu)
 	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
 
 void rcu_check_callbacks(int cpu, int user)
@@ -553,8 +591,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
 	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
+	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
 	rdp->quiescbatch = rcp->completed;
 	rdp->qs_pending = 0;
-- 
cgit v1.2.3-58-ga151


From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 10 Aug 2008 18:35:38 -0700
Subject: rcu, debug: detect stalled grace periods

this is a diagnostic patch for Classic RCU.

The approach is to record a timestamp at the beginning
of the grace period (in rcu_start_batch()), then have
rcu_check_callbacks() complain if:

 1.	it is running on a CPU that has holding up grace periods for
 	a long time (say one second).  This will identify the culprit
 	assuming that the culprit has not disabled hardware irqs,
 	instruction execution, or some such.

 2.	it is running on a CPU that is not holding up grace periods,
 	but grace periods have been held up for an even longer time
 	(say two seconds).

It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter.

Rather than exponential backoff, it backs off to once per 30 seconds.
My feeling upon thinking on it was that if you have stalled RCU grace
periods for that long, a few extra printk() messages are probably the
least of your worries...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: David Witbrodt <dawitbro@sbcglobal.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h |  3 ++
 kernel/rcuclassic.c        | 80 ++++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          | 13 ++++++++
 3 files changed, 96 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 04c728147be0..16589958b40e 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -46,6 +46,9 @@ struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	long	pending;	/* Number of the last pending batch           */
+#ifdef CONFIG_DEBUG_RCU_STALL
+	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
+#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
 
 	int	signaled;
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d4271146a9bd..d7ec731de75c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
+#include <linux/time.h>
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
@@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
  *   period (if necessary).
  */
+
+#ifdef CONFIG_DEBUG_RCU_STALL
+
+static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+{
+	rcp->gp_check = get_seconds() + 3;
+}
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	long delta;
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock(&rcp->lock);
+	delta = get_seconds() - rcp->gp_check;
+	if (delta < 2L ||
+	    cpus_empty(rcp->cpumask)) {
+		spin_unlock(&rcp->lock);
+		return;
+	rcp->gp_check = get_seconds() + 30;
+	}
+	spin_unlock(&rcp->lock);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for_each_cpu_mask(cpu, rcp->cpumask)
+		printk(" %d", cpu);
+	printk(" (detected by %d, t=%lu/%lu)\n",
+	       smp_processor_id(), get_seconds(), rcp->gp_check);
+}
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
+			smp_processor_id(), get_seconds(), rcp->gp_check);
+	dump_stack();
+	spin_lock(&rcp->lock);
+	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
+		rcp->gp_check = get_seconds() + 30;
+	spin_unlock(&rcp->lock);
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
+				   struct rcu_data *rdp)
+{
+	long delta;
+
+	delta = get_seconds() - rcp->gp_check;
+	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
+
+		/* We haven't checked in, so go dump stack. */
+
+		print_cpu_stall(rcp);
+
+	} else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
+
+		/* They had two seconds to dump stack, so complain. */
+
+		print_other_cpu_stall(rcp);
+
+	}
+}
+
+#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
+
+static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
+				   struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
+
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
@@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
 		rcp->cur++;
+		record_gp_check_time(rcp);
 
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+	/* Check for CPU stalls, if enabled. */
+	check_cpu_stall(rcp, rdp);
+
 	if (rdp->nxtlist) {
 		/*
 		 * This cpu has pending rcu entries and the grace period
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e1d4764435ed..2fb6d90bf1e6 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_CPU_STALL
+	bool "Check for stalled CPUs delaying RCU grace periods"
+	depends on CLASSIC_RCU
+	default n
+	help
+	  This option causes RCU to printk information on which
+	  CPUs are delaying the current grace period, but only when
+	  the grace period extends for excessive time periods.
+
+	  Say Y if you want RCU to perform such checks.
+
+	  Say N if you are unsure.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-58-ga151


From 78635fc739b1254f3e0362ac430edbdd2cff01dc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 11 Aug 2008 13:34:15 +0200
Subject: rcu, debug: detect stalled grace periods, cleanups

small cleanups.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index d7ec731de75c..56b8712fe5aa 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -294,6 +294,7 @@ static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
 {
 	rcp->gp_check = get_seconds() + 3;
 }
+
 static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	int cpu;
@@ -303,11 +304,9 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 	spin_lock(&rcp->lock);
 	delta = get_seconds() - rcp->gp_check;
-	if (delta < 2L ||
-	    cpus_empty(rcp->cpumask)) {
+	if (delta < 2L || cpus_empty(rcp->cpumask)) {
 		spin_unlock(&rcp->lock);
 		return;
-	rcp->gp_check = get_seconds() + 30;
 	}
 	spin_unlock(&rcp->lock);
 
@@ -319,6 +318,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 	printk(" (detected by %d, t=%lu/%lu)\n",
 	       smp_processor_id(), get_seconds(), rcp->gp_check);
 }
+
 static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
@@ -329,8 +329,8 @@ static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 		rcp->gp_check = get_seconds() + 30;
 	spin_unlock(&rcp->lock);
 }
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
-				   struct rcu_data *rdp)
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	long delta;
 
@@ -341,12 +341,11 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
 
 		print_cpu_stall(rcp);
 
-	} else if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
-
-		/* They had two seconds to dump stack, so complain. */
-
-		print_other_cpu_stall(rcp);
-
+	} else {
+		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
+			/* They had two seconds to dump stack, so complain. */
+			print_other_cpu_stall(rcp);
+		}
 	}
 }
 
@@ -355,8 +354,9 @@ static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
 static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
 {
 }
-static inline void check_cpu_stall(struct rcu_ctrlblk *rcp,
-				   struct rcu_data *rdp)
+
+static inline void
+check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 }
 
-- 
cgit v1.2.3-58-ga151


From 293a17ebc944c958e24e6ffbd1d5a49abdbf489e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 12 Aug 2008 17:25:03 -0700
Subject: rcu: prevent console flood when one CPU sees another AWOL via RCU

One small change needed to keep from flooding the console when one
CPU notices that another is AWOL.  Unless I am missing something subtle.
Otherwise the cleanups look good!

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 56b8712fe5aa..dab2676d9d72 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -308,6 +308,7 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 		spin_unlock(&rcp->lock);
 		return;
 	}
+	rcp->gp_check = get_seconds() + 30;
 	spin_unlock(&rcp->lock);
 
 	/* OK, time to rat on our buddy... */
-- 
cgit v1.2.3-58-ga151


From 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 5 Aug 2008 09:21:44 -0700
Subject: rcu: classic RCU locking and memory-barrier cleanups

This patch simplifies the locking and memory-barrier usage in the Classic
RCU grace-period-detection mechanism, incorporating Lai Jiangshan's
feedback from the earlier version (http://lkml.org/lkml/2008/8/1/400
and http://lkml.org/lkml/2008/8/3/43).  Passed 10 hours of
rcutorture concurrent with CPUs being put online and taken offline on
a 128-hardware-thread Power machine.  My apologies to whoever in the
Eastern Hemisphere was planning to use this machine over the Western
Hemisphere night, but it was sitting idle and...

So this is ready for tip/core/rcu.

This patch is in preparation for moving to a hierarchical
algorithm to allow the very large SMP machines -- requested by some
people at OLS, and there seem to have been a few recent patches in the
4096-CPU direction as well.  The general idea is to move to a much more
conservative concurrency design, then apply a hierarchy to reduce
contention on the global lock by a few orders of magnitude (larger
machines would see greater reductions).  The reason for taking a
conservative approach is that this code isn't on any fast path.

Prototype in progress.

This patch is against the linux-tip git tree (tip/core/rcu).  If you
wish to test this against 2.6.26, use the following set of patches:

http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimp-1.patch
http://www.rdrop.com/users/paulmck/patches/2.6.26-ljsimpfix-3.patch

The first patch combines commits 5127bed588a2f8f3a1f732de2a8a190b7df5dce3
and 3cac97cbb14aed00d83eb33d4613b0fe3aaea863 from Lai Jiangshan
<laijs@cn.fujitsu.com>, and the second patch contains my changes.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 51 +++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 41 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index dab2676d9d72..5de126630b10 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -87,6 +87,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 	int cpu;
 	cpumask_t cpumask;
 	set_need_resched();
+	spin_lock(&rcp->lock);
 	if (unlikely(!rcp->signaled)) {
 		rcp->signaled = 1;
 		/*
@@ -112,6 +113,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
+	spin_unlock(&rcp->lock);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -125,7 +127,9 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
 		struct rcu_data *rdp)
 {
 	long batch;
-	smp_mb(); /* reads the most recently updated value of rcu->cur. */
+
+	head->next = NULL;
+	smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
 
 	/*
 	 * Determine the batch number of this callback.
@@ -175,7 +179,6 @@ void call_rcu(struct rcu_head *head,
 	unsigned long flags;
 
 	head->func = func;
-	head->next = NULL;
 	local_irq_save(flags);
 	__call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	local_irq_restore(flags);
@@ -204,7 +207,6 @@ void call_rcu_bh(struct rcu_head *head,
 	unsigned long flags;
 
 	head->func = func;
-	head->next = NULL;
 	local_irq_save(flags);
 	__call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 	local_irq_restore(flags);
@@ -467,17 +469,17 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
-	/* if the cpu going offline owns the grace period
+	/*
+	 * if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
 	spin_lock_bh(&rcp->lock);
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	/* spin_lock implies smp_mb() */
 	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
+	spin_unlock_bh(&rcp->lock);
 
 	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
@@ -511,16 +513,19 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	long completed_snap;
+
 	if (rdp->nxtlist) {
 		local_irq_disable();
+		completed_snap = ACCESS_ONCE(rcp->completed);
 
 		/*
 		 * move the other grace-period-completed entries to
 		 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
 		 */
-		if (!rcu_batch_before(rcp->completed, rdp->batch))
+		if (!rcu_batch_before(completed_snap, rdp->batch))
 			rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
-		else if (!rcu_batch_before(rcp->completed, rdp->batch - 1))
+		else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
 			rdp->nxttail[0] = rdp->nxttail[1];
 
 		/*
@@ -561,8 +566,24 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+	/*
+	 * Memory references from any prior RCU read-side critical sections
+	 * executed by the interrupted code must be see before any RCU
+	 * grace-period manupulations below.
+	 */
+
+	smp_mb(); /* See above block comment. */
+
 	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+
+	/*
+	 * Memory references from any later RCU read-side critical sections
+	 * executed by the interrupted code must be see after any RCU
+	 * grace-period manupulations above.
+	 */
+
+	smp_mb(); /* See above block comment. */
 }
 
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
@@ -571,13 +592,15 @@ static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 	check_cpu_stall(rcp, rdp);
 
 	if (rdp->nxtlist) {
+		long completed_snap = ACCESS_ONCE(rcp->completed);
+
 		/*
 		 * This cpu has pending rcu entries and the grace period
 		 * for them has completed.
 		 */
-		if (!rcu_batch_before(rcp->completed, rdp->batch))
+		if (!rcu_batch_before(completed_snap, rdp->batch))
 			return 1;
-		if (!rcu_batch_before(rcp->completed, rdp->batch - 1) &&
+		if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
 				rdp->nxttail[0] != rdp->nxttail[1])
 			return 1;
 		if (rdp->nxttail[0] != &rdp->nxtlist)
@@ -628,6 +651,12 @@ int rcu_needs_cpu(int cpu)
 	return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
 }
 
+/*
+ * Top-level function driving RCU grace-period detection, normally
+ * invoked from the scheduler-clock interrupt.  This function simply
+ * increments counters that are read only from softirq by this same
+ * CPU, so there are no memory barriers required.
+ */
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
@@ -671,6 +700,7 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
+	spin_lock(&rcp->lock);
 	memset(rdp, 0, sizeof(*rdp));
 	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
@@ -678,6 +708,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
 	rdp->blimit = blimit;
+	spin_unlock(&rcp->lock);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
-- 
cgit v1.2.3-58-ga151


From 5802294f1b1895ee19a3d0ae72805da453afb9de Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 30 Jul 2008 14:20:55 -0400
Subject: rcu: trace fix possible mem-leak

In the initialization of the RCU trace module, if
rcupreempt_debugfs_init() fails, we never free the the trace buffer.

This patch frees the trace buffer in case the debugfs fails.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Reviewed-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt_trace.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
 
 static int __init rcupreempt_trace_init(void)
 {
+	int ret;
+
 	mutex_init(&rcupreempt_trace_mutex);
 	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
 	if (!rcupreempt_trace_buf)
 		return 1;
-	return rcupreempt_debugfs_init();
+	ret = rcupreempt_debugfs_init();
+	if (ret)
+		kfree(rcupreempt_trace_buf);
+	return ret;
 }
 
 static void __exit rcupreempt_trace_cleanup(void)
-- 
cgit v1.2.3-58-ga151


From cd95851785bcfe95fdf73689e8ecb5a1c5959231 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Aug 2008 07:37:15 -0700
Subject: rcu: fix classic RCU locking cleanup lockdep problem

On Fri, Aug 15, 2008 at 04:24:30PM +0200, Ingo Molnar wrote:
>
> Paul,
>
> one of your two recent RCU patches caused this lockdep splat in -tip
> testing:
>
> ------------------->
> Brought up 2 CPUs
> Total of 2 processors activated (6850.87 BogoMIPS).
> PM: Adding info for No Bus:platform
> khelper used greatest stack depth: 3124 bytes left
>
> =================================
> [ INFO: inconsistent lock state ]
> 2.6.27-rc3-tip #1
> ---------------------------------
> inconsistent {softirq-on-W} -> {in-softirq-W} usage.
> ksoftirqd/0/4 [HC0[0]:SC1[1]:HE1:SE0] takes:
>  (&rcu_ctrlblk.lock){-+..}, at: [<c016d91c>] __rcu_process_callbacks+0x1ac/0x1f0
> {softirq-on-W} state was registered at:
>   [<c01528e4>] __lock_acquire+0x3f4/0x5b0
>   [<c0152b29>] lock_acquire+0x89/0xc0
>   [<c076142b>] _spin_lock+0x3b/0x70
>   [<c016d649>] rcu_init_percpu_data+0x29/0x80
>   [<c075e43f>] rcu_cpu_notify+0xaf/0xd0
>   [<c076458d>] notifier_call_chain+0x2d/0x60
>   [<c0145ede>] __raw_notifier_call_chain+0x1e/0x30
>   [<c075db29>] _cpu_up+0x79/0x110
>   [<c075dc0d>] cpu_up+0x4d/0x70
>   [<c0a769e1>] kernel_init+0xb1/0x200
>   [<c01048a3>] kernel_thread_helper+0x7/0x10
>   [<ffffffff>] 0xffffffff
> irq event stamp: 14
> hardirqs last  enabled at (14): [<c01534db>] trace_hardirqs_on+0xb/0x10
> hardirqs last disabled at (13): [<c014dbeb>] trace_hardirqs_off+0xb/0x10
> softirqs last  enabled at (0): [<c012b186>] copy_process+0x276/0x1190
> softirqs last disabled at (11): [<c0105c0a>] call_on_stack+0x1a/0x30
>
> other info that might help us debug this:
> no locks held by ksoftirqd/0/4.
>
> stack backtrace:
> Pid: 4, comm: ksoftirqd/0 Not tainted 2.6.27-rc3-tip #1
>  [<c01504dc>] print_usage_bug+0x16c/0x1b0
>  [<c0152455>] mark_lock+0xa75/0xb10
>  [<c0108b75>] ? sched_clock+0x15/0x30
>  [<c015289d>] __lock_acquire+0x3ad/0x5b0
>  [<c0152b29>] lock_acquire+0x89/0xc0
>  [<c016d91c>] ? __rcu_process_callbacks+0x1ac/0x1f0
>  [<c076142b>] _spin_lock+0x3b/0x70
>  [<c016d91c>] ? __rcu_process_callbacks+0x1ac/0x1f0
>  [<c016d91c>] __rcu_process_callbacks+0x1ac/0x1f0
>  [<c016d986>] rcu_process_callbacks+0x26/0x50
>  [<c0132305>] __do_softirq+0x95/0x120
>  [<c0132270>] ? __do_softirq+0x0/0x120
>  [<c0105c0a>] call_on_stack+0x1a/0x30
>  [<c0132426>] ? ksoftirqd+0x96/0x110
>  [<c0132390>] ? ksoftirqd+0x0/0x110
>  [<c01411f7>] ? kthread+0x47/0x80
>  [<c01411b0>] ? kthread+0x0/0x80
>  [<c01048a3>] ? kernel_thread_helper+0x7/0x10
>  =======================
> calling  init_cpufreq_transition_notifier_list+0x0/0x20
> initcall init_cpufreq_transition_notifier_list+0x0/0x20 returned 0 after 0 msecs
> calling  net_ns_init+0x0/0x190
> net_namespace: 676 bytes
> initcall net_ns_init+0x0/0x190 returned 0 after 0 msecs
> calling  cpufreq_tsc+0x0/0x20
> initcall cpufreq_tsc+0x0/0x20 returned 0 after 0 msecs
> calling  reboot_init+0x0/0x20
> initcall reboot_init+0x0/0x20 returned 0 after 0 msecs
> calling  print_banner+0x0/0x10
> Booting paravirtualized kernel on bare hardware
>
> <-----------------------
>
> my guess is on:
>
>  commit 1f7b94cd3d564901f9e04a8bc5832ae7bfd690a0
>  Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
>  Date:   Tue Aug 5 09:21:44 2008 -0700
>
>     rcu: classic RCU locking and memory-barrier cleanups
>
> 	Ingo

Fixes a problem detected by lockdep in which rcu->lock was acquired
both in irq context and in process context, but without disabling from
process context.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 5de126630b10..fb1f1cc45142 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -700,7 +700,9 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
-	spin_lock(&rcp->lock);
+	long flags;
+
+	spin_lock_irqsave(&rcp->lock, flags);
 	memset(rdp, 0, sizeof(*rdp));
 	rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
 	rdp->donetail = &rdp->donelist;
@@ -708,7 +710,7 @@ static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 	rdp->qs_pending = 0;
 	rdp->cpu = cpu;
 	rdp->blimit = blimit;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 static void __cpuinit rcu_online_cpu(int cpu)
-- 
cgit v1.2.3-58-ga151


From ded00a56e99555c3f4000ef3eebfd5fe0d574565 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 17 Aug 2008 12:50:36 -0700
Subject: rcu: remove redundant ACCESS_ONCE definition from rcupreempt.c

Remove the redundant definition of ACCESS_ONCE() from rcupreempt.c in
favor of the one in compiler.h.  Also merge the comment header from
rcupreempt.c's definition into that in compiler.h.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/compiler.h | 4 +++-
 kernel/rcupreempt.c      | 8 --------
 2 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c8bd2daf95ec..8322141ee480 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -190,7 +190,9 @@ extern void __chk_io_ptr(const volatile void __iomem *);
  * ACCESS_ONCE() in different C statements.
  *
  * This macro does absolutely -nothing- to prevent the CPU from reordering,
- * merging, or refetching absolutely anything at any time.
+ * merging, or refetching absolutely anything at any time.  Its main intended
+ * use is to mediate communication between process-level code and irq/NMI
+ * handlers, all running on the same CPU.
  */
 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
 
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 27827931ca0d..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -58,14 +58,6 @@
 #include <linux/cpumask.h>
 #include <linux/rcupreempt_trace.h>
 
-/*
- * Macro that prevents the compiler from reordering accesses, but does
- * absolutely -nothing- to prevent CPUs from reordering.  This is used
- * only to mediate communication between mainline code and hardware
- * interrupt and NMI handlers.
- */
-#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
-
 /*
  * PREEMPT_RCU data structures.
  */
-- 
cgit v1.2.3-58-ga151


From eff9b713ee3540ddab862095aaf4b1511a6758bc Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 18 Aug 2008 17:51:08 -0700
Subject: rcu: fix locking cleanup fallout

Given that the rcp->lock is now acquired from call_rcu(), which can be
invoked from irq-disable regions, all acquisitions need to disable irqs.
The following patch fixes this.

Although I don't have any reason to believe that this is the cause of
Yinghai's oops, it does need to be fixed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index fb1f1cc45142..c6b6cf55f3e2 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -86,8 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
 {
 	int cpu;
 	cpumask_t cpumask;
+	unsigned long flags;
+
 	set_need_resched();
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if (unlikely(!rcp->signaled)) {
 		rcp->signaled = 1;
 		/*
@@ -113,7 +115,7 @@ static void force_quiescent_state(struct rcu_data *rdp,
 		for_each_cpu_mask_nr(cpu, cpumask)
 			smp_send_reschedule(cpu);
 	}
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 #else
 static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -301,17 +303,18 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 	int cpu;
 	long delta;
+	unsigned long flags;
 
 	/* Only let one CPU complain about others per time interval. */
 
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	delta = get_seconds() - rcp->gp_check;
 	if (delta < 2L || cpus_empty(rcp->cpumask)) {
 		spin_unlock(&rcp->lock);
 		return;
 	}
 	rcp->gp_check = get_seconds() + 30;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 
 	/* OK, time to rat on our buddy... */
 
@@ -324,13 +327,15 @@ static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
 
 static void print_cpu_stall(struct rcu_ctrlblk *rcp)
 {
+	unsigned long flags;
+
 	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
 			smp_processor_id(), get_seconds(), rcp->gp_check);
 	dump_stack();
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
 		rcp->gp_check = get_seconds() + 30;
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
@@ -413,6 +418,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	unsigned long flags;
+
 	if (rdp->quiescbatch != rcp->cur) {
 		/* start new grace period: */
 		rdp->qs_pending = 1;
@@ -436,7 +443,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 		return;
 	rdp->qs_pending = 0;
 
-	spin_lock(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	/*
 	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 	 * during cpu startup. Ignore the quiescent state.
@@ -444,7 +451,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 	if (likely(rdp->quiescbatch == rcp->cur))
 		cpu_quiet(rdp->cpu, rcp);
 
-	spin_unlock(&rcp->lock);
+	spin_unlock_irqrestore(&rcp->lock, flags);
 }
 
 
@@ -469,21 +476,22 @@ static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
+	unsigned long flags;
+
 	/*
 	 * if the cpu going offline owns the grace period
 	 * we can block indefinitely waiting for it, so flush
 	 * it here
 	 */
-	spin_lock_bh(&rcp->lock);
+	spin_lock_irqsave(&rcp->lock, flags);
 	if (rcp->cur != rcp->completed)
 		cpu_quiet(rdp->cpu, rcp);
 	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
 	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
-	spin_unlock_bh(&rcp->lock);
+	spin_unlock(&rcp->lock);
 
-	local_irq_disable();
 	this_rdp->qlen += rdp->qlen;
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void rcu_offline_cpu(int cpu)
@@ -550,12 +558,12 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
+			spin_lock_irqsave(&rcp->lock, flags);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
 				rcp->pending = rdp->batch;
 				rcu_start_batch(rcp);
 			}
-			spin_unlock(&rcp->lock);
+			spin_unlock_irqrestore(&rcp->lock, flags);
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From 0c925d79234fe77589d8ff3861f9f8bb9e7fc3f6 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 18 Aug 2008 21:49:51 -0700
Subject: rcuclassic: fix compilation NG

fix:

  CC      kernel/rcuclassic.o
kernel/rcuclassic.c: In function '__rcu_process_callbacks':
kernel/rcuclassic.c:561: error: 'flags' undeclared (first use in this function)
kernel/rcuclassic.c:561: error: (Each undeclared identifier is reported only once
kernel/rcuclassic.c:561: error: for each function it appears in.)

Declare missing variable flags.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index c6b6cf55f3e2..01e761a6b38c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -557,6 +557,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		local_irq_enable();
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
+			unsigned long flags;
+
 			/* and start it/schedule start if it's a new batch */
 			spin_lock_irqsave(&rcp->lock, flags);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
-- 
cgit v1.2.3-58-ga151


From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:02 +0200
Subject: sched: rt-bandwidth for user grouping interface

rt_runtime is a signed value

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/user.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
 {
 	struct user_struct *up = container_of(kobj, struct user_struct, kobj);
 
-	return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+	return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
 }
 
 static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
 	unsigned long rt_runtime;
 	int rc;
 
-	sscanf(buf, "%lu", &rt_runtime);
+	sscanf(buf, "%ld", &rt_runtime);
 
 	rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
 
-- 
cgit v1.2.3-58-ga151


From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:03 +0200
Subject: sched: rt-bandwidth accounting fix

It fixes an accounting bug where we would continue accumulating runtime
even though the bandwidth control is disabled. This would lead to very long
throttle periods once bandwidth control gets turned on again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 998ba54b4543..77340b04a538 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 {
 	u64 runtime = sched_rt_runtime(rt_rq);
 
-	if (runtime == RUNTIME_INF)
-		return 0;
-
 	if (rt_rq->rt_throttled)
 		return rt_rq_throttled(rt_rq);
 
@@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq)
 		rt_rq = rt_rq_of_se(rt_se);
 
 		spin_lock(&rt_rq->rt_runtime_lock);
-		rt_rq->rt_time += delta_exec;
-		if (sched_rt_runtime_exceeded(rt_rq))
-			resched_task(curr);
+		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
+			rt_rq->rt_time += delta_exec;
+			if (sched_rt_runtime_exceeded(rt_rq))
+				resched_task(curr);
+		}
 		spin_unlock(&rt_rq->rt_runtime_lock);
 	}
 }
-- 
cgit v1.2.3-58-ga151


From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:04 +0200
Subject: sched: rt-bandwidth group disable fixes

More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime
to disable full group bandwidth control.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 9 ++++++++-
 kernel/sched_rt.c | 5 ++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 9a1ddb84e26d..c1bee5fb8154 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
+static inline int rt_bandwidth_enabled(void);
+
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
@@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 77340b04a538..94daace5ee15 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 	int i, idle = 1;
 	cpumask_t span;
 
-	if (rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
 	span = sched_rt_period_mask();
@@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 
+	if (!rt_bandwidth_enabled())
+		return;
+
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 
-- 
cgit v1.2.3-58-ga151


From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:05 +0200
Subject: sched: extract walk_tg_tree()

Extract walk_tg_tree() and make it a little more generic so we can use it
in the schedulablity test.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1bee5fb8154..8c019a19d052 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#ifdef CONFIG_SMP
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-	struct rq *rq = cpu_rq(cpu);
-
-	if (rq->nr_running)
-		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
-
-	return rq->avg_load_per_task;
-}
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
  * Iterate the full tree, calling @down when first entering a node and @up when
  * leaving it for the final time.
  */
-static void
-walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
+static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
 {
 	struct task_group *parent, *child;
+	int ret;
 
 	rcu_read_lock();
 	parent = &root_task_group;
 down:
-	(*down)(parent, cpu, sd);
+	ret = (*down)(parent, data);
+	if (ret)
+		goto out_unlock;
 	list_for_each_entry_rcu(child, &parent->children, siblings) {
 		parent = child;
 		goto down;
@@ -1426,14 +1412,42 @@ down:
 up:
 		continue;
 	}
-	(*up)(parent, cpu, sd);
+	ret = (*up)(parent, data);
+	if (ret)
+		goto out_unlock;
 
 	child = parent;
 	parent = parent->parent;
 	if (parent)
 		goto up;
+out_unlock:
 	rcu_read_unlock();
+
+	return ret;
+}
+
+static int tg_nop(struct task_group *tg, void *data)
+{
+	return 0;
 }
+#endif
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (rq->nr_running)
+		rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+
+	return rq->avg_load_per_task;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
  * This needs to be done in a bottom-up fashion because the rq weight of a
  * parent group depends on the shares of its child groups.
  */
-static void
-tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long rq_weight = 0;
 	unsigned long shares = 0;
+	struct sched_domain *sd = data;
 	int i;
 
 	for_each_cpu_mask(i, sd->span) {
@@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
 		__update_group_shares_cpu(tg, i, shares, rq_weight);
 		spin_unlock_irqrestore(&rq->lock, flags);
 	}
+
+	return 0;
 }
 
 /*
@@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
  */
-static void
-tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+static int tg_load_down(struct task_group *tg, void *data)
 {
 	unsigned long load;
+	long cpu = (long)data;
 
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
@@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 	}
 
 	tg->cfs_rq[cpu]->h_load = load;
-}
 
-static void
-tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
+	return 0;
 }
 
 static void update_shares(struct sched_domain *sd)
@@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd)
 
 	if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
 		sd->last_update = now;
-		walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
+		walk_tg_tree(tg_nop, tg_shares_up, sd);
 	}
 }
 
@@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 	spin_lock(&rq->lock);
 }
 
-static void update_h_load(int cpu)
+static void update_h_load(long cpu)
 {
-	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
+	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
 }
 
 #else
-- 
cgit v1.2.3-58-ga151


From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 19 Aug 2008 12:33:06 +0200
Subject: sched: rt-bandwidth fixes

The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting
for the group scheduler - however it doesn't deal with sched_setscheduler(),
which will keep tasks out of groups that have no assigned runtime.

If we relax this, we get into the situation where RT tasks can get into a group
when we disable bandwidth control, and then starve them by enabling it again.

Rework the schedulability code to check for this condition and fail to turn
on bandwidth control with -EBUSY when this situation is found.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 125 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8c019a19d052..e41bdae2778d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
 static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 #endif /* CONFIG_RT_GROUP_SCHED */
-#else /* !CONFIG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_USER_SCHED */
 #define root_task_group init_task_group
-#endif /* CONFIG_FAIR_GROUP_SCHED */
+#endif /* CONFIG_USER_SCHED */
 
 /* task_group_lock serializes add/remove of task groups and also changes to
  * a task group's cpu shares.
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED))
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
@@ -5082,7 +5082,8 @@ recheck:
 		 * Do not allow realtime tasks into groups that have no runtime
 		 * assigned.
 		 */
-		if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
+		if (rt_bandwidth_enabled() && rt_policy(policy) &&
+				task_group(p)->rt_bandwidth.rt_runtime == 0)
 			return -EPERM;
 #endif
 
@@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
 	if (runtime == RUNTIME_INF)
-		return 1ULL << 16;
+		return 1ULL << 20;
 
-	return div64_u64(runtime << 16, period);
+	return div64_u64(runtime << 20, period);
 }
 
-#ifdef CONFIG_CGROUP_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
+/* Must be called with tasklist_lock held */
+static inline int tg_has_rt_tasks(struct task_group *tg)
 {
-	struct task_group *tgi, *parent = tg->parent;
-	unsigned long total = 0;
+	struct task_struct *g, *p;
 
-	if (!parent) {
-		if (global_rt_period() < period)
-			return 0;
+	do_each_thread(g, p) {
+		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+			return 1;
+	} while_each_thread(g, p);
 
-		return to_ratio(period, runtime) <
-			to_ratio(global_rt_period(), global_rt_runtime());
-	}
+	return 0;
+}
 
-	if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period)
-		return 0;
+struct rt_schedulable_data {
+	struct task_group *tg;
+	u64 rt_period;
+	u64 rt_runtime;
+};
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &parent->children, siblings) {
-		if (tgi == tg)
-			continue;
+static int tg_schedulable(struct task_group *tg, void *data)
+{
+	struct rt_schedulable_data *d = data;
+	struct task_group *child;
+	unsigned long total, sum = 0;
+	u64 period, runtime;
+
+	period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+	runtime = tg->rt_bandwidth.rt_runtime;
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	if (tg == d->tg) {
+		period = d->rt_period;
+		runtime = d->rt_runtime;
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) <=
-		to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
-				parent->rt_bandwidth.rt_runtime);
-}
-#elif defined CONFIG_USER_SCHED
-static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
-{
-	struct task_group *tgi;
-	unsigned long total = 0;
-	unsigned long global_ratio =
-		to_ratio(global_rt_period(), global_rt_runtime());
+	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+		return -EBUSY;
 
-	rcu_read_lock();
-	list_for_each_entry_rcu(tgi, &task_groups, list) {
-		if (tgi == tg)
-			continue;
+	total = to_ratio(period, runtime);
 
-		total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period),
-				tgi->rt_bandwidth.rt_runtime);
+	list_for_each_entry_rcu(child, &tg->children, siblings) {
+		period = ktime_to_ns(child->rt_bandwidth.rt_period);
+		runtime = child->rt_bandwidth.rt_runtime;
+
+		if (child == d->tg) {
+			period = d->rt_period;
+			runtime = d->rt_runtime;
+		}
+
+		sum += to_ratio(period, runtime);
 	}
-	rcu_read_unlock();
 
-	return total + to_ratio(period, runtime) < global_ratio;
+	if (sum > total)
+		return -EINVAL;
+
+	return 0;
 }
-#endif
 
-/* Must be called with tasklist_lock held */
-static inline int tg_has_rt_tasks(struct task_group *tg)
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
-	struct task_struct *g, *p;
-	do_each_thread(g, p) {
-		if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
-			return 1;
-	} while_each_thread(g, p);
-	return 0;
+	struct rt_schedulable_data data = {
+		.tg = tg,
+		.rt_period = period,
+		.rt_runtime = runtime,
+	};
+
+	return walk_tg_tree(tg_schedulable, tg_nop, &data);
 }
 
 static int tg_set_bandwidth(struct task_group *tg,
@@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg,
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	if (rt_runtime == 0 && tg_has_rt_tasks(tg)) {
-		err = -EBUSY;
+	err = __rt_schedulable(tg, rt_period, rt_runtime);
+	if (err)
 		goto unlock;
-	}
-	if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
-		err = -EINVAL;
-		goto unlock;
-	}
 
 	spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
 	tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void)
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
 	mutex_lock(&rt_constraints_mutex);
-	if (!__rt_schedulable(tg, rt_period, rt_runtime))
-		ret = -EINVAL;
+	read_lock(&tasklist_lock);
+	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
 	return ret;
-- 
cgit v1.2.3-58-ga151


From 275a89bdd3868af3008852594d2e169eaf69441b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 21 Aug 2008 06:14:55 -0700
Subject: rcu: use irq-safe locks

Some earlier tip/core/rcu patches caused RCU to incorrectly enable irqs
too early in boot.  This caused Yinghai's repeated-kexec testing to
hit oopses, presumably due to so that device interrupts left over from
the prior kernel instance (which would oops the newly booting kernel
before it got a chance to reset said devices).  This patch therefore
converts all the local_irq_disable()s in rcuclassic.c to local_irq_save().

Besides, I never did like local_irq_disable() anyway.  ;-)

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 01e761a6b38c..3f6918966bda 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -247,6 +247,7 @@ static inline void raise_rcu_softirq(void)
  */
 static void rcu_do_batch(struct rcu_data *rdp)
 {
+	unsigned long flags;
 	struct rcu_head *next, *list;
 	int count = 0;
 
@@ -261,9 +262,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	}
 	rdp->donelist = list;
 
-	local_irq_disable();
+	local_irq_save(flags);
 	rdp->qlen -= count;
-	local_irq_enable();
+	local_irq_restore(flags);
 	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
 		rdp->blimit = blimit;
 
@@ -464,12 +465,14 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 				struct rcu_head **tail, long batch)
 {
+	unsigned long flags;
+
 	if (list) {
-		local_irq_disable();
+		local_irq_save(flags);
 		this_rdp->batch = batch;
 		*this_rdp->nxttail[2] = list;
 		this_rdp->nxttail[2] = tail;
-		local_irq_enable();
+		local_irq_restore(flags);
 	}
 }
 
@@ -521,10 +524,11 @@ static void rcu_offline_cpu(int cpu)
 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 					struct rcu_data *rdp)
 {
+	unsigned long flags;
 	long completed_snap;
 
 	if (rdp->nxtlist) {
-		local_irq_disable();
+		local_irq_save(flags);
 		completed_snap = ACCESS_ONCE(rcp->completed);
 
 		/*
@@ -554,7 +558,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 			rdp->nxttail[0] = &rdp->nxtlist;
 		}
 
-		local_irq_enable();
+		local_irq_restore(flags);
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
 			unsigned long flags;
-- 
cgit v1.2.3-58-ga151


From a38409fbb5181324fb73b359189a72e02b6c7030 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 Aug 2008 12:16:09 +0200
Subject: dma-coherent: export dma_[alloc|release]_from_coherent methods

fixes modular builds:

  ERROR: "dma_alloc_from_coherent" [sound/core/snd-page-alloc.ko] undefined!
  ERROR: "dma_release_from_coherent" [sound/core/snd-page-alloc.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/dma-coherent.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index c1d4d5b4c61c..f013a0c2e111 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -124,6 +124,7 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
 	}
 	return (mem != NULL);
 }
+EXPORT_SYMBOL(dma_alloc_from_coherent);
 
 /**
  * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
@@ -151,3 +152,4 @@ int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(dma_release_from_coherent);
-- 
cgit v1.2.3-58-ga151


From 94d3d8247de22c5b0624aa00616ceca459498e55 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:41 -0700
Subject: sched: do_wait_for_common: use signal_pending_state()

Change do_wait_for_common() to use signal_pending_state() instead of open
coding.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d601fb0406ca..da7c5d23cc03 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4599,10 +4599,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
-			if ((state == TASK_INTERRUPTIBLE &&
-			     signal_pending(current)) ||
-			    (state == TASK_KILLABLE &&
-			     fatal_signal_pending(current))) {
+			if (signal_pending_state(state, current)) {
 				timeout = -ERESTARTSYS;
 				break;
 			}
-- 
cgit v1.2.3-58-ga151


From f31e11d87a5d7601636710195891ba462ad99f11 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive(): don't consider task->nivcsw

If wait_task_inactive() returns success the task was deactivated.  In that
case schedule() always increments ->nvcsw which alone can be used as a
"generation counter".

If the next call returns the same number, we can be sure that the task was
unscheduled.  Otherwise, because we know that .on_rq == 0 again, ->nvcsw
should have been changed in between.

Q: perhaps it is better to do "ncsw = (p->nvcsw << 1) | 1" ?  This
decreases the possibility of "was it unscheduled" false positive when
->nvcsw == 0.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index da7c5d23cc03..908670aa215a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1921,11 +1921,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		running = task_running(rq, p);
 		on_rq = p->se.on_rq;
 		ncsw = 0;
-		if (!match_state || p->state == match_state) {
-			ncsw = p->nivcsw + p->nvcsw;
-			if (unlikely(!ncsw))
-				ncsw = 1;
-		}
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw ?: 1;
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3-58-ga151


From 93dcf55f828b035fc93fc19eb03c1390e1e6d570 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 20 Aug 2008 16:54:44 -0700
Subject: wait_task_inactive: "improve" the returned value for ->nvcsw == 0

wait_task_inactive() returns 1 when p->nvcsw == 0 || p->nvcsw == 1.  This
means that two subsequent calls can return the same number while the task
was scheduled in between.

Change the code to return "nvcsw | LONG_MIN" instead of "nvcsw ?: 1", now
the overlap always needs LONG_MAX schedules.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 908670aa215a..6a43c8942b05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1922,7 +1922,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		on_rq = p->se.on_rq;
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
-			ncsw = p->nvcsw ?: 1;
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, &flags);
 
 		/*
-- 
cgit v1.2.3-58-ga151


From 65eb3dc609dec17deea48dcd4de2e549d29a9824 Mon Sep 17 00:00:00 2001
From: Kevin Diggs <kevdig@hypersurf.com>
Date: Tue, 26 Aug 2008 10:26:54 +0200
Subject: sched: add kernel doc for the completion, fix
 kernel-doc-nano-HOWTO.txt

This patch adds kernel doc for the completion feature.

An error in the split-man.pl PERL snippet in kernel-doc-nano-HOWTO.txt is
also fixed.

Signed-off-by: Kevin Diggs <kevdig@hypersurf.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-doc-nano-HOWTO.txt |  4 +--
 include/linux/completion.h              | 41 ++++++++++++++++++++++++
 kernel/sched.c                          | 56 +++++++++++++++++++++++++++++++++
 3 files changed, 99 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kernel-doc-nano-HOWTO.txt b/Documentation/kernel-doc-nano-HOWTO.txt
index 0bd32748a467..c6841eee9598 100644
--- a/Documentation/kernel-doc-nano-HOWTO.txt
+++ b/Documentation/kernel-doc-nano-HOWTO.txt
@@ -168,10 +168,10 @@ if ($#ARGV < 0) {
 mkdir $ARGV[0],0777;
 $state = 0;
 while (<STDIN>) {
-    if (/^\.TH \"[^\"]*\" 4 \"([^\"]*)\"/) {
+    if (/^\.TH \"[^\"]*\" 9 \"([^\"]*)\"/) {
 	if ($state == 1) { close OUT }
 	$state = 1;
-	$fn = "$ARGV[0]/$1.4";
+	$fn = "$ARGV[0]/$1.9";
 	print STDERR "Creating $fn\n";
 	open OUT, ">$fn" or die "can't open $fn: $!\n";
 	print OUT $_;
diff --git a/include/linux/completion.h b/include/linux/completion.h
index 02ef8835999c..4a6b604ef7e4 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -10,6 +10,18 @@
 
 #include <linux/wait.h>
 
+/**
+ * struct completion - structure used to maintain state for a "completion"
+ *
+ * This is the opaque structure used to maintain the state for a "completion".
+ * Completions currently use a FIFO to queue threads that have to wait for
+ * the "completion" event.
+ *
+ * See also:  complete(), wait_for_completion() (and friends _timeout,
+ * _interruptible, _interruptible_timeout, and _killable), init_completion(),
+ * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and
+ * INIT_COMPLETION().
+ */
 struct completion {
 	unsigned int done;
 	wait_queue_head_t wait;
@@ -21,6 +33,14 @@ struct completion {
 #define COMPLETION_INITIALIZER_ONSTACK(work) \
 	({ init_completion(&work); work; })
 
+/**
+ * DECLARE_COMPLETION: - declare and initialize a completion structure
+ * @work:  identifier for the completion structure
+ *
+ * This macro declares and initializes a completion structure. Generally used
+ * for static declarations. You should use the _ONSTACK variant for automatic
+ * variables.
+ */
 #define DECLARE_COMPLETION(work) \
 	struct completion work = COMPLETION_INITIALIZER(work)
 
@@ -29,6 +49,13 @@ struct completion {
  * completions - so we use the _ONSTACK() variant for those that
  * are on the kernel stack:
  */
+/**
+ * DECLARE_COMPLETION_ONSTACK: - declare and initialize a completion structure
+ * @work:  identifier for the completion structure
+ *
+ * This macro declares and initializes a completion structure on the kernel
+ * stack.
+ */
 #ifdef CONFIG_LOCKDEP
 # define DECLARE_COMPLETION_ONSTACK(work) \
 	struct completion work = COMPLETION_INITIALIZER_ONSTACK(work)
@@ -36,6 +63,13 @@ struct completion {
 # define DECLARE_COMPLETION_ONSTACK(work) DECLARE_COMPLETION(work)
 #endif
 
+/**
+ * init_completion: - Initialize a dynamically allocated completion
+ * @x:  completion structure that is to be initialized
+ *
+ * This inline function will initialize a dynamically created completion
+ * structure.
+ */
 static inline void init_completion(struct completion *x)
 {
 	x->done = 0;
@@ -55,6 +89,13 @@ extern bool completion_done(struct completion *x);
 extern void complete(struct completion *);
 extern void complete_all(struct completion *);
 
+/**
+ * INIT_COMPLETION: - reinitialize a completion structure
+ * @x:  completion structure to be reinitialized
+ *
+ * This macro should be used to reinitialize a completion structure so it can
+ * be reused. This is especially important after complete_all() is used.
+ */
 #define INIT_COMPLETION(x)	((x).done = 0)
 
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 29e2ec0bd831..93f5ea08be97 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4565,6 +4565,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 }
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
+/**
+ * complete: - signals a single thread waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up a single thread waiting on this completion. Threads will be
+ * awakened in the same order in which they were queued.
+ *
+ * See also complete_all(), wait_for_completion() and related routines.
+ */
 void complete(struct completion *x)
 {
 	unsigned long flags;
@@ -4576,6 +4585,12 @@ void complete(struct completion *x)
 }
 EXPORT_SYMBOL(complete);
 
+/**
+ * complete_all: - signals all threads waiting on this completion
+ * @x:  holds the state of this particular completion
+ *
+ * This will wake up all threads waiting on this particular completion event.
+ */
 void complete_all(struct completion *x)
 {
 	unsigned long flags;
@@ -4624,12 +4639,31 @@ wait_for_common(struct completion *x, long timeout, int state)
 	return timeout;
 }
 
+/**
+ * wait_for_completion: - waits for completion of a task
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It is NOT
+ * interruptible and there is no timeout.
+ *
+ * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
+ * and interrupt capability. Also see complete().
+ */
 void __sched wait_for_completion(struct completion *x)
 {
 	wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(wait_for_completion);
 
+/**
+ * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. The timeout is in jiffies. It is not
+ * interruptible.
+ */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 {
@@ -4637,6 +4671,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
 }
 EXPORT_SYMBOL(wait_for_completion_timeout);
 
+/**
+ * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits for completion of a specific task to be signaled. It is
+ * interruptible.
+ */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4646,6 +4687,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible);
 
+/**
+ * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
+ * @x:  holds the state of this particular completion
+ * @timeout:  timeout value in jiffies
+ *
+ * This waits for either a completion of a specific task to be signaled or for a
+ * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ */
 unsigned long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
 					  unsigned long timeout)
@@ -4654,6 +4703,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
 }
 EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 
+/**
+ * wait_for_completion_killable: - waits for completion of a task (killable)
+ * @x:  holds the state of this particular completion
+ *
+ * This waits to be signaled for completion of a specific task. It can be
+ * interrupted by a kill signal.
+ */
 int __sched wait_for_completion_killable(struct completion *x)
 {
 	long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
-- 
cgit v1.2.3-58-ga151


From 0cd418ddb1ee88df7d16d5df06cb2da68eceb9e4 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Mon, 18 Aug 2008 14:39:21 -0700
Subject: rcuclassic: fix compiler warning

CC      kernel/rcuclassic.o
kernel/rcuclassic.c: In function 'rcu_init_percpu_data':
kernel/rcuclassic.c:705: warning: comparison of distinct pointer types lacks a cast
kernel/rcuclassic.c:713: warning: comparison of distinct pointer types lacks a cast

flags should be unsigned long.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 3f6918966bda..743cf0550ff4 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -714,7 +714,7 @@ void rcu_check_callbacks(int cpu, int user)
 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 						struct rcu_data *rdp)
 {
-	long flags;
+	unsigned long flags;
 
 	spin_lock_irqsave(&rcp->lock, flags);
 	memset(rdp, 0, sizeof(*rdp));
-- 
cgit v1.2.3-58-ga151


From aec0a5142cb52aaa152d962d84a838e25d520742 Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Thu, 28 Aug 2008 14:42:49 +0530
Subject: sched: call resched_task() conditionally from new task wake up path

- During wake up of a new task, task_new_fair() can do a resched_task()
  on the current task. Later in the code path, check_preempt_curr() also ends
  up doing the same, which can be avoided. Check if TIF_NEED_RESCHED is
  already set for the current task.

- task_new_fair() does a resched_task() on the current task unconditionally.
  This can be done only in case when child runs before the parent.

So this is a small speedup.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fb8994c6d4bb..8264bb5dbd51 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1348,6 +1348,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (unlikely(se == pse))
 		return;
 
+	/*
+	 * We can come here with TIF_NEED_RESCHED already set from new task
+	 * wake up path.
+	 */
+	if (test_tsk_need_resched(curr))
+		return;
+
 	cfs_rq_of(pse)->next = pse;
 
 	/*
@@ -1620,10 +1627,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
+		resched_task(rq->curr);
 	}
 
 	enqueue_task_fair(rq, p, 0);
-	resched_task(rq->curr);
 }
 
 /*
-- 
cgit v1.2.3-58-ga151


From 29cbef4869bf288256ab76c7dc674cb132b35de2 Mon Sep 17 00:00:00 2001
From: Joe Korty <joe.korty@ccur.com>
Date: Wed, 27 Aug 2008 11:21:39 -0400
Subject: make might_sleep() display the oopsing process

Expand might_sleep's printk to indicate the oopsing process.

Signed-off-by: Joe Korty <joe.korty@ccur.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 93f5ea08be97..6e283dc7679a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8233,8 +8233,8 @@ void __might_sleep(char *file, int line)
 		prev_jiffy = jiffies;
 		printk(KERN_ERR "BUG: sleeping function called from invalid"
 				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(), current->pid, current->comm);
 		debug_show_held_locks(current);
 		if (irqs_disabled())
 			print_irqtrace_events(current);
-- 
cgit v1.2.3-58-ga151


From aef745fca016aea45adae5c98e8698904dd8ad51 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 Aug 2008 11:34:43 +0200
Subject: sched: clean up __might_sleep()

add KERN_ to the printout and clean up the flow a bit.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6e283dc7679a..b112caaa400a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8226,20 +8226,25 @@ void __might_sleep(char *file, int line)
 #ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) &&
-	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
-		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
-			return;
-		prev_jiffy = jiffies;
-		printk(KERN_ERR "BUG: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d, pid: %d, name: %s\n",
-			in_atomic(), irqs_disabled(), current->pid, current->comm);
-		debug_show_held_locks(current);
-		if (irqs_disabled())
-			print_irqtrace_events(current);
-		dump_stack();
-	}
+	if ((!in_atomic() && !irqs_disabled()) ||
+		    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+	dump_stack();
 #endif
 }
 EXPORT_SYMBOL(__might_sleep);
-- 
cgit v1.2.3-58-ga151


From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 Aug 2008 13:40:47 +0200
Subject: sched: extract walk_tg_tree(), fix

fix:

 kernel/sched.c: In function '__rt_schedulable':
 kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree'
 kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function)
 kernel/sched.c:8771: error: (Each undeclared identifier is reported only once
 kernel/sched.c:8771: error: for each function it appears in.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e41bdae2778d..703f56d5db5f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
 	update_load_sub(&rq->load, load);
 }
 
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED)
+#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
 typedef int (*tg_visitor)(struct task_group *, void *);
 
 /*
-- 
cgit v1.2.3-58-ga151


From 268364a0f48aee2f851f9d1ef8a6cda0f3039ef1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yhlu.kernel@gmail.com>
Date: Thu, 4 Sep 2008 21:02:44 +0200
Subject: IO resources: add reserve_region_with_split()

add reserve_region_with_split() to not lose e820 reserved entries if
they overlap with existing IO regions:

with test case by extend 0xe0000000 - 0xeffffff to 0xdd800000 -
we get:
	e0000000-efffffff : PCI MMCONFIG 0
		 e0000000-efffffff : reserved

and in /proc/iomem we get:
	found conflict for reserved [dd800000, efffffff], try to reserve with split
	    __reserve_region_with_split: (PCI Bus #80) [dd000000, ddffffff], res: (reserved) [dd800000, efffffff]
	    __reserve_region_with_split: (PCI Bus #00) [de000000, dfffffff], res: (reserved) [de000000, efffffff]
	initcall pci_subsys_init+0x0/0x121 returned 0 after 381 msecs
in dmesg

various fixes and improvements suggested by Linus.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ioport.h |  3 +++
 kernel/resource.c      | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 8d3b7a9afd17..fded376b94e3 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -108,6 +108,9 @@ extern struct resource iomem_resource;
 
 extern int request_resource(struct resource *root, struct resource *new);
 extern int release_resource(struct resource *new);
+extern void reserve_region_with_split(struct resource *root,
+			     resource_size_t start, resource_size_t end,
+			     const char *name);
 extern int insert_resource(struct resource *parent, struct resource *new);
 extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new);
 extern int allocate_resource(struct resource *root, struct resource *new,
diff --git a/kernel/resource.c b/kernel/resource.c
index 03d796c1b2e9..414d6fc9131e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -516,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
 	return result;
 }
 
+static void __init __reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	struct resource *parent = root;
+	struct resource *conflict;
+	struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+
+	if (!res)
+		return;
+
+	res->name = name;
+	res->start = start;
+	res->end = end;
+	res->flags = IORESOURCE_BUSY;
+
+	for (;;) {
+		conflict = __request_resource(parent, res);
+		if (!conflict)
+			break;
+		if (conflict != parent) {
+			parent = conflict;
+			if (!(conflict->flags & IORESOURCE_BUSY))
+				continue;
+		}
+
+		/* Uhhuh, that didn't work out.. */
+		kfree(res);
+		res = NULL;
+		break;
+	}
+
+	if (!res) {
+		printk(KERN_DEBUG "    __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
+			 conflict->name, conflict->start, conflict->end,
+			 name, start, end);
+
+		/* failed, split and try again */
+
+		/* conflict coverred whole area */
+		if (conflict->start <= start && conflict->end >= end)
+			return;
+
+		if (conflict->start > start)
+			__reserve_region_with_split(root, start, conflict->start-1, name);
+		if (!(conflict->flags & IORESOURCE_BUSY)) {
+			resource_size_t common_start, common_end;
+
+			common_start = max(conflict->start, start);
+			common_end = min(conflict->end, end);
+			if (common_start < common_end)
+				__reserve_region_with_split(root, common_start, common_end, name);
+		}
+		if (conflict->end < end)
+			__reserve_region_with_split(root, conflict->end+1, end, name);
+	}
+
+}
+
+void reserve_region_with_split(struct resource *root,
+		resource_size_t start, resource_size_t end,
+		const char *name)
+{
+	write_lock(&resource_lock);
+	__reserve_region_with_split(root, start, end, name);
+	write_unlock(&resource_lock);
+}
+
 EXPORT_SYMBOL(adjust_resource);
 
 /**
-- 
cgit v1.2.3-58-ga151


From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001
From: Krzysztof Helt <krzysztof.h1@poczta.fm>
Date: Fri, 5 Sep 2008 23:46:19 +0200
Subject: sched: compilation fix with gcc 3.4.6

I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6.
The error is:
  CC      kernel/sched.o
kernel/sched.c: In function `start_rt_bandwidth':
kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available
kernel/sched.c:214: sorry, unimplemented: called from here
make[1]: *** [kernel/sched.o] Error 1
make: *** [kernel] Error 2

It seems that the gcc 3.4.6 requires full inline definition before first usage.
The patch below fixes the compilation problem.

Signed-off-by: Krzysztof Helt <krzysztof.h1@wp.pl> (if needed>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 703f56d5db5f..4de2bfb28c58 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
 
-static inline int rt_bandwidth_enabled(void);
+static inline int rt_bandwidth_enabled(void)
+{
+	return sysctl_sched_rt_runtime >= 0;
+}
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
@@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void)
 	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 
-static inline int rt_bandwidth_enabled(void)
-{
-	return sysctl_sched_rt_runtime >= 0;
-}
-
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
-- 
cgit v1.2.3-58-ga151


From 38736f475071b80b66be28af7b44c854073699cc Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Sat, 6 Sep 2008 14:50:23 +0530
Subject: sched: fix __load_balance_iterator() for cfq with only one task

The __load_balance_iterator() returns a NULL when there's only one
sched_entity which is a task. It is caused by the following code-path.

	/* Skip over entities that are not tasks */
	do {
		se = list_entry(next, struct sched_entity, group_node);
		next = next->next;
	} while (next != &cfs_rq->tasks && !entity_is_task(se));

	if (next == &cfs_rq->tasks)
		return NULL;
	^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
      This will return NULL even when se is a task.

As a side-effect, there was a regression in sched_mc behavior since 2.6.25,
since iter_move_one_task() when it calls load_balance_start_fair(),
would not get any tasks to move!

Fix this by checking if the last entity was a task or not.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8264bb5dbd51..a10ac0bcee64 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1458,7 +1458,7 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 		next = next->next;
 	} while (next != &cfs_rq->tasks && !entity_is_task(se));
 
-	if (next == &cfs_rq->tasks)
+	if (next == &cfs_rq->tasks && !entity_is_task(se))
 		return NULL;
 
 	cfs_rq->balance_iterator = next;
-- 
cgit v1.2.3-58-ga151


From 3ba35573ad9a149a3af19625b502679283382f6b Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 31 Aug 2008 19:58:49 +0200
Subject: kernel/cpu.c: Move the CPU_DYING notifiers

When a cpu is taken offline, the CPU_DYING notifiers are called on the
dying cpu. According to <linux/notifiers.h>, the cpu should be "not
running any task, not handling interrupts, soon dead".

For the current implementation, this is not true:
- __cpu_disable can fail. If it fails, then the cpu will remain alive
  and happy.
- At least on x86, __cpu_disable() briefly enables the local interrupts
  to handle any outstanding interrupts.

What about moving CPU_DYING down a few lines, behind the __cpu_disable()
line?
There are only two CPU_DYING handlers in the kernel right now: one in
kvm, one in the scheduler. Both should work with the patch applied
[and: I'm not sure if either one handles a failing __cpu_disable()]

The patch survives simple offlining a cpu. kvm untested due to lack
of a test setup.

Signed-off-By: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpu.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..9e7ebde1331c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -199,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
 	struct take_cpu_down_param *param = _param;
 	int err;
 
-	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
-				param->hcpu);
 	/* Ensure this CPU doesn't handle any more interrupts. */
 	err = __cpu_disable();
 	if (err < 0)
 		return err;
 
+	raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
+				param->hcpu);
+
 	/* Force idle task to run as soon as we yield: it should
 	   immediately notice cpu is offline and die quickly. */
 	sched_idle_next();
-- 
cgit v1.2.3-58-ga151


From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001
From: Manfred Spraul <manfred@colorfullife.com>
Date: Sun, 7 Sep 2008 16:57:22 +0200
Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier

Right now, there is no notifier that is called on a new cpu, before the new
cpu begins processing interrupts/softirqs.
Various kernel function would need that notification, e.g. kvm works around
by calling smp_call_function_single(), rcu polls cpu_online_map.

The patch adds a CPU_STARTING notification. It also adds a helper function
that sends the message to all cpu_chain handlers.

Tested on x86-64.
All other archs are untested. Especially on sparc, I'm not sure if I got
it right.

Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/smp.c             |  3 +++
 arch/arm/kernel/smp.c               |  1 +
 arch/cris/arch-v32/kernel/smp.c     |  1 +
 arch/ia64/kernel/smpboot.c          |  1 +
 arch/m32r/kernel/smpboot.c          |  2 ++
 arch/mips/kernel/smp.c              |  2 ++
 arch/powerpc/kernel/smp.c           |  1 +
 arch/s390/kernel/smp.c              |  2 ++
 arch/sh/kernel/smp.c                |  2 ++
 arch/sparc/kernel/sun4d_smp.c       |  1 +
 arch/sparc/kernel/sun4m_smp.c       |  2 ++
 arch/um/kernel/smp.c                |  1 +
 arch/x86/kernel/smpboot.c           |  1 +
 arch/x86/mach-voyager/voyager_smp.c |  2 ++
 include/linux/cpu.h                 |  1 +
 include/linux/notifier.h            | 10 +++++++++-
 kernel/cpu.c                        | 19 +++++++++++++++++++
 17 files changed, 51 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 83df541650fc..06b6fdab639f 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -149,6 +149,9 @@ smp_callin(void)
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
 
+	/* inform the notifiers about the new cpu */
+	notify_cpu_starting(cpuid);
+
 	/* Must have completely accurate bogos.  */
 	local_irq_enable();
 
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e9842f6767f9..e42a749a56dd 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -277,6 +277,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void)
 	/*
 	 * Enable local interrupts.
 	 */
+	notify_cpu_starting(cpu);
 	local_irq_enable();
 	local_fiq_enable();
 
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index 952a24b2f5a9..52e16c6436f9 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -178,6 +178,7 @@ void __init smp_callin(void)
 	unmask_irq(IPI_INTR_VECT);
 	unmask_irq(TIMER0_INTR_VECT);
 	preempt_disable();
+	notify_cpu_starting(cpu);
 	local_irq_enable();
 
 	cpu_set(cpu, cpu_online_map);
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index bcea81e432fd..333b58f218d0 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -401,6 +401,7 @@ smp_callin (void)
 	spin_lock(&vector_lock);
 	/* Setup the per cpu irq handling data structures */
 	__setup_vector_irq(cpuid);
+	notify_cpu_starting(cpuid);
 	cpu_set(cpuid, cpu_online_map);
 	per_cpu(cpu_state, cpuid) = CPU_ONLINE;
 	spin_unlock(&vector_lock);
diff --git a/arch/m32r/kernel/smpboot.c b/arch/m32r/kernel/smpboot.c
index 2c03ac1d005f..fc2994811f15 100644
--- a/arch/m32r/kernel/smpboot.c
+++ b/arch/m32r/kernel/smpboot.c
@@ -498,6 +498,8 @@ static void __init smp_online(void)
 {
 	int cpu_id = smp_processor_id();
 
+	notify_cpu_starting(cpu_id);
+
 	local_irq_enable();
 
 	/* Get our bogomips. */
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 4410f172b8ab..7b59cfb7e602 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -121,6 +121,8 @@ asmlinkage __cpuinit void start_secondary(void)
 	cpu = smp_processor_id();
 	cpu_data[cpu].udelay_val = loops_per_jiffy;
 
+	notify_cpu_starting(cpu);
+
 	mp_ops->smp_finish();
 	set_cpu_sibling_map(cpu);
 
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5337ca7bb649..c27b10a1bd79 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -453,6 +453,7 @@ int __devinit start_secondary(void *unused)
 	secondary_cpu_time_init();
 
 	ipi_call_lock();
+	notify_cpu_starting(cpu);
 	cpu_set(cpu, cpu_online_map);
 	/* Update sibling maps */
 	base = cpu_first_thread_in_core(cpu);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 00b9b4dec5eb..9e8b1f9b8f4d 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -585,6 +585,8 @@ int __cpuinit start_secondary(void *cpuvoid)
 	/* Enable pfault pseudo page faults on this cpu. */
 	pfault_init();
 
+	/* call cpu notifiers */
+	notify_cpu_starting(smp_processor_id());
 	/* Mark this cpu as online */
 	spin_lock(&call_lock);
 	cpu_set(smp_processor_id(), cpu_online_map);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 60c50841143e..001778f9adaf 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -82,6 +82,8 @@ asmlinkage void __cpuinit start_secondary(void)
 
 	preempt_disable();
 
+	notify_cpu_starting(smp_processor_id());
+
 	local_irq_enable();
 
 	calibrate_delay();
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 69596402a500..446767e8f569 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -88,6 +88,7 @@ void __init smp4d_callin(void)
 	local_flush_cache_all();
 	local_flush_tlb_all();
 
+	notify_cpu_starting(cpuid);
 	/*
 	 * Unblock the master CPU _only_ when the scheduler state
 	 * of all secondary CPUs will be up-to-date, so after
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a14a76ac7f36..9964890dc1db 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -71,6 +71,8 @@ void __cpuinit smp4m_callin(void)
 	local_flush_cache_all();
 	local_flush_tlb_all();
 
+	notify_cpu_starting(cpuid);
+
 	/* Get our local ticker going. */
 	smp_setup_percpu_timer();
 
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index be2d50c3aa95..045772142844 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -85,6 +85,7 @@ static int idle_proc(void *cpup)
 	while (!cpu_isset(cpu, smp_commenced_mask))
 		cpu_relax();
 
+	notify_cpu_starting(cpu);
 	cpu_set(cpu, cpu_online_map);
 	default_idle();
 	return 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7985c5b3f916..0b8261c3cac2 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void)
 	end_local_APIC_setup();
 	map_cpu_to_logical_apicid();
 
+	notify_cpu_starting(cpuid);
 	/*
 	 * Get our bogomips.
 	 *
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..199a5f4a873c 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
 
 	VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
 
+	notify_cpu_starting(cpuid);
+
 	/* enable interrupts */
 	local_irq_enable();
 
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index d7faf8808497..c2747ac2ae43 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -69,6 +69,7 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 #endif
 
 int cpu_up(unsigned int cpu);
+void notify_cpu_starting(unsigned int cpu);
 extern void cpu_hotplug_init(void);
 extern void cpu_maps_update_begin(void);
 extern void cpu_maps_update_done(void);
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index da2698b0fdd1..b86fa2ffca0c 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -213,9 +213,16 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 #define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
-				        * not handling interrupts, soon dead */
+					* not handling interrupts, soon dead.
+					* Called on the dying cpu, interrupts
+					* are already disabled. Must not
+					* sleep, must not fail */
 #define CPU_POST_DEAD		0x0009 /* CPU (unsigned)v dead, cpu_hotplug
 					* lock is dropped */
+#define CPU_STARTING		0x000A /* CPU (unsigned)v soon running.
+					* Called on the new cpu, just before
+					* enabling interrupts. Must not sleep,
+					* must not fail */
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
  * operation in progress
@@ -229,6 +236,7 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_FAILED_FROZEN	(CPU_DOWN_FAILED | CPU_TASKS_FROZEN)
 #define CPU_DEAD_FROZEN		(CPU_DEAD | CPU_TASKS_FROZEN)
 #define CPU_DYING_FROZEN	(CPU_DYING | CPU_TASKS_FROZEN)
+#define CPU_STARTING_FROZEN	(CPU_STARTING | CPU_TASKS_FROZEN)
 
 /* Hibernation and suspend events */
 #define PM_HIBERNATION_PREPARE	0x0001 /* Going to hibernate */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f17e9854c246..dc45f2459efb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -453,6 +453,25 @@ out:
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
 
+/**
+ * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
+ * @cpu: cpu that just started
+ *
+ * This function calls the cpu_chain notifiers with CPU_STARTING.
+ * It must be called by the arch code on the new cpu, before the new cpu
+ * enables interrupts and before the "boot" cpu returns from __cpu_up().
+ */
+void notify_cpu_starting(unsigned int cpu)
+{
+	unsigned long val = CPU_STARTING;
+
+#ifdef CONFIG_PM_SLEEP_SMP
+	if (cpu_isset(cpu, frozen_cpus))
+		val = CPU_STARTING_FROZEN;
+#endif /* CONFIG_PM_SLEEP_SMP */
+	raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
+}
+
 #endif /* CONFIG_SMP */
 
 /*
-- 
cgit v1.2.3-58-ga151


From e75b986af7881ed8d8ccb1ed154045ed17cfebd0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Sep 2008 21:38:57 +0200
Subject: clockevents: remove WARN_ON which was used to gather information

The issue of the endless reprogramming loop due to a too small
min_delta_ns was fixed with the previous updates of the clock events
code, but we had no information about the spread of this problem. I
added a WARN_ON to get automated information via kerneloops.org and to
get some direct reports, which allowed me to analyse the affected
machines.

The WARN_ON has served its purpose and would be annoying for a release
kernel. Remove it and just keep the information about the increase of
the min_delta_ns value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-oneshot.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 2e35501e61dd..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
 		 * and emit a warning.
 		 */
 		if (++i > 2) {
-			printk(KERN_WARNING "CE: __tick_program_event of %s is "
-			       "stuck %llx %llx\n", dev->name ? dev->name : "?",
-			       now.tv64, expires.tv64);
-			printk(KERN_WARNING
-			       "CE: increasing min_delta_ns %ld to %ld nsec\n",
-			       dev->min_delta_ns, dev->min_delta_ns << 1);
-			WARN_ON(1);
-
-			/* Double the min. delta and try again */
+			/* Increase the min. delta and try again */
 			if (!dev->min_delta_ns)
 				dev->min_delta_ns = 5000;
 			else
-				dev->min_delta_ns <<= 1;
+				dev->min_delta_ns += dev->min_delta_ns >> 1;
+
+			printk(KERN_WARNING
+			       "CE: %s increasing min_delta_ns to %lu nsec\n",
+			       dev->name ? dev->name : "?",
+			       dev->min_delta_ns << 1);
+
 			i = 0;
 		}
 
-- 
cgit v1.2.3-58-ga151


From baf25731e54d06eb13dc4eda78c6dc7da4ce84e8 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Tue, 9 Sep 2008 11:26:33 +0800
Subject: sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly

On my tulsa x86-64 machine, kernel 2.6.25-rc5 couldn't boot randomly.

Basically, function __enable_runtime forgets to reset rt_rq->rt_throttled
to 0. When every cpu is up, per-cpu migration_thread is created and it runs
very fast, sometimes to mark the corresponding rt_rq->rt_throttled to 1 very
quickly. After all cpus are up, with below calling chain:

   sched_init_smp => arch_init_sched_domains => build_sched_domains => ...
=> cpu_attach_domain => rq_attach_root => set_rq_online => ...
=> _enable_runtime

_enable_runtime is called against every rt_rq again, so rt_rq->rt_time is
reset to 0, but rt_rq->rt_throttled might be still 1. Later on function
do_sched_rt_period_timer couldn't reset it, and all RT tasks couldn't be
scheduled to run on that cpu. here is RT task migration_thread which is
woken up when a task is migrated to another cpu.

Below patch fixes it against 2.6.27-rc5.

Signed-off-by: Zhang Yanmin <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..1113157b2058 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -350,6 +350,7 @@ static void __enable_runtime(struct rq *rq)
 		spin_lock(&rt_rq->rt_runtime_lock);
 		rt_rq->rt_runtime = rt_b->rt_runtime;
 		rt_rq->rt_time = 0;
+		rt_rq->rt_throttled = 0;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 		spin_unlock(&rt_b->rt_runtime_lock);
 	}
-- 
cgit v1.2.3-58-ga151


From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 10 Sep 2008 17:00:19 -0700
Subject: sched: fix deadlock in setting scheduler parameter to zero

Andrei Gusev wrote:

> I played witch scheduler settings. After doing something like:
> echo -n 1000000 >sched_rt_period_us
>
> command is locked. I found in kernel.log:
>
> Sep 11 00:39:34 zaratustra
> Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G        W
> (2.6.26.3 #12)
> Sep 11 00:39:34 zaratustra EIP: 0060:[<c0213fc7>] EFLAGS: 00210246 CPU: 0
> Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80
> Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000
> EDX: 00000000
> Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f
> ESP: ea7a6edc
> Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068
> Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000
> task=ea744000 task.ti=ea7a6000)
> Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f
> c0119042 00000000 00000000 00000001
> Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000
> c04d2e80 00000001 000e7ef0
> Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001
> ffffffff c04d2e80 ea5b2480
> Sep 11 00:39:34 zaratustra Call Trace:
> Sep 11 00:39:34 zaratustra [<c0119042>] __rt_schedulable+0x52/0x130
> Sep 11 00:39:34 zaratustra [<c01191a3>] sched_rt_handler+0x83/0x120
> Sep 11 00:39:34 zaratustra [<c01a76a6>] proc_sys_call_handler+0xb6/0xd0
> Sep 11 00:39:34 zaratustra [<c01a76c0>] proc_sys_write+0x0/0x20
> Sep 11 00:39:34 zaratustra [<c01a76d9>] proc_sys_write+0x19/0x20
> Sep 11 00:39:34 zaratustra [<c016cc68>] vfs_write+0xa8/0x140
> Sep 11 00:39:34 zaratustra [<c016cdd1>] sys_write+0x41/0x80
> Sep 11 00:39:34 zaratustra [<c0103051>] sysenter_past_esp+0x6a/0x91
> Sep 11 00:39:34 zaratustra =======================
> Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de
> 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea
> 77 08 89 e8 31 d2 <f7> f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89
> ca 8b 1c 24
> Sep 11 00:39:34 zaratustra EIP: [<c0213fc7>] div64_u64+0x57/0x80 SS:ESP
> 0068:ea7a6edc
> Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]---

fix the boundary condition.

sysctl_sched_rt_period=0 makes exception at to_ratio().

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cc1f81b50b82..98890807375b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void)
 	u64 rt_runtime, rt_period;
 	int ret = 0;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 
@@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void)
 	unsigned long flags;
 	int i;
 
+	if (sysctl_sched_rt_period <= 0)
+		return -EINVAL;
+
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-- 
cgit v1.2.3-58-ga151


From 2344abbcbdb82140050e8be29d3d55e4f6fe860b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Sep 2008 11:32:50 -0700
Subject: clockevents: make device shutdown robust

The device shut down does not cleanup the next_event variable of the
clock event device. So when the device is reactivated the possible
stale next_event value can prevent the device to be reprogrammed as it
claims to wait on a event already.

This is the root cause of the resurfacing suspend/resume problem,
where systems need key press to come back to life.

Fix this by setting next_event to KTIME_MAX when the device is shut
down. Use a separate function for shutdown which takes care of that
and only keep the direct set mode call in the broadcast code, where we
can not touch the next_event value.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c    | 12 +++++++++++-
 kernel/time/tick-broadcast.c |  9 ++++-----
 kernel/time/tick-common.c    |  4 ++--
 kernel/time/tick-internal.h  |  2 ++
 4 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 1876b526c778..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -71,6 +71,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
 	}
 }
 
+/**
+ * clockevents_shutdown - shutdown the device and clear next_event
+ * @dev:	device to shutdown
+ */
+void clockevents_shutdown(struct clock_event_device *dev)
+{
+	clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
+	dev->next_event.tv64 = KTIME_MAX;
+}
+
 /**
  * clockevents_program_event - Reprogram the clock event device.
  * @expires:	absolute expiry time (monotonic clock)
@@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
 
 	if (new) {
 		BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
-		clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(new);
 	}
 	local_irq_restore(flags);
 }
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2f5a38294bf9..f1f3eee28113 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -236,8 +236,7 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
 			if (td->mode == TICKDEV_MODE_PERIODIC)
-				clockevents_set_mode(dev,
-						     CLOCK_EVT_MODE_SHUTDOWN);
+				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
 			tick_broadcast_force = 1;
@@ -254,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why)
 
 	if (cpus_empty(tick_broadcast_mask)) {
 		if (!bc_stopped)
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+			clockevents_shutdown(bc);
 	} else if (bc_stopped) {
 		if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
 			tick_broadcast_start_periodic(bc);
@@ -306,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
 
 	if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
 		if (bc && cpus_empty(tick_broadcast_mask))
-			clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+			clockevents_shutdown(bc);
 	}
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -321,7 +320,7 @@ void tick_suspend_broadcast(void)
 
 	bc = tick_broadcast_device.evtdev;
 	if (bc)
-		clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(bc);
 
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index c4777193d567..019315ebf9de 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -249,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
 	 * not give it back to the clockevents layer !
 	 */
 	if (tick_is_broadcast_device(curdev)) {
-		clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN);
+		clockevents_shutdown(curdev);
 		curdev = NULL;
 	}
 	clockevents_exchange_device(curdev, newdev);
@@ -311,7 +311,7 @@ static void tick_suspend(void)
 	unsigned long flags;
 
 	spin_lock_irqsave(&tick_device_lock, flags);
-	clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN);
+	clockevents_shutdown(td->evtdev);
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 0ffc2918ea6f..6e9db9734aa6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly;
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
 extern void tick_handle_periodic(struct clock_event_device *dev);
 
+extern void clockevents_shutdown(struct clock_event_device *dev);
+
 /*
  * NO_HZ / high resolution timer shared code
  */
-- 
cgit v1.2.3-58-ga151


From 15afe09bf496ae10c989e1a375a6b5da7bd3e16e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 20 Sep 2008 23:38:02 +0200
Subject: sched: wakeup preempt when small overlap

Lin Ming reported a 10% OLTP regression against 2.6.27-rc4.

The difference seems to come from different preemption agressiveness,
which affects the cache footprint of the workload and its effective
cache trashing.

Aggresively preempt a task if its avg overlap is very small, this should
avoid the task going to sleep and find it still running when we schedule
back to it - saving a wakeup.

Reported-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h   |  2 +-
 kernel/sched.c          | 12 ++++++------
 kernel/sched_fair.c     | 13 ++++++++++---
 kernel/sched_features.h |  1 +
 kernel/sched_idletask.c |  6 +++---
 kernel/sched_rt.c       |  2 +-
 6 files changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b3b7a8f32477..d8e699b55858 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -897,7 +897,7 @@ struct sched_class {
 	void (*yield_task) (struct rq *rq);
 	int  (*select_task_rq)(struct task_struct *p, int sync);
 
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
+	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int sync);
 
 	struct task_struct * (*pick_next_task) (struct rq *rq);
 	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
diff --git a/kernel/sched.c b/kernel/sched.c
index 0d8905a1b8ca..ad9d39b021f8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -604,9 +604,9 @@ struct rq {
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
-static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
+static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
 {
-	rq->curr->sched_class->check_preempt_curr(rq, p);
+	rq->curr->sched_class->check_preempt_curr(rq, p, sync);
 }
 
 static inline int cpu_of(struct rq *rq)
@@ -2282,7 +2282,7 @@ out_running:
 	trace_mark(kernel_sched_wakeup,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, sync);
 
 	p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
@@ -2417,7 +2417,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	trace_mark(kernel_sched_wakeup_new,
 		"pid %d state %ld ## rq %p task %p rq->curr %p",
 		p->pid, p->state, rq, p, rq->curr);
-	check_preempt_curr(rq, p);
+	check_preempt_curr(rq, p, 0);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
 		p->sched_class->task_wake_up(rq, p);
@@ -2877,7 +2877,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 	 * Note that idle threads have a prio of MAX_PRIO, for this test
 	 * to be always true for them.
 	 */
-	check_preempt_curr(this_rq, p);
+	check_preempt_curr(this_rq, p, 0);
 }
 
 /*
@@ -6007,7 +6007,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 	set_task_cpu(p, dest_cpu);
 	if (on_rq) {
 		activate_task(rq_dest, p, 0);
-		check_preempt_curr(rq_dest, p);
+		check_preempt_curr(rq_dest, p, 0);
 	}
 done:
 	ret = 1;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a10ac0bcee64..7328383690f1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1331,7 +1331,7 @@ static inline int depth_se(struct sched_entity *se)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
@@ -1367,6 +1367,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
+	if (sched_feat(WAKEUP_OVERLAP) && sync &&
+			se->avg_overlap < sysctl_sched_migration_cost &&
+			pse->avg_overlap < sysctl_sched_migration_cost) {
+		resched_task(curr);
+		return;
+	}
+
 	/*
 	 * preemption test can be made between sibling entities who are in the
 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
@@ -1649,7 +1656,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
@@ -1666,7 +1673,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 9353ca78154e..bf027a7accf8 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,3 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
 /*
  * Idle tasks are unconditionally rescheduled:
  */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
 {
 	resched_task(rq->idle);
 }
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
 	if (running)
 		resched_task(rq->curr);
 	else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
 		if (p->prio > oldprio)
 			resched_task(rq->curr);
 	} else
-		check_preempt_curr(rq, p);
+		check_preempt_curr(rq, p, 0);
 }
 
 /*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 552310798dad..6d2d0a5d030b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -783,7 +783,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
+static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
 {
 	if (p->prio < rq->curr->prio) {
 		resched_task(rq->curr);
-- 
cgit v1.2.3-58-ga151


From f681bbd656b01439be904250a1581ca9c27505a1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Sep 2008 16:29:00 +0200
Subject: sched: turn off WAKEUP_OVERLAP

WAKEUP_OVERLAP is not a winner on a 16way box, running psql+sysbench:

       .27-rc7-NO_WAKEUP_OVERLAP  .27-rc7-WAKEUP_OVERLAP
-------------------------------------------------
    1:             694              811    +14.39%
    2:            1454             1427    -1.86%
    4:            3017             3070    +1.70%
    8:            5694             5808    +1.96%
   16:           10592            10612    +0.19%
   32:            9693             9647    -0.48%
   64:            8507             8262    -2.97%
  128:            8402             7087    -18.55%
  256:            8419             5124    -64.30%
  512:            7990             3671    -117.62%
-------------------------------------------------
  SUM:           64466            55524    -16.11%

... so turn it off by default.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index bf027a7accf8..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -11,4 +11,4 @@ SCHED_FEAT(ASYM_GRAN, 1)
 SCHED_FEAT(LB_BIAS, 1)
 SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
 SCHED_FEAT(ASYM_EFF_LOAD, 1)
-SCHED_FEAT(WAKEUP_OVERLAP, 1)
+SCHED_FEAT(WAKEUP_OVERLAP, 0)
-- 
cgit v1.2.3-58-ga151


From caea8a03702c147e8ae90da0801e7ba8297b1d46 Mon Sep 17 00:00:00 2001
From: Chris Friesen <cfriesen@nortel.com>
Date: Mon, 22 Sep 2008 11:06:09 -0600
Subject: sched: fix list traversal to use _rcu variant

load_balance_fair() calls rcu_read_lock() but then traverses the list
 using the regular list traversal routine.  This patch converts the
list traversal to use the _rcu version.

Signed-off-by: Chris Friesen <cfriesen@nortel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 7328383690f1..3b89aa6594a9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1521,7 +1521,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	rcu_read_lock();
 	update_h_load(busiest_cpu);
 
-	list_for_each_entry(tg, &task_groups, list) {
+	list_for_each_entry_rcu(tg, &task_groups, list) {
 		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
-- 
cgit v1.2.3-58-ga151


From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Mon, 22 Sep 2008 14:55:45 -0700
Subject: sched: fix init_hrtick() section mismatch warning

LD      kernel/built-in.o
WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference
from the function init_hrtick() to the variable
.cpuinit.data:hotplug_hrtick_nb.8
The function init_hrtick() references
the variable __cpuinitdata hotplug_hrtick_nb.8.
This is often because init_hrtick lacks a __cpuinitdata
annotation or the annotation of hotplug_hrtick_nb.8 is wrong.

Signed-off-by: Md.Rakib H. Mullick <rakib.mullick@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 98890807375b..13dd2db9fb2d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_DONE;
 }
 
-static void init_hrtick(void)
+static __init void init_hrtick(void)
 {
 	hotcpu_notifier(hotplug_hrtick, 0);
 }
-- 
cgit v1.2.3-58-ga151


From 006c75f146e58e080d2b2725a6664f71886e112b Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 22 Sep 2008 14:55:46 -0700
Subject: sched: clarify ifdef tangle

- Add some comments to try to make the ifdef puzzle a bit clearer

- Explicitly inline one of the three init_hrtick() implementations.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ad9d39b021f8..927c9307cd00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1102,7 +1102,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
 }
 
-static void init_hrtick(void)
+static inline void init_hrtick(void)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1121,7 +1121,7 @@ static void init_rq_hrtick(struct rq *rq)
 	rq->hrtick_timer.function = hrtick;
 	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 }
-#else
+#else	/* CONFIG_SCHED_HRTICK */
 static inline void hrtick_clear(struct rq *rq)
 {
 }
@@ -1133,7 +1133,7 @@ static inline void init_rq_hrtick(struct rq *rq)
 static inline void init_hrtick(void)
 {
 }
-#endif
+#endif	/* CONFIG_SCHED_HRTICK */
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
-- 
cgit v1.2.3-58-ga151


From 3a72dc8eb5a7122fff439a22bd22486a4fff505c Mon Sep 17 00:00:00 2001
From: Harvey Harrison <harvey.harrison@gmail.com>
Date: Mon, 22 Sep 2008 14:55:46 -0700
Subject: rcu: fix sparse shadowed variable warning

kernel/rcuclassic.c:564:18: warning: symbol 'flags' shadows an earlier one
kernel/rcuclassic.c:527:16: originally declared here

Signed-off-by: Harvey Harrison <harvey.harrison@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 743cf0550ff4..ed15128ca2c9 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -561,15 +561,15 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 		local_irq_restore(flags);
 
 		if (rcu_batch_after(rdp->batch, rcp->pending)) {
-			unsigned long flags;
+			unsigned long flags2;
 
 			/* and start it/schedule start if it's a new batch */
-			spin_lock_irqsave(&rcp->lock, flags);
+			spin_lock_irqsave(&rcp->lock, flags2);
 			if (rcu_batch_after(rdp->batch, rcp->pending)) {
 				rcp->pending = rdp->batch;
 				rcu_start_batch(rcp);
 			}
-			spin_unlock_irqrestore(&rcp->lock, flags);
+			spin_unlock_irqrestore(&rcp->lock, flags2);
 		}
 	}
 
-- 
cgit v1.2.3-58-ga151


From 6441402b1f173fa38e561d3cee7c01c32e5281ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 18:46:37 +0200
Subject: clockevents: prevent cpu online to interfere with nohz

Impact: rare hang which can be triggered on CPU online.

tick_do_timer_cpu keeps track of the CPU which updates jiffies
via do_timer. The value -1 is used to signal, that currently no
CPU is doing this. There are two cases, where the variable can
have this state:

 boot:
    necessary for systems where the boot cpu id can be != 0

 nohz long idle sleep:
    When the CPU which did the jiffies update last goes into
    a long idle sleep it drops the update jiffies duty so
    another CPU which is not idle can pick it up and keep
    jiffies going.

Using the same value for both situations is wrong, as the CPU online
code can see the -1 state when the timer of the newly onlined CPU is
setup. The setup for a newly onlined CPU goes through periodic mode
and can pick up the do_timer duty without being aware of the nohz /
highres mode of the already running system.

Use two separate states and make them constants to avoid magic
numbers confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-common.c   | 7 ++++---
 kernel/time/tick-internal.h | 4 ++++
 kernel/time/tick-sched.c    | 8 ++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 019315ebf9de..b523d095decf 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
  */
 ktime_t tick_next_period;
 ktime_t tick_period;
-int tick_do_timer_cpu __read_mostly = -1;
+int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
 DEFINE_SPINLOCK(tick_device_lock);
 
 /*
@@ -148,7 +148,7 @@ static void tick_setup_device(struct tick_device *td,
 		 * If no cpu took the do_timer update, assign it to
 		 * this cpu:
 		 */
-		if (tick_do_timer_cpu == -1) {
+		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
 			tick_do_timer_cpu = cpu;
 			tick_next_period = ktime_get();
 			tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -300,7 +300,8 @@ static void tick_shutdown(unsigned int *cpup)
 	if (*cpup == tick_do_timer_cpu) {
 		int cpu = first_cpu(cpu_online_map);
 
-		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1;
+		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+			TICK_DO_TIMER_NONE;
 	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 6e9db9734aa6..e18014fadf95 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
 /*
  * tick internal variable and functions used by low/high res code
  */
+
+#define TICK_DO_TIMER_NONE	-1
+#define TICK_DO_TIMER_BOOT	-2
+
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern spinlock_t tick_device_lock;
 extern ktime_t tick_next_period;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a87b0468568b..31a14e8caac1 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -221,7 +221,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 	 */
 	if (unlikely(!cpu_online(cpu))) {
 		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = -1;
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 	}
 
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
@@ -303,7 +303,7 @@ void tick_nohz_stop_sched_tick(int inidle)
 		 * invoked.
 		 */
 		if (cpu == tick_do_timer_cpu)
-			tick_do_timer_cpu = -1;
+			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 
 		ts->idle_sleeps++;
 
@@ -468,7 +468,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	 * this duty, then the jiffies update is still serialized by
 	 * xtime_lock.
 	 */
-	if (unlikely(tick_do_timer_cpu == -1))
+	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
 		tick_do_timer_cpu = cpu;
 
 	/* Check, if the jiffies need an update */
@@ -570,7 +570,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	 * this duty, then the jiffies update is still serialized by
 	 * xtime_lock.
 	 */
-	if (unlikely(tick_do_timer_cpu == -1))
+	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
 		tick_do_timer_cpu = cpu;
 #endif
 
-- 
cgit v1.2.3-58-ga151


From 49d670fb8dd62d3ed4e3ed2513538ea65b051aed Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 18:56:01 +0200
Subject: clockevents: prevent stale tick_next_period for onlining CPUs

Impact: possible hang on CPU onlining in timer one shot mode.

The tick_next_period variable is only used during boot on nohz/highres
enabled systems, but for CPU onlining it needs to be maintained when
the per cpu clock events device operates in one shot mode.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 31a14e8caac1..39019b3f7621 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -75,6 +75,9 @@ static void tick_do_update_jiffies64(ktime_t now)
 							   incr * ticks);
 		}
 		do_timer(++ticks);
+
+		/* Keep the tick_next_period variable up to date */
+		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	}
 	write_sequnlock(&xtime_lock);
 }
-- 
cgit v1.2.3-58-ga151


From 302745699c1b675b5d2a1af87271de10e4d96b6a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 19:02:25 +0200
Subject: clockevents: check broadcast device not tick device

Impact: Possible hang on CPU online observed on AMD C1E machines.

The broadcast setup code looks at the mode of the tick device to
determine whether it needs to be shut down or setup. This is wrong
when the broadcast mode is set to one shot already. This can happen
when a CPU is brought online as it goes through the periodic setup
first.

The problem went unnoticed as sane systems do not call into that code
before the switch to one shot for the clock event device happens.
The AMD C1E idle routine switches over immediately and thereby shuts
down the just setup device before the first interrupt happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f1f3eee28113..e2b66d1c8ca5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,7 +235,7 @@ static void tick_do_broadcast_on_off(void *why)
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
-			if (td->mode == TICKDEV_MODE_PERIODIC)
+			if (bc->mode == TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
@@ -245,7 +245,7 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!tick_broadcast_force &&
 		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
-			if (td->mode == TICKDEV_MODE_PERIODIC)
+			if (bc->mode == TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
 		}
 		break;
-- 
cgit v1.2.3-58-ga151


From 27ce4cb4a0c7cf59b9a9952266883862f2e4c99f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 22 Sep 2008 19:04:02 +0200
Subject: clockevents: prevent mode mismatch on cpu online

Impact: timer hang on CPU online observed on AMD C1E systems

When a CPU is brought online then the broadcast machinery can
be in the one shot state already. Check this and setup the timer
device of the new CPU in one shot mode so the broadcast code
can pick up the next_event value correctly.

Another AMD C1E oddity, as we switch to broadcast immediately and
not after the full bring up via the ACPI cpu idle code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 8 ++++++++
 kernel/time/tick-common.c    | 3 ++-
 kernel/time/tick-internal.h  | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index e2b66d1c8ca5..bd7034542399 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -575,4 +575,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
 	spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
+/*
+ * Check, whether the broadcast device is in one shot mode
+ */
+int tick_broadcast_oneshot_active(void)
+{
+	return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
+}
+
 #endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b523d095decf..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 	if (!tick_device_is_functional(dev))
 		return;
 
-	if (dev->features & CLOCK_EVT_FEAT_PERIODIC) {
+	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
+	    !tick_broadcast_oneshot_active()) {
 		clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
 	} else {
 		unsigned long seq;
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e18014fadf95..55c3f4be6077 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -35,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
 extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
+extern int tick_broadcast_oneshot_active(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 {
@@ -43,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 # endif /* !BROADCAST */
 
 #else /* !ONESHOT */
-- 
cgit v1.2.3-58-ga151


From f8e256c687eb53850685747757c8d75e58756e15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Sep 2008 13:00:57 +0200
Subject: timers: fix build error in !oneshot case
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

 kernel/time/tick-common.c: In function ‘tick_setup_periodic’:
 kernel/time/tick-common.c:113: error: implicit declaration of function ‘tick_broadcast_oneshot_active’

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/tick-internal.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 55c3f4be6077..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -74,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
 	return 0;
 }
+static inline int tick_broadcast_oneshot_active(void) { return 0; }
 #endif /* !TICK_ONESHOT */
 
 /*
-- 
cgit v1.2.3-58-ga151


From 695698500912c4479ddf4723e492de3970ff8530 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 14:54:23 +0200
Subject: sched: rework wakeup preemption

Rework the wakeup preemption to work on real runtime instead of
the virtual runtime. This greatly simplifies the code.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 133 ++--------------------------------------------------
 1 file changed, 4 insertions(+), 129 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3b89aa6594a9..c20899763457 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -408,64 +408,6 @@ static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	return __sched_period(nr_running);
 }
 
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-	struct load_weight lw = {
-		.weight = NICE_0_LOAD,
-		.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-	};
-
-	for_each_sched_entity(se) {
-		struct load_weight *se_lw = &se->load;
-		unsigned long rw = cfs_rq_of(se)->load.weight;
-
-#ifdef CONFIG_FAIR_SCHED_GROUP
-		struct cfs_rq *cfs_rq = se->my_q;
-		struct task_group *tg = NULL
-
-		if (cfs_rq)
-			tg = cfs_rq->tg;
-
-		if (tg && tg->shares < NICE_0_LOAD) {
-			/*
-			 * scale shares to what it would have been had
-			 * tg->weight been NICE_0_LOAD:
-			 *
-			 *   weight = 1024 * shares / tg->weight
-			 */
-			lw.weight *= se->load.weight;
-			lw.weight /= tg->shares;
-
-			lw.inv_weight = 0;
-
-			se_lw = &lw;
-			rw += lw.weight - se->load.weight;
-		} else
-#endif
-
-		if (se->load.weight < NICE_0_LOAD) {
-			se_lw = &lw;
-			rw += NICE_0_LOAD - se->load.weight;
-		}
-
-		delta = calc_delta_mine(delta, rw, se_lw);
-	}
-
-	return delta;
-}
-
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
@@ -1281,53 +1223,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 	 * + nice tasks.
 	 */
 	if (sched_feat(ASYM_GRAN))
-		gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
-	else
-		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+		gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
 
 	return gran;
 }
 
-/*
- * Should 'se' preempt 'curr'.
- *
- *             |s1
- *        |s2
- *   |s3
- *         g
- *      |<--->|c
- *
- *  w(c, s1) = -1
- *  w(c, s2) =  0
- *  w(c, s3) =  1
- *
- */
-static int
-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
-{
-	s64 gran, vdiff = curr->vruntime - se->vruntime;
-
-	if (vdiff < 0)
-		return -1;
-
-	gran = wakeup_gran(curr);
-	if (vdiff > gran)
-		return 1;
-
-	return 0;
-}
-
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1336,7 +1236,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	struct task_struct *curr = rq->curr;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	struct sched_entity *se = &curr->se, *pse = &p->se;
-	int se_depth, pse_depth;
+	s64 delta_exec;
 
 	if (unlikely(rt_prio(p->prio))) {
 		update_rq_clock(rq);
@@ -1374,33 +1274,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 		return;
 	}
 
-	/*
-	 * preemption test can be made between sibling entities who are in the
-	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
-	 * both tasks until we find their ancestors who are siblings of common
-	 * parent.
-	 */
-
-	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(se);
-	pse_depth = depth_se(pse);
-
-	while (se_depth > pse_depth) {
-		se_depth--;
-		se = parent_entity(se);
-	}
-
-	while (pse_depth > se_depth) {
-		pse_depth--;
-		pse = parent_entity(pse);
-	}
-
-	while (!is_same_group(se, pse)) {
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
-
-	if (wakeup_preempt_entity(se, pse) == 1)
+	delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+	if (delta_exec > wakeup_gran(pse))
 		resched_task(curr);
 }
 
-- 
cgit v1.2.3-58-ga151


From 940959e93949e839c14f8ddc3b9b0e34a2ab6e29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:42 +0200
Subject: sched: fixlet for group load balance

We should not only correct the increment for the initial group, but should
be consistent and do so for all the groups we encounter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c20899763457..0c59da7e3120 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1027,7 +1027,6 @@ static long effective_load(struct task_group *tg, int cpu,
 		long wl, long wg)
 {
 	struct sched_entity *se = tg->se[cpu];
-	long more_w;
 
 	if (!tg->parent)
 		return wl;
@@ -1039,18 +1038,17 @@ static long effective_load(struct task_group *tg, int cpu,
 	if (!wl && sched_feat(ASYM_EFF_LOAD))
 		return wl;
 
-	/*
-	 * Instead of using this increment, also add the difference
-	 * between when the shares were last updated and now.
-	 */
-	more_w = se->my_q->load.weight - se->my_q->rq_weight;
-	wl += more_w;
-	wg += more_w;
-
 	for_each_sched_entity(se) {
-#define D(n) (likely(n) ? (n) : 1)
-
 		long S, rw, s, a, b;
+		long more_w;
+
+		/*
+		 * Instead of using this increment, also add the difference
+		 * between when the shares were last updated and now.
+		 */
+		more_w = se->my_q->load.weight - se->my_q->rq_weight;
+		wl += more_w;
+		wg += more_w;
 
 		S = se->my_q->tg->shares;
 		s = se->my_q->shares;
@@ -1059,7 +1057,11 @@ static long effective_load(struct task_group *tg, int cpu,
 		a = S*(rw + wl);
 		b = S*rw + s*wg;
 
-		wl = s*(a-b)/D(b);
+		wl = s*(a-b);
+
+		if (likely(b))
+			wl /= b;
+
 		/*
 		 * Assume the group is already running and will
 		 * thus already be accounted for in the weight.
@@ -1068,7 +1070,6 @@ static long effective_load(struct task_group *tg, int cpu,
 		 * alter the group weight.
 		 */
 		wg = 0;
-#undef D
 	}
 
 	return wl;
-- 
cgit v1.2.3-58-ga151


From 78333cdd0e472180743d35988e576d6ecc6f6ddb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:43 +0200
Subject: sched: add some comments to the bandwidth code

Hopefully clarify some of this code a little.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 2e228bd5395e..d570a8cc4fcd 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -231,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_SMP
+/*
+ * We ran out of runtime, see if we can borrow some from our neighbours.
+ */
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -250,9 +253,18 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
 			continue;
 
 		spin_lock(&iter->rt_runtime_lock);
+		/*
+		 * Either all rqs have inf runtime and there's nothing to steal
+		 * or __disable_runtime() below sets a specific rq to inf to
+		 * indicate its been disabled and disalow stealing.
+		 */
 		if (iter->rt_runtime == RUNTIME_INF)
 			goto next;
 
+		/*
+		 * From runqueues with spare time, take 1/n part of their
+		 * spare time, but no more than our period.
+		 */
 		diff = iter->rt_runtime - iter->rt_time;
 		if (diff > 0) {
 			diff = div_u64((u64)diff, weight);
@@ -274,6 +286,9 @@ next:
 	return more;
 }
 
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
 static void __disable_runtime(struct rq *rq)
 {
 	struct root_domain *rd = rq->rd;
@@ -289,17 +304,33 @@ static void __disable_runtime(struct rq *rq)
 
 		spin_lock(&rt_b->rt_runtime_lock);
 		spin_lock(&rt_rq->rt_runtime_lock);
+		/*
+		 * Either we're all inf and nobody needs to borrow, or we're
+		 * already disabled and thus have nothing to do, or we have
+		 * exactly the right amount of runtime to take out.
+		 */
 		if (rt_rq->rt_runtime == RUNTIME_INF ||
 				rt_rq->rt_runtime == rt_b->rt_runtime)
 			goto balanced;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 
+		/*
+		 * Calculate the difference between what we started out with
+		 * and what we current have, that's the amount of runtime
+		 * we lend and now have to reclaim.
+		 */
 		want = rt_b->rt_runtime - rt_rq->rt_runtime;
 
+		/*
+		 * Greedy reclaim, take back as much as we can.
+		 */
 		for_each_cpu_mask(i, rd->span) {
 			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 			s64 diff;
 
+			/*
+			 * Can't reclaim from ourselves or disabled runqueues.
+			 */
 			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 				continue;
 
@@ -319,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
 		}
 
 		spin_lock(&rt_rq->rt_runtime_lock);
+		/*
+		 * We cannot be left wanting - that would mean some runtime
+		 * leaked out of the system.
+		 */
 		BUG_ON(want);
 balanced:
+		/*
+		 * Disable all the borrow logic by pretending we have inf
+		 * runtime - in which case borrowing doesn't make sense.
+		 */
 		rt_rq->rt_runtime = RUNTIME_INF;
 		spin_unlock(&rt_rq->rt_runtime_lock);
 		spin_unlock(&rt_b->rt_runtime_lock);
@@ -343,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
 	if (unlikely(!scheduler_running))
 		return;
 
+	/*
+	 * Reset each runqueue's bandwidth settings
+	 */
 	for_each_leaf_rt_rq(rt_rq, rq) {
 		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 
-- 
cgit v1.2.3-58-ga151


From 4653f803e6e0d970ffeac0efd2c01743eb6c5228 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:44 +0200
Subject: sched: more sanity checks on the bandwidth settings

While playing around with it, I noticed we missed some sanity checks.
Also add some comments while we're there.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 669c49aa57f0..e1299de1765e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8866,11 +8866,29 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+	/*
+	 * Cannot have more runtime than the period.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
+
+	/*
+	 * Ensure we don't starve existing RT tasks.
+	 */
 	if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
 		return -EBUSY;
 
 	total = to_ratio(period, runtime);
 
+	/*
+	 * Nobody can have more than the global setting allows.
+	 */
+	if (total > to_ratio(global_rt_period(), global_rt_runtime()))
+		return -EINVAL;
+
+	/*
+	 * The sum of our children's runtime should not exceed our own.
+	 */
 	list_for_each_entry_rcu(child, &tg->children, siblings) {
 		period = ktime_to_ns(child->rt_bandwidth.rt_period);
 		runtime = child->rt_bandwidth.rt_runtime;
@@ -8978,19 +8996,24 @@ long sched_group_rt_period(struct task_group *tg)
 
 static int sched_rt_global_constraints(void)
 {
-	struct task_group *tg = &root_task_group;
-	u64 rt_runtime, rt_period;
+	u64 runtime, period;
 	int ret = 0;
 
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
-	rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
-	rt_runtime = tg->rt_bandwidth.rt_runtime;
+	runtime = global_rt_runtime();
+	period = global_rt_period();
+
+	/*
+	 * Sanity check on the sysctl variables.
+	 */
+	if (runtime > period && runtime != RUNTIME_INF)
+		return -EINVAL;
 
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
-	ret = __rt_schedulable(tg, rt_period, rt_runtime);
+	ret = __rt_schedulable(NULL, 0, 0);
 	read_unlock(&tasklist_lock);
 	mutex_unlock(&rt_constraints_mutex);
 
-- 
cgit v1.2.3-58-ga151


From 57fdc26d4a734a3e00c6b2fc0e1e40ff8da4dc31 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 23 Sep 2008 15:33:45 +0200
Subject: sched: fixup buddy selection

We should set the buddy even though we might already have the
TIF_RESCHED flag set.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c59da7e3120..e3f3c10f7033 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1249,6 +1249,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (unlikely(se == pse))
 		return;
 
+	cfs_rq_of(pse)->next = pse;
+
 	/*
 	 * We can come here with TIF_NEED_RESCHED already set from new task
 	 * wake up path.
@@ -1256,8 +1258,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (test_tsk_need_resched(curr))
 		return;
 
-	cfs_rq_of(pse)->next = pse;
-
 	/*
 	 * Batch tasks do not preempt (their preemption is driven by
 	 * the tick):
-- 
cgit v1.2.3-58-ga151


From f9092f358bc2ec5367621478811f046f82873376 Mon Sep 17 00:00:00 2001
From: Jonathan Steel <jon.steel@esentire.com>
Date: Mon, 22 Sep 2008 13:57:45 -0700
Subject: kexec: fix segmentation fault in kimage_add_entry

A segmentation fault can occur in kimage_add_entry in kexec.c when loading
a kernel image into memory.  The fault occurs because a page is requested
by calling kimage_alloc_page with gfp_mask GFP_KERNEL and the function may
actually return a page with gfp_mask GFP_HIGHUSER.  The high mem page is
returned because it was swapped with the kernel page due to the kernel
page being a page that will shortly be copied to.

This patch ensures that kimage_alloc_page returns a page that was created
with the correct gfp flags.

I have verified the change and fixed the whitespace damage of the original
patch.  Jonathan did a great job of tracking this down after he hit the
problem.  -- Eric

Signed-off-by: Jonathan Steel <jon.steel@esentire.com>
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 59f3f0df35d4..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -753,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
 			*old = addr | (*old & ~PAGE_MASK);
 
 			/* The old page I have found cannot be a
-			 * destination page, so return it.
+			 * destination page, so return it if it's
+			 * gfp_flags honor the ones passed in.
 			 */
+			if (!(gfp_mask & __GFP_HIGHMEM) &&
+			    PageHighMem(old_page)) {
+				kimage_free_pages(old_page);
+				continue;
+			}
 			addr = old_addr;
 			page = old_page;
 			break;
-- 
cgit v1.2.3-58-ga151


From b87f17242da6b2ac6db2d179b2f93fb84cff2fbe Mon Sep 17 00:00:00 2001
From: Bharata B Rao <bharata@linux.vnet.ibm.com>
Date: Thu, 25 Sep 2008 09:53:54 +0530
Subject: sched: maintain only task entities in cfs_rq->tasks list

cfs_rq->tasks list is used by the load balancer to iterate
over all the tasks. Currently it holds all the entities
(both task and group entities) because of which there is
a need to check for group entities explicitly during load
balancing. This patch changes the cfs_rq->tasks list to
hold only task entities.

Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 26 +++++++++-----------------
 1 file changed, 9 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e3f3c10f7033..95c1295ad26d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -528,11 +528,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_add(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		add_cfs_task_weight(cfs_rq, se->load.weight);
+		list_add(&se->group_node, &cfs_rq->tasks);
+	}
 	cfs_rq->nr_running++;
 	se->on_rq = 1;
-	list_add(&se->group_node, &cfs_rq->tasks);
 }
 
 static void
@@ -541,11 +542,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	update_load_sub(&cfs_rq->load, se->load.weight);
 	if (!parent_entity(se))
 		dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-	if (entity_is_task(se))
+	if (entity_is_task(se)) {
 		add_cfs_task_weight(cfs_rq, -se->load.weight);
+		list_del_init(&se->group_node);
+	}
 	cfs_rq->nr_running--;
 	se->on_rq = 0;
-	list_del_init(&se->group_node);
 }
 
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -1335,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
 	if (next == &cfs_rq->tasks)
 		return NULL;
 
-	/* Skip over entities that are not tasks */
-	do {
-		se = list_entry(next, struct sched_entity, group_node);
-		next = next->next;
-	} while (next != &cfs_rq->tasks && !entity_is_task(se));
-
-	if (next == &cfs_rq->tasks && !entity_is_task(se))
-		return NULL;
-
-	cfs_rq->balance_iterator = next;
-
-	if (entity_is_task(se))
-		p = task_of(se);
+	se = list_entry(next, struct sched_entity, group_node);
+	p = task_of(se);
+	cfs_rq->balance_iterator = next->next;
 
 	return p;
 }
-- 
cgit v1.2.3-58-ga151


From 18d6522b86d21a04c8ac1ea79747e2e434a956d9 Mon Sep 17 00:00:00 2001
From: Atsuo Igarashi <atsuo_igarashi@tripeaks.co.jp>
Date: Fri, 26 Sep 2008 10:36:41 -0500
Subject: kgdb: could not write to the last of valid memory with kgdb

On the ARM architecture, kgdb will crash the kernel if the last byte
of valid memory is written due to a flush_icache_range flushing
beyond the memory boundary.

Signed-off-by: Atsuo Igarashi <atsuo_igarashi@tripeaks.co.jp>
Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index eaa21fc9ad1d..949806ab67de 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -488,7 +488,7 @@ static int write_mem_msg(int binary)
 		if (err)
 			return err;
 		if (CACHE_FLUSH_IS_SAFE)
-			flush_icache_range(addr, addr + length + 1);
+			flush_icache_range(addr, addr + length);
 		return 0;
 	}
 
-- 
cgit v1.2.3-58-ga151


From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Fri, 26 Sep 2008 10:36:41 -0500
Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping

On the x86 arch, user space single step exceptions should be ignored
if they occur in the kernel space, such as ptrace stepping through a
system call.

First check if it is kgdb that is executing a single step, then ensure
it is not an accidental traversal into the user space, while in kgdb,
any other time the TIF_SINGLESTEP is set, kgdb should ignore the
exception.

On x86, arm, mips and powerpc, the kgdb_contthread usage was
inconsistent with the way single stepping is implemented in the kgdb
core.  The arch specific stub should always set the
kgdb_cpu_doing_single_step correctly if it is single stepping.  This
allows kgdb to correctly process an instruction steps if ptrace
happens to be requesting an instruction step over a system call.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 arch/arm/kernel/kgdb.c     |  2 --
 arch/mips/kernel/kgdb.c    |  3 +--
 arch/powerpc/kernel/kgdb.c |  5 ++---
 arch/x86/kernel/kgdb.c     | 18 +++++++++++-------
 kernel/kgdb.c              |  8 ++++----
 5 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c
index aaffaecffcd1..ba8ccfede964 100644
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -111,8 +111,6 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
 	case 'D':
 	case 'k':
 	case 'c':
-		kgdb_contthread = NULL;
-
 		/*
 		 * Try to read optional parameter, pc unchanged if no parm.
 		 * If this was a compiled breakpoint, we need to move
diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c
index 8f6d58ede33c..6e152c80cd4a 100644
--- a/arch/mips/kernel/kgdb.c
+++ b/arch/mips/kernel/kgdb.c
@@ -236,8 +236,7 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code,
 
 		atomic_set(&kgdb_cpu_doing_single_step, -1);
 		if (remcom_in_buffer[0] == 's')
-			if (kgdb_contthread)
-				atomic_set(&kgdb_cpu_doing_single_step, cpu);
+			atomic_set(&kgdb_cpu_doing_single_step, cpu);
 
 		return 0;
 	}
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index b4fdf2f2743c..fe8f71dd0b3f 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -347,9 +347,8 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code,
 			linux_regs->msr |= MSR_SE;
 #endif
 			kgdb_single_step = 1;
-			if (kgdb_contthread)
-				atomic_set(&kgdb_cpu_doing_single_step,
-					   raw_smp_processor_id());
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
 		}
 		return 0;
 	}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..00f7896c9a19 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -378,10 +378,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
 		if (remcomInBuffer[0] == 's') {
 			linux_regs->flags |= X86_EFLAGS_TF;
 			kgdb_single_step = 1;
-			if (kgdb_contthread) {
-				atomic_set(&kgdb_cpu_doing_single_step,
-					   raw_smp_processor_id());
-			}
+			atomic_set(&kgdb_cpu_doing_single_step,
+				   raw_smp_processor_id());
 		}
 
 		get_debugreg(dr6, 6);
@@ -466,9 +464,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
 
 	case DIE_DEBUG:
 		if (atomic_read(&kgdb_cpu_doing_single_step) ==
-			raw_smp_processor_id() &&
-			user_mode(regs))
-			return single_step_cont(regs, args);
+		    raw_smp_processor_id()) {
+			if (user_mode(regs))
+				return single_step_cont(regs, args);
+			break;
+		} else if (test_thread_flag(TIF_SINGLESTEP))
+			/* This means a user thread is single stepping
+			 * a system call which should be ignored
+			 */
+			return NOTIFY_DONE;
 		/* fall through */
 	default:
 		if (user_mode(regs))
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 949806ab67de..25d955dbb989 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -1462,7 +1462,7 @@ acquirelock:
 	 * Get the passive CPU lock which will hold all the non-primary
 	 * CPU in a spin state while the debugger is active
 	 */
-	if (!kgdb_single_step || !kgdb_contthread) {
+	if (!kgdb_single_step) {
 		for (i = 0; i < NR_CPUS; i++)
 			atomic_set(&passive_cpu_wait[i], 1);
 	}
@@ -1475,7 +1475,7 @@ acquirelock:
 
 #ifdef CONFIG_SMP
 	/* Signal the other CPUs to enter kgdb_wait() */
-	if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup)
+	if ((!kgdb_single_step) && kgdb_do_roundup)
 		kgdb_roundup_cpus(flags);
 #endif
 
@@ -1494,7 +1494,7 @@ acquirelock:
 	kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
 	kgdb_deactivate_sw_breakpoints();
 	kgdb_single_step = 0;
-	kgdb_contthread = NULL;
+	kgdb_contthread = current;
 	exception_level = 0;
 
 	/* Talk to debugger with gdbserial protocol */
@@ -1508,7 +1508,7 @@ acquirelock:
 	kgdb_info[ks->cpu].task = NULL;
 	atomic_set(&cpu_in_kgdb[ks->cpu], 0);
 
-	if (!kgdb_single_step || !kgdb_contthread) {
+	if (!kgdb_single_step) {
 		for (i = NR_CPUS-1; i >= 0; i--)
 			atomic_set(&passive_cpu_wait[i], 0);
 		/*
-- 
cgit v1.2.3-58-ga151


From 7659e349672bb0d378ef8d7d62bae4c53d2bdd18 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 14:06:45 +0200
Subject: hrtimer: migrate pending list on cpu offline

Impact: hrtimers which are on the pending list are not migrated at cpu
	offline and can be stale forever

Add the pending list migration when CONFIG_HIGH_RES_TIMERS is enabled

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..580bc66ae136 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1610,10 +1610,36 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	}
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+				   struct hrtimer_cpu_base *new_base)
+{
+	struct hrtimer *timer;
+	int raise = 0;
+
+	while (!list_empty(&old_base->cb_pending)) {
+		timer = list_entry(old_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
+		timer->base = &new_base->clock_base[timer->base->index];
+		list_add_tail(&timer->cb_entry, &new_base->cb_pending);
+		raise = 1;
+	}
+	return raise;
+}
+#else
+static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
+				   struct hrtimer_cpu_base *new_base)
+{
+	return 0;
+}
+#endif
+
 static void migrate_hrtimers(int cpu)
 {
 	struct hrtimer_cpu_base *old_base, *new_base;
-	int i;
+	int i, raise = 0;
 
 	BUG_ON(cpu_online(cpu));
 	old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1630,10 +1656,16 @@ static void migrate_hrtimers(int cpu)
 				     &new_base->clock_base[i]);
 	}
 
+	if (migrate_hrtimer_pending(old_base, new_base))
+		raise = 1;
+
 	spin_unlock(&old_base->lock);
 	spin_unlock(&new_base->lock);
 	local_irq_enable();
 	put_cpu_var(hrtimer_bases);
+
+	if (raise)
+		hrtimer_raise_softirq();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-- 
cgit v1.2.3-58-ga151


From 41e1022eae71707f1ce6801a746f70b1e57b7567 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 14:09:39 +0200
Subject: hrtimer: fix migration of CB_IRQSAFE_NO_SOFTIRQ hrtimers

Impact: Stale timers after a CPU went offline.

commit 37bb6cb4097e29ffee970065b74499cbf10603a3
       hrtimer: unlock hrtimer_wakeup

changed the hrtimer sleeper callback mode to CB_IRQSAFE_NO_SOFTIRQ due
to locking problems. A result of this change is that when enqueue is
called for an already expired hrtimer the callback function is not
longer called directly from the enqueue code. The normal callers have
been fixed in the code, but the migration code which moves hrtimers
from a dead CPU to a live CPU was not made aware of this.

This can be fixed by checking the timer state after the call to
enqueue in the migration code.


Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 580bc66ae136..ac2f6d6d4868 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1591,11 +1591,12 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
-static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
+static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				struct hrtimer_clock_base *new_base)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
+	int raise = 0;
 
 	while ((node = rb_first(&old_base->active))) {
 		timer = rb_entry(node, struct hrtimer, node);
@@ -1607,7 +1608,27 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * Enqueue the timer. Allow reprogramming of the event device
 		 */
 		enqueue_hrtimer(timer, new_base, 1);
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+		/*
+		 * Happens with high res enabled when the timer was
+		 * already expired and the callback mode is
+		 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
+		 * (hrtimer_sleeper). The enqueue code does not move
+		 * them to the soft irq pending list for
+		 * performance/latency reasons, but in the migration
+		 * state, we need to do that otherwise we end up with
+		 * a stale timer.
+		 */
+		if (timer->state == HRTIMER_STATE_INACTIVE) {
+			timer->state = HRTIMER_STATE_PENDING;
+			list_add_tail(&timer->cb_entry,
+				      &new_base->cpu_base->cb_pending);
+			raise = 1;
+		}
+#endif
 	}
+	return raise;
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1652,8 +1673,9 @@ static void migrate_hrtimers(int cpu)
 	spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		migrate_hrtimer_list(&old_base->clock_base[i],
-				     &new_base->clock_base[i]);
+		if (migrate_hrtimer_list(&old_base->clock_base[i],
+					 &new_base->clock_base[i]))
+			raise = 1;
 	}
 
 	if (migrate_hrtimer_pending(old_base, new_base))
-- 
cgit v1.2.3-58-ga151


From b00c1a99e7758f794923c61e5cd55268d61c9469 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 15:44:46 +0200
Subject: hrtimer: mark migration state

Impact: during migration active hrtimers can be seen as inactive

The migration code removes the hrtimers from the queues of the dead
CPU and sets the state temporary to INACTIVE. The enqueue code sets it
to ACTIVE/PENDING again.

Prevent that the wrong state can be seen by using a separate migration
state bit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  4 +++-
 kernel/hrtimer.c        | 12 ++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 6d93dce61cbb..bdd88df1b4e5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -67,9 +67,10 @@ enum hrtimer_cb_mode {
  * 0x02		callback function running
  * 0x04		callback pending (high resolution mode)
  *
- * Special case:
+ * Special cases:
  * 0x03		callback function running and enqueued
  *		(was requeued on another CPU)
+ * 0x09		timer was migrated on CPU hotunplug
  * The "callback function running and enqueued" status is only possible on
  * SMP. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
@@ -87,6 +88,7 @@ enum hrtimer_cb_mode {
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
 #define HRTIMER_STATE_PENDING	0x04
+#define HRTIMER_STATE_MIGRATE	0x08
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ac2f6d6d4868..ace723dd1e52 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1602,7 +1602,13 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		timer = rb_entry(node, struct hrtimer, node);
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0);
+
+		/*
+		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * timer could be seen as !active and just vanish away
+		 * under us on another CPU
+		 */
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timer. Allow reprogramming of the event device
@@ -1620,13 +1626,15 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * state, we need to do that otherwise we end up with
 		 * a stale timer.
 		 */
-		if (timer->state == HRTIMER_STATE_INACTIVE) {
+		if (timer->state == HRTIMER_STATE_MIGRATE) {
 			timer->state = HRTIMER_STATE_PENDING;
 			list_add_tail(&timer->cb_entry,
 				      &new_base->cpu_base->cb_pending);
 			raise = 1;
 		}
 #endif
+		/* Clear the migration state bit */
+		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 	return raise;
 }
-- 
cgit v1.2.3-58-ga151


From ccc7dadf736639da86f3e0c86832c11a66fc8221 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Sep 2008 15:47:42 +0200
Subject: hrtimer: prevent migration of per CPU hrtimers

Impact: per CPU hrtimers can be migrated from a dead CPU

The hrtimer code has no knowledge about per CPU timers, but we need to
prevent the migration of such timers and warn when such a timer is
active at migration time.

Explicitely mark the timers as per CPU and use a more understandable
mode descriptor for the interrupts safe unlocked callback mode, which
is used by hrtimer_sleeper and the scheduler code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h      | 14 +++++++++++---
 kernel/hrtimer.c             | 37 +++++++++++++++++++++++++------------
 kernel/sched.c               |  4 ++--
 kernel/time/tick-sched.c     |  2 +-
 kernel/trace/trace_sysprof.c |  2 +-
 5 files changed, 40 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bdd88df1b4e5..2f245fe63bda 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -47,14 +47,22 @@ enum hrtimer_restart {
  *	HRTIMER_CB_IRQSAFE:		Callback may run in hardirq context
  *	HRTIMER_CB_IRQSAFE_NO_RESTART:	Callback may run in hardirq context and
  *					does not restart the timer
- *	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:	Callback must run in hardirq context
- *					Special mode for tick emultation
+ *	HRTIMER_CB_IRQSAFE_PERCPU:	Callback must run in hardirq context
+ *					Special mode for tick emulation and
+ *					scheduler timer. Such timers are per
+ *					cpu and not allowed to be migrated on
+ *					cpu unplug.
+ *	HRTIMER_CB_IRQSAFE_UNLOCKED:	Callback should run in hardirq context
+ *					with timer->base lock unlocked
+ *					used for timers which call wakeup to
+ *					avoid lock order problems with rq->lock
  */
 enum hrtimer_cb_mode {
 	HRTIMER_CB_SOFTIRQ,
 	HRTIMER_CB_IRQSAFE,
 	HRTIMER_CB_IRQSAFE_NO_RESTART,
-	HRTIMER_CB_IRQSAFE_NO_SOFTIRQ,
+	HRTIMER_CB_IRQSAFE_PERCPU,
+	HRTIMER_CB_IRQSAFE_UNLOCKED,
 };
 
 /*
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ace723dd1e52..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 			 */
 			BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
 			return 1;
-		case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ:
+		case HRTIMER_CB_IRQSAFE_PERCPU:
+		case HRTIMER_CB_IRQSAFE_UNLOCKED:
 			/*
 			 * This is solely for the sched tick emulation with
 			 * dynamic tick support to ensure that we do not
 			 * restart the tick right on the edge and end up with
 			 * the tick timer in the softirq ! The calling site
-			 * takes care of this.
+			 * takes care of this. Also used for hrtimer sleeper !
 			 */
 			debug_hrtimer_deactivate(timer);
 			return 1;
@@ -1245,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
 	timer_stats_account_hrtimer(timer);
 
 	fn = timer->function;
-	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
+	    timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
 		/*
 		 * Used for scheduler timers, avoid lock inversion with
 		 * rq->lock and tasklist_lock.
@@ -1452,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 #endif
 }
 
@@ -1592,7 +1594,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 #ifdef CONFIG_HOTPLUG_CPU
 
 static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
-				struct hrtimer_clock_base *new_base)
+				struct hrtimer_clock_base *new_base, int dcpu)
 {
 	struct hrtimer *timer;
 	struct rb_node *node;
@@ -1603,6 +1605,18 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		BUG_ON(hrtimer_callback_running(timer));
 		debug_hrtimer_deactivate(timer);
 
+		/*
+		 * Should not happen. Per CPU timers should be
+		 * canceled _before_ the migration code is called
+		 */
+		if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
+			__remove_hrtimer(timer, old_base,
+					 HRTIMER_STATE_INACTIVE, 0);
+			WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
+			     timer, timer->function, dcpu);
+			continue;
+		}
+
 		/*
 		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
@@ -1619,12 +1633,11 @@ static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		/*
 		 * Happens with high res enabled when the timer was
 		 * already expired and the callback mode is
-		 * HRTIMER_CB_IRQSAFE_NO_SOFTIRQ
-		 * (hrtimer_sleeper). The enqueue code does not move
-		 * them to the soft irq pending list for
-		 * performance/latency reasons, but in the migration
-		 * state, we need to do that otherwise we end up with
-		 * a stale timer.
+		 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
+		 * enqueue code does not move them to the soft irq
+		 * pending list for performance/latency reasons, but
+		 * in the migration state, we need to do that
+		 * otherwise we end up with a stale timer.
 		 */
 		if (timer->state == HRTIMER_STATE_MIGRATE) {
 			timer->state = HRTIMER_STATE_PENDING;
@@ -1682,7 +1695,7 @@ static void migrate_hrtimers(int cpu)
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		if (migrate_hrtimer_list(&old_base->clock_base[i],
-					 &new_base->clock_base[i]))
+					 &new_base->clock_base[i], cpu))
 			raise = 1;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 13dd2db9fb2d..ad1962dc0aa2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -201,7 +201,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
 	hrtimer_init(&rt_b->rt_period_timer,
 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rt_b->rt_period_timer.function = sched_rt_period_timer;
-	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
 }
 
 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
@@ -1119,7 +1119,7 @@ static void init_rq_hrtick(struct rq *rq)
 
 	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	rq->hrtick_timer.function = hrtick;
-	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 }
 #else
 static inline void hrtick_clear(struct rq *rq)
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 39019b3f7621..cb02324bdb88 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -625,7 +625,7 @@ void tick_setup_sched_timer(void)
 	 */
 	hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
 	ts->sched_timer.function = tick_sched_timer;
-	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	/* Get the next period (per cpu) */
 	ts->sched_timer.expires = tick_init_jiffy_update();
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index bb948e52ce20..db58fb66a135 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
 
 	hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hrtimer->function = stack_trace_timer_fn;
-	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+	hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
 
 	hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
 }
-- 
cgit v1.2.3-58-ga151


From 31a78f23bac0069004e69f98808b6988baccb6b6 Mon Sep 17 00:00:00 2001
From: Balbir Singh <balbir@linux.vnet.ibm.com>
Date: Sun, 28 Sep 2008 23:09:31 +0100
Subject: mm owner: fix race between swapoff and exit

There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on.  The condition occurs when
try_to_unuse() runs in parallel with an exiting task.  A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.

CPU0                                    CPU1
                                        try_to_unuse
                                        looks at mm = task0->mm
                                        increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
                                        mmput(mm) decrements mm->mm_users
task0 freed
                                        dereferencing mm->owner fails

The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.

Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.

Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.

Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same.  And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.

Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c       |  2 +-
 kernel/cgroup.c |  5 +++--
 kernel/exit.c   | 12 ++++++++++--
 mm/memcontrol.c | 17 +++++++++++++++++
 4 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/fs/exec.c b/fs/exec.c
index 32993beecbe9..cecee501ce78 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -752,11 +752,11 @@ static int exec_mmap(struct mm_struct *mm)
 	tsk->active_mm = mm;
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
-	mm_update_next_owner(old_mm);
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
+		mm_update_next_owner(old_mm);
 		mmput(old_mm);
 		return 0;
 	}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 13932abde159..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2738,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
  */
 void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
 {
-	struct cgroup *oldcgrp, *newcgrp;
+	struct cgroup *oldcgrp, *newcgrp = NULL;
 
 	if (need_mm_owner_callback) {
 		int i;
 		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 			oldcgrp = task_cgroup(old, ss->subsys_id);
-			newcgrp = task_cgroup(new, ss->subsys_id);
+			if (new)
+				newcgrp = task_cgroup(new, ss->subsys_id);
 			if (oldcgrp == newcgrp)
 				continue;
 			if (ss->mm_owner_changed)
diff --git a/kernel/exit.c b/kernel/exit.c
index 16395644a98f..85a83c831856 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -583,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
 	 * If there are other users of the mm and the owner (us) is exiting
 	 * we need to find a new owner to take on the responsibility.
 	 */
-	if (!mm)
-		return 0;
 	if (atomic_read(&mm->mm_users) <= 1)
 		return 0;
 	if (mm->owner != p)
@@ -627,6 +625,16 @@ retry:
 	} while_each_thread(g, c);
 
 	read_unlock(&tasklist_lock);
+	/*
+	 * We found no owner yet mm_users > 1: this implies that we are
+	 * most likely racing with swapoff (try_to_unuse()) or /proc or
+	 * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
+	 * so that subsystems can understand the callback and take action.
+	 */
+	down_write(&mm->mmap_sem);
+	cgroup_mm_owner_callbacks(mm->owner, NULL);
+	mm->owner = NULL;
+	up_write(&mm->mmap_sem);
 	return;
 
 assign_new_owner:
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c0500e4d3a2f..36896f3eb7f5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -250,6 +250,14 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
 
 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 {
+	/*
+	 * mm_update_next_owner() may clear mm->owner to NULL
+	 * if it races with swapoff, page migration, etc.
+	 * So this can be called with p == NULL.
+	 */
+	if (unlikely(!p))
+		return NULL;
+
 	return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
 				struct mem_cgroup, css);
 }
@@ -549,6 +557,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
 	if (likely(!memcg)) {
 		rcu_read_lock();
 		mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+		if (unlikely(!mem)) {
+			rcu_read_unlock();
+			kmem_cache_free(page_cgroup_cache, pc);
+			return 0;
+		}
 		/*
 		 * For every charge from the cgroup, increment reference count
 		 */
@@ -801,6 +814,10 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
 
 	rcu_read_lock();
 	mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
+	if (unlikely(!mem)) {
+		rcu_read_unlock();
+		return 0;
+	}
 	css_get(&mem->css);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-58-ga151


From 64b9e0294d24a4204232e13e01630b0690e48d61 Mon Sep 17 00:00:00 2001
From: "Amit K. Arora" <aarora@linux.vnet.ibm.com>
Date: Tue, 30 Sep 2008 17:15:39 +0530
Subject: sched: minor optimizations in wake_affine and select_task_rq_fair

This patch does following:
o Removes unused variable and argument "rq".
o Optimizes one of the "if" conditions in wake_affine() - i.e.  if
  "balanced" is true, we need not do rest of the calculations in the
  condition.
o If this cpu is same as the previous cpu (on which woken up task
  was running when it went to sleep), no need to call wake_affine at all.

Signed-off-by: Amit K Arora <aarora@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 95c1295ad26d..fcbe850a5a90 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1088,7 +1088,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
 
 static int
-wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
+wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	    struct task_struct *p, int prev_cpu, int this_cpu, int sync,
 	    int idx, unsigned long load, unsigned long this_load,
 	    unsigned int imbalance)
@@ -1136,8 +1136,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
 
-	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-			balanced) {
+	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+			tl_per_task)) {
 		/*
 		 * This domain has SD_WAKE_AFFINE and
 		 * p is cache cold in this domain, and
@@ -1156,16 +1156,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	struct sched_domain *sd, *this_sd = NULL;
 	int prev_cpu, this_cpu, new_cpu;
 	unsigned long load, this_load;
-	struct rq *rq, *this_rq;
+	struct rq *this_rq;
 	unsigned int imbalance;
 	int idx;
 
 	prev_cpu	= task_cpu(p);
-	rq		= task_rq(p);
 	this_cpu	= smp_processor_id();
 	this_rq		= cpu_rq(this_cpu);
 	new_cpu		= prev_cpu;
 
+	if (prev_cpu == this_cpu)
+		goto out;
 	/*
 	 * 'this_sd' is the first domain that both
 	 * this_cpu and prev_cpu are present in:
@@ -1193,13 +1194,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
 	load = source_load(prev_cpu, idx);
 	this_load = target_load(this_cpu, idx);
 
-	if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
 				     load, this_load, imbalance))
 		return this_cpu;
 
-	if (prev_cpu == this_cpu)
-		goto out;
-
 	/*
 	 * Start passive balancing when half the imbalance_pct
 	 * limit is reached.
-- 
cgit v1.2.3-58-ga151


From aa94fbd5ccd840c8ab26d02439ec799b03a72547 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 2 Oct 2008 14:50:14 -0700
Subject: fix error-path NULL deref in alloc_posix_timer()

Found by static checker (http://repo.or.cz/w/smatch.git).

Signed-off-by: Dan Carpenter <error27@gmail.com>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/posix-timers.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e36d5798cbff..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -441,7 +441,7 @@ static struct k_itimer * alloc_posix_timer(void)
 		return tmr;
 	if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
 		kmem_cache_free(posix_timers_cache, tmr);
-		tmr = NULL;
+		return NULL;
 	}
 	memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
 	return tmr;
-- 
cgit v1.2.3-58-ga151


From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Oct 2008 16:06:39 -0700
Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU

This patch adds stalled-CPU detection to Classic RCU.  This capability
is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which
defaults disabled.

This is a debugging feature to detect infinite loops in kernel code, not
something that non-kernel-hackers would be expected to care about.

This feature can detect looping CPUs in !PREEMPT builds and looping CPUs
with preemption disabled in PREEMPT builds.  This is essentially a port of
this functionality from the treercu patch, replacing the stall debug patch
that is already in tip/core/rcu (commit 67182ae1c4).

The changes from the patch in tip/core/rcu include making the config
variable name match that in treercu, changing from seconds to jiffies to
avoid spurious warnings, and printing a boot message when this feature
is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h |  12 +++-
 kernel/rcuclassic.c        | 166 +++++++++++++++++++++++----------------------
 lib/Kconfig.debug          |   2 +-
 3 files changed, 96 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 29bf528c7dcc..5f89b62e6983 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -40,15 +40,21 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+#define RCU_SECONDS_TILL_STALL_CHECK	( 3 * HZ) /* for rcp->jiffies_stall */
+#define RCU_SECONDS_TILL_STALL_RECHECK	(30 * HZ) /* for rcp->jiffies_stall */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
 	long	cur;		/* Current batch number.                      */
 	long	completed;	/* Number of the last completed batch         */
 	long	pending;	/* Number of the last pending batch           */
-#ifdef CONFIG_DEBUG_RCU_STALL
-	unsigned long gp_check;	/* Time grace period should end, in seconds.  */
-#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	unsigned long gp_start;	/* Time at which GP started in jiffies. */
+	unsigned long jiffies_stall;
+				/* Time at which to check for CPU stalls. */
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 
 	int	signaled;
 
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index ed15128ca2c9..0d07e6e51578 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
 	}
 }
 
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+	rcp->gp_start = jiffies;
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
+}
+
+static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	long delta;
+	unsigned long flags;
+
+	/* Only let one CPU complain about others per time interval. */
+
+	spin_lock_irqsave(&rcp->lock, flags);
+	delta = jiffies - rcp->jiffies_stall;
+	if (delta < 2 || rcp->cur != rcp->completed) {
+		spin_unlock_irqrestore(&rcp->lock, flags);
+		return;
+	}
+	rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+
+	/* OK, time to rat on our buddy... */
+
+	printk(KERN_ERR "RCU detected CPU stalls:");
+	for_each_possible_cpu(cpu) {
+		if (cpu_isset(cpu, rcp->cpumask))
+			printk(" %d", cpu);
+	}
+	printk(" (detected by %d, t=%ld jiffies)\n",
+	       smp_processor_id(), (long)(jiffies - rcp->gp_start));
+}
+
+static void print_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	unsigned long flags;
+
+	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
+			smp_processor_id(), jiffies,
+			jiffies - rcp->gp_start);
+	dump_stack();
+	spin_lock_irqsave(&rcp->lock, flags);
+	if ((long)(jiffies - rcp->jiffies_stall) >= 0)
+		rcp->jiffies_stall =
+			jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+	spin_unlock_irqrestore(&rcp->lock, flags);
+	set_need_resched();  /* kick ourselves to get things going. */
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp)
+{
+	long delta;
+
+	delta = jiffies - rcp->jiffies_stall;
+	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
+
+		/* We haven't checked in, so go dump stack. */
+		print_cpu_stall(rcp);
+
+	} else if (rcp->cur != rcp->completed && delta >= 2) {
+
+		/* They had two seconds to dump stack, so complain. */
+		print_other_cpu_stall(rcp);
+	}
+}
+
+#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
+static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
+{
+}
+
+static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
  * @head: structure to be used for queueing the RCU updates.
@@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp)
  *   period (if necessary).
  */
 
-#ifdef CONFIG_DEBUG_RCU_STALL
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
-	rcp->gp_check = get_seconds() + 3;
-}
-
-static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	long delta;
-	unsigned long flags;
-
-	/* Only let one CPU complain about others per time interval. */
-
-	spin_lock_irqsave(&rcp->lock, flags);
-	delta = get_seconds() - rcp->gp_check;
-	if (delta < 2L || cpus_empty(rcp->cpumask)) {
-		spin_unlock(&rcp->lock);
-		return;
-	}
-	rcp->gp_check = get_seconds() + 30;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-
-	/* OK, time to rat on our buddy... */
-
-	printk(KERN_ERR "RCU detected CPU stalls:");
-	for_each_cpu_mask(cpu, rcp->cpumask)
-		printk(" %d", cpu);
-	printk(" (detected by %d, t=%lu/%lu)\n",
-	       smp_processor_id(), get_seconds(), rcp->gp_check);
-}
-
-static void print_cpu_stall(struct rcu_ctrlblk *rcp)
-{
-	unsigned long flags;
-
-	printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n",
-			smp_processor_id(), get_seconds(), rcp->gp_check);
-	dump_stack();
-	spin_lock_irqsave(&rcp->lock, flags);
-	if ((long)(get_seconds() - rcp->gp_check) >= 0L)
-		rcp->gp_check = get_seconds() + 30;
-	spin_unlock_irqrestore(&rcp->lock, flags);
-}
-
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	long delta;
-
-	delta = get_seconds() - rcp->gp_check;
-	if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) {
-
-		/* We haven't checked in, so go dump stack. */
-
-		print_cpu_stall(rcp);
-
-	} else {
-		if (!cpus_empty(rcp->cpumask) && delta >= 2L) {
-			/* They had two seconds to dump stack, so complain. */
-			print_other_cpu_stall(rcp);
-		}
-	}
-}
-
-#else /* #ifdef CONFIG_DEBUG_RCU_STALL */
-
-static inline void record_gp_check_time(struct rcu_ctrlblk *rcp)
-{
-}
-
-static inline void
-check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */
-
 /*
  * Register a new batch of callbacks, and start it up if there is currently no
  * active batch and the batch to be registered has not already occurred.
@@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 	if (rcp->cur != rcp->pending &&
 			rcp->completed == rcp->cur) {
 		rcp->cur++;
-		record_gp_check_time(rcp);
+		record_gp_stall_check_time(rcp);
 
 		/*
 		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 {
 	/* Check for CPU stalls, if enabled. */
-	check_cpu_stall(rcp, rdp);
+	check_cpu_stall(rcp);
 
 	if (rdp->nxtlist) {
 		long completed_snap = ACCESS_ONCE(rcp->completed);
@@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
  */
 void __init __rcu_init(void)
 {
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index ccede1aeab38..9fee969dd60e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
-config RCU_CPU_STALL
+config RCU_CPU_STALL_DETECTOR
 	bool "Check for stalled CPUs delaying RCU grace periods"
 	depends on CLASSIC_RCU
 	default n
-- 
cgit v1.2.3-58-ga151


From 2ec2b482b10a1ed3493c224f1893cddd3d33833b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 3 Oct 2008 10:41:00 +0200
Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU, fix

fix the !CONFIG_RCU_CPU_STALL_DETECTOR path:

 kernel/rcuclassic.c: In function '__rcu_pending':
 kernel/rcuclassic.c:609: error: too few arguments to function 'check_cpu_stall'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcuclassic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 0d07e6e51578..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -239,7 +239,7 @@ static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
 {
 }
 
-static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
 {
 }
 
-- 
cgit v1.2.3-58-ga151


From d294eb83d8d39a29f01dad391f15fc3a29aa04f9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 3 Oct 2008 12:10:10 +0200
Subject: cpusets: scan_for_empty_cpusets(), cpuset doesn't seem to be so const
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a warning on latest -tip:

 kernel/cpuset.c: Dans la fonction «scan_for_empty_cpusets» :
 kernel/cpuset.c:1932: attention : passing argument 1 of «list_add_tail» discards qualifiers from pointer target type

Actually the struct cpuset *root passed in parameter to scan_for_empty_cpusets
is not supposed to be const since an entry is added on the tail of its list.
Just correct the qualifier.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/cpuset.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 827cd9adccb2..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1921,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  * that has tasks along with an empty 'mems'.  But if we did see such
  * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
  */
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
 {
 	LIST_HEAD(queue);
 	struct cpuset *cp;	/* scans cpusets being updated */
-- 
cgit v1.2.3-58-ga151


From 07454bfff151d2465ada809bbaddf3548cc1097c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Oct 2008 10:51:07 +0200
Subject: clockevents: check broadcast tick device not the clock events device

Impact: jiffies increment too fast.

Hugh Dickins noted that with NOHZ=n and HIGHRES=n jiffies get
incremented too fast. The reason is a wrong check in the broadcast
enter/exit code, which keeps the local apic timer in periodic mode
when the switch happens.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index bd7034542399..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -235,7 +235,8 @@ static void tick_do_broadcast_on_off(void *why)
 	case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
 		if (!cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_set(cpu, tick_broadcast_mask);
-			if (bc->mode == TICKDEV_MODE_PERIODIC)
+			if (tick_broadcast_device.mode ==
+			    TICKDEV_MODE_PERIODIC)
 				clockevents_shutdown(dev);
 		}
 		if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
@@ -245,7 +246,8 @@ static void tick_do_broadcast_on_off(void *why)
 		if (!tick_broadcast_force &&
 		    cpu_isset(cpu, tick_broadcast_mask)) {
 			cpu_clear(cpu, tick_broadcast_mask);
-			if (bc->mode == TICKDEV_MODE_PERIODIC)
+			if (tick_broadcast_device.mode ==
+			    TICKDEV_MODE_PERIODIC)
 				tick_setup_periodic(dev, 0);
 		}
 		break;
-- 
cgit v1.2.3-58-ga151


From f6121f4f8708195e88cbdf8dd8d171b226b3f858 Mon Sep 17 00:00:00 2001
From: Dario Faggioli <raistlin@linux.it>
Date: Fri, 3 Oct 2008 17:40:46 +0200
Subject: sched_rt.c: resch needed in rt_rq_enqueue() for the root rt_rq

While working on the new version of the code for SCHED_SPORADIC I
noticed something strange in the present throttling mechanism. More
specifically in the throttling timer handler in sched_rt.c
(do_sched_rt_period_timer()) and in rt_rq_enqueue().

The problem is that, when unthrottling a runqueue, rt_rq_enqueue() only
asks for rescheduling if the runqueue has a sched_entity associated to
it (i.e., rt_rq->rt_se != NULL).
Now, if the runqueue is the root rq (which has a rt_se = NULL)
rescheduling does not take place, and it is delayed to some undefined
instant in the future.

This imply some random bandwidth usage by the RT tasks under throttling.
For instance, setting rt_runtime_us/rt_period_us = 950ms/1000ms an RT
task will get less than 95%. In our tests we got something varying
between 70% to 95%.
Using smaller time values, e.g., 95ms/100ms, things are even worse, and
I can see values also going down to 20-25%!!

The tests we performed are simply running 'yes' as a SCHED_FIFO task,
and checking the CPU usage with top, but we can investigate thoroughly
if you think it is needed.

Things go much better, for us, with the attached patch... Don't know if
it is the best approach, but it solved the issue for us.

Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <trimarchimichael@yahoo.it>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d570a8cc4fcd..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
+	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 	struct sched_rt_entity *rt_se = rt_rq->rt_se;
 
-	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
-		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
-
-		enqueue_rt_entity(rt_se);
+	if (rt_rq->rt_nr_running) {
+		if (rt_se && !on_rt_rq(rt_se))
+			enqueue_rt_entity(rt_se);
 		if (rt_rq->highest_prio < curr->prio)
 			resched_task(curr);
 	}
-- 
cgit v1.2.3-58-ga151


From 34b3ede2353604ec9861c1d900b2a835ff85de47 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 6 Oct 2008 09:27:00 +0800
Subject: sched: remove redundant code in cpu_cgroup_create()

css will be initialized by cgroup core.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2caedc47e764..9715f4ce6cfe 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9088,7 +9088,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 	if (!cgrp->parent) {
 		/* This is early initialization for the top cgroup */
-		init_task_group.css.cgroup = cgrp;
 		return &init_task_group.css;
 	}
 
@@ -9097,9 +9096,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	if (IS_ERR(tg))
 		return ERR_PTR(-ENOMEM);
 
-	/* Bind the cgroup to task_group object we just created */
-	tg->css.cgroup = cgrp;
-
 	return &tg->css;
 }
 
-- 
cgit v1.2.3-58-ga151


From cc1e0f4f7ad95a9eb81e1904cb16068af226180d Mon Sep 17 00:00:00 2001
From: Jason Wessel <jason.wessel@windriver.com>
Date: Mon, 6 Oct 2008 13:50:59 -0500
Subject: kgdb: call touch_softlockup_watchdog on resume

The softlockup watchdog needs to be touched when resuming the from the
kgdb stopped state to avoid the printk that a CPU is stuck if the
debugger was active for longer than the softlockup threshold.

Signed-off-by: Jason Wessel <jason.wessel@windriver.com>
---
 kernel/kgdb.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 25d955dbb989..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -590,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
 
 	/* Signal the primary CPU that we are done: */
 	atomic_set(&cpu_in_kgdb[cpu], 0);
+	touch_softlockup_watchdog();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 }
@@ -1432,6 +1433,7 @@ acquirelock:
 	    atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
 
 		atomic_set(&kgdb_active, -1);
+		touch_softlockup_watchdog();
 		clocksource_touch_watchdog();
 		local_irq_restore(flags);
 
@@ -1524,6 +1526,7 @@ acquirelock:
 kgdb_restore:
 	/* Free kgdb_active */
 	atomic_set(&kgdb_active, -1);
+	touch_softlockup_watchdog();
 	clocksource_touch_watchdog();
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3-58-ga151


From 2fb7635c4cea310992a39580133099dd99ad151c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 8 Oct 2008 09:16:04 +0200
Subject: sched: sync wakeups vs avg_overlap

While looking at the code I wondered why we always do:

  sync && avg_overlap < migration_cost

Which is a bit odd, since the overlap test was meant to detect sync wakeups
so using it to specialize sync wakeups doesn't make much sense.

Hence change the code to do:

  sync || avg_overlap < migration_cost

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index fcbe850a5a90..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1103,6 +1103,11 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
 		return 0;
 
+	if (!sync && sched_feat(SYNC_WAKEUPS) &&
+	    curr->se.avg_overlap < sysctl_sched_migration_cost &&
+	    p->se.avg_overlap < sysctl_sched_migration_cost)
+		sync = 1;
+
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1127,11 +1132,8 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	 * a reasonable amount of time then attract this newly
 	 * woken task:
 	 */
-	if (sync && balanced) {
-		if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
-		    p->se.avg_overlap < sysctl_sched_migration_cost)
-			return 1;
-	}
+	if (sync && balanced)
+		return 1;
 
 	schedstat_inc(p, se.nr_wakeups_affine_attempts);
 	tl_per_task = cpu_avg_load_per_task(this_cpu);
@@ -1268,9 +1270,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	if (sched_feat(WAKEUP_OVERLAP) && sync &&
-			se->avg_overlap < sysctl_sched_migration_cost &&
-			pse->avg_overlap < sysctl_sched_migration_cost) {
+	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+			(se->avg_overlap < sysctl_sched_migration_cost &&
+			 pse->avg_overlap < sysctl_sched_migration_cost))) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3-58-ga151


From a5d8c3483a6e19aca95ef6a2c5890e33bfa5b293 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Oct 2008 11:35:51 +0200
Subject: sched debug: add name to sched_domain sysctl entries

add /proc/sys/kernel/sched_domain/cpu0/domain0/name, to make
it easier to see which specific scheduler domain remained at
that entry.

Since we process the scheduler domain tree and
simplify it, it's not always immediately clear during debugging
which domain came from where.

depends on CONFIG_SCHED_DEBUG=y.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d8e699b55858..5d0819ee442a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -824,6 +824,9 @@ struct sched_domain {
 	unsigned int ttwu_move_affine;
 	unsigned int ttwu_move_balance;
 #endif
+#ifdef CONFIG_SCHED_DEBUG
+	char *name;
+#endif
 };
 
 extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
diff --git a/kernel/sched.c b/kernel/sched.c
index 9715f4ce6cfe..6f230596bd0c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6351,7 +6351,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-	struct ctl_table *table = sd_alloc_ctl_entry(12);
+	struct ctl_table *table = sd_alloc_ctl_entry(13);
 
 	if (table == NULL)
 		return NULL;
@@ -6379,7 +6379,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[10], "flags", &sd->flags,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	/* &table[11] is terminator */
+	set_table_entry(&table[11], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring);
+	/* &table[12] is terminator */
 
 	return table;
 }
@@ -7263,13 +7265,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
  * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
  */
 
+#ifdef CONFIG_SCHED_DEBUG
+# define SD_INIT_NAME(sd, type)		sd->name = #type
+#else
+# define SD_INIT_NAME(sd, type)		do { } while (0)
+#endif
+
 #define	SD_INIT(sd, type)	sd_init_##type(sd)
+
 #define SD_INIT_FUNC(type)	\
 static noinline void sd_init_##type(struct sched_domain *sd)	\
 {								\
 	memset(sd, 0, sizeof(*sd));				\
 	*sd = SD_##type##_INIT;					\
 	sd->level = SD_LV_##type;				\
+	SD_INIT_NAME(sd, type);					\
 }
 
 SD_INIT_FUNC(CPU)
-- 
cgit v1.2.3-58-ga151


From 8083e4ad970e4eb567e31037060cdd4ba346f0c0 Mon Sep 17 00:00:00 2001
From: "venkatesh.pallipadi@intel.com" <venkatesh.pallipadi@intel.com>
Date: Mon, 4 Aug 2008 11:59:11 -0700
Subject: [CPUFREQ][5/6] cpufreq: Changes to get_cpu_idle_time_us(), used by
 ondemand governor

export get_cpu_idle_time_us() for it to be used in ondemand governor.
Last update time can be current time when the CPU is currently non-idle,
accounting for the busy time since last idle.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Dave Jones <davej@redhat.com>
---
 include/linux/tick.h     |  2 +-
 kernel/time/tick-sched.c | 11 ++++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index 8cf8cfe2cc97..98921a3e1aa8 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -126,7 +126,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 	return len;
 }
 static inline void tick_nohz_stop_idle(int cpu) { }
-static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return 0; }
+static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
 #endif
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cb02324bdb88..a4d219398167 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/profile.h>
 #include <linux/sched.h>
 #include <linux/tick.h>
+#include <linux/module.h>
 
 #include <asm/irq_regs.h>
 
@@ -190,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 
-	*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	if (!tick_nohz_enabled)
+		return -1;
+
+	if (ts->idle_active)
+		*last_update_time = ktime_to_us(ts->idle_lastupdate);
+	else
+		*last_update_time = ktime_to_us(ktime_get());
+
 	return ktime_to_us(ts->idle_sleeptime);
 }
+EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
 
 /**
  * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
-- 
cgit v1.2.3-58-ga151