From 82524746c27fa418c250a56dd7606b9d3fc79826 Mon Sep 17 00:00:00 2001
From: Franck Bui-Huu <fbuihuu@gmail.com>
Date: Mon, 12 May 2008 21:21:05 +0200
Subject: rcu: split list.h and move rcu-protected lists into rculist.h

Move rcu-protected lists from list.h into a new header file rculist.h.

This is done because list are a very used primitive structure all over the
kernel and it's currently impossible to include other header files in this
list.h without creating some circular dependencies.

For example, list.h implements rcu-protected list and uses rcu_dereference()
without including rcupdate.h.  It actually compiles because users of
rcu_dereference() are macros.  Others RCU functions could be used too but
aren't probably because of this.

Therefore this patch creates rculist.h which includes rcupdates without to
many changes/troubles.

Signed-off-by: Franck Bui-Huu <fbuihuu@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Josh Triplett <josh@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/textsearch.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'lib')

diff --git a/lib/textsearch.c b/lib/textsearch.c
index be8bda3862f5..a3e500ad51d7 100644
--- a/lib/textsearch.c
+++ b/lib/textsearch.c
@@ -97,6 +97,7 @@
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/init.h>
+#include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/err.h>
 #include <linux/textsearch.h>
-- 
cgit v1.2.3-58-ga151


From 6e766410c4babd37bc7cd5e25009c179781742c8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: ftrace: annotate core code that should not be traced

Mark with "notrace" functions in core code that should not be
traced.  The "notrace" attribute will prevent gcc from adding
a call to ftrace on the annotated funtions.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/smp_processor_id.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 6c90fb90e19c..e555ab62fbad 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -7,7 +7,7 @@
 #include <linux/kallsyms.h>
 #include <linux/sched.h>
 
-unsigned int debug_smp_processor_id(void)
+notrace unsigned int debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
-- 
cgit v1.2.3-58-ga151


From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add basic support for gcc profiler instrumentation

If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is
set to a non-zero value the ftrace routine will be called everytime
we enter a kernel function that is not marked with the "notrace"
attribute.

The ftrace routine will then call a registered function if a function
happens to be registered.

[ This code has been highly hacked by Steven Rostedt and Ingo Molnar,
  so don't blame Arnaldo for all of this ;-) ]

Update:
  It is now possible to register more than one ftrace function.
  If only one ftrace function is registered, that will be the
  function that ftrace calls directly. If more than one function
  is registered, then ftrace will call a function that will loop
  through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Makefile                   |   4 ++
 arch/x86/Kconfig           |   1 +
 arch/x86/kernel/entry_32.S |  27 +++++++++
 arch/x86/kernel/entry_64.S |  37 ++++++++++++
 include/linux/ftrace.h     |  38 +++++++++++++
 kernel/Makefile            |   1 +
 kernel/trace/Kconfig       |   5 ++
 kernel/trace/Makefile      |   3 +
 kernel/trace/ftrace.c      | 138 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          |   2 +
 10 files changed, 256 insertions(+)
 create mode 100644 include/linux/ftrace.h
 create mode 100644 kernel/trace/Kconfig
 create mode 100644 kernel/trace/Makefile
 create mode 100644 kernel/trace/ftrace.c

(limited to 'lib')

diff --git a/Makefile b/Makefile
index 20b32351906b..b4a273f19b52 100644
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..c742dfeb0dbe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..f47b9b5440d2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+	call   *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..f046e0c64883 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -54,6 +54,43 @@
 
 	.code64
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644
index 000000000000..b96ef14c249a
--- /dev/null
+++ b/include/linux/ftrace.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void mcount(void);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+#endif /* _LINUX_FTRACE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..fa05f6d8bdbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..8185c91417bc
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,5 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..bf4fd215a6a9
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..b6a80b98a3fb
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,138 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+static DEFINE_SPINLOCK(ftrace_func_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL(mcount);
+
+notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+	/*
+	 * For one func, simply call it directly.
+	 * For more than one func, call the chain.
+	 */
+	if (ops->next == &ftrace_list_end)
+		ftrace_trace_function = ops->func;
+	else
+		ftrace_trace_function = ftrace_list_func;
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+
+	/*
+	 * If we are the only function, then the ftrace pointer is
+	 * pointing directly to that function.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	/* If we only have one func left, then call that directly */
+	if (ftrace_list->next == &ftrace_list_end)
+		ftrace_trace_function = ftrace_list->func;
+
+ out:
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..d8b6279a9b42 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -634,6 +634,8 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
-- 
cgit v1.2.3-58-ga151


From 5568b139f4d196273958ae2947a736fdf1ffeece Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:44 +0200
Subject: ftrace: debug smp_processor_id, use notrace preempt disable

The debug smp_processor_id caused a recursive fault in debugging
the irqsoff tracer. The tracer used a smp_processor_id in the
ftrace callback, and this function called preempt_disable which
also is traced. This caused a recursive fault (stack overload).

Since using smp_processor_id without debugging on does not cause
faults with the tracer (even when the tracer is wrong), the
debug version should not cause a system reboot.

This changes the debug_smp_processor_id to use the notrace versions
of preempt_disable and enable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/smp_processor_id.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index e555ab62fbad..3b4dc098181e 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -37,7 +37,7 @@ notrace unsigned int debug_smp_processor_id(void)
 	/*
 	 * Avoid recursion:
 	 */
-	preempt_disable();
+	preempt_disable_notrace();
 
 	if (!printk_ratelimit())
 		goto out_enable;
@@ -49,7 +49,7 @@ notrace unsigned int debug_smp_processor_id(void)
 	dump_stack();
 
 out_enable:
-	preempt_enable_no_resched();
+	preempt_enable_no_resched_notrace();
 out:
 	return this_cpu;
 }
-- 
cgit v1.2.3-58-ga151


From 3594136ad67a54d77bcb2547e70011754a2f91d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 12 May 2008 21:20:50 +0200
Subject: ftrace: do not profile lib/string.o

Most archs define the string and memory compare functions in assembly.
Some do not. But these functions may be used in some archs at early
boot up.

Since most archs define this code in assembly and they are not usually
traced, there's no need to trace them when they are not defined in
assembly.

This patch removes the -pg from the CFLAGS for lib/string.o.
This prevents the string functions use in either vdso or early bootup
from crashing the system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/Makefile | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 74b0cfb1fcc3..6ca9e6ee1e33 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -8,6 +8,14 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o prio_heap.o ratelimit.o
 
+ifdef CONFIG_FTRACE
+# Do not profile string.o, since it may be used in early boot or vdso
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(if $(subst string,,$(basename $(notdir $@))), \
+	$(ORIG_CFLAGS), \
+	$(subst -pg,,$(ORIG_CFLAGS)))
+endif
+
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
 
-- 
cgit v1.2.3-58-ga151


From 9d0a420b737f72d84fabebf29634d800cbf54538 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Mon, 12 May 2008 21:20:55 +0200
Subject: ftrace: remove function tracing from spinlock debug

The debug functions in spin_lock debugging pollute the output of the
function tracer. This patch adds the debug files in the lib director
to those that should not be compiled with mcount tracing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index 6ca9e6ee1e33..d97ad1100b6f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,8 +10,9 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 
 ifdef CONFIG_FTRACE
 # Do not profile string.o, since it may be used in early boot or vdso
+# Also do not profile any debug utilities
 ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(if $(subst string,,$(basename $(notdir $@))), \
+KBUILD_CFLAGS = $(if $(filter-out %debug debug% string%,$(basename $(notdir $@))), \
 	$(ORIG_CFLAGS), \
 	$(subst -pg,,$(ORIG_CFLAGS)))
 endif
-- 
cgit v1.2.3-58-ga151


From 654e4787689faffdd2137fe91f59fd3ef3363ad2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Wed, 14 May 2008 21:30:31 -0400
Subject: ftrace: use the new kbuild CFLAGS_REMOVE for lib directory

This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro
in the lib directory.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib')

diff --git a/lib/Makefile b/lib/Makefile
index d97ad1100b6f..4b836a53c08f 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -10,11 +10,11 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 
 ifdef CONFIG_FTRACE
 # Do not profile string.o, since it may be used in early boot or vdso
+CFLAGS_REMOVE_string.o = -pg
 # Also do not profile any debug utilities
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(if $(filter-out %debug debug% string%,$(basename $(notdir $@))), \
-	$(ORIG_CFLAGS), \
-	$(subst -pg,,$(ORIG_CFLAGS)))
+CFLAGS_REMOVE_spinlock_debug.o = -pg
+CFLAGS_REMOVE_list_debug.o = -pg
+CFLAGS_REMOVE_debugobjects.o = -pg
 endif
 
 lib-$(CONFIG_MMU) += ioremap.o
-- 
cgit v1.2.3-58-ga151


From 886dd58258e6ddebe20e7aebef7b167a24bad7ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 15:44:38 +0200
Subject: debugging: make stacktrace independent from DEBUG_KERNEL

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 lib/Kconfig.debug | 1 -
 1 file changed, 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..9c17fb9d1d5e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -419,7 +419,6 @@ config DEBUG_LOCKING_API_SELFTESTS
 
 config STACKTRACE
 	bool
-	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 
 config DEBUG_KOBJECT
-- 
cgit v1.2.3-58-ga151


From 1462a200057df08be12f3719e1f37adbd2c6e4d0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Jun 2008 08:40:04 +0200
Subject: Revert "prohibit rcutorture from being compiled into the kernel"

This reverts commit 9aaffc898ff4a3df18c5fc4b9e0fa47e779ad726.

That commit was a very bad idea. RCU_TORTURE found many boot timing
bugs and other sorts of bugs in the past, so excluding it from
boot images is very silly.

The option already depends on DEBUG_KERNEL and is disabled by default.
Even when it runs, the test threads are reniced. If it annoys people
we could add a runtime sysctl.
---
 lib/Kconfig.debug | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..f51ba2fa2662 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -531,13 +531,14 @@ config BOOT_PRINTK_DELAY
 config RCU_TORTURE_TEST
 	tristate "torture tests for RCU"
 	depends on DEBUG_KERNEL
-	depends on m
 	default n
 	help
 	  This option provides a kernel module that runs torture tests
 	  on the RCU infrastructure.  The kernel module may be built
 	  after the fact on the running kernel to be tested, if desired.
 
+	  Say Y here if you want RCU torture tests to start automatically
+	  at boot time (you probably don't).
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
-- 
cgit v1.2.3-58-ga151


From 31a72bce0bd6f3e0114009288bccbc96376eeeca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Jun 2008 09:26:49 -0700
Subject: rcu: make rcutorture more vicious: reinstate boot-time testing

This patch re-institutes the ability to build rcutorture directly into
the Linux kernel.  The reason that this capability was removed was that
this could result in your kernel being pretty much useless, as rcutorture
would be running starting from early boot.  This problem has been avoided
by (1) making rcutorture run only three seconds of every six by default,
(2) adding a CONFIG_RCU_TORTURE_TEST_RUNNABLE that permits rcutorture
to be quiesced at boot time, and (3) adding a sysctl in /proc named
/proc/sys/kernel/rcutorture_runnable that permits rcutorture to be
quiesced and unquiesced when built into the kernel.

Please note that this /proc file is -not- available when rcutorture
is built as a module.  Please also note that to get the earlier
take-no-prisoners behavior, you must use the boot command line to set
rcutorture's "stutter" parameter to zero.

The rcutorture quiescing mechanism is currently quite crude: loops
in each rcutorture process that poll a global variable once per tick.
Suggestions for improvement are welcome.  The default action will
be to reduce the polling rate to a few times per second.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/RCU/torture.txt | 21 ++++++++++++++-------
 kernel/rcutorture.c           |  9 ++++++++-
 kernel/sysctl.c               | 13 +++++++++++++
 lib/Kconfig.debug             | 21 +++++++++++++++++++--
 4 files changed, 54 insertions(+), 10 deletions(-)

(limited to 'lib')

diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt
index 02b3d14c0209..516527d4bc55 100644
--- a/Documentation/RCU/torture.txt
+++ b/Documentation/RCU/torture.txt
@@ -10,13 +10,20 @@ status messages via printk(), which can be examined via the dmesg
 command (perhaps grepping for "torture").  The test is started
 when the module is loaded, and stops when the module is unloaded.
 
-However, actually setting this config option to "y" results in the system
-running the test immediately upon boot, and ending only when the system
-is taken down.  Normally, one will instead want to build the system
-with CONFIG_RCU_TORTURE_TEST=m and to use modprobe and rmmod to control
-the test, perhaps using a script similar to the one shown at the end of
-this document.  Note that you will need CONFIG_MODULE_UNLOAD in order
-to be able to end the test.
+CONFIG_RCU_TORTURE_TEST_RUNNABLE
+
+It is also possible to specify CONFIG_RCU_TORTURE_TEST=y, which will
+result in the tests being loaded into the base kernel.  In this case,
+the CONFIG_RCU_TORTURE_TEST_RUNNABLE config option is used to specify
+whether the RCU torture tests are to be started immediately during
+boot or whether the /proc/sys/kernel/rcutorture_runnable file is used
+to enable them.  This /proc file can be used to repeatedly pause and
+restart the tests, regardless of the initial state specified by the
+CONFIG_RCU_TORTURE_TEST_RUNNABLE config option.
+
+You will normally -not- want to start the RCU torture tests during boot
+(and thus the default is CONFIG_RCU_TORTURE_TEST_RUNNABLE=n), but doing
+this can sometimes be useful in finding boot-time bugs.
 
 
 MODULE PARAMETERS
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98ae7d168225..27003e2421c7 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -125,6 +125,13 @@ static struct list_head rcu_torture_removed;
 
 static int stutter_pause_test = 0;
 
+#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
+#define RCUTORTURE_RUNNABLE_INIT 1
+#else
+#define RCUTORTURE_RUNNABLE_INIT 0
+#endif
+int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+
 /*
  * Allocate an element from the rcu_tortures pool.
  */
@@ -188,7 +195,7 @@ rcu_random(struct rcu_random_state *rrsp)
 static void
 rcu_stutter_wait(void)
 {
-	while (stutter_pause_test)
+	while (stutter_pause_test || !rcutorture_runnable)
 		schedule_timeout_interruptible(1);
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..c6887cf135c8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -82,6 +82,9 @@ extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+#ifdef CONFIG_RCU_TORTURE_TEST
+extern int rcutorture_runnable;
+#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
 #if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -813,6 +816,16 @@ static struct ctl_table kern_table[] = {
 		.child		= key_sysctls,
 	},
 #endif
+#ifdef CONFIG_RCU_TORTURE_TEST
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "rcutorture_runnable",
+		.data           = &rcutorture_runnable,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f51ba2fa2662..c35a86a516a0 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -537,11 +537,28 @@ config RCU_TORTURE_TEST
 	  on the RCU infrastructure.  The kernel module may be built
 	  after the fact on the running kernel to be tested, if desired.
 
-	  Say Y here if you want RCU torture tests to start automatically
-	  at boot time (you probably don't).
+	  Say Y here if you want RCU torture tests to be built into
+	  the kernel.
 	  Say M if you want the RCU torture tests to build as a module.
 	  Say N if you are unsure.
 
+config RCU_TORTURE_TEST_RUNNABLE
+	bool "torture tests for RCU runnable by default"
+	depends on RCU_TORTURE_TEST = y
+	default n
+	help
+	  This option provides a way to build the RCU torture tests
+	  directly into the kernel without them starting up at boot
+	  time.  You can use /proc/sys/kernel/rcutorture_runnable
+	  to manually override this setting.  This /proc file is
+	  available only when the RCU torture tests have been built
+	  into the kernel.
+
+	  Say Y here if you want the RCU torture tests to start during
+	  boot (you probably don't).
+	  Say N here if you want the RCU torture tests to start only
+	  after being manually enabled via /proc.
+
 config KPROBES_SANITY_TEST
 	bool "Kprobes sanity tests"
 	depends on DEBUG_KERNEL
-- 
cgit v1.2.3-58-ga151


From ad118c54a3587b2c69a769d0ba37d4d8dce4559d Mon Sep 17 00:00:00 2001
From: Vegard Nossum <vegard.nossum@gmail.com>
Date: Fri, 27 Jun 2008 18:04:48 +0200
Subject: stacktrace: add saved stack traces to backtrace self-test

This patch adds saved stack-traces to the backtrace suite of self-tests.

Note that we don't depend on or unconditionally enable CONFIG_STACKTRACE
because not all architectures may have it (and we still want to enable the
other tests for those architectures).

Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/backtracetest.c | 30 +++++++++++++++++++++++++++++-
 lib/Kconfig.debug      |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index d1a7605c5b8f..50f7abd0813d 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,9 +10,10 @@
  * of the License.
  */
 
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/stacktrace.h>
 
 static struct timer_list backtrace_timer;
 
@@ -22,6 +23,31 @@ static void backtrace_test_timer(unsigned long data)
 	printk("The following trace is a kernel self test and not a bug!\n");
 	dump_stack();
 }
+
+#ifdef CONFIG_STACKTRACE
+static void backtrace_test_saved(void)
+{
+	struct stack_trace trace;
+	unsigned long entries[8];
+
+	printk("Testing a saved backtrace.\n");
+	printk("The following trace is a kernel self test and not a bug!\n");
+
+	trace.nr_entries = 0;
+	trace.max_entries = ARRAY_SIZE(entries);
+	trace.entries = entries;
+	trace.skip = 0;
+
+	save_stack_trace(&trace);
+	print_stack_trace(&trace, 0);
+}
+#else
+static void backtrace_test_saved(void)
+{
+	printk("Saved backtrace test skipped.\n");
+}
+#endif
+
 static int backtrace_regression_test(void)
 {
 	printk("====[ backtrace testing ]===========\n");
@@ -29,6 +55,8 @@ static int backtrace_regression_test(void)
 	printk("The following trace is a kernel self test and not a bug!\n");
 	dump_stack();
 
+	backtrace_test_saved();
+
 	init_timer(&backtrace_timer);
 	backtrace_timer.function = backtrace_test_timer;
 	mod_timer(&backtrace_timer, jiffies + 10);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 9c17fb9d1d5e..6263e2d851f1 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -562,6 +562,9 @@ config BACKTRACE_SELF_TEST
 	  for distributions or general kernels, but only for kernel
 	  developers working on architecture code.
 
+	  Note that if you want to also test saved backtraces, you will
+	  have to enable STACKTRACE as well.
+
 	  Say N if you are unsure.
 
 config LKDTM
-- 
cgit v1.2.3-58-ga151


From 0f9bfa569d46f2346a53a940b2b9e49a38635732 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 Jul 2008 16:06:25 -0700
Subject: vsprintf: split out '%s' handling logic

The actual code is the same, just split out into a helper function.
This makes it easier to read, and allows for future sharing of the
string code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 57 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 31 insertions(+), 26 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 6021757a4496..926c7e00e2dc 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -482,6 +482,35 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
 	return buf;
 }
 
+static char *string(char *buf, char *end, char *s, int field_width, int precision, int flags)
+{
+	int len, i;
+
+	if ((unsigned long)s < PAGE_SIZE)
+		s = "<NULL>";
+
+	len = strnlen(s, precision);
+
+	if (!(flags & LEFT)) {
+		while (len < field_width--) {
+			if (buf < end)
+				*buf = ' ';
+			++buf;
+		}
+	}
+	for (i = 0; i < len; ++i) {
+		if (buf < end)
+			*buf = *s;
+		++buf; ++s;
+	}
+	while (len < field_width--) {
+		if (buf < end)
+			*buf = ' ';
+		++buf;
+	}
+	return buf;
+}
+
 /**
  * vsnprintf - Format a string and place it in a buffer
  * @buf: The buffer to place the result into
@@ -502,11 +531,9 @@ static char *number(char *buf, char *end, unsigned long long num, int base, int
  */
 int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 {
-	int len;
 	unsigned long long num;
-	int i, base;
+	int base;
 	char *str, *end, c;
-	const char *s;
 
 	int flags;		/* flags to number() */
 
@@ -622,29 +649,7 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 				continue;
 
 			case 's':
-				s = va_arg(args, char *);
-				if ((unsigned long)s < PAGE_SIZE)
-					s = "<NULL>";
-
-				len = strnlen(s, precision);
-
-				if (!(flags & LEFT)) {
-					while (len < field_width--) {
-						if (str < end)
-							*str = ' ';
-						++str;
-					}
-				}
-				for (i = 0; i < len; ++i) {
-					if (str < end)
-						*str = *s;
-					++str; ++s;
-				}
-				while (len < field_width--) {
-					if (str < end)
-						*str = ' ';
-					++str;
-				}
+				str = string(str, end, va_arg(args, char *), field_width, precision, flags);
 				continue;
 
 			case 'p':
-- 
cgit v1.2.3-58-ga151


From 78a8bf69b32980879975f7e31d30386c50bfe851 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 Jul 2008 16:16:15 -0700
Subject: vsprintf: split out '%p' handling logic

The actual code is the same, just split out into a helper function.
This makes it easier to read, and allows for simple future extension
of %p handling.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 926c7e00e2dc..f569feb7662e 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -511,6 +511,16 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
 	return buf;
 }
 
+static char *pointer(char *buf, char *end, void *ptr, int field_width, int precision, int flags)
+{
+	flags |= SMALL;
+	if (field_width == -1) {
+		field_width = 2*sizeof(void *);
+		flags |= ZEROPAD;
+	}
+	return number(buf, end, (unsigned long) ptr, 16, field_width, precision, flags);
+}
+
 /**
  * vsnprintf - Format a string and place it in a buffer
  * @buf: The buffer to place the result into
@@ -653,17 +663,9 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 				continue;
 
 			case 'p':
-				flags |= SMALL;
-				if (field_width == -1) {
-					field_width = 2*sizeof(void *);
-					flags |= ZEROPAD;
-				}
-				str = number(str, end,
-						(unsigned long) va_arg(args, void *),
-						16, field_width, precision, flags);
+				str = pointer(str, end, va_arg(args, void *), field_width, precision, flags);
 				continue;
 
-
 			case 'n':
 				/* FIXME:
 				* What does C99 say about the overflow case here? */
-- 
cgit v1.2.3-58-ga151


From 4d8a743cdd2690c0bc8d1b8cbd02cffb1ead849f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 Jul 2008 16:24:57 -0700
Subject: vsprintf: add infrastructure support for extended '%p' specifiers

This expands the kernel '%p' handling with an arbitrary alphanumberic
specifier extension string immediately following the '%p'.  Right now
it's just being ignored, but the next commit will start adding some
specific pointer type extensions.

NOTE! The reason the extension is appended to the '%p' is to allow
minimal gcc type checking: gcc will still see the '%p' and will check
that the argument passed in is indeed a pointer, and yet will not
complain about the extended information that gcc doesn't understand
about (on the other hand, it also won't actually check that the pointer
type and the extension are compatible).

Alphanumeric characters were chosen because there is no sane existing
use for a string format with a hex pointer representation immediately
followed by alphanumerics (which is what such a format string would have
traditionally resulted in).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index f569feb7662e..5d6f0718b6d9 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -511,7 +511,14 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
 	return buf;
 }
 
-static char *pointer(char *buf, char *end, void *ptr, int field_width, int precision, int flags)
+/*
+ * Show a '%p' thing.  A kernel extension is that the '%p' is followed
+ * by an extra set of alphanumeric characters that are extended format
+ * specifiers.
+ *
+ * Right now don't actually handle any such, but we will..
+ */
+static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags)
 {
 	flags |= SMALL;
 	if (field_width == -1) {
@@ -663,7 +670,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 				continue;
 
 			case 'p':
-				str = pointer(str, end, va_arg(args, void *), field_width, precision, flags);
+				str = pointer(fmt+1, str, end,
+						va_arg(args, void *),
+						field_width, precision, flags);
+				/* Skip all alphanumeric pointer suffixes */
+				while (isalnum(fmt[1]))
+					fmt++;
 				continue;
 
 			case 'n':
-- 
cgit v1.2.3-58-ga151


From 0fe1ef24f7bd0020f29ffe287dfdb9ead33ca0b2 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 6 Jul 2008 16:43:12 -0700
Subject: vsprintf: add support for '%pS' and '%pF' pointer formats

They print out a pointer in symbolic format, if possible (ie using
symbolic KALLSYMS information).  The '%pS' format is for regular direct
pointers (which can point to data or code and that you find on the stack
during backtraces etc), while '%pF' is for C function pointer types.

On most architectures, the two mean exactly the same thing, but some
architectures use an indirect pointer for C function pointers, where the
function pointer points to a function descriptor (which in turn contains
the actual pointer to the code).  The '%pF' code automatically does the
appropriate function descriptor dereference on such architectures.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/vsprintf.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

(limited to 'lib')

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 5d6f0718b6d9..1dc2d1d18fa8 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -22,6 +22,8 @@
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/kernel.h>
+#include <linux/kallsyms.h>
+#include <linux/uaccess.h>
 
 #include <asm/page.h>		/* for PAGE_SIZE */
 #include <asm/div64.h>
@@ -511,15 +513,52 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio
 	return buf;
 }
 
+static inline void *dereference_function_descriptor(void *ptr)
+{
+#if defined(CONFIG_IA64) || defined(CONFIG_PPC64)
+	void *p;
+	if (!probe_kernel_address(ptr, p))
+		ptr = p;
+#endif
+	return ptr;
+}
+
+static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags)
+{
+	unsigned long value = (unsigned long) ptr;
+#ifdef CONFIG_KALLSYMS
+	char sym[KSYM_SYMBOL_LEN];
+	sprint_symbol(sym, value);
+	return string(buf, end, sym, field_width, precision, flags);
+#else
+	field_width = 2*sizeof(void *);
+	flags |= SPECIAL | SMALL | ZEROPAD;
+	return number(buf, end, value, 16, field_width, precision, flags);
+#endif
+}
+
 /*
  * Show a '%p' thing.  A kernel extension is that the '%p' is followed
  * by an extra set of alphanumeric characters that are extended format
  * specifiers.
  *
- * Right now don't actually handle any such, but we will..
+ * Right now we just handle 'F' (for symbolic Function descriptor pointers)
+ * and 'S' (for Symbolic direct pointers), but this can easily be
+ * extended in the future (network address types etc).
+ *
+ * The difference between 'S' and 'F' is that on ia64 and ppc64 function
+ * pointers are really function descriptors, which contain a pointer the
+ * real address. 
  */
 static char *pointer(const char *fmt, char *buf, char *end, void *ptr, int field_width, int precision, int flags)
 {
+	switch (*fmt) {
+	case 'F':
+		ptr = dereference_function_descriptor(ptr);
+		/* Fallthrough */
+	case 'S':
+		return symbol_string(buf, end, ptr, field_width, precision, flags);
+	}
 	flags |= SMALL;
 	if (field_width == -1) {
 		field_width = 2*sizeof(void *);
-- 
cgit v1.2.3-58-ga151


From e8ced39d5e8911c662d4d69a342b9d053eaaac4e Mon Sep 17 00:00:00 2001
From: Mingming Cao <cmm@us.ibm.com>
Date: Fri, 11 Jul 2008 19:27:31 -0400
Subject: percpu_counter: new function percpu_counter_sum_and_set

Delayed allocation need to check free blocks at every write time.
percpu_counter_read_positive() is not quit accurate. delayed
allocation need a more accurate accounting, but using
percpu_counter_sum_positive() is frequently is quite expensive.

This patch added a new function to update center counter when sum
per-cpu counter, to increase the accurate rate for next
percpu_counter_read() and require less calling expensive
percpu_counter_sum().

Signed-off-by: Mingming Cao <cmm@us.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c               |  2 +-
 include/linux/percpu_counter.h | 12 +++++++++---
 lib/percpu_counter.c           |  7 ++++++-
 3 files changed, 16 insertions(+), 5 deletions(-)

(limited to 'lib')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 25f63d8c1b3d..6369bacf0dcb 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -1621,7 +1621,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi,
 #ifdef CONFIG_SMP
 	if (free_blocks - root_blocks < FBC_BATCH)
 		free_blocks =
-			percpu_counter_sum_positive(&sbi->s_freeblocks_counter);
+			percpu_counter_sum_and_set(&sbi->s_freeblocks_counter);
 #endif
 	if (free_blocks - root_blocks < nblocks)
 		return free_blocks - root_blocks;
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 9007ccdfc112..208388835357 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount);
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
-s64 __percpu_counter_sum(struct percpu_counter *fbc);
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
 {
-	s64 ret = __percpu_counter_sum(fbc);
+	s64 ret = __percpu_counter_sum(fbc, 0);
 	return ret < 0 ? 0 : ret;
 }
 
+static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc)
+{
+	return __percpu_counter_sum(fbc, 1);
+}
+
+
 static inline s64 percpu_counter_sum(struct percpu_counter *fbc)
 {
-	return __percpu_counter_sum(fbc);
+	return __percpu_counter_sum(fbc, 0);
 }
 
 static inline s64 percpu_counter_read(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 119174494cb5..4a8ba4bf5f6f 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add);
  * Add up all the per-cpu counts, return the result.  This is a more accurate
  * but much slower version of percpu_counter_read_positive()
  */
-s64 __percpu_counter_sum(struct percpu_counter *fbc)
+s64 __percpu_counter_sum(struct percpu_counter *fbc, int set)
 {
 	s64 ret;
 	int cpu;
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
 	for_each_online_cpu(cpu) {
 		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
+		if (set)
+			*pcount = 0;
 	}
+	if (set)
+		fbc->count = ret;
+
 	spin_unlock(&fbc->lock);
 	return ret;
 }
-- 
cgit v1.2.3-58-ga151