From 2244d07bfa2097cb00600da91c715a8aa547917e Mon Sep 17 00:00:00 2001
From: Oliver Hartkopp <socketcan@hartkopp.net>
Date: Tue, 17 Aug 2010 08:59:14 +0000
Subject: net: simplify flags for tx timestamping

This patch removes the abstraction introduced by the union skb_shared_tx in
the shared skb data.

The access of the different union elements at several places led to some
confusion about accessing the shared tx_flags e.g. in skb_orphan_try().

    http://marc.info/?l=linux-netdev&m=128084897415886&w=2

Signed-off-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/icmp.c      | 4 ++--
 net/ipv4/ip_output.c | 6 +++---
 net/ipv4/raw.c       | 2 +-
 net/ipv4/udp.c       | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index a0d847c7cba5..96bc7f9475a3 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
 	inet->tos = ip_hdr(skb)->tos;
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 	if (icmp_param->replyopts.optlen) {
 		ipc.opt = &icmp_param->replyopts;
 		if (ipc.opt->srr)
@@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info)
 	inet_sk(sk)->tos = tos;
 	ipc.addr = iph->saddr;
 	ipc.opt = &icmp_param.replyopts;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	{
 		struct flowi fl = {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 04b69896df5f..e807492f1777 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -953,7 +953,7 @@ alloc_new_skb:
 				else
 					/* only the initial fragment is
 					   time stamped */
-					ipc->shtx.flags = 0;
+					ipc->tx_flags = 0;
 			}
 			if (skb == NULL)
 				goto error;
@@ -964,7 +964,7 @@ alloc_new_skb:
 			skb->ip_summed = csummode;
 			skb->csum = 0;
 			skb_reserve(skb, hh_len);
-			*skb_tx(skb) = ipc->shtx;
+			skb_shinfo(skb)->tx_flags = ipc->tx_flags;
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1384,7 +1384,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar
 
 	daddr = ipc.addr = rt->rt_src;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	if (replyopts.opt.optlen) {
 		ipc.opt = &replyopts.opt;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 009a7b2aa1ef..1f85ef289895 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 	ipc.addr = inet->inet_saddr;
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 	ipc.oif = sk->sk_bound_dev_if;
 
 	if (msg->msg_controllen) {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 32e0bef60d0a..86e757e162ee 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		return -EOPNOTSUPP;
 
 	ipc.opt = NULL;
-	ipc.shtx.flags = 0;
+	ipc.tx_flags = 0;
 
 	if (up->pending) {
 		/*
@@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	ipc.addr = inet->inet_saddr;
 
 	ipc.oif = sk->sk_bound_dev_if;
-	err = sock_tx_timestamp(msg, sk, &ipc.shtx);
+	err = sock_tx_timestamp(sk, &ipc.tx_flags);
 	if (err)
 		return err;
 	if (msg->msg_controllen) {
-- 
cgit v1.2.3-58-ga151


From 3d04ebb6ab2ac9a3bea7644f0d13cdf65002b870 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 17 Aug 2010 20:34:40 +0000
Subject: netfilter: ipt_CLUSTERIP: use proto_ports_offset() to support AH
 message

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/netfilter/ipt_CLUSTERIP.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 3a43cf36db87..1e26a4897655 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/net_namespace.h>
 #include <net/checksum.h>
+#include <net/ip.h>
 
 #define CLUSTERIP_VERSION "0.8"
 
@@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb,
 {
 	const struct iphdr *iph = ip_hdr(skb);
 	unsigned long hashval;
-	u_int16_t sport, dport;
-	const u_int16_t *ports;
-
-	switch (iph->protocol) {
-	case IPPROTO_TCP:
-	case IPPROTO_UDP:
-	case IPPROTO_UDPLITE:
-	case IPPROTO_SCTP:
-	case IPPROTO_DCCP:
-	case IPPROTO_ICMP:
-		ports = (const void *)iph+iph->ihl*4;
-		sport = ports[0];
-		dport = ports[1];
-		break;
-	default:
+	u_int16_t sport = 0, dport = 0;
+	int poff;
+
+	poff = proto_ports_offset(iph->protocol);
+	if (poff >= 0) {
+		const u_int16_t *ports;
+		u16 _ports[2];
+
+		ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports);
+		if (ports) {
+			sport = ports[0];
+			dport = ports[1];
+		}
+	} else {
 		if (net_ratelimit())
 			pr_info("unknown protocol %u\n", iph->protocol);
-		sport = dport = 0;
 	}
 
 	switch (config->hash_mode) {
-- 
cgit v1.2.3-58-ga151


From 49e8ab03ebcacd8e37660ffec20c0c46721a2800 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 19 Aug 2010 06:10:45 +0000
Subject: net: build_ehash_secret() and rt_bind_peer() cleanups

Now cmpxchg() is available on all arches, we can use it in
build_ehash_secret() and rt_bind_peer() instead of using spinlocks.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/af_inet.c | 8 +++-----
 net/ipv4/route.c   | 9 +--------
 2 files changed, 4 insertions(+), 13 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6a1100c25a9f..f581f77d1097 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret);
 
 /*
  * inet_ehash_secret must be set exactly once
- * Instead of using a dedicated spinlock, we (ab)use inetsw_lock
  */
 void build_ehash_secret(void)
 {
 	u32 rnd;
+
 	do {
 		get_random_bytes(&rnd, sizeof(rnd));
 	} while (rnd == 0);
-	spin_lock_bh(&inetsw_lock);
-	if (!inet_ehash_secret)
-		inet_ehash_secret = rnd;
-	spin_unlock_bh(&inetsw_lock);
+
+	cmpxchg(&inet_ehash_secret, 0, rnd);
 }
 EXPORT_SYMBOL(build_ehash_secret);
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3f56b6e6c6aa..85a67c9d5982 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1268,18 +1268,11 @@ skip_hashing:
 
 void rt_bind_peer(struct rtable *rt, int create)
 {
-	static DEFINE_SPINLOCK(rt_peer_lock);
 	struct inet_peer *peer;
 
 	peer = inet_getpeer(rt->rt_dst, create);
 
-	spin_lock_bh(&rt_peer_lock);
-	if (rt->peer == NULL) {
-		rt->peer = peer;
-		peer = NULL;
-	}
-	spin_unlock_bh(&rt_peer_lock);
-	if (peer)
+	if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL)
 		inet_putpeer(peer);
 }
 
-- 
cgit v1.2.3-58-ga151


From 00959ade36acadc00e757f87060bf6e4501d545f Mon Sep 17 00:00:00 2001
From: Dmitry Kozlov <xeb@mail.ru>
Date: Sat, 21 Aug 2010 23:05:39 -0700
Subject: PPTP: PPP over IPv4 (Point-to-Point Tunneling Protocol)

PPP: introduce "pptp" module which implements point-to-point tunneling protocol using pppox framework
NET: introduce the "gre" module for demultiplexing GRE packets on version criteria
     (required to pptp and ip_gre may coexists)
NET: ip_gre: update to use the "gre" module

This patch introduces then pptp support to the linux kernel which
dramatically speeds up pptp vpn connections and decreases cpu usage in
comparison of existing user-space implementation
(poptop/pptpclient). There is accel-pptp project
(https://sourceforge.net/projects/accel-pptp/) to utilize this module,
it contains plugin for pppd to use pptp in client-mode and modified
pptpd (poptop) to build high-performance pptp NAS.

There was many changes from initial submitted patch, most important are:
1. using rcu instead of read-write locks
2. using static bitmap instead of dynamically allocated
3. using vmalloc for memory allocation instead of BITS_PER_LONG + __get_free_pages
4. fixed many coding style issues
Thanks to Eric Dumazet.

Signed-off-by: Dmitry Kozlov <xeb@mail.ru>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 MAINTAINERS              |  14 +
 drivers/net/Kconfig      |  11 +
 drivers/net/Makefile     |   1 +
 drivers/net/pptp.c       | 726 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/if_pppox.h |  52 ++--
 include/net/gre.h        |  18 ++
 net/ipv4/Kconfig         |   7 +
 net/ipv4/Makefile        |   1 +
 net/ipv4/gre.c           | 151 ++++++++++
 net/ipv4/ip_gre.c        |  14 +-
 10 files changed, 971 insertions(+), 24 deletions(-)
 create mode 100644 drivers/net/pptp.c
 create mode 100644 include/net/gre.h
 create mode 100644 net/ipv4/gre.c

(limited to 'net/ipv4')

diff --git a/MAINTAINERS b/MAINTAINERS
index b5b8baa1d70e..43c9efcd8e10 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6528,6 +6528,20 @@ M:	"Maciej W. Rozycki" <macro@linux-mips.org>
 S:	Maintained
 F:	drivers/serial/zs.*
 
+GRE DEMULTIPLEXER DRIVER
+M:	Dmitry Kozlov <xeb@mail.ru>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	net/ipv4/gre.c
+F:	include/net/gre.h
+
+PPTP DRIVER
+M:	Dmitry Kozlov <xeb@mail.ru>
+L:	netdev@vger.kernel.org
+S:	Maintained
+F:	drivers/net/pptp.c
+W:	http://sourceforge.net/projects/accel-pptp
+
 THE REST
 M:	Linus Torvalds <torvalds@linux-foundation.org>
 L:	linux-kernel@vger.kernel.org
diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index 5a6895320b48..9b2a72089a68 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -3192,6 +3192,17 @@ config PPPOE
 	  which contains instruction on how to use this driver (under 
 	  the heading "Kernel mode PPPoE").
 
+config PPTP
+	tristate "PPP over IPv4 (PPTP) (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && PPP && NET_IPGRE_DEMUX
+	help
+	  Support for PPP over IPv4.(Point-to-Point Tunneling Protocol)
+
+	  This driver requires pppd plugin to work in client mode or
+	  modified pptpd (poptop) to work in server mode.
+	  See http://accel-pptp.sourceforge.net/ for information how to
+	  utilize this module.
+
 config PPPOATM
 	tristate "PPP over ATM"
 	depends on ATM && PPP
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 56e8c27f77ce..0b371083d481 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -162,6 +162,7 @@ obj-$(CONFIG_PPP_BSDCOMP) += bsd_comp.o
 obj-$(CONFIG_PPP_MPPE) += ppp_mppe.o
 obj-$(CONFIG_PPPOE) += pppox.o pppoe.o
 obj-$(CONFIG_PPPOL2TP) += pppox.o
+obj-$(CONFIG_PPTP) += pppox.o pptp.o
 
 obj-$(CONFIG_SLIP) += slip.o
 obj-$(CONFIG_SLHC) += slhc.o
diff --git a/drivers/net/pptp.c b/drivers/net/pptp.c
new file mode 100644
index 000000000000..761f0eced724
--- /dev/null
+++ b/drivers/net/pptp.c
@@ -0,0 +1,726 @@
+/*
+ *  Point-to-Point Tunneling Protocol for Linux
+ *
+ *	Authors: Dmitry Kozlov <xeb@mail.ru>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/ppp_channel.h>
+#include <linux/ppp_defs.h>
+#include <linux/if_pppox.h>
+#include <linux/if_ppp.h>
+#include <linux/notifier.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/version.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+#include <net/sock.h>
+#include <net/protocol.h>
+#include <net/ip.h>
+#include <net/icmp.h>
+#include <net/route.h>
+#include <net/gre.h>
+
+#include <linux/uaccess.h>
+
+#define PPTP_DRIVER_VERSION "0.8.5"
+
+#define MAX_CALLID 65535
+
+static DECLARE_BITMAP(callid_bitmap, MAX_CALLID + 1);
+static struct pppox_sock **callid_sock;
+
+static DEFINE_SPINLOCK(chan_lock);
+
+static struct proto pptp_sk_proto __read_mostly;
+static struct ppp_channel_ops pptp_chan_ops;
+static const struct proto_ops pptp_ops;
+
+#define PPP_LCP_ECHOREQ 0x09
+#define PPP_LCP_ECHOREP 0x0A
+#define SC_RCV_BITS	(SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP)
+
+#define MISSING_WINDOW 20
+#define WRAPPED(curseq, lastseq)\
+	((((curseq) & 0xffffff00) == 0) &&\
+	(((lastseq) & 0xffffff00) == 0xffffff00))
+
+#define PPTP_GRE_PROTO  0x880B
+#define PPTP_GRE_VER    0x1
+
+#define PPTP_GRE_FLAG_C	0x80
+#define PPTP_GRE_FLAG_R	0x40
+#define PPTP_GRE_FLAG_K	0x20
+#define PPTP_GRE_FLAG_S	0x10
+#define PPTP_GRE_FLAG_A	0x80
+
+#define PPTP_GRE_IS_C(f) ((f)&PPTP_GRE_FLAG_C)
+#define PPTP_GRE_IS_R(f) ((f)&PPTP_GRE_FLAG_R)
+#define PPTP_GRE_IS_K(f) ((f)&PPTP_GRE_FLAG_K)
+#define PPTP_GRE_IS_S(f) ((f)&PPTP_GRE_FLAG_S)
+#define PPTP_GRE_IS_A(f) ((f)&PPTP_GRE_FLAG_A)
+
+#define PPTP_HEADER_OVERHEAD (2+sizeof(struct pptp_gre_header))
+struct pptp_gre_header {
+	u8  flags;
+	u8  ver;
+	u16 protocol;
+	u16 payload_len;
+	u16 call_id;
+	u32 seq;
+	u32 ack;
+} __packed;
+
+static struct pppox_sock *lookup_chan(u16 call_id, __be32 s_addr)
+{
+	struct pppox_sock *sock;
+	struct pptp_opt *opt;
+
+	rcu_read_lock();
+	sock = rcu_dereference(callid_sock[call_id]);
+	if (sock) {
+		opt = &sock->proto.pptp;
+		if (opt->dst_addr.sin_addr.s_addr != s_addr)
+			sock = NULL;
+		else
+			sock_hold(sk_pppox(sock));
+	}
+	rcu_read_unlock();
+
+	return sock;
+}
+
+static int lookup_chan_dst(u16 call_id, __be32 d_addr)
+{
+	struct pppox_sock *sock;
+	struct pptp_opt *opt;
+	int i;
+
+	rcu_read_lock();
+	for (i = find_next_bit(callid_bitmap, MAX_CALLID, 1); i < MAX_CALLID;
+	     i = find_next_bit(callid_bitmap, MAX_CALLID, i + 1)) {
+		sock = rcu_dereference(callid_sock[i]);
+		if (!sock)
+			continue;
+		opt = &sock->proto.pptp;
+		if (opt->dst_addr.call_id == call_id &&
+			  opt->dst_addr.sin_addr.s_addr == d_addr)
+			break;
+	}
+	rcu_read_unlock();
+
+	return i < MAX_CALLID;
+}
+
+static int add_chan(struct pppox_sock *sock)
+{
+	static int call_id;
+
+	spin_lock(&chan_lock);
+	if (!sock->proto.pptp.src_addr.call_id)	{
+		call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, call_id + 1);
+		if (call_id == MAX_CALLID) {
+			call_id = find_next_zero_bit(callid_bitmap, MAX_CALLID, 1);
+			if (call_id == MAX_CALLID)
+				goto out_err;
+		}
+		sock->proto.pptp.src_addr.call_id = call_id;
+	} else if (test_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap))
+		goto out_err;
+
+	set_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap);
+	rcu_assign_pointer(callid_sock[sock->proto.pptp.src_addr.call_id], sock);
+	spin_unlock(&chan_lock);
+
+	return 0;
+
+out_err:
+	spin_unlock(&chan_lock);
+	return -1;
+}
+
+static void del_chan(struct pppox_sock *sock)
+{
+	spin_lock(&chan_lock);
+	clear_bit(sock->proto.pptp.src_addr.call_id, callid_bitmap);
+	rcu_assign_pointer(callid_sock[sock->proto.pptp.src_addr.call_id], NULL);
+	spin_unlock(&chan_lock);
+	synchronize_rcu();
+}
+
+static int pptp_xmit(struct ppp_channel *chan, struct sk_buff *skb)
+{
+	struct sock *sk = (struct sock *) chan->private;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct pptp_opt *opt = &po->proto.pptp;
+	struct pptp_gre_header *hdr;
+	unsigned int header_len = sizeof(*hdr);
+	int err = 0;
+	int islcp;
+	int len;
+	unsigned char *data;
+	__u32 seq_recv;
+
+
+	struct rtable *rt;
+	struct net_device *tdev;
+	struct iphdr  *iph;
+	int    max_headroom;
+
+	if (sk_pppox(po)->sk_state & PPPOX_DEAD)
+		goto tx_error;
+
+	{
+		struct flowi fl = { .oif = 0,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = opt->dst_addr.sin_addr.s_addr,
+					.saddr = opt->src_addr.sin_addr.s_addr,
+					.tos = RT_TOS(0) } },
+			.proto = IPPROTO_GRE };
+		err = ip_route_output_key(&init_net, &rt, &fl);
+		if (err)
+			goto tx_error;
+	}
+	tdev = rt->dst.dev;
+
+	max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(*iph) + sizeof(*hdr) + 2;
+
+	if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) {
+		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
+		if (!new_skb) {
+			ip_rt_put(rt);
+			goto tx_error;
+		}
+		if (skb->sk)
+			skb_set_owner_w(new_skb, skb->sk);
+		kfree_skb(skb);
+		skb = new_skb;
+	}
+
+	data = skb->data;
+	islcp = ((data[0] << 8) + data[1]) == PPP_LCP && 1 <= data[2] && data[2] <= 7;
+
+	/* compress protocol field */
+	if ((opt->ppp_flags & SC_COMP_PROT) && data[0] == 0 && !islcp)
+		skb_pull(skb, 1);
+
+	/* Put in the address/control bytes if necessary */
+	if ((opt->ppp_flags & SC_COMP_AC) == 0 || islcp) {
+		data = skb_push(skb, 2);
+		data[0] = PPP_ALLSTATIONS;
+		data[1] = PPP_UI;
+	}
+
+	len = skb->len;
+
+	seq_recv = opt->seq_recv;
+
+	if (opt->ack_sent == seq_recv)
+		header_len -= sizeof(hdr->ack);
+
+	/* Push down and install GRE header */
+	skb_push(skb, header_len);
+	hdr = (struct pptp_gre_header *)(skb->data);
+
+	hdr->flags       = PPTP_GRE_FLAG_K;
+	hdr->ver         = PPTP_GRE_VER;
+	hdr->protocol    = htons(PPTP_GRE_PROTO);
+	hdr->call_id     = htons(opt->dst_addr.call_id);
+
+	hdr->flags      |= PPTP_GRE_FLAG_S;
+	hdr->seq         = htonl(++opt->seq_sent);
+	if (opt->ack_sent != seq_recv)	{
+		/* send ack with this message */
+		hdr->ver |= PPTP_GRE_FLAG_A;
+		hdr->ack  = htonl(seq_recv);
+		opt->ack_sent = seq_recv;
+	}
+	hdr->payload_len = htons(len);
+
+	/*	Push down and install the IP header. */
+
+	skb_reset_transport_header(skb);
+	skb_push(skb, sizeof(*iph));
+	skb_reset_network_header(skb);
+	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED);
+
+	iph =	ip_hdr(skb);
+	iph->version =	4;
+	iph->ihl =	sizeof(struct iphdr) >> 2;
+	if (ip_dont_fragment(sk, &rt->dst))
+		iph->frag_off	=	htons(IP_DF);
+	else
+		iph->frag_off	=	0;
+	iph->protocol = IPPROTO_GRE;
+	iph->tos      = 0;
+	iph->daddr    = rt->rt_dst;
+	iph->saddr    = rt->rt_src;
+	iph->ttl      = dst_metric(&rt->dst, RTAX_HOPLIMIT);
+	iph->tot_len  = htons(skb->len);
+
+	skb_dst_drop(skb);
+	skb_dst_set(skb, &rt->dst);
+
+	nf_reset(skb);
+
+	skb->ip_summed = CHECKSUM_NONE;
+	ip_select_ident(iph, &rt->dst, NULL);
+	ip_send_check(iph);
+
+	ip_local_out(skb);
+
+tx_error:
+	return 1;
+}
+
+static int pptp_rcv_core(struct sock *sk, struct sk_buff *skb)
+{
+	struct pppox_sock *po = pppox_sk(sk);
+	struct pptp_opt *opt = &po->proto.pptp;
+	int headersize, payload_len, seq;
+	__u8 *payload;
+	struct pptp_gre_header *header;
+
+	if (!(sk->sk_state & PPPOX_CONNECTED)) {
+		if (sock_queue_rcv_skb(sk, skb))
+			goto drop;
+		return NET_RX_SUCCESS;
+	}
+
+	header = (struct pptp_gre_header *)(skb->data);
+
+	/* test if acknowledgement present */
+	if (PPTP_GRE_IS_A(header->ver)) {
+		__u32 ack = (PPTP_GRE_IS_S(header->flags)) ?
+				header->ack : header->seq; /* ack in different place if S = 0 */
+
+		ack = ntohl(ack);
+
+		if (ack > opt->ack_recv)
+			opt->ack_recv = ack;
+		/* also handle sequence number wrap-around  */
+		if (WRAPPED(ack, opt->ack_recv))
+			opt->ack_recv = ack;
+	}
+
+	/* test if payload present */
+	if (!PPTP_GRE_IS_S(header->flags))
+		goto drop;
+
+	headersize  = sizeof(*header);
+	payload_len = ntohs(header->payload_len);
+	seq         = ntohl(header->seq);
+
+	/* no ack present? */
+	if (!PPTP_GRE_IS_A(header->ver))
+		headersize -= sizeof(header->ack);
+	/* check for incomplete packet (length smaller than expected) */
+	if (skb->len - headersize < payload_len)
+		goto drop;
+
+	payload = skb->data + headersize;
+	/* check for expected sequence number */
+	if (seq < opt->seq_recv + 1 || WRAPPED(opt->seq_recv, seq)) {
+		if ((payload[0] == PPP_ALLSTATIONS) && (payload[1] == PPP_UI) &&
+				(PPP_PROTOCOL(payload) == PPP_LCP) &&
+				((payload[4] == PPP_LCP_ECHOREQ) || (payload[4] == PPP_LCP_ECHOREP)))
+			goto allow_packet;
+	} else {
+		opt->seq_recv = seq;
+allow_packet:
+		skb_pull(skb, headersize);
+
+		if (payload[0] == PPP_ALLSTATIONS && payload[1] == PPP_UI) {
+			/* chop off address/control */
+			if (skb->len < 3)
+				goto drop;
+			skb_pull(skb, 2);
+		}
+
+		if ((*skb->data) & 1) {
+			/* protocol is compressed */
+			skb_push(skb, 1)[0] = 0;
+		}
+
+		skb->ip_summed = CHECKSUM_NONE;
+		skb_set_network_header(skb, skb->head-skb->data);
+		ppp_input(&po->chan, skb);
+
+		return NET_RX_SUCCESS;
+	}
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static int pptp_rcv(struct sk_buff *skb)
+{
+	struct pppox_sock *po;
+	struct pptp_gre_header *header;
+	struct iphdr *iph;
+
+	if (skb->pkt_type != PACKET_HOST)
+		goto drop;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	iph = ip_hdr(skb);
+
+	header = (struct pptp_gre_header *)skb->data;
+
+	if (ntohs(header->protocol) != PPTP_GRE_PROTO || /* PPTP-GRE protocol for PPTP */
+		PPTP_GRE_IS_C(header->flags) ||                /* flag C should be clear */
+		PPTP_GRE_IS_R(header->flags) ||                /* flag R should be clear */
+		!PPTP_GRE_IS_K(header->flags) ||               /* flag K should be set */
+		(header->flags&0xF) != 0)                      /* routing and recursion ctrl = 0 */
+		/* if invalid, discard this packet */
+		goto drop;
+
+	po = lookup_chan(htons(header->call_id), iph->saddr);
+	if (po) {
+		skb_dst_drop(skb);
+		nf_reset(skb);
+		return sk_receive_skb(sk_pppox(po), skb, 0);
+	}
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static int pptp_bind(struct socket *sock, struct sockaddr *uservaddr,
+	int sockaddr_len)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct pptp_opt *opt = &po->proto.pptp;
+	int error = 0;
+
+	lock_sock(sk);
+
+	opt->src_addr = sp->sa_addr.pptp;
+	if (add_chan(po)) {
+		release_sock(sk);
+		error = -EBUSY;
+	}
+
+	release_sock(sk);
+	return error;
+}
+
+static int pptp_connect(struct socket *sock, struct sockaddr *uservaddr,
+	int sockaddr_len, int flags)
+{
+	struct sock *sk = sock->sk;
+	struct sockaddr_pppox *sp = (struct sockaddr_pppox *) uservaddr;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct pptp_opt *opt = &po->proto.pptp;
+	struct rtable *rt;
+	int error = 0;
+
+	if (sp->sa_protocol != PX_PROTO_PPTP)
+		return -EINVAL;
+
+	if (lookup_chan_dst(sp->sa_addr.pptp.call_id, sp->sa_addr.pptp.sin_addr.s_addr))
+		return -EALREADY;
+
+	lock_sock(sk);
+	/* Check for already bound sockets */
+	if (sk->sk_state & PPPOX_CONNECTED) {
+		error = -EBUSY;
+		goto end;
+	}
+
+	/* Check for already disconnected sockets, on attempts to disconnect */
+	if (sk->sk_state & PPPOX_DEAD) {
+		error = -EALREADY;
+		goto end;
+	}
+
+	if (!opt->src_addr.sin_addr.s_addr || !sp->sa_addr.pptp.sin_addr.s_addr) {
+		error = -EINVAL;
+		goto end;
+	}
+
+	po->chan.private = sk;
+	po->chan.ops = &pptp_chan_ops;
+
+	{
+		struct flowi fl = {
+			.nl_u = {
+				.ip4_u = {
+					.daddr = opt->dst_addr.sin_addr.s_addr,
+					.saddr = opt->src_addr.sin_addr.s_addr,
+					.tos = RT_CONN_FLAGS(sk) } },
+			.proto = IPPROTO_GRE };
+		security_sk_classify_flow(sk, &fl);
+		if (ip_route_output_key(&init_net, &rt, &fl)) {
+			error = -EHOSTUNREACH;
+			goto end;
+		}
+		sk_setup_caps(sk, &rt->dst);
+	}
+	po->chan.mtu = dst_mtu(&rt->dst);
+	if (!po->chan.mtu)
+		po->chan.mtu = PPP_MTU;
+	ip_rt_put(rt);
+	po->chan.mtu -= PPTP_HEADER_OVERHEAD;
+
+	po->chan.hdrlen = 2 + sizeof(struct pptp_gre_header);
+	error = ppp_register_channel(&po->chan);
+	if (error) {
+		pr_err("PPTP: failed to register PPP channel (%d)\n", error);
+		goto end;
+	}
+
+	opt->dst_addr = sp->sa_addr.pptp;
+	sk->sk_state = PPPOX_CONNECTED;
+
+ end:
+	release_sock(sk);
+	return error;
+}
+
+static int pptp_getname(struct socket *sock, struct sockaddr *uaddr,
+	int *usockaddr_len, int peer)
+{
+	int len = sizeof(struct sockaddr_pppox);
+	struct sockaddr_pppox sp;
+
+	sp.sa_family	  = AF_PPPOX;
+	sp.sa_protocol  = PX_PROTO_PPTP;
+	sp.sa_addr.pptp = pppox_sk(sock->sk)->proto.pptp.src_addr;
+
+	memcpy(uaddr, &sp, len);
+
+	*usockaddr_len = len;
+
+	return 0;
+}
+
+static int pptp_release(struct socket *sock)
+{
+	struct sock *sk = sock->sk;
+	struct pppox_sock *po;
+	struct pptp_opt *opt;
+	int error = 0;
+
+	if (!sk)
+		return 0;
+
+	lock_sock(sk);
+
+	if (sock_flag(sk, SOCK_DEAD)) {
+		release_sock(sk);
+		return -EBADF;
+	}
+
+	po = pppox_sk(sk);
+	opt = &po->proto.pptp;
+	del_chan(po);
+
+	pppox_unbind_sock(sk);
+	sk->sk_state = PPPOX_DEAD;
+
+	sock_orphan(sk);
+	sock->sk = NULL;
+
+	release_sock(sk);
+	sock_put(sk);
+
+	return error;
+}
+
+static void pptp_sock_destruct(struct sock *sk)
+{
+	if (!(sk->sk_state & PPPOX_DEAD)) {
+		del_chan(pppox_sk(sk));
+		pppox_unbind_sock(sk);
+	}
+	skb_queue_purge(&sk->sk_receive_queue);
+}
+
+static int pptp_create(struct net *net, struct socket *sock)
+{
+	int error = -ENOMEM;
+	struct sock *sk;
+	struct pppox_sock *po;
+	struct pptp_opt *opt;
+
+	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pptp_sk_proto);
+	if (!sk)
+		goto out;
+
+	sock_init_data(sock, sk);
+
+	sock->state = SS_UNCONNECTED;
+	sock->ops   = &pptp_ops;
+
+	sk->sk_backlog_rcv = pptp_rcv_core;
+	sk->sk_state       = PPPOX_NONE;
+	sk->sk_type        = SOCK_STREAM;
+	sk->sk_family      = PF_PPPOX;
+	sk->sk_protocol    = PX_PROTO_PPTP;
+	sk->sk_destruct    = pptp_sock_destruct;
+
+	po = pppox_sk(sk);
+	opt = &po->proto.pptp;
+
+	opt->seq_sent = 0; opt->seq_recv = 0;
+	opt->ack_recv = 0; opt->ack_sent = 0;
+
+	error = 0;
+out:
+	return error;
+}
+
+static int pptp_ppp_ioctl(struct ppp_channel *chan, unsigned int cmd,
+	unsigned long arg)
+{
+	struct sock *sk = (struct sock *) chan->private;
+	struct pppox_sock *po = pppox_sk(sk);
+	struct pptp_opt *opt = &po->proto.pptp;
+	void __user *argp = (void __user *)arg;
+	int __user *p = argp;
+	int err, val;
+
+	err = -EFAULT;
+	switch (cmd) {
+	case PPPIOCGFLAGS:
+		val = opt->ppp_flags;
+		if (put_user(val, p))
+			break;
+		err = 0;
+		break;
+	case PPPIOCSFLAGS:
+		if (get_user(val, p))
+			break;
+		opt->ppp_flags = val & ~SC_RCV_BITS;
+		err = 0;
+		break;
+	default:
+		err = -ENOTTY;
+	}
+
+	return err;
+}
+
+static struct ppp_channel_ops pptp_chan_ops = {
+	.start_xmit = pptp_xmit,
+	.ioctl      = pptp_ppp_ioctl,
+};
+
+static struct proto pptp_sk_proto __read_mostly = {
+	.name     = "PPTP",
+	.owner    = THIS_MODULE,
+	.obj_size = sizeof(struct pppox_sock),
+};
+
+static const struct proto_ops pptp_ops = {
+	.family     = AF_PPPOX,
+	.owner      = THIS_MODULE,
+	.release    = pptp_release,
+	.bind       = pptp_bind,
+	.connect    = pptp_connect,
+	.socketpair = sock_no_socketpair,
+	.accept     = sock_no_accept,
+	.getname    = pptp_getname,
+	.poll       = sock_no_poll,
+	.listen     = sock_no_listen,
+	.shutdown   = sock_no_shutdown,
+	.setsockopt = sock_no_setsockopt,
+	.getsockopt = sock_no_getsockopt,
+	.sendmsg    = sock_no_sendmsg,
+	.recvmsg    = sock_no_recvmsg,
+	.mmap       = sock_no_mmap,
+	.ioctl      = pppox_ioctl,
+};
+
+static struct pppox_proto pppox_pptp_proto = {
+	.create = pptp_create,
+	.owner  = THIS_MODULE,
+};
+
+static struct gre_protocol gre_pptp_protocol = {
+	.handler = pptp_rcv,
+};
+
+static int __init pptp_init_module(void)
+{
+	int err = 0;
+	pr_info("PPTP driver version " PPTP_DRIVER_VERSION "\n");
+
+	callid_sock = __vmalloc((MAX_CALLID + 1) * sizeof(void *),
+		GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
+	if (!callid_sock) {
+		pr_err("PPTP: cann't allocate memory\n");
+		return -ENOMEM;
+	}
+
+	err = gre_add_protocol(&gre_pptp_protocol, GREPROTO_PPTP);
+	if (err) {
+		pr_err("PPTP: can't add gre protocol\n");
+		goto out_mem_free;
+	}
+
+	err = proto_register(&pptp_sk_proto, 0);
+	if (err) {
+		pr_err("PPTP: can't register sk_proto\n");
+		goto out_gre_del_protocol;
+	}
+
+	err = register_pppox_proto(PX_PROTO_PPTP, &pppox_pptp_proto);
+	if (err) {
+		pr_err("PPTP: can't register pppox_proto\n");
+		goto out_unregister_sk_proto;
+	}
+
+	return 0;
+
+out_unregister_sk_proto:
+	proto_unregister(&pptp_sk_proto);
+out_gre_del_protocol:
+	gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP);
+out_mem_free:
+	vfree(callid_sock);
+
+	return err;
+}
+
+static void __exit pptp_exit_module(void)
+{
+	unregister_pppox_proto(PX_PROTO_PPTP);
+	proto_unregister(&pptp_sk_proto);
+	gre_del_protocol(&gre_pptp_protocol, GREPROTO_PPTP);
+	vfree(callid_sock);
+}
+
+module_init(pptp_init_module);
+module_exit(pptp_exit_module);
+
+MODULE_DESCRIPTION("Point-to-Point Tunneling Protocol");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h
index 1925e0c3f162..1525b2156b2a 100644
--- a/include/linux/if_pppox.h
+++ b/include/linux/if_pppox.h
@@ -40,25 +40,35 @@
  * PPPoE addressing definition 
  */ 
 typedef __be16 sid_t;
-struct pppoe_addr{ 
-       sid_t           sid;                    /* Session identifier */ 
-       unsigned char   remote[ETH_ALEN];       /* Remote address */ 
-       char            dev[IFNAMSIZ];          /* Local device to use */ 
+struct pppoe_addr {
+	sid_t         sid;                    /* Session identifier */
+	unsigned char remote[ETH_ALEN];       /* Remote address */
+	char          dev[IFNAMSIZ];          /* Local device to use */
 }; 
  
 /************************************************************************ 
- * Protocols supported by AF_PPPOX 
- */ 
+ * PPTP addressing definition
+ */
+struct pptp_addr {
+	u16             call_id;
+	struct in_addr  sin_addr;
+};
+
+/************************************************************************
+ * Protocols supported by AF_PPPOX
+ */
 #define PX_PROTO_OE    0 /* Currently just PPPoE */
 #define PX_PROTO_OL2TP 1 /* Now L2TP also */
-#define PX_MAX_PROTO   2
-
-struct sockaddr_pppox { 
-       sa_family_t     sa_family;            /* address family, AF_PPPOX */ 
-       unsigned int    sa_protocol;          /* protocol identifier */ 
-       union{ 
-               struct pppoe_addr       pppoe; 
-       }sa_addr; 
+#define PX_PROTO_PPTP  2
+#define PX_MAX_PROTO   3
+
+struct sockaddr_pppox {
+	sa_family_t     sa_family;            /* address family, AF_PPPOX */
+	unsigned int    sa_protocol;          /* protocol identifier */
+	union {
+		struct pppoe_addr  pppoe;
+		struct pptp_addr   pptp;
+	} sa_addr;
 } __packed;
 
 /* The use of the above union isn't viable because the size of this
@@ -101,7 +111,7 @@ struct pppoe_tag {
 	__be16 tag_type;
 	__be16 tag_len;
 	char tag_data[0];
-} __attribute ((packed));
+} __packed;
 
 /* Tag identifiers */
 #define PTT_EOL		__cpu_to_be16(0x0000)
@@ -150,15 +160,23 @@ struct pppoe_opt {
 					     relayed to (PPPoE relaying) */
 };
 
+struct pptp_opt {
+	struct pptp_addr src_addr;
+	struct pptp_addr dst_addr;
+	u32 ack_sent, ack_recv;
+	u32 seq_sent, seq_recv;
+	int ppp_flags;
+};
 #include <net/sock.h>
 
 struct pppox_sock {
 	/* struct sock must be the first member of pppox_sock */
-	struct sock		sk;
-	struct ppp_channel	chan;
+	struct sock sk;
+	struct ppp_channel chan;
 	struct pppox_sock	*next;	  /* for hash table */
 	union {
 		struct pppoe_opt pppoe;
+		struct pptp_opt  pptp;
 	} proto;
 	__be16			num;
 };
diff --git a/include/net/gre.h b/include/net/gre.h
new file mode 100644
index 000000000000..82665474bcb7
--- /dev/null
+++ b/include/net/gre.h
@@ -0,0 +1,18 @@
+#ifndef __LINUX_GRE_H
+#define __LINUX_GRE_H
+
+#include <linux/skbuff.h>
+
+#define GREPROTO_CISCO		0
+#define GREPROTO_PPTP		1
+#define GREPROTO_MAX		2
+
+struct gre_protocol {
+	int  (*handler)(struct sk_buff *skb);
+	void (*err_handler)(struct sk_buff *skb, u32 info);
+};
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version);
+int gre_del_protocol(const struct gre_protocol *proto, u8 version);
+
+#endif
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 7c3a7d191249..7458bdae7e9f 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -215,8 +215,15 @@ config NET_IPIP
 	  be inserted in and removed from the running kernel whenever you
 	  want). Most people won't need this and can say N.
 
+config NET_IPGRE_DEMUX
+	tristate "IP: GRE demultiplexer"
+	help
+	 This is helper module to demultiplex GRE packets on GRE version field criteria.
+	 Required by ip_gre and pptp modules.
+
 config NET_IPGRE
 	tristate "IP: GRE tunnels over IP"
+	depends on NET_IPGRE_DEMUX
 	help
 	  Tunneling means encapsulating data of one protocol type within
 	  another protocol and sending it over a channel that understands the
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 80ff87ce43aa..4978d22f9a75 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
 obj-$(CONFIG_IP_MROUTE) += ipmr.o
 obj-$(CONFIG_NET_IPIP) += ipip.o
+obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
new file mode 100644
index 000000000000..b546736da2e1
--- /dev/null
+++ b/net/ipv4/gre.c
@@ -0,0 +1,151 @@
+/*
+ *	GRE over IPv4 demultiplexer driver
+ *
+ *	Authors: Dmitry Kozlov (xeb@mail.ru)
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/kmod.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/netdevice.h>
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <net/protocol.h>
+#include <net/gre.h>
+
+
+const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
+static DEFINE_SPINLOCK(gre_proto_lock);
+
+int gre_add_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (gre_proto[version])
+		goto err_out_unlock;
+
+	rcu_assign_pointer(gre_proto[version], proto);
+	spin_unlock(&gre_proto_lock);
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_add_protocol);
+
+int gre_del_protocol(const struct gre_protocol *proto, u8 version)
+{
+	if (version >= GREPROTO_MAX)
+		goto err_out;
+
+	spin_lock(&gre_proto_lock);
+	if (gre_proto[version] != proto)
+		goto err_out_unlock;
+	rcu_assign_pointer(gre_proto[version], NULL);
+	spin_unlock(&gre_proto_lock);
+	synchronize_rcu();
+	return 0;
+
+err_out_unlock:
+	spin_unlock(&gre_proto_lock);
+err_out:
+	return -1;
+}
+EXPORT_SYMBOL_GPL(gre_del_protocol);
+
+static int gre_rcv(struct sk_buff *skb)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+	int ret;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->handler)
+		goto drop_unlock;
+	ret = proto->handler(skb);
+	rcu_read_unlock();
+	return ret;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+
+static void gre_err(struct sk_buff *skb, u32 info)
+{
+	const struct gre_protocol *proto;
+	u8 ver;
+
+	if (!pskb_may_pull(skb, 12))
+		goto drop;
+
+	ver = skb->data[1]&0x7f;
+	if (ver >= GREPROTO_MAX)
+		goto drop;
+
+	rcu_read_lock();
+	proto = rcu_dereference(gre_proto[ver]);
+	if (!proto || !proto->err_handler)
+		goto drop_unlock;
+	proto->err_handler(skb, info);
+	rcu_read_unlock();
+	return;
+
+drop_unlock:
+	rcu_read_unlock();
+drop:
+	kfree_skb(skb);
+}
+
+static const struct net_protocol net_gre_protocol = {
+	.handler     = gre_rcv,
+	.err_handler = gre_err,
+	.netns_ok    = 1,
+};
+
+static int __init gre_init(void)
+{
+	pr_info("GRE over IPv4 demultiplexor driver");
+
+	if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
+		pr_err("gre: can't add protocol\n");
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static void __exit gre_exit(void)
+{
+	inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
+}
+
+module_init(gre_init);
+module_exit(gre_exit);
+
+MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
+MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
+MODULE_LICENSE("GPL");
+
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 945b20a5ad50..85176895495a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -44,6 +44,7 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/rtnetlink.h>
+#include <net/gre.h>
 
 #ifdef CONFIG_IPV6
 #include <net/ipv6.h>
@@ -1278,10 +1279,9 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 }
 
 
-static const struct net_protocol ipgre_protocol = {
-	.handler	=	ipgre_rcv,
-	.err_handler	=	ipgre_err,
-	.netns_ok	=	1,
+static const struct gre_protocol ipgre_protocol = {
+	.handler     = ipgre_rcv,
+	.err_handler = ipgre_err,
 };
 
 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
@@ -1663,7 +1663,7 @@ static int __init ipgre_init(void)
 	if (err < 0)
 		return err;
 
-	err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE);
+	err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
 	if (err < 0) {
 		printk(KERN_INFO "ipgre init: can't add protocol\n");
 		goto add_proto_failed;
@@ -1683,7 +1683,7 @@ out:
 tap_ops_failed:
 	rtnl_link_unregister(&ipgre_link_ops);
 rtnl_link_failed:
-	inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
+	gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
 add_proto_failed:
 	unregister_pernet_device(&ipgre_net_ops);
 	goto out;
@@ -1693,7 +1693,7 @@ static void __exit ipgre_fini(void)
 {
 	rtnl_link_unregister(&ipgre_tap_ops);
 	rtnl_link_unregister(&ipgre_link_ops);
-	if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
+	if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
 		printk(KERN_INFO "ipgre close: can't remove protocol\n");
 	unregister_pernet_device(&ipgre_net_ops);
 }
-- 
cgit v1.2.3-58-ga151


From e88c64f0a42575e01c7ace903d0570bc0b7fcf85 Mon Sep 17 00:00:00 2001
From: Hagen Paul Pfeifer <hagen@jauu.net>
Date: Thu, 19 Aug 2010 06:33:05 +0000
Subject: tcp: allow effective reduction of TCP's rcv-buffer via setsockopt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Via setsockopt it is possible to reduce the socket RX buffer
(SO_RCVBUF). TCP method to select the initial window and window scaling
option in tcp_select_initial_window() currently misbehaves and do not
consider a reduced RX socket buffer via setsockopt.

Even though the server's RX buffer is reduced via setsockopt() to 256
byte (Initial Window 384 byte => 256 * 2 - (256 * 2 / 4)) the window
scale option is still 7:

192.168.1.38.40676 > 78.47.222.210.5001: Flags [S], seq 2577214362, win 5840, options [mss 1460,sackOK,TS val 338417 ecr 0,nop,wscale 0], length 0
78.47.222.210.5001 > 192.168.1.38.40676: Flags [S.], seq 1570631029, ack 2577214363, win 384, options [mss 1452,sackOK,TS val 2435248895 ecr 338417,nop,wscale 7], length 0
192.168.1.38.40676 > 78.47.222.210.5001: Flags [.], ack 1, win 5840, options [nop,nop,TS val 338421 ecr 2435248895], length 0

Within tcp_select_initial_window() the original space argument - a
representation of the rx buffer size - is expanded during
tcp_select_initial_window(). Only sysctl_tcp_rmem[2], sysctl_rmem_max
and window_clamp are considered to calculate the initial window.

This patch adjust the window_clamp argument if the user explicitly
reduce the receive buffer.

Signed-off-by: Hagen Paul Pfeifer <hagen@jauu.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index de3bd8458588..01b94b8d9ec9 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2429,6 +2429,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 		__u8 rcv_wscale;
 		/* Set this up on the first call only */
 		req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+
+		/* limit the window selection if the user enforce a smaller rx buffer */
+		if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+		    (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0))
+			req->window_clamp = tcp_full_space(sk);
+
 		/* tcp_full_space because it is guaranteed to be the first packet */
 		tcp_select_initial_window(tcp_full_space(sk),
 			mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
@@ -2555,6 +2561,11 @@ static void tcp_connect_init(struct sock *sk)
 
 	tcp_initialize_rcv_mss(sk);
 
+	/* limit the window selection if the user enforce a smaller rx buffer */
+	if (sk->sk_userlocks & SOCK_RCVBUF_LOCK &&
+	    (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0))
+		tp->window_clamp = tcp_full_space(sk);
+
 	tcp_select_initial_window(tcp_full_space(sk),
 				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
 				  &tp->rcv_wnd,
-- 
cgit v1.2.3-58-ga151


From 21dc330157454046dd7c494961277d76e1c957fe Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Mon, 23 Aug 2010 00:13:46 -0700
Subject: net: Rename skb_has_frags to skb_has_frag_list

SKBs can be "fragmented" in two ways, via a page array (called
skb_shinfo(skb)->frags[]) and via a list of SKBs (called
skb_shinfo(skb)->frag_list).

Since skb_has_frags() tests the latter, it's name is confusing
since it sounds more like it's testing the former.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h               |  2 +-
 include/linux/skbuff.h                  |  4 ++--
 net/core/dev.c                          |  4 ++--
 net/core/skbuff.c                       | 18 +++++++++---------
 net/ipv4/ip_fragment.c                  |  2 +-
 net/ipv4/ip_output.c                    |  2 +-
 net/ipv6/ip6_output.c                   |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 +-
 net/ipv6/reassembly.c                   |  2 +-
 9 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 46c36ffe20ee..ce2de8b64083 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2191,7 +2191,7 @@ static inline int net_gso_ok(int features, int gso_type)
 static inline int skb_gso_ok(struct sk_buff *skb, int features)
 {
 	return net_gso_ok(features, skb_shinfo(skb)->gso_type) &&
-	       (!skb_has_frags(skb) || (features & NETIF_F_FRAGLIST));
+	       (!skb_has_frag_list(skb) || (features & NETIF_F_FRAGLIST));
 }
 
 static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f067c95cf18a..f900ffcd847e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1120,7 +1120,7 @@ extern void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page,
 			    int off, int size);
 
 #define SKB_PAGE_ASSERT(skb) 	BUG_ON(skb_shinfo(skb)->nr_frags)
-#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_has_frags(skb))
+#define SKB_FRAG_ASSERT(skb) 	BUG_ON(skb_has_frag_list(skb))
 #define SKB_LINEAR_ASSERT(skb)  BUG_ON(skb_is_nonlinear(skb))
 
 #ifdef NET_SKBUFF_DATA_USES_OFFSET
@@ -1784,7 +1784,7 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
 		     skb = skb->prev)
 
 
-static inline bool skb_has_frags(const struct sk_buff *skb)
+static inline bool skb_has_frag_list(const struct sk_buff *skb)
 {
 	return skb_shinfo(skb)->frag_list != NULL;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index d569f88bcf80..859e30ff044a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1930,7 +1930,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb,
 				      struct net_device *dev)
 {
 	return skb_is_nonlinear(skb) &&
-	       ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
+	       ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
 	        (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
 					      illegal_highdma(dev, skb))));
 }
@@ -3090,7 +3090,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
 	if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
 		goto normal;
 
-	if (skb_is_gso(skb) || skb_has_frags(skb))
+	if (skb_is_gso(skb) || skb_has_frag_list(skb))
 		goto normal;
 
 	rcu_read_lock();
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 99ef721f773d..e2535fb4985d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -340,7 +340,7 @@ static void skb_release_data(struct sk_buff *skb)
 				put_page(skb_shinfo(skb)->frags[i].page);
 		}
 
-		if (skb_has_frags(skb))
+		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 
 		kfree(skb->head);
@@ -759,7 +759,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
 		skb_shinfo(n)->nr_frags = i;
 	}
 
-	if (skb_has_frags(skb)) {
+	if (skb_has_frag_list(skb)) {
 		skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list;
 		skb_clone_fraglist(n);
 	}
@@ -822,7 +822,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
 	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
 		get_page(skb_shinfo(skb)->frags[i].page);
 
-	if (skb_has_frags(skb))
+	if (skb_has_frag_list(skb))
 		skb_clone_fraglist(skb);
 
 	skb_release_data(skb);
@@ -1099,7 +1099,7 @@ drop_pages:
 		for (; i < nfrags; i++)
 			put_page(skb_shinfo(skb)->frags[i].page);
 
-		if (skb_has_frags(skb))
+		if (skb_has_frag_list(skb))
 			skb_drop_fraglist(skb);
 		goto done;
 	}
@@ -1194,7 +1194,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
 	/* Optimization: no fragments, no reasons to preestimate
 	 * size of pulled pages. Superb.
 	 */
-	if (!skb_has_frags(skb))
+	if (!skb_has_frag_list(skb))
 		goto pull_pages;
 
 	/* Estimate size of pulled pages. */
@@ -2323,7 +2323,7 @@ next_skb:
 		st->frag_data = NULL;
 	}
 
-	if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) {
+	if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) {
 		st->cur_skb = skb_shinfo(st->root_skb)->frag_list;
 		st->frag_idx = 0;
 		goto next_skb;
@@ -2889,7 +2889,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 		return -ENOMEM;
 
 	/* Easy case. Most of packets will go this way. */
-	if (!skb_has_frags(skb)) {
+	if (!skb_has_frag_list(skb)) {
 		/* A little of trouble, not enough of space for trailer.
 		 * This should not happen, when stack is tuned to generate
 		 * good frames. OK, on miss we reallocate and reserve even more
@@ -2924,7 +2924,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 
 		if (skb1->next == NULL && tailbits) {
 			if (skb_shinfo(skb1)->nr_frags ||
-			    skb_has_frags(skb1) ||
+			    skb_has_frag_list(skb1) ||
 			    skb_tailroom(skb1) < tailbits)
 				ntail = tailbits + 128;
 		}
@@ -2933,7 +2933,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer)
 		    skb_cloned(skb1) ||
 		    ntail ||
 		    skb_shinfo(skb1)->nr_frags ||
-		    skb_has_frags(skb1)) {
+		    skb_has_frag_list(skb1)) {
 			struct sk_buff *skb2;
 
 			/* Fuck, we are miserable poor guys... */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b7c41654dde5..f4dc879e258e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e807492f1777..6d2753c7ffdd 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	 * LATER: this step can be merged to real generation of fragments,
 	 * we can switch to copy when see the first bad fragment.
 	 */
-	if (skb_has_frags(skb)) {
+	if (skb_has_frag_list(skb)) {
 		struct sk_buff *frag;
 		int first_len = skb_pagelen(skb);
 		int truesizes = 0;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index d40b330c0ee6..1838927a2243 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -637,7 +637,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
 
-	if (skb_has_frags(skb)) {
+	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
 		int truesizes = 0;
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 13ef5bc05cf5..089c598773c7 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -413,7 +413,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev)
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 545c4141b755..8aea3f3f18d7 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -499,7 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	/* If the first fragment is fragmented itself, we split
 	 * it to two chunks: the first with data and paged part
 	 * and the second, holding only fragments. */
-	if (skb_has_frags(head)) {
+	if (skb_has_frag_list(head)) {
 		struct sk_buff *clone;
 		int i, plen = 0;
 
-- 
cgit v1.2.3-58-ga151


From ec550d246e38e1b4ea8604b5c71ccb72e38f3290 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 24 Aug 2010 14:45:09 -0700
Subject: net: ip_append_data() optim

Compiler is not smart enough to avoid a conditional branch.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 6d2753c7ffdd..e42762023c27 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -837,10 +837,9 @@ int ip_append_data(struct sock *sk,
 		inet->cork.length = 0;
 		sk->sk_sndmsg_page = NULL;
 		sk->sk_sndmsg_off = 0;
-		if ((exthdrlen = rt->dst.header_len) != 0) {
-			length += exthdrlen;
-			transhdrlen += exthdrlen;
-		}
+		exthdrlen = rt->dst.header_len;
+		length += exthdrlen;
+		transhdrlen += exthdrlen;
 	} else {
 		rt = (struct rtable *)inet->cork.dst;
 		if (inet->cork.flags & IPCORK_OPT)
-- 
cgit v1.2.3-58-ga151


From dca43c75e7e545694a9dd6288553f55c53e2a3a3 Mon Sep 17 00:00:00 2001
From: Jerry Chu <hkchu@google.com>
Date: Fri, 27 Aug 2010 19:13:28 +0000
Subject: tcp: Add TCP_USER_TIMEOUT socket option.

This patch provides a "user timeout" support as described in RFC793. The
socket option is also needed for the the local half of RFC5482 "TCP User
Timeout Option".

TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int,
when > 0, to specify the maximum amount of time in ms that transmitted
data may remain unacknowledged before TCP will forcefully close the
corresponding connection and return ETIMEDOUT to the application. If
0 is given, TCP will continue to use the system default.

Increasing the user timeouts allows a TCP connection to survive extended
periods without end-to-end connectivity. Decreasing the user timeouts
allows applications to "fail fast" if so desired. Otherwise it may take
upto 20 minutes with the current system defaults in a normal WAN
environment.

The socket option can be made during any state of a TCP connection, but
is only effective during the synchronized states of a connection
(ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, or LAST-ACK).
Moreover, when used with the TCP keepalive (SO_KEEPALIVE) option,
TCP_USER_TIMEOUT will overtake keepalive to determine when to close a
connection due to keepalive failure.

The option does not change in anyway when TCP retransmits a packet, nor
when a keepalive probe will be sent.

This option, like many others, will be inherited by an acceptor from its
listener.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h                |  1 +
 include/net/inet_connection_sock.h |  1 +
 net/ipv4/tcp.c                     | 11 ++++++++++-
 net/ipv4/tcp_timer.c               | 40 ++++++++++++++++++++++++--------------
 4 files changed, 37 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a778ee024590..e64f4c67d0ef 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -105,6 +105,7 @@ enum {
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
+#define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b6d3b55da19b..e4f494b42e06 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -125,6 +125,7 @@ struct inet_connection_sock {
 		int		  probe_size;
 	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
+	u32			  icsk_user_timeout;
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11aaea77..cf3254528753 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		err = tp->af_specific->md5_parse(sk, optval, optlen);
 		break;
 #endif
-
+	case TCP_USER_TIMEOUT:
+		/* Cap the max timeout in ms TCP will retry/retrans
+		 * before giving up and aborting (ETIMEDOUT) a connection.
+		 */
+		icsk->icsk_user_timeout = msecs_to_jiffies(val);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		val = tp->thin_dupack;
 		break;
+
+	case TCP_USER_TIMEOUT:
+		val = jiffies_to_msecs(icsk->icsk_user_timeout);
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb920c9f5..11569deccbea 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -138,10 +138,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
  * retransmissions with an initial RTO of TCP_RTO_MIN.
  */
 static bool retransmits_timed_out(struct sock *sk,
-				  unsigned int boundary)
+				  unsigned int boundary,
+				  unsigned int timeout)
 {
-	unsigned int timeout, linear_backoff_thresh;
-	unsigned int start_ts;
+	unsigned int linear_backoff_thresh, start_ts;
 
 	if (!inet_csk(sk)->icsk_retransmits)
 		return false;
@@ -151,14 +151,15 @@ static bool retransmits_timed_out(struct sock *sk,
 	else
 		start_ts = tcp_sk(sk)->retrans_stamp;
 
-	linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
-
-	if (boundary <= linear_backoff_thresh)
-		timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
-	else
-		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
-			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	if (likely(timeout == 0)) {
+		linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN);
 
+		if (boundary <= linear_backoff_thresh)
+			timeout = ((2 << boundary) - 1) * TCP_RTO_MIN;
+		else
+			timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
+				(boundary - linear_backoff_thresh) * TCP_RTO_MAX;
+	}
 	return (tcp_time_stamp - start_ts) >= timeout;
 }
 
@@ -174,7 +175,7 @@ static int tcp_write_timeout(struct sock *sk)
 			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -187,14 +188,16 @@ static int tcp_write_timeout(struct sock *sk)
 
 			retry_until = tcp_orphan_retries(sk, alive);
 			do_reset = alive ||
-				   !retransmits_timed_out(sk, retry_until);
+				   !retransmits_timed_out(sk, retry_until, 0);
 
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
 	}
 
-	if (retransmits_timed_out(sk, retry_until)) {
+	if (retransmits_timed_out(sk, retry_until,
+	    (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 :
+	    icsk->icsk_user_timeout)) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -436,7 +439,7 @@ out_reset_timer:
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
 		__sk_dst_reset(sk);
 
 out:;
@@ -556,7 +559,14 @@ static void tcp_keepalive_timer (unsigned long data)
 	elapsed = keepalive_time_elapsed(tp);
 
 	if (elapsed >= keepalive_time_when(tp)) {
-		if (icsk->icsk_probes_out >= keepalive_probes(tp)) {
+		/* If the TCP_USER_TIMEOUT option is enabled, use that
+		 * to determine when to timeout instead.
+		 */
+		if ((icsk->icsk_user_timeout != 0 &&
+		    elapsed >= icsk->icsk_user_timeout &&
+		    icsk->icsk_probes_out > 0) ||
+		    (icsk->icsk_user_timeout == 0 &&
+		    icsk->icsk_probes_out >= keepalive_probes(tp))) {
 			tcp_send_active_reset(sk, GFP_ATOMIC);
 			tcp_write_err(sk);
 			goto out;
-- 
cgit v1.2.3-58-ga151


From 22b71c8f4f3db8df92f5e7b081c265bc56c0bd2f Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Sun, 29 Aug 2010 19:23:12 +0000
Subject: tcp/dccp: Consolidate common code for RFC 3390 conversion

This patch consolidates initial-window code common to TCP and CCID-2:
 * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and
 * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341).

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h      | 15 +++++++++++++++
 net/dccp/ccids/ccid2.c |  8 ++------
 net/ipv4/tcp_input.c   | 17 ++---------------
 3 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index df6a2eb20193..a64022199b62 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -779,6 +779,21 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk)
 /* Use define here intentionally to get WARN_ON location shown at the caller */
 #define tcp_verify_left_out(tp)	WARN_ON(tcp_left_out(tp) > tp->packets_out)
 
+/*
+ * Convert RFC 3390 larger initial window into an equivalent number of packets.
+ *
+ * John Heffner states:
+ *
+ *	The RFC specifies a window of no more than 4380 bytes
+ *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
+ *	is a bit misleading because they use a clamp at 4380 bytes
+ *	rather than a multiplier in the relevant range.
+ */
+static inline u32 rfc3390_bytes_to_packets(const u32 smss)
+{
+	return smss <= 1095 ? 4 : (smss > 1460 ? 2 : 3);
+}
+
 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
 extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst);
 
diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c
index 8c95813bcc67..b9c942a09c98 100644
--- a/net/dccp/ccids/ccid2.c
+++ b/net/dccp/ccids/ccid2.c
@@ -641,12 +641,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk)
 	/* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */
 	hc->tx_ssthresh = ~0U;
 
-	/*
-	 * RFC 4341, 5: "The cwnd parameter is initialized to at most four
-	 * packets for new connections, following the rules from [RFC3390]".
-	 * We need to convert the bytes of RFC3390 into the packets of RFC 4341.
-	 */
-	hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U);
+	/* Use larger initial windows (RFC 4341, section 5). */
+	hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache);
 
 	/* Make sure that Ack Ratio is enabled and within bounds. */
 	max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e663b78a2ef6..1bc87a05c734 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk)
 	}
 }
 
-/* Numbers are taken from RFC3390.
- *
- * John Heffner states:
- *
- *	The RFC specifies a window of no more than 4380 bytes
- *	unless 2*MSS > 4380.  Reading the pseudocode in the RFC
- *	is a bit misleading because they use a clamp at 4380 bytes
- *	rather than use a multiplier in the relevant range.
- */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
 	__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
-	if (!cwnd) {
-		if (tp->mss_cache > 1460)
-			cwnd = 2;
-		else
-			cwnd = (tp->mss_cache > 1095) ? 3 : 4;
-	}
+	if (!cwnd)
+		cwnd = rfc3390_bytes_to_packets(tp->mss_cache);
 	return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
 }
 
-- 
cgit v1.2.3-58-ga151


From 6dcd814bd08bc7989f7f3eac9bbe8b20aec0182a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 30 Aug 2010 07:04:14 +0000
Subject: net: struct xfrm_tunnel in read_mostly section

tunnel4_handlers chain being scanned for each incoming packet,
make sure it doesnt share an often dirtied cache line.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c         | 2 +-
 net/ipv4/tunnel4.c      | 4 ++--
 net/ipv4/xfrm4_tunnel.c | 4 ++--
 net/ipv6/sit.c          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index ec036731a70b..3c6f8f3968a6 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -744,7 +744,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
 	ipn->tunnels_wc[0]	= tunnel;
 }
 
-static struct xfrm_tunnel ipip_handler = {
+static struct xfrm_tunnel ipip_handler __read_mostly = {
 	.handler	=	ipip_rcv,
 	.err_handler	=	ipip_err,
 	.priority	=	1,
diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 59186ca7808a..942f02da07d8 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -14,8 +14,8 @@
 #include <net/protocol.h>
 #include <net/xfrm.h>
 
-static struct xfrm_tunnel *tunnel4_handlers;
-static struct xfrm_tunnel *tunnel64_handlers;
+static struct xfrm_tunnel *tunnel4_handlers __read_mostly;
+static struct xfrm_tunnel *tunnel64_handlers __read_mostly;
 static DEFINE_MUTEX(tunnel4_mutex);
 
 static inline struct xfrm_tunnel **fam_handlers(unsigned short family)
diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c
index 41f5982d2087..82806455e859 100644
--- a/net/ipv4/xfrm4_tunnel.c
+++ b/net/ipv4/xfrm4_tunnel.c
@@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info)
 	return -ENOENT;
 }
 
-static struct xfrm_tunnel xfrm_tunnel_handler = {
+static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = {
 	.handler	=	xfrm_tunnel_rcv,
 	.err_handler	=	xfrm_tunnel_err,
 	.priority	=	2,
 };
 
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static struct xfrm_tunnel xfrm64_tunnel_handler = {
+static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = {
 	.handler	=	xfrm_tunnel_rcv,
 	.err_handler	=	xfrm_tunnel_err,
 	.priority	=	2,
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 4699cd3c3118..86618eb30335 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -1132,7 +1132,7 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev)
 	sitn->tunnels_wc[0]	= tunnel;
 }
 
-static struct xfrm_tunnel sit_handler = {
+static struct xfrm_tunnel sit_handler __read_mostly = {
 	.handler	=	ipip6_rcv,
 	.err_handler	=	ipip6_err,
 	.priority	=	1,
-- 
cgit v1.2.3-58-ga151


From 1639ab6f7831f056286c64d98e8e5eb04e3bacac Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 31 Aug 2010 10:23:47 +0000
Subject: gro: unexport tcp4_gro_receive and tcp4_gro_complete

tcp4_gro_receive() and tcp4_gro_complete() dont need to be exported.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_ipv4.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 020766292bb0..a0232f3a358b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
 
 	return tcp_gro_receive(head, skb);
 }
-EXPORT_SYMBOL(tcp4_gro_receive);
 
 int tcp4_gro_complete(struct sk_buff *skb)
 {
@@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb)
 
 	return tcp_gro_complete(skb);
 }
-EXPORT_SYMBOL(tcp4_gro_complete);
 
 struct proto tcp_prot = {
 	.name			= "TCP",
-- 
cgit v1.2.3-58-ga151


From 875168a9330d3aa6481ce62ce8fa77c7be0c75fb Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 30 Aug 2010 11:07:25 +0000
Subject: net: tunnels should use rcu_dereference

tunnel4_handlers, tunnel64_handlers, tunnel6_handlers and
tunnel46_handlers are protected by RCU, but we dont use appropriate rcu
primitives to scan them. rcu_lock() is already held by caller.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tunnel4.c | 13 +++++++++----
 net/ipv6/tunnel6.c | 11 ++++++++---
 2 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index 942f02da07d8..df59d16337f2 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family)
 }
 EXPORT_SYMBOL(xfrm4_tunnel_deregister);
 
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+	
 static int tunnel4_rcv(struct sk_buff *skb)
 {
 	struct xfrm_tunnel *handler;
@@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto drop;
 
-	for (handler = tunnel4_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	for (handler = tunnel64_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
-	for (handler = tunnel4_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel4_handlers, handler)
 		if (!handler->err_handler(skb, info))
 			break;
 }
@@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info)
 {
 	struct xfrm_tunnel *handler;
 
-	for (handler = tunnel64_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel64_handlers, handler)
 		if (!handler->err_handler(skb, info))
 			break;
 }
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index d203e6df1f8b..3177fe0459e0 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -88,6 +88,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family)
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
+#define for_each_tunnel_rcu(head, handler)		\
+	for (handler = rcu_dereference(head);		\
+	     handler != NULL;				\
+	     handler = rcu_dereference(handler->next))	\
+
 static int tunnel6_rcv(struct sk_buff *skb)
 {
 	struct xfrm6_tunnel *handler;
@@ -95,7 +100,7 @@ static int tunnel6_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
 		goto drop;
 
-	for (handler = tunnel6_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -113,7 +118,7 @@ static int tunnel46_rcv(struct sk_buff *skb)
 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
 		goto drop;
 
-	for (handler = tunnel46_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel46_handlers, handler)
 		if (!handler->handler(skb))
 			return 0;
 
@@ -129,7 +134,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 {
 	struct xfrm6_tunnel *handler;
 
-	for (handler = tunnel6_handlers; handler; handler = handler->next)
+	for_each_tunnel_rcu(tunnel6_handlers, handler)
 		if (!handler->err_handler(skb, opt, type, code, offset, info))
 			break;
 }
-- 
cgit v1.2.3-58-ga151


From 0705c6f0e2d39333645bf77cf1efb94526ff1f82 Mon Sep 17 00:00:00 2001
From: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Date: Wed, 1 Sep 2010 00:28:35 +0000
Subject: tcp: update also tcp_output with regard to RFC 5681

Thanks to Ilpo Jarvinen, this updates also the initial window
setting for tcp_output with regard to RFC 5681.

Signed-off-by: Gerrit Renker <gerrit@erg.abdn.ac.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_output.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 01b94b8d9ec9..ea09d2fd50c7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss,
 		}
 	}
 
-	/* Set initial window to value enough for senders,
-	 * following RFC2414. Senders, not following this RFC,
-	 * will be satisfied with 2.
-	 */
+	/* Set initial window to value enough for senders, following RFC5681. */
 	if (mss > (1 << *rcv_wscale)) {
-		int init_cwnd = 4;
-		if (mss > 1460 * 3)
-			init_cwnd = 2;
-		else if (mss > 1460)
-			init_cwnd = 3;
+		int init_cwnd = rfc3390_bytes_to_packets(mss);
+
 		/* when initializing use the value from init_rcv_wnd
 		 * rather than the default from above
 		 */
-- 
cgit v1.2.3-58-ga151


From deffd77759e3ceb936f0760cc54a213881577a83 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 2 Sep 2010 03:56:51 +0000
Subject: net: arp: code cleanup

Clean the code up according to Documentation/CodingStyle.

Don't initialize the variable dont_send in arp_process().

Remove the temporary varialbe flags in arp_state_to_flags().

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/arp.c | 226 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 123 insertions(+), 103 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 96c1955b3e2f..dcfe7e961c10 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -55,7 +55,7 @@
  *		Stuart Cheshire	:	Metricom and grat arp fixes
  *					*** FOR 2.1 clean this up ***
  *		Lawrence V. Stefani: (08/12/96) Added FDDI support.
- *		Alan Cox 	:	Took the AP1000 nasty FDDI hack and
+ *		Alan Cox	:	Took the AP1000 nasty FDDI hack and
  *					folded into the mainstream FDDI code.
  *					Ack spit, Linus how did you allow that
  *					one in...
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
 #endif
 
 #include <asm/system.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/netfilter_arp.h>
 
@@ -173,32 +173,32 @@ const struct neigh_ops arp_broken_ops = {
 EXPORT_SYMBOL(arp_broken_ops);
 
 struct neigh_table arp_tbl = {
-	.family =	AF_INET,
-	.entry_size =	sizeof(struct neighbour) + 4,
-	.key_len =	4,
-	.hash =		arp_hash,
-	.constructor =	arp_constructor,
-	.proxy_redo =	parp_redo,
-	.id =		"arp_cache",
-	.parms = {
-		.tbl =			&arp_tbl,
-		.base_reachable_time =	30 * HZ,
-		.retrans_time =	1 * HZ,
-		.gc_staletime =	60 * HZ,
-		.reachable_time =		30 * HZ,
-		.delay_probe_time =	5 * HZ,
-		.queue_len =		3,
-		.ucast_probes =	3,
-		.mcast_probes =	3,
-		.anycast_delay =	1 * HZ,
-		.proxy_delay =		(8 * HZ) / 10,
-		.proxy_qlen =		64,
-		.locktime =		1 * HZ,
+	.family		= AF_INET,
+	.entry_size	= sizeof(struct neighbour) + 4,
+	.key_len	= 4,
+	.hash		= arp_hash,
+	.constructor	= arp_constructor,
+	.proxy_redo	= parp_redo,
+	.id		= "arp_cache",
+	.parms		= {
+		.tbl			= &arp_tbl,
+		.base_reachable_time	= 30 * HZ,
+		.retrans_time		= 1 * HZ,
+		.gc_staletime		= 60 * HZ,
+		.reachable_time		= 30 * HZ,
+		.delay_probe_time	= 5 * HZ,
+		.queue_len		= 3,
+		.ucast_probes		= 3,
+		.mcast_probes		= 3,
+		.anycast_delay		= 1 * HZ,
+		.proxy_delay		= (8 * HZ) / 10,
+		.proxy_qlen		= 64,
+		.locktime		= 1 * HZ,
 	},
-	.gc_interval =	30 * HZ,
-	.gc_thresh1 =	128,
-	.gc_thresh2 =	512,
-	.gc_thresh3 =	1024,
+	.gc_interval	= 30 * HZ,
+	.gc_thresh1	= 128,
+	.gc_thresh2	= 512,
+	.gc_thresh3	= 1024,
 };
 EXPORT_SYMBOL(arp_tbl);
 
@@ -233,7 +233,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev)
 
 static int arp_constructor(struct neighbour *neigh)
 {
-	__be32 addr = *(__be32*)neigh->primary_key;
+	__be32 addr = *(__be32 *)neigh->primary_key;
 	struct net_device *dev = neigh->dev;
 	struct in_device *in_dev;
 	struct neigh_parms *parms;
@@ -296,16 +296,19 @@ static int arp_constructor(struct neighbour *neigh)
 			neigh->ops = &arp_broken_ops;
 			neigh->output = neigh->ops->output;
 			return 0;
+#else
+			break;
 #endif
-		;}
+		}
 #endif
 		if (neigh->type == RTN_MULTICAST) {
 			neigh->nud_state = NUD_NOARP;
 			arp_mc_map(addr, neigh->ha, dev, 1);
-		} else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) {
+		} else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) {
 			neigh->nud_state = NUD_NOARP;
 			memcpy(neigh->ha, dev->dev_addr, dev->addr_len);
-		} else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) {
+		} else if (neigh->type == RTN_BROADCAST ||
+			   (dev->flags & IFF_POINTOPOINT)) {
 			neigh->nud_state = NUD_NOARP;
 			memcpy(neigh->ha, dev->broadcast, dev->addr_len);
 		}
@@ -315,7 +318,7 @@ static int arp_constructor(struct neighbour *neigh)
 		else
 			neigh->ops = &arp_generic_ops;
 
-		if (neigh->nud_state&NUD_VALID)
+		if (neigh->nud_state & NUD_VALID)
 			neigh->output = neigh->ops->connected_output;
 		else
 			neigh->output = neigh->ops->output;
@@ -334,7 +337,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	__be32 saddr = 0;
 	u8  *dst_ha = NULL;
 	struct net_device *dev = neigh->dev;
-	__be32 target = *(__be32*)neigh->primary_key;
+	__be32 target = *(__be32 *)neigh->primary_key;
 	int probes = atomic_read(&neigh->probes);
 	struct in_device *in_dev;
 
@@ -347,7 +350,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
 	default:
 	case 0:		/* By default announce any local IP */
-		if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL)
+		if (skb && inet_addr_type(dev_net(dev),
+					  ip_hdr(skb)->saddr) == RTN_LOCAL)
 			saddr = ip_hdr(skb)->saddr;
 		break;
 	case 1:		/* Restrict announcements of saddr in same subnet */
@@ -369,16 +373,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb)
 	if (!saddr)
 		saddr = inet_select_addr(dev, target, RT_SCOPE_LINK);
 
-	if ((probes -= neigh->parms->ucast_probes) < 0) {
-		if (!(neigh->nud_state&NUD_VALID))
-			printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n");
+	probes -= neigh->parms->ucast_probes;
+	if (probes < 0) {
+		if (!(neigh->nud_state & NUD_VALID))
+			printk(KERN_DEBUG
+			       "trying to ucast probe in NUD_INVALID\n");
 		dst_ha = neigh->ha;
 		read_lock_bh(&neigh->lock);
-	} else if ((probes -= neigh->parms->app_probes) < 0) {
+	} else {
+		probes -= neigh->parms->app_probes;
+		if (probes < 0) {
 #ifdef CONFIG_ARPD
-		neigh_app_ns(neigh);
+			neigh_app_ns(neigh);
 #endif
-		return;
+			return;
+		}
 	}
 
 	arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr,
@@ -451,7 +460,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev)
  *	is allowed to use this function, it is scheduled to be removed. --ANK
  */
 
-static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev)
+static int arp_set_predefined(int addr_hint, unsigned char *haddr,
+			      __be32 paddr, struct net_device *dev)
 {
 	switch (addr_hint) {
 	case RTN_LOCAL:
@@ -483,7 +493,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 
 	paddr = skb_rtable(skb)->rt_gateway;
 
-	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev))
+	if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr,
+			       paddr, dev))
 		return 0;
 
 	n = __neigh_lookup(&arp_tbl, &paddr, dev, 1);
@@ -515,13 +526,14 @@ int arp_bind_neighbour(struct dst_entry *dst)
 		return -EINVAL;
 	if (n == NULL) {
 		__be32 nexthop = ((struct rtable *)dst)->rt_gateway;
-		if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT))
+		if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))
 			nexthop = 0;
 		n = __neigh_lookup_errno(
 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE)
-		    dev->type == ARPHRD_ATM ? clip_tbl_hook :
+					 dev->type == ARPHRD_ATM ?
+					 clip_tbl_hook :
 #endif
-		    &arp_tbl, &nexthop, dev);
+					 &arp_tbl, &nexthop, dev);
 		if (IS_ERR(n))
 			return PTR_ERR(n);
 		dst->neighbour = n;
@@ -543,8 +555,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
 
 	if (!IN_DEV_PROXY_ARP(in_dev))
 		return 0;
-
-	if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0)
+	imi = IN_DEV_MEDIUM_ID(in_dev);
+	if (imi == 0)
 		return 1;
 	if (imi == -1)
 		return 0;
@@ -685,7 +697,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 	arp->ar_pln = 4;
 	arp->ar_op = htons(type);
 
-	arp_ptr=(unsigned char *)(arp+1);
+	arp_ptr = (unsigned char *)(arp + 1);
 
 	memcpy(arp_ptr, src_hw, dev->addr_len);
 	arp_ptr += dev->addr_len;
@@ -735,9 +747,8 @@ void arp_send(int type, int ptype, __be32 dest_ip,
 
 	skb = arp_create(type, ptype, dest_ip, dev, src_ip,
 			 dest_hw, src_hw, target_hw);
-	if (skb == NULL) {
+	if (skb == NULL)
 		return;
-	}
 
 	arp_xmit(skb);
 }
@@ -815,7 +826,7 @@ static int arp_process(struct sk_buff *skb)
 /*
  *	Extract fields
  */
-	arp_ptr= (unsigned char *)(arp+1);
+	arp_ptr = (unsigned char *)(arp + 1);
 	sha	= arp_ptr;
 	arp_ptr += dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
@@ -869,16 +880,17 @@ static int arp_process(struct sk_buff *skb)
 		addr_type = rt->rt_type;
 
 		if (addr_type == RTN_LOCAL) {
-			int dont_send = 0;
+			int dont_send;
 
-			if (!dont_send)
-				dont_send |= arp_ignore(in_dev,sip,tip);
+			dont_send = arp_ignore(in_dev, sip, tip);
 			if (!dont_send && IN_DEV_ARPFILTER(in_dev))
-				dont_send |= arp_filter(sip,tip,dev);
+				dont_send |= arp_filter(sip, tip, dev);
 			if (!dont_send) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n) {
-					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
 					neigh_release(n);
 				}
 			}
@@ -887,8 +899,7 @@ static int arp_process(struct sk_buff *skb)
 			if (addr_type == RTN_UNICAST  &&
 			    (arp_fwd_proxy(in_dev, dev, rt) ||
 			     arp_fwd_pvlan(in_dev, dev, rt, sip, tip) ||
-			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))
-			{
+			     pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) {
 				n = neigh_event_ns(&arp_tbl, sha, &sip, dev);
 				if (n)
 					neigh_release(n);
@@ -896,9 +907,12 @@ static int arp_process(struct sk_buff *skb)
 				if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED ||
 				    skb->pkt_type == PACKET_HOST ||
 				    in_dev->arp_parms->proxy_delay == 0) {
-					arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha);
+					arp_send(ARPOP_REPLY, ETH_P_ARP, sip,
+						 dev, tip, sha, dev->dev_addr,
+						 sha);
 				} else {
-					pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb);
+					pneigh_enqueue(&arp_tbl,
+						       in_dev->arp_parms, skb);
 					return 0;
 				}
 				goto out;
@@ -939,7 +953,8 @@ static int arp_process(struct sk_buff *skb)
 		if (arp->ar_op != htons(ARPOP_REPLY) ||
 		    skb->pkt_type != PACKET_HOST)
 			state = NUD_STALE;
-		neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0);
+		neigh_update(n, sha, state,
+			     override ? NEIGH_UPDATE_F_OVERRIDE : 0);
 		neigh_release(n);
 	}
 
@@ -975,7 +990,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
 	    arp->ar_pln != 4)
 		goto freeskb;
 
-	if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL)
 		goto out_of_mem;
 
 	memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb));
@@ -1019,7 +1035,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 		return -EINVAL;
 	if (!dev && (r->arp_flags & ATF_COM)) {
 		dev = dev_getbyhwaddr(net, r->arp_ha.sa_family,
-				r->arp_ha.sa_data);
+				      r->arp_ha.sa_data);
 		if (!dev)
 			return -ENODEV;
 	}
@@ -1033,7 +1049,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r,
 }
 
 static int arp_req_set(struct net *net, struct arpreq *r,
-		struct net_device * dev)
+		       struct net_device *dev)
 {
 	__be32 ip;
 	struct neighbour *neigh;
@@ -1046,10 +1062,11 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 	if (r->arp_flags & ATF_PERM)
 		r->arp_flags |= ATF_COM;
 	if (dev == NULL) {
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
-							 .tos = RTO_ONLINK } } };
-		struct rtable * rt;
-		if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+		struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+						    .tos = RTO_ONLINK } };
+		struct rtable *rt;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err != 0)
 			return err;
 		dev = rt->dst.dev;
 		ip_rt_put(rt);
@@ -1083,9 +1100,9 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 		unsigned state = NUD_STALE;
 		if (r->arp_flags & ATF_PERM)
 			state = NUD_PERMANENT;
-		err = neigh_update(neigh, (r->arp_flags&ATF_COM) ?
+		err = neigh_update(neigh, (r->arp_flags & ATF_COM) ?
 				   r->arp_ha.sa_data : NULL, state,
-				   NEIGH_UPDATE_F_OVERRIDE|
+				   NEIGH_UPDATE_F_OVERRIDE |
 				   NEIGH_UPDATE_F_ADMIN);
 		neigh_release(neigh);
 	}
@@ -1094,12 +1111,12 @@ static int arp_req_set(struct net *net, struct arpreq *r,
 
 static unsigned arp_state_to_flags(struct neighbour *neigh)
 {
-	unsigned flags = 0;
 	if (neigh->nud_state&NUD_PERMANENT)
-		flags = ATF_PERM|ATF_COM;
+		return ATF_PERM | ATF_COM;
 	else if (neigh->nud_state&NUD_VALID)
-		flags = ATF_COM;
-	return flags;
+		return ATF_COM;
+	else
+		return 0;
 }
 
 /*
@@ -1142,7 +1159,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r,
 }
 
 static int arp_req_delete(struct net *net, struct arpreq *r,
-		struct net_device * dev)
+			  struct net_device *dev)
 {
 	int err;
 	__be32 ip;
@@ -1153,10 +1170,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 
 	ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr;
 	if (dev == NULL) {
-		struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip,
-							 .tos = RTO_ONLINK } } };
-		struct rtable * rt;
-		if ((err = ip_route_output_key(net, &rt, &fl)) != 0)
+		struct flowi fl = { .nl_u.ip4_u = { .daddr = ip,
+						    .tos = RTO_ONLINK } };
+		struct rtable *rt;
+		err = ip_route_output_key(net, &rt, &fl);
+		if (err != 0)
 			return err;
 		dev = rt->dst.dev;
 		ip_rt_put(rt);
@@ -1166,7 +1184,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r,
 	err = -ENXIO;
 	neigh = neigh_lookup(&arp_tbl, &ip, dev);
 	if (neigh) {
-		if (neigh->nud_state&~NUD_NOARP)
+		if (neigh->nud_state & ~NUD_NOARP)
 			err = neigh_update(neigh, NULL, NUD_FAILED,
 					   NEIGH_UPDATE_F_OVERRIDE|
 					   NEIGH_UPDATE_F_ADMIN);
@@ -1186,24 +1204,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	struct net_device *dev = NULL;
 
 	switch (cmd) {
-		case SIOCDARP:
-		case SIOCSARP:
-			if (!capable(CAP_NET_ADMIN))
-				return -EPERM;
-		case SIOCGARP:
-			err = copy_from_user(&r, arg, sizeof(struct arpreq));
-			if (err)
-				return -EFAULT;
-			break;
-		default:
-			return -EINVAL;
+	case SIOCDARP:
+	case SIOCSARP:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
+	case SIOCGARP:
+		err = copy_from_user(&r, arg, sizeof(struct arpreq));
+		if (err)
+			return -EFAULT;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	if (r.arp_pa.sa_family != AF_INET)
 		return -EPFNOSUPPORT;
 
 	if (!(r.arp_flags & ATF_PUBL) &&
-	    (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB)))
+	    (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB)))
 		return -EINVAL;
 	if (!(r.arp_flags & ATF_NETMASK))
 		((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr =
@@ -1211,7 +1229,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	rtnl_lock();
 	if (r.arp_dev[0]) {
 		err = -ENODEV;
-		if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL)
+		dev = __dev_get_by_name(net, r.arp_dev);
+		if (dev == NULL)
 			goto out;
 
 		/* Mmmm... It is wrong... ARPHRD_NETROM==0 */
@@ -1243,7 +1262,8 @@ out:
 	return err;
 }
 
-static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
+static int arp_netdev_event(struct notifier_block *this, unsigned long event,
+			    void *ptr)
 {
 	struct net_device *dev = ptr;
 
@@ -1311,12 +1331,13 @@ static char *ax2asc2(ax25_address *a, char *buf)
 	for (n = 0, s = buf; n < 6; n++) {
 		c = (a->ax25_call[n] >> 1) & 0x7F;
 
-		if (c != ' ') *s++ = c;
+		if (c != ' ')
+			*s++ = c;
 	}
 
 	*s++ = '-';
-
-	if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) {
+	n = (a->ax25_call[6] >> 1) & 0x0F;
+	if (n > 9) {
 		*s++ = '1';
 		n -= 10;
 	}
@@ -1325,10 +1346,9 @@ static char *ax2asc2(ax25_address *a, char *buf)
 	*s++ = '\0';
 
 	if (*buf == '\0' || *buf == '-')
-	   return "*";
+		return "*";
 
 	return buf;
-
 }
 #endif /* CONFIG_AX25 */
 
@@ -1408,10 +1428,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos)
 /* ------------------------------------------------------------------------ */
 
 static const struct seq_operations arp_seq_ops = {
-	.start  = arp_seq_start,
-	.next   = neigh_seq_next,
-	.stop   = neigh_seq_stop,
-	.show   = arp_seq_show,
+	.start	= arp_seq_start,
+	.next	= neigh_seq_next,
+	.stop	= neigh_seq_stop,
+	.show	= arp_seq_show,
 };
 
 static int arp_seq_open(struct inode *inode, struct file *file)
-- 
cgit v1.2.3-58-ga151


From e0386005ff2a729998735e10769d99e1acbc2dd1 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 8 Sep 2010 21:31:35 -0700
Subject: net: inet_add_protocol() can use cmpxchg()

Use cmpxchg() to get rid of spinlocks in inet_add_protocol() and
friends.

inet_protos[] & inet6_protos[] are moved to read_mostly section

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/protocol.c | 31 +++++--------------------------
 net/ipv6/protocol.c | 32 ++++----------------------------
 2 files changed, 9 insertions(+), 54 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c
index f2d297351405..65699c24411c 100644
--- a/net/ipv4/protocol.c
+++ b/net/ipv4/protocol.c
@@ -28,8 +28,7 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 
-const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp;
-static DEFINE_SPINLOCK(inet_proto_lock);
+const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly;
 
 /*
  *	Add a protocol handler to the hash tables
@@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock);
 
 int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash, ret;
+	int hash = protocol & (MAX_INET_PROTOS - 1);
 
-	hash = protocol & (MAX_INET_PROTOS - 1);
-
-	spin_lock_bh(&inet_proto_lock);
-	if (inet_protos[hash]) {
-		ret = -1;
-	} else {
-		inet_protos[hash] = prot;
-		ret = 0;
-	}
-	spin_unlock_bh(&inet_proto_lock);
-
-	return ret;
+	return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1;
 }
 EXPORT_SYMBOL(inet_add_protocol);
 
@@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol);
 
 int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol)
 {
-	int hash, ret;
-
-	hash = protocol & (MAX_INET_PROTOS - 1);
+	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet_proto_lock);
-	if (inet_protos[hash] == prot) {
-		inet_protos[hash] = NULL;
-		ret = 0;
-	} else {
-		ret = -1;
-	}
-	spin_unlock_bh(&inet_proto_lock);
+	ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
 
diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c
index 1fa3468f0f32..9bb936ae2452 100644
--- a/net/ipv6/protocol.c
+++ b/net/ipv6/protocol.c
@@ -25,28 +25,14 @@
 #include <linux/spinlock.h>
 #include <net/protocol.h>
 
-const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS];
-static DEFINE_SPINLOCK(inet6_proto_lock);
-
+const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly;
 
 int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol)
 {
-	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
-
-	spin_lock_bh(&inet6_proto_lock);
-
-	if (inet6_protos[hash]) {
-		ret = -1;
-	} else {
-		inet6_protos[hash] = prot;
-		ret = 0;
-	}
-
-	spin_unlock_bh(&inet6_proto_lock);
+	int hash = protocol & (MAX_INET_PROTOS - 1);
 
-	return ret;
+	return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1;
 }
-
 EXPORT_SYMBOL(inet6_add_protocol);
 
 /*
@@ -57,20 +43,10 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol
 {
 	int ret, hash = protocol & (MAX_INET_PROTOS - 1);
 
-	spin_lock_bh(&inet6_proto_lock);
-
-	if (inet6_protos[hash] != prot) {
-		ret = -1;
-	} else {
-		inet6_protos[hash] = NULL;
-		ret = 0;
-	}
-
-	spin_unlock_bh(&inet6_proto_lock);
+	ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1;
 
 	synchronize_net();
 
 	return ret;
 }
-
 EXPORT_SYMBOL(inet6_del_protocol);
-- 
cgit v1.2.3-58-ga151


From 49d61e2390c92bd226fc395a6165eb5a65ae4de6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 9 Sep 2010 05:33:43 +0000
Subject: tunnels: missing rcu_assign_pointer()

xfrm4_tunnel_register() & xfrm6_tunnel_register() should
use rcu_assign_pointer() to make sure previous writes
(to handler->next) are committed to memory before chain
insertion.

deregister functions dont need a particular barrier.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tunnel4.c | 2 +-
 net/ipv6/tunnel6.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c
index df59d16337f2..9a17bd2a0a37 100644
--- a/net/ipv4/tunnel4.c
+++ b/net/ipv4/tunnel4.c
@@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family)
 	}
 
 	handler->next = *pprev;
-	*pprev = handler;
+	rcu_assign_pointer(*pprev, handler);
 
 	ret = 0;
 
diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c
index 3177fe0459e0..d9864725d0c6 100644
--- a/net/ipv6/tunnel6.c
+++ b/net/ipv6/tunnel6.c
@@ -51,7 +51,7 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family)
 	}
 
 	handler->next = *pprev;
-	*pprev = handler;
+	rcu_assign_pointer(*pprev, handler);
 
 	ret = 0;
 
-- 
cgit v1.2.3-58-ga151


From a034ee3cca5726b14107f281f4bed1c0fd44472a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 9 Sep 2010 23:32:28 +0000
Subject: fib: cleanups

Use rcu_dereference_rtnl() helper

Change hard coded constants in fib_flag_trans()
 7 -> RTN_UNREACHABLE
 8 -> RTN_PROHIBIT

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 55 ++++++++++++++++++++++-------------------------------
 1 file changed, 23 insertions(+), 32 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 4a8e370862bc..a96e5ec211a0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -186,9 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node)
 {
 	struct tnode *ret = node_parent(node);
 
-	return rcu_dereference_check(ret,
-				     rcu_read_lock_held() ||
-				     lockdep_rtnl_is_held());
+	return rcu_dereference_rtnl(ret);
 }
 
 /* Same as rcu_assign_pointer
@@ -211,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
 {
 	struct node *ret = tnode_get_child(tn, i);
 
-	return rcu_dereference_check(ret,
-				     rcu_read_lock_held() ||
-				     lockdep_rtnl_is_held());
+	return rcu_dereference_rtnl(ret);
 }
 
 static inline int tnode_child_length(const struct tnode *tn)
@@ -459,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits)
 		tn->empty_children = 1<<bits;
 	}
 
-	pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
-		 (unsigned long) (sizeof(struct node) << bits));
+	pr_debug("AT %p s=%zu %zu\n", tn, sizeof(struct tnode),
+		 sizeof(struct node) << bits);
 	return tn;
 }
 
@@ -609,11 +605,10 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 
 	/* Keep root node larger  */
 
-	if (!node_parent((struct node*) tn)) {
+	if (!node_parent((struct node *)tn)) {
 		inflate_threshold_use = inflate_threshold_root;
 		halve_threshold_use = halve_threshold_root;
-	}
-	else {
+	} else {
 		inflate_threshold_use = inflate_threshold;
 		halve_threshold_use = halve_threshold;
 	}
@@ -639,7 +634,7 @@ static struct node *resize(struct trie *t, struct tnode *tn)
 	check_tnode(tn);
 
 	/* Return if at least one inflate is run */
-	if( max_work != MAX_WORK)
+	if (max_work != MAX_WORK)
 		return (struct node *) tn;
 
 	/*
@@ -966,9 +961,7 @@ fib_find_node(struct trie *t, u32 key)
 	struct node *n;
 
 	pos = 0;
-	n = rcu_dereference_check(t->trie,
-				  rcu_read_lock_held() ||
-				  lockdep_rtnl_is_held());
+	n = rcu_dereference_rtnl(t->trie);
 
 	while (n != NULL &&  NODE_TYPE(n) == T_TNODE) {
 		tn = (struct tnode *) n;
@@ -1748,16 +1741,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
 
 		/* Node empty, walk back up to parent */
 		c = (struct node *) p;
-	} while ( (p = node_parent_rcu(c)) != NULL);
+	} while ((p = node_parent_rcu(c)) != NULL);
 
 	return NULL; /* Root of trie */
 }
 
 static struct leaf *trie_firstleaf(struct trie *t)
 {
-	struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
-							rcu_read_lock_held() ||
-							lockdep_rtnl_is_held());
+	struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie);
 
 	if (!n)
 		return NULL;
@@ -2043,14 +2034,14 @@ struct fib_trie_iter {
 	struct seq_net_private p;
 	struct fib_table *tb;
 	struct tnode *tnode;
-	unsigned index;
-	unsigned depth;
+	unsigned int index;
+	unsigned int depth;
 };
 
 static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
 {
 	struct tnode *tn = iter->tnode;
-	unsigned cindex = iter->index;
+	unsigned int cindex = iter->index;
 	struct tnode *p;
 
 	/* A single entry routing table */
@@ -2159,7 +2150,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s)
  */
 static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
 {
-	unsigned i, max, pointers, bytes, avdepth;
+	unsigned int i, max, pointers, bytes, avdepth;
 
 	if (stat->leaves)
 		avdepth = stat->totdepth*100 / stat->leaves;
@@ -2356,7 +2347,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v)
 
 static void seq_indent(struct seq_file *seq, int n)
 {
-	while (n-- > 0) seq_puts(seq, "   ");
+	while (n-- > 0)
+		seq_puts(seq, "   ");
 }
 
 static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
@@ -2388,7 +2380,7 @@ static const char *const rtn_type_names[__RTN_MAX] = {
 	[RTN_XRESOLVE] = "XRESOLVE",
 };
 
-static inline const char *rtn_type(char *buf, size_t len, unsigned t)
+static inline const char *rtn_type(char *buf, size_t len, unsigned int t)
 {
 	if (t < __RTN_MAX && rtn_type_names[t])
 		return rtn_type_names[t];
@@ -2544,13 +2536,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v)
 	rcu_read_unlock();
 }
 
-static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
+static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 {
-	static unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT, [8] = RTF_REJECT,
-	};
-	unsigned flags = type2flags[type];
+	unsigned int flags = 0;
 
+	if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT)
+		flags = RTF_REJECT;
 	if (fi && fi->fib_nh->nh_gw)
 		flags |= RTF_GATEWAY;
 	if (mask == htonl(0xFFFFFFFF))
@@ -2562,7 +2553,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
 /*
  *	This outputs /proc/net/route.
  *	The format of the file is not supposed to be changed
- * 	and needs to be same as fib_hash output to avoid breaking
+ *	and needs to be same as fib_hash output to avoid breaking
  *	legacy utilities
  */
 static int fib_route_seq_show(struct seq_file *seq, void *v)
@@ -2587,7 +2578,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v)
 
 		list_for_each_entry_rcu(fa, &li->falh, fa_list) {
 			const struct fib_info *fi = fa->fa_info;
-			unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
+			unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi);
 			int len;
 
 			if (fa->fa_type == RTN_BROADCAST
-- 
cgit v1.2.3-58-ga151


From 0abee5260babe0a12d468b485a28336551697925 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Wed, 15 Sep 2010 19:24:50 +0200
Subject: netfilter: nf_nat: add nf_nat_csum()

Add a static function nf_nat_csum() to replace the duplicate code in
nf_nat_mangle_udp_packet() and __nf_nat_mangle_tcp_packet().

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_helper.c | 76 ++++++++++++++++----------------------
 1 file changed, 31 insertions(+), 45 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c
index 4a0c6b548eee..31427fb57aa8 100644
--- a/net/ipv4/netfilter/nf_nat_helper.c
+++ b/net/ipv4/netfilter/nf_nat_helper.c
@@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo,
 }
 EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust);
 
+static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data,
+			int datalen, __sum16 *check, int oldlen)
+{
+	struct rtable *rt = skb_rtable(skb);
+
+	if (skb->ip_summed != CHECKSUM_PARTIAL) {
+		if (!(rt->rt_flags & RTCF_LOCAL) &&
+		    skb->dev->features & NETIF_F_V4_CSUM) {
+			skb->ip_summed = CHECKSUM_PARTIAL;
+			skb->csum_start = skb_headroom(skb) +
+					  skb_network_offset(skb) +
+					  iph->ihl * 4;
+			skb->csum_offset = (void *)check - data;
+			*check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						    datalen, iph->protocol, 0);
+		} else {
+			*check = 0;
+			*check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+						   datalen, iph->protocol,
+						   csum_partial(data, datalen,
+								0));
+			if (iph->protocol == IPPROTO_UDP && !*check)
+				*check = CSUM_MANGLED_0;
+		}
+	} else
+		inet_proto_csum_replace2(check, skb,
+					 htons(oldlen), htons(datalen), 1);
+}
+
 /* Generic function for mangling variable-length address changes inside
  * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX
  * command in FTP).
@@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			       const char *rep_buffer,
 			       unsigned int rep_len, bool adjust)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct tcphdr *tcph;
 	int oldlen, datalen;
@@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb,
 			match_offset, match_len, rep_buffer, rep_len);
 
 	datalen = skb->len - iph->ihl*4;
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    skb->dev->features & NETIF_F_V4_CSUM) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  iph->ihl * 4;
-			skb->csum_offset = offsetof(struct tcphdr, check);
-			tcph->check = ~tcp_v4_check(datalen,
-						    iph->saddr, iph->daddr, 0);
-		} else {
-			tcph->check = 0;
-			tcph->check = tcp_v4_check(datalen,
-						   iph->saddr, iph->daddr,
-						   csum_partial(tcph,
-								datalen, 0));
-		}
-	} else
-		inet_proto_csum_replace2(&tcph->check, skb,
-					 htons(oldlen), htons(datalen), 1);
+	nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen);
 
 	if (adjust && rep_len != match_len)
 		nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq,
@@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 			 const char *rep_buffer,
 			 unsigned int rep_len)
 {
-	struct rtable *rt = skb_rtable(skb);
 	struct iphdr *iph;
 	struct udphdr *udph;
 	int datalen, oldlen;
@@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb,
 	if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL)
 		return 1;
 
-	if (skb->ip_summed != CHECKSUM_PARTIAL) {
-		if (!(rt->rt_flags & RTCF_LOCAL) &&
-		    skb->dev->features & NETIF_F_V4_CSUM) {
-			skb->ip_summed = CHECKSUM_PARTIAL;
-			skb->csum_start = skb_headroom(skb) +
-					  skb_network_offset(skb) +
-					  iph->ihl * 4;
-			skb->csum_offset = offsetof(struct udphdr, check);
-			udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
-							 datalen, IPPROTO_UDP,
-							 0);
-		} else {
-			udph->check = 0;
-			udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
-							datalen, IPPROTO_UDP,
-							csum_partial(udph,
-								     datalen, 0));
-			if (!udph->check)
-				udph->check = CSUM_MANGLED_0;
-		}
-	} else
-		inet_proto_csum_replace2(&udph->check, skb,
-					 htons(oldlen), htons(datalen), 1);
+	nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen);
 
 	return 1;
 }
-- 
cgit v1.2.3-58-ga151


From b7285b7912776a4492744949c747c88d539006fa Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 15 Sep 2010 11:07:24 +0000
Subject: ipip: get rid of ipip_lock

As RTNL is held while doing tunnels inserts and deletes, we can remove
ipip_lock spinlock. My initial RCU conversion was conservative and
converted the rwlock to spinlock, with no RTNL requirement.

Use appropriate rcu annotations and modern lockdep checks as well.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 67 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 33 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 3c6f8f3968a6..8de8888dc95a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -122,11 +122,11 @@
 
 static int ipip_net_id __read_mostly;
 struct ipip_net {
-	struct ip_tunnel *tunnels_r_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_r[HASH_SIZE];
-	struct ip_tunnel *tunnels_l[HASH_SIZE];
-	struct ip_tunnel *tunnels_wc[1];
-	struct ip_tunnel **tunnels[4];
+	struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_r[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_l[HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels_wc[1];
+	struct ip_tunnel __rcu **tunnels[4];
 
 	struct net_device *fb_tunnel_dev;
 };
@@ -135,9 +135,8 @@ static void ipip_tunnel_init(struct net_device *dev);
 static void ipip_tunnel_setup(struct net_device *dev);
 
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ipip_lock);
 
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
@@ -145,8 +144,8 @@ static DEFINE_SPINLOCK(ipip_lock);
 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
 		__be32 remote, __be32 local)
 {
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(local);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(local);
 	struct ip_tunnel *t;
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
@@ -169,12 +168,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
 	return NULL;
 }
 
-static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
+static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn,
 		struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	unsigned h = 0;
+	unsigned int h = 0;
 	int prio = 0;
 
 	if (remote) {
@@ -188,7 +187,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn,
 	return &ipn->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
+static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn,
 		struct ip_tunnel *t)
 {
 	return __ipip_bucket(ipn, &t->parms);
@@ -196,13 +195,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn,
 
 static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp;
-
-	for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ipip_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ipip_lock);
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipip_bucket(ipn, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
@@ -210,12 +210,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t)
 
 static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp = ipip_bucket(ipn, t);
+	struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t);
 
-	spin_lock_bh(&ipip_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ipip_lock);
 }
 
 static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
@@ -223,12 +221,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
-	struct ip_tunnel *t, **tp, *nt;
+	struct ip_tunnel *t, *nt;
+	struct ip_tunnel __rcu **tp;
 	struct net_device *dev;
 	char name[IFNAMSIZ];
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
-	for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) {
+	for (tp = __ipip_bucket(ipn, parms);
+		 (t = rtnl_dereference(*tp)) != NULL;
+		 tp = &t->next) {
 		if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr)
 			return t;
 	}
@@ -268,16 +269,15 @@ failed_free:
 	return NULL;
 }
 
+/* called with RTNL */
 static void ipip_tunnel_uninit(struct net_device *dev)
 {
 	struct net *net = dev_net(dev);
 	struct ipip_net *ipn = net_generic(net, ipip_net_id);
 
-	if (dev == ipn->fb_tunnel_dev) {
-		spin_lock_bh(&ipip_lock);
-		ipn->tunnels_wc[0] = NULL;
-		spin_unlock_bh(&ipip_lock);
-	} else
+	if (dev == ipn->fb_tunnel_dev)
+		rcu_assign_pointer(ipn->tunnels_wc[0], NULL);
+	else
 		ipip_tunnel_unlink(ipn, netdev_priv(dev));
 	dev_put(dev);
 }
@@ -741,7 +741,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
 	iph->ihl		= 5;
 
 	dev_hold(dev);
-	ipn->tunnels_wc[0]	= tunnel;
+	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
 }
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
@@ -760,11 +760,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head)
 	for (prio = 1; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t = ipn->tunnels[prio][h];
+			struct ip_tunnel *t;
 
+			t = rtnl_dereference(ipn->tunnels[prio][h]);
 			while (t != NULL) {
 				unregister_netdevice_queue(t->dev, head);
-				t = t->next;
+				t = rtnl_dereference(t->next);
 			}
 		}
 	}
-- 
cgit v1.2.3-58-ga151


From 1507850b400492fdedc3064d3b8db5e9a1c871e3 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 15 Sep 2010 11:07:53 +0000
Subject: gre: get rid of ipgre_lock

As RTNL is held while doing tunnels inserts and deletes, we can remove
ipgre_lock spinlock. My initial RCU conversion was conservative and
converted the rwlock to spinlock, with no RTNL requirement.

Use appropriate rcu annotations and modern lockdep checks as well.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 61 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 32 insertions(+), 29 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 85176895495a..fc20e687e933 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -129,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev);
 
 static int ipgre_net_id __read_mostly;
 struct ipgre_net {
-	struct ip_tunnel *tunnels[4][HASH_SIZE];
+	struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
 
 	struct net_device *fb_tunnel_dev;
 };
@@ -159,9 +159,8 @@ struct ipgre_net {
 #define tunnels_l	tunnels[1]
 #define tunnels_wc	tunnels[0]
 /*
- * Locking : hash tables are protected by RCU and a spinlock
+ * Locking : hash tables are protected by RCU and RTNL
  */
-static DEFINE_SPINLOCK(ipgre_lock);
 
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
@@ -174,8 +173,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 {
 	struct net *net = dev_net(dev);
 	int link = dev->ifindex;
-	unsigned h0 = HASH(remote);
-	unsigned h1 = HASH(key);
+	unsigned int h0 = HASH(remote);
+	unsigned int h1 = HASH(key);
 	struct ip_tunnel *t, *cand = NULL;
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 	int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
@@ -290,13 +289,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
 	return NULL;
 }
 
-static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
+static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
 		struct ip_tunnel_parm *parms)
 {
 	__be32 remote = parms->iph.daddr;
 	__be32 local = parms->iph.saddr;
 	__be32 key = parms->i_key;
-	unsigned h = HASH(key);
+	unsigned int h = HASH(key);
 	int prio = 0;
 
 	if (local)
@@ -309,7 +308,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
 	return &ign->tunnels[prio][h];
 }
 
-static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
+static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
 		struct ip_tunnel *t)
 {
 	return __ipgre_bucket(ign, &t->parms);
@@ -317,23 +316,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
 
 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp = ipgre_bucket(ign, t);
+	struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
 
-	spin_lock_bh(&ipgre_lock);
-	t->next = *tp;
+	rcu_assign_pointer(t->next, rtnl_dereference(*tp));
 	rcu_assign_pointer(*tp, t);
-	spin_unlock_bh(&ipgre_lock);
 }
 
 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
 {
-	struct ip_tunnel **tp;
-
-	for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
-		if (t == *tp) {
-			spin_lock_bh(&ipgre_lock);
-			*tp = t->next;
-			spin_unlock_bh(&ipgre_lock);
+	struct ip_tunnel __rcu **tp;
+	struct ip_tunnel *iter;
+
+	for (tp = ipgre_bucket(ign, t);
+	     (iter = rtnl_dereference(*tp)) != NULL;
+	     tp = &iter->next) {
+		if (t == iter) {
+			rcu_assign_pointer(*tp, t->next);
 			break;
 		}
 	}
@@ -347,10 +345,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 	__be32 local = parms->iph.saddr;
 	__be32 key = parms->i_key;
 	int link = parms->link;
-	struct ip_tunnel *t, **tp;
+	struct ip_tunnel *t;
+	struct ip_tunnel __rcu **tp;
 	struct ipgre_net *ign = net_generic(net, ipgre_net_id);
 
-	for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
+	for (tp = __ipgre_bucket(ign, parms);
+	     (t = rtnl_dereference(*tp)) != NULL;
+	     tp = &t->next)
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
 		    key == t->parms.i_key &&
@@ -361,7 +362,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
 	return t;
 }
 
-static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
+static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
 		struct ip_tunnel_parm *parms, int create)
 {
 	struct ip_tunnel *t, *nt;
@@ -669,7 +670,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	u8     tos;
 	__be16 df;
 	struct rtable *rt;     			/* Route to the other host */
-	struct net_device *tdev;			/* Device to other host */
+	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
 	int    gre_hlen;
@@ -1013,7 +1014,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
 					break;
 				}
 			} else {
-				unsigned nflags = 0;
+				unsigned int nflags = 0;
 
 				t = netdev_priv(dev);
 
@@ -1126,7 +1127,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
 
 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
 			unsigned short type,
-			const void *daddr, const void *saddr, unsigned len)
+			const void *daddr, const void *saddr, unsigned int len)
 {
 	struct ip_tunnel *t = netdev_priv(dev);
 	struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
@@ -1275,7 +1276,7 @@ static void ipgre_fb_tunnel_init(struct net_device *dev)
 	tunnel->hlen		= sizeof(struct iphdr) + 4;
 
 	dev_hold(dev);
-	ign->tunnels_wc[0]	= tunnel;
+	rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
 }
 
 
@@ -1291,11 +1292,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
 	for (prio = 0; prio < 4; prio++) {
 		int h;
 		for (h = 0; h < HASH_SIZE; h++) {
-			struct ip_tunnel *t = ign->tunnels[prio][h];
+			struct ip_tunnel *t;
+
+			t = rtnl_dereference(ign->tunnels[prio][h]);
 
 			while (t != NULL) {
 				unregister_netdevice_queue(t->dev, head);
-				t = t->next;
+				t = rtnl_dereference(t->next);
 			}
 		}
 	}
@@ -1522,7 +1525,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
 		t = nt;
 
 		if (dev->type != ARPHRD_ETHER) {
-			unsigned nflags = 0;
+			unsigned int nflags = 0;
 
 			if (ipv4_is_multicast(p.iph.daddr))
 				nflags = IFF_BROADCAST;
-- 
cgit v1.2.3-58-ga151


From 95ae6b228f814fc0528d0506ee9f18ac333d6851 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 15 Sep 2010 04:04:31 +0000
Subject: ipv4: ip_ptr cleanups

dev->ip_ptr is protected by rtnl and rcu.

Yet some places dont use appropriate primitives and/or locking rules.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/plip.c           |  8 ++++++--
 drivers/net/via-velocity.h   | 11 +++++++----
 drivers/net/wan/hdlc_cisco.c |  4 +++-
 include/linux/inetdevice.h   | 14 +++++---------
 include/linux/netdevice.h    |  2 +-
 net/core/dev.c               |  2 +-
 net/ipv4/devinet.c           |  4 ++--
 net/ipv4/ipmr.c              |  2 +-
 net/mac80211/main.c          |  2 +-
 9 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/plip.c b/drivers/net/plip.c
index 7e82a82422cf..ca4df7f4cf21 100644
--- a/drivers/net/plip.c
+++ b/drivers/net/plip.c
@@ -995,8 +995,10 @@ plip_tx_packet(struct sk_buff *skb, struct net_device *dev)
 static void
 plip_rewrite_address(const struct net_device *dev, struct ethhdr *eth)
 {
-	const struct in_device *in_dev = dev->ip_ptr;
+	const struct in_device *in_dev;
 
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		/* Any address will do - we take the first */
 		const struct in_ifaddr *ifa = in_dev->ifa_list;
@@ -1006,6 +1008,7 @@ plip_rewrite_address(const struct net_device *dev, struct ethhdr *eth)
 			memcpy(eth->h_dest+2, &ifa->ifa_address, 4);
 		}
 	}
+	rcu_read_unlock();
 }
 
 static int
@@ -1088,7 +1091,8 @@ plip_open(struct net_device *dev)
 	   when the device address isn't identical to the address of a
 	   received frame, the kernel incorrectly drops it).             */
 
-	if ((in_dev=dev->ip_ptr) != NULL) {
+	in_dev=__in_dev_get_rtnl(dev);
+	if (in_dev) {
 		/* Any address will do - we take the first. We already
 		   have the first two bytes filled with 0xfc, from
 		   plip_init_dev(). */
diff --git a/drivers/net/via-velocity.h b/drivers/net/via-velocity.h
index f7b33ae7a703..b5e120b0074b 100644
--- a/drivers/net/via-velocity.h
+++ b/drivers/net/via-velocity.h
@@ -1504,22 +1504,25 @@ struct velocity_info {
  *	addresses on this chain then we use the first - multi-IP WOL is not
  *	supported.
  *
- *	CHECK ME: locking
  */
 
 static inline int velocity_get_ip(struct velocity_info *vptr)
 {
-	struct in_device *in_dev = (struct in_device *) vptr->dev->ip_ptr;
+	struct in_device *in_dev;
 	struct in_ifaddr *ifa;
+	int res = -ENOENT;
 
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(vptr->dev);
 	if (in_dev != NULL) {
 		ifa = (struct in_ifaddr *) in_dev->ifa_list;
 		if (ifa != NULL) {
 			memcpy(vptr->ip_addr, &ifa->ifa_address, 4);
-			return 0;
+			res = 0;
 		}
 	}
-	return -ENOENT;
+	rcu_read_unlock();
+	return res;
 }
 
 /**
diff --git a/drivers/net/wan/hdlc_cisco.c b/drivers/net/wan/hdlc_cisco.c
index b38ffa149aba..b1e5e5b69c2a 100644
--- a/drivers/net/wan/hdlc_cisco.c
+++ b/drivers/net/wan/hdlc_cisco.c
@@ -191,7 +191,8 @@ static int cisco_rx(struct sk_buff *skb)
 
 		switch (ntohl (cisco_data->type)) {
 		case CISCO_ADDR_REQ: /* Stolen from syncppp.c :-) */
-			in_dev = dev->ip_ptr;
+			rcu_read_lock();
+			in_dev = __in_dev_get_rcu(dev);
 			addr = 0;
 			mask = ~cpu_to_be32(0); /* is the mask correct? */
 
@@ -211,6 +212,7 @@ static int cisco_rx(struct sk_buff *skb)
 				cisco_keepalive_send(dev, CISCO_ADDR_REPLY,
 						     addr, mask);
 			}
+			rcu_read_unlock();
 			dev_kfree_skb_any(skb);
 			return NET_RX_SUCCESS;
 
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 2be1a1a2beb9..1ec09bb4a3ab 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -9,6 +9,7 @@
 #include <linux/rcupdate.h>
 #include <linux/timer.h>
 #include <linux/sysctl.h>
+#include <linux/rtnetlink.h>
 
 enum
 {
@@ -198,14 +199,10 @@ static __inline__ int bad_mask(__be32 mask, __be32 addr)
 
 static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)
 {
-	struct in_device *in_dev = dev->ip_ptr;
-	if (in_dev)
-		in_dev = rcu_dereference(in_dev);
-	return in_dev;
+	return rcu_dereference(dev->ip_ptr);
 }
 
-static __inline__ struct in_device *
-in_dev_get(const struct net_device *dev)
+static inline struct in_device *in_dev_get(const struct net_device *dev)
 {
 	struct in_device *in_dev;
 
@@ -217,10 +214,9 @@ in_dev_get(const struct net_device *dev)
 	return in_dev;
 }
 
-static __inline__ struct in_device *
-__in_dev_get_rtnl(const struct net_device *dev)
+static inline struct in_device *__in_dev_get_rtnl(const struct net_device *dev)
 {
-	return (struct in_device*)dev->ip_ptr;
+	return rcu_dereference_check(dev->ip_ptr, lockdep_rtnl_is_held());
 }
 
 extern void in_dev_finish_destroy(struct in_device *idev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index af05186d5b36..8992fffb8104 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -942,7 +942,7 @@ struct net_device {
 	void			*dsa_ptr;	/* dsa specific data */
 #endif
 	void 			*atalk_ptr;	/* AppleTalk link 	*/
-	void			*ip_ptr;	/* IPv4 specific data	*/
+	struct in_device __rcu	*ip_ptr;	/* IPv4 specific data	*/
 	void                    *dn_ptr;        /* DECnet specific data */
 	void                    *ip6_ptr;       /* IPv6 specific data */
 	void			*ec_ptr;	/* Econet specific data	*/
diff --git a/net/core/dev.c b/net/core/dev.c
index fc2dc933bee5..5bdce97b8175 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5286,7 +5286,7 @@ void netdev_run_todo(void)
 
 		/* paranoia */
 		BUG_ON(atomic_read(&dev->refcnt));
-		WARN_ON(dev->ip_ptr);
+		WARN_ON(rcu_dereference_raw(dev->ip_ptr));
 		WARN_ON(dev->ip6_ptr);
 		WARN_ON(dev->dn_ptr);
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index da14c49284f4..c2ff48fa18c7 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev)
 		inet_free_ifa(ifa);
 	}
 
-	dev->ip_ptr = NULL;
+	rcu_assign_pointer(dev->ip_ptr, NULL);
 
 	devinet_sysctl_unregister(in_dev);
 	neigh_parms_release(&arp_tbl, in_dev->arp_parms);
@@ -1059,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_REGISTER:
 		printk(KERN_DEBUG "inetdev_event: bug\n");
-		dev->ip_ptr = NULL;
+		rcu_assign_pointer(dev->ip_ptr, NULL);
 		break;
 	case NETDEV_UP:
 		if (!inetdev_valid_mtu(dev->mtu))
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 179fcab866fc..10b24c02deb0 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -724,7 +724,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	case 0:
 		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
 			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
-			if (dev && dev->ip_ptr == NULL) {
+			if (dev && __in_dev_get_rtnl(dev) == NULL) {
 				dev_put(dev);
 				return -EADDRNOTAVAIL;
 			}
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 4935b843bcca..b8cf2821f00d 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -362,7 +362,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb,
 	if (sdata->vif.type != NL80211_IFTYPE_STATION)
 		return NOTIFY_DONE;
 
-	idev = sdata->dev->ip_ptr;
+	idev = __in_dev_get_rtnl(sdata->dev);
 	if (!idev)
 		return NOTIFY_DONE;
 
-- 
cgit v1.2.3-58-ga151


From 99ad3c53b36a056a472927de9c79eda231ecc6fe Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 16 Sep 2010 19:45:19 +0200
Subject: netfilter: nf_nat_core: don't check if the tuple is used if there is
 no other choice

Eliminate nf_nat_used_tuple() to save some CPU cycles when there is no
other choice.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_core.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..2c084b3a8f0c 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -262,11 +262,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
 	proto = __nf_nat_proto_find(orig_tuple->dst.protonum);
 
 	/* Only bother mapping if it's not already in range and unique */
-	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) &&
-	    (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) ||
-	     proto->in_range(tuple, maniptype, &range->min, &range->max)) &&
-	    !nf_nat_used_tuple(tuple, ct))
-		goto out;
+	if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) {
+		if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) {
+			if (proto->in_range(tuple, maniptype, &range->min,
+					    &range->max) &&
+			    (range->min.all == range->max.all ||
+			     !nf_nat_used_tuple(tuple, ct)))
+				goto out;
+		} else if (!nf_nat_used_tuple(tuple, ct)) {
+			goto out;
+		}
+	}
 
 	/* Last change: get protocol to try to obtain unique tuple. */
 	proto->unique_tuple(tuple, range, maniptype, ct);
-- 
cgit v1.2.3-58-ga151


From ed0b6d7581b54455062f09ccac123814e70cd02f Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Thu, 16 Sep 2010 19:47:51 +0200
Subject: netfilter: nf_nat: no IP_NAT_RANGE_MAP_IPS flags when
 alloc_null_binding()

When alloc_null_binding(), no IP_NAT_RNAGE_MAP_IPS in flags means no IP address
translation is needed. It isn't necessary to specify the address explicitly.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_rule.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c
index ebbd319f62f5..21c30426480b 100644
--- a/net/ipv4/netfilter/nf_nat_rule.c
+++ b/net/ipv4/netfilter/nf_nat_rule.c
@@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
 {
 	/* Force range to this IP; let proto decide mapping for
 	   per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
-	   Use reply in case it's already been mangled (eg local packet).
 	*/
-	__be32 ip
-		= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
-		   ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
-		   : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
-	struct nf_nat_range range
-		= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
-
-	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
+	struct nf_nat_range range;
+
+	range.flags = 0;
+	pr_debug("Allocating NULL binding for %p (%pI4)\n", ct,
+		 HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ?
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip :
+		 &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
+
 	return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
 }
 
-- 
cgit v1.2.3-58-ga151


From 462fb2af9788a82a534f8184abfde31574e1cfa0 Mon Sep 17 00:00:00 2001
From: Bandan Das <bandan.das@stratus.com>
Date: Sun, 19 Sep 2010 09:34:33 +0000
Subject: bridge : Sanitize skb before it enters the IP stack

Related dicussion here : http://lkml.org/lkml/2010/9/3/16

Introduce a function br_parse_ip_options that will audit the
skb and possibly refill IP options before a packet enters the
IP stack. If no options are present, the function will zero out
the skb cb area so that it is not misinterpreted as options by some
unsuspecting IP layer routine. If packet consistency fails, drop it.

Signed-off-by: Bandan Das <bandan.das@stratus.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_netfilter.c | 107 +++++++++++++++++++++++++++++++++-------------
 net/ipv4/ip_options.c     |   3 +-
 2 files changed, 80 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 137f23259a93..77f7b5fda45a 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -209,6 +209,72 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_PPP_SES);
 }
 
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+int br_parse_ip_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+	u32 len;
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+
+	/* Basic sanity checks */
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	/* Zero out the CB buffer if no options present */
+	if (iph->ihl == 5) {
+		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+		return 0;
+	}
+
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+	if (ip_options_compile(dev_net(dev), opt, skb))
+		goto inhdr_error;
+
+	/* Check correct handling of SRR option */
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+			goto drop;
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* Fill in the header for fragmented IP packets handled by
  * the IPv4 connection tracking code.
  */
@@ -549,7 +615,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 {
 	struct net_bridge_port *p;
 	struct net_bridge *br;
-	struct iphdr *iph;
 	__u32 len = nf_bridge_encap_header_len(skb);
 
 	if (unlikely(!pskb_may_pull(skb, len)))
@@ -578,28 +643,9 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 
 	nf_bridge_pull_encap_header_rcsum(skb);
 
-	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
-		goto inhdr_error;
-
-	iph = ip_hdr(skb);
-	if (iph->ihl < 5 || iph->version != 4)
-		goto inhdr_error;
-
-	if (!pskb_may_pull(skb, 4 * iph->ihl))
-		goto inhdr_error;
-
-	iph = ip_hdr(skb);
-	if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0)
-		goto inhdr_error;
-
-	len = ntohs(iph->tot_len);
-	if (skb->len < len || len < 4 * iph->ihl)
-		goto inhdr_error;
-
-	pskb_trim_rcsum(skb, len);
-
-	/* BUG: Should really parse the IP options here. */
-	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	if (br_parse_ip_options(skb))
+		/* Drop invalid packet */
+		goto out;
 
 	nf_bridge_put(skb->nf_bridge);
 	if (!nf_bridge_alloc(skb))
@@ -614,8 +660,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb,
 
 	return NF_STOLEN;
 
-inhdr_error:
-//      IP_INC_STATS_BH(IpInHdrErrors);
 out:
 	return NF_DROP;
 }
@@ -759,14 +803,19 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb,
 #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
+	int ret;
+
 	if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
-		/* BUG: Should really parse the IP options here. */
-		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-		return ip_fragment(skb, br_dev_queue_push_xmit);
+		if (br_parse_ip_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		ret = ip_fragment(skb, br_dev_queue_push_xmit);
 	} else
-		return br_dev_queue_push_xmit(skb);
+		ret = br_dev_queue_push_xmit(skb);
+
+	return ret;
 }
 #else
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index ba9836c488ed..1906fa35860c 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -466,7 +466,7 @@ error:
 	}
 	return -EINVAL;
 }
-
+EXPORT_SYMBOL(ip_options_compile);
 
 /*
  *	Undo all the changes done by ip_options_compile().
@@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb)
 	}
 	return 0;
 }
+EXPORT_SYMBOL(ip_options_rcv_srr);
-- 
cgit v1.2.3-58-ga151


From 8990f468ae9010ab0af4be8f51bf7ab833a67202 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 Sep 2010 00:12:11 +0000
Subject: net: rx_dropped accounting

Under load, netif_rx() can drop incoming packets but administrators dont
have a chance to spot which device needs some tuning (RPS activation for
example)

This patch adds rx_dropped accounting in vlans and tunnels.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.h      | 2 ++
 net/8021q/vlan_dev.c  | 9 +++++++--
 net/ipv4/ip_gre.c     | 6 ++++--
 net/ipv4/ipip.c       | 5 ++++-
 net/ipv6/ip6_tunnel.c | 5 ++++-
 net/ipv6/ip6mr.c      | 4 +++-
 net/ipv6/sit.c        | 5 ++++-
 7 files changed, 28 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 8d9503ad01da..b26ce343072c 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -25,6 +25,7 @@ struct vlan_priority_tci_mapping {
  *	@rx_multicast: number of received multicast packets
  *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
+ *	@rx_dropped: number of dropped packets
  */
 struct vlan_rx_stats {
 	u64			rx_packets;
@@ -32,6 +33,7 @@ struct vlan_rx_stats {
 	u64			rx_multicast;
 	struct u64_stats_sync	syncp;
 	unsigned long		rx_errors;
+	unsigned long		rx_dropped;
 };
 
 /**
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 3bccdd12a264..94a1feddeb49 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -225,7 +225,10 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	netif_rx(skb);
+	if (unlikely(netif_rx(skb) == NET_RX_DROP)) {
+		if (rx_stats)
+			rx_stats->rx_dropped++;
+	}
 	rcu_read_unlock();
 	return NET_RX_SUCCESS;
 
@@ -843,13 +846,15 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
 			accum.rx_packets += rxpackets;
 			accum.rx_bytes   += rxbytes;
 			accum.rx_multicast += rxmulticast;
-			/* rx_errors is an ulong, not protected by syncp */
+			/* rx_errors, rx_dropped are ulong, not protected by syncp */
 			accum.rx_errors  += p->rx_errors;
+			accum.rx_dropped += p->rx_dropped;
 		}
 		stats->rx_packets = accum.rx_packets;
 		stats->rx_bytes   = accum.rx_bytes;
 		stats->rx_errors  = accum.rx_errors;
 		stats->multicast  = accum.rx_multicast;
+		stats->rx_dropped = accum.rx_dropped;
 	}
 	return stats;
 }
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fc20e687e933..714b6a80361d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -647,9 +647,11 @@ static int ipgre_rcv(struct sk_buff *skb)
 		skb_reset_network_header(skb);
 		ipgre_ecn_decapsulate(iph, skb);
 
-		netif_rx(skb);
+		if (netif_rx(skb) == NET_RX_DROP)
+			stats->rx_dropped++;
+
 		rcu_read_unlock();
-		return(0);
+		return 0;
 	}
 	icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 8de8888dc95a..babd25278106 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -377,7 +377,10 @@ static int ipip_rcv(struct sk_buff *skb)
 		skb_tunnel_rx(skb, tunnel->dev);
 
 		ipip_ecn_decapsulate(iph, skb);
-		netif_rx(skb);
+
+		if (netif_rx(skb) == NET_RX_DROP)
+			tunnel->dev->stats.rx_dropped++;
+
 		rcu_read_unlock();
 		return 0;
 	}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9289cecac4de..f6d9f683543e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -725,7 +725,10 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 		skb_tunnel_rx(skb, t->dev);
 
 		dscp_ecn_decapsulate(t, ipv6h, skb);
-		netif_rx(skb);
+
+		if (netif_rx(skb) == NET_RX_DROP)
+			t->dev->stats.rx_dropped++;
+
 		rcu_read_unlock();
 		return 0;
 	}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 66078dad7fe8..2640c9be589d 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -666,7 +666,9 @@ static int pim6_rcv(struct sk_buff *skb)
 
 	skb_tunnel_rx(skb, reg_dev);
 
-	netif_rx(skb);
+	if (netif_rx(skb) == NET_RX_DROP)
+		reg_dev->stats.rx_dropped++;
+
 	dev_put(reg_dev);
 	return 0;
  drop:
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 6822481ff766..8a0399822230 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -564,7 +564,10 @@ static int ipip6_rcv(struct sk_buff *skb)
 		skb_tunnel_rx(skb, tunnel->dev);
 
 		ipip6_ecn_decapsulate(iph, skb);
-		netif_rx(skb);
+
+		if (netif_rx(skb) == NET_RX_DROP)
+			tunnel->dev->stats.rx_dropped++;
+
 		rcu_read_unlock();
 		return 0;
 	}
-- 
cgit v1.2.3-58-ga151


From 5b92b61f3891517d18d0573ad2c939c81b59ecfe Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 22 Sep 2010 08:34:12 +0200
Subject: netfilter: nf_nat: better error handling of nf_ct_expect_related() in
 helpers

This patch improves the situation in which the expectation table is
full for conntrack NAT helpers. Basically, we give up if we don't
find a place in the table instead of looping over nf_ct_expect_related()
with a different port (we should only do this if it returns -EBUSY, for
-EMFILE or -ESHUTDOWN I think that it's better to skip this).

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/nf_nat_amanda.c |  9 ++++++-
 net/ipv4/netfilter/nf_nat_ftp.c    |  9 ++++++-
 net/ipv4/netfilter/nf_nat_h323.c   | 53 +++++++++++++++++++++++++++++++++-----
 net/ipv4/netfilter/nf_nat_irc.c    |  9 ++++++-
 net/ipv4/netfilter/nf_nat_sip.c    | 27 ++++++++++++++++---
 5 files changed, 93 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c
index c31b87668250..0f23b3f06df0 100644
--- a/net/ipv4/netfilter/nf_nat_amanda.c
+++ b/net/ipv4/netfilter/nf_nat_amanda.c
@@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c
index 86e0e84ff0a0..dc73abb3fe27 100644
--- a/net/ipv4/netfilter/nf_nat_ftp.c
+++ b/net/ipv4/netfilter/nf_nat_ftp.c
@@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c
index 5045196d853c..790f3160e012 100644
--- a/net/ipv4/netfilter/nf_nat_h323.c
+++ b/net/ipv4/netfilter/nf_nat_h323.c
@@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct,
 	/* Try to get a pair of ports. */
 	for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port);
 	     nated_port != 0; nated_port += 2) {
+		int ret;
+
 		rtp_exp->tuple.dst.u.udp.port = htons(nated_port);
-		if (nf_ct_expect_related(rtp_exp) == 0) {
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == 0) {
 			rtcp_exp->tuple.dst.u.udp.port =
 			    htons(nated_port + 1);
-			if (nf_ct_expect_related(rtcp_exp) == 0)
+			ret = nf_ct_expect_related(rtcp_exp);
+			if (ret == 0)
+				break;
+			else if (ret != -EBUSY) {
+				nf_ct_unexpect_related(rtp_exp);
+				nated_port = 0;
 				break;
-			nf_ct_unexpect_related(rtp_exp);
+			}
+		} else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
 		}
 	}
 
@@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
 			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
 			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (; nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
 			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
@@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct,
 
 	/* Try to get same port: if not, try to change it. */
 	for (nated_port = ntohs(port); nated_port != 0; nated_port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(nated_port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
 			break;
+		else if (ret != -EBUSY) {
+			nated_port = 0;
+			break;
+		}
 	}
 
 	if (nated_port == 0) {	/* No port available */
diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c
index ea83a886b03e..535e1a802356 100644
--- a/net/ipv4/netfilter/nf_nat_irc.c
+++ b/net/ipv4/netfilter/nf_nat_irc.c
@@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb,
 
 	/* Try to get same port: if not, try to change it. */
 	for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.tcp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
index 11b538deaaec..e40cf7816fdb 100644
--- a/net/ipv4/netfilter/nf_nat_sip.c
+++ b/net/ipv4/netfilter/nf_nat_sip.c
@@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff,
 	exp->expectfn = ip_nat_sip_expected;
 
 	for (; port != 0; port++) {
+		int ret;
+
 		exp->tuple.dst.u.udp.port = htons(port);
-		if (nf_ct_expect_related(exp) == 0)
+		ret = nf_ct_expect_related(exp);
+		if (ret == 0)
+			break;
+		else if (ret != -EBUSY) {
+			port = 0;
 			break;
+		}
 	}
 
 	if (port == 0)
@@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff,
 	/* Try to get same pair of ports: if not, try to change them. */
 	for (port = ntohs(rtp_exp->tuple.dst.u.udp.port);
 	     port != 0; port += 2) {
+		int ret;
+
 		rtp_exp->tuple.dst.u.udp.port = htons(port);
-		if (nf_ct_expect_related(rtp_exp) != 0)
+		ret = nf_ct_expect_related(rtp_exp);
+		if (ret == -EBUSY)
 			continue;
+		else if (ret < 0) {
+			port = 0;
+			break;
+		}
 		rtcp_exp->tuple.dst.u.udp.port = htons(port + 1);
-		if (nf_ct_expect_related(rtcp_exp) == 0)
+		ret = nf_ct_expect_related(rtcp_exp);
+		if (ret == 0)
 			break;
-		nf_ct_unexpect_related(rtp_exp);
+		else if (ret != -EBUSY) {
+			nf_ct_unexpect_related(rtp_exp);
+			port = 0;
+			break;
+		}
 	}
 
 	if (port == 0)
-- 
cgit v1.2.3-58-ga151


From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 22 Sep 2010 20:43:57 +0000
Subject: net: return operator cleanup

Change "return (EXPR);" to "return EXPR;"

return is not a function, parentheses are not required.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/atmdev.h                  |  2 +-
 include/linux/etherdevice.h             |  4 +--
 include/linux/netdevice.h               |  2 +-
 include/linux/skbuff.h                  |  6 ++---
 include/net/bluetooth/hci_core.h        |  2 +-
 include/net/bluetooth/l2cap.h           |  2 +-
 include/net/inet_ecn.h                  |  2 +-
 include/net/ip.h                        |  4 +--
 include/net/ipv6.h                      | 35 +++++++++++++-------------
 include/net/irda/irlap.h                |  2 +-
 include/net/irda/irlmp.h                |  2 +-
 include/net/irda/irttp.h                |  2 +-
 include/net/sch_generic.h               |  2 +-
 include/net/sctp/sctp.h                 | 12 ++++-----
 include/net/sctp/sm.h                   | 10 ++++----
 include/net/sctp/structs.h              |  2 +-
 include/net/sctp/tsnmap.h               |  2 +-
 include/net/tipc/tipc_msg.h             | 10 ++++----
 net/802/fc.c                            |  2 +-
 net/802/fddi.c                          | 12 ++++-----
 net/802/hippi.c                         |  2 +-
 net/802/tr.c                            |  2 +-
 net/8021q/vlan_core.c                   |  2 +-
 net/9p/client.c                         |  4 +--
 net/bluetooth/rfcomm/core.c             |  4 +--
 net/core/flow.c                         |  4 +--
 net/core/neighbour.c                    |  6 ++---
 net/core/utils.c                        |  2 +-
 net/dccp/ccids/lib/loss_interval.c      |  2 +-
 net/econet/af_econet.c                  |  4 +--
 net/ethernet/eth.c                      |  2 +-
 net/ipv4/arp.c                          |  2 +-
 net/ipv4/datagram.c                     |  2 +-
 net/ipv4/inet_diag.c                    |  2 +-
 net/ipv4/ip_fragment.c                  |  4 +--
 net/ipv4/ip_gre.c                       |  2 +-
 net/ipv4/netfilter/arp_tables.c         |  2 +-
 net/ipv4/route.c                        |  2 +-
 net/ipv4/tcp_input.c                    | 10 ++++----
 net/ipv4/tcp_minisocks.c                |  2 +-
 net/ipv4/tcp_output.c                   |  8 +++---
 net/ipv4/tcp_westwood.c                 |  2 +-
 net/ipv6/addrconf.c                     |  2 +-
 net/ipv6/addrlabel.c                    |  5 ++--
 net/ipv6/af_inet6.c                     |  6 ++---
 net/ipv6/exthdrs_core.c                 |  4 +--
 net/ipv6/ip6_output.c                   |  4 +--
 net/ipv6/ndisc.c                        |  8 +++---
 net/ipv6/netfilter/ip6_tables.c         | 14 +++++------
 net/ipv6/raw.c                          | 12 ++++-----
 net/ipv6/route.c                        | 14 +++++------
 net/ipv6/tcp_ipv6.c                     |  2 +-
 net/ipv6/xfrm6_policy.c                 |  2 +-
 net/irda/af_irda.c                      | 14 +++++------
 net/irda/discovery.c                    |  2 +-
 net/irda/ircomm/ircomm_tty.c            |  4 +--
 net/irda/irlmp.c                        |  2 +-
 net/irda/irlmp_frame.c                  |  2 +-
 net/irda/irnet/irnet_irda.c             | 22 ++++++++---------
 net/irda/irnet/irnet_ppp.c              |  8 +++---
 net/key/af_key.c                        |  4 +--
 net/mac80211/rate.c                     |  2 +-
 net/rfkill/input.c                      |  2 +-
 net/rose/rose_link.c                    |  4 +--
 net/sctp/protocol.c                     |  2 +-
 net/sctp/socket.c                       |  6 ++---
 net/sunrpc/auth_gss/auth_gss.c          |  2 +-
 net/sunrpc/auth_gss/gss_generic_token.c | 44 ++++++++++++++++-----------------
 net/sunrpc/auth_gss/gss_krb5_seqnum.c   |  2 +-
 net/sunrpc/auth_gss/gss_mech_switch.c   |  2 +-
 net/sunrpc/sched.c                      |  2 +-
 net/tipc/addr.c                         |  2 +-
 net/tipc/bcast.c                        |  2 +-
 net/tipc/bearer.c                       |  2 +-
 net/tipc/dbg.c                          |  4 +--
 net/tipc/link.c                         |  6 ++---
 net/tipc/link.h                         | 16 ++++++------
 net/tipc/msg.h                          |  6 ++---
 net/tipc/name_table.c                   |  2 +-
 net/tipc/node.c                         |  6 ++---
 net/tipc/port.h                         |  2 +-
 net/tipc/socket.c                       |  2 +-
 net/tipc/subscr.c                       |  2 +-
 net/wireless/core.h                     |  2 +-
 84 files changed, 220 insertions(+), 222 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/atmdev.h b/include/linux/atmdev.h
index f6481daf6e52..a8e4e832cdbb 100644
--- a/include/linux/atmdev.h
+++ b/include/linux/atmdev.h
@@ -449,7 +449,7 @@ void vcc_insert_socket(struct sock *sk);
 
 static inline int atm_guess_pdu2truesize(int size)
 {
-	return (SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info));
+	return SKB_DATA_ALIGN(size) + sizeof(struct skb_shared_info);
 }
 
 
diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index fb6aa6070921..f16a01081e15 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -71,7 +71,7 @@ static inline int is_zero_ether_addr(const u8 *addr)
  */
 static inline int is_multicast_ether_addr(const u8 *addr)
 {
-	return (0x01 & addr[0]);
+	return 0x01 & addr[0];
 }
 
 /**
@@ -82,7 +82,7 @@ static inline int is_multicast_ether_addr(const u8 *addr)
  */
 static inline int is_local_ether_addr(const u8 *addr)
 {
-	return (0x02 & addr[0]);
+	return 0x02 & addr[0];
 }
 
 /**
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f7f1302138af..45dcda5bfda9 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1676,7 +1676,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
  */
 static inline int netif_is_multiqueue(const struct net_device *dev)
 {
-	return (dev->num_tx_queues > 1);
+	return dev->num_tx_queues > 1;
 }
 
 extern void netif_set_real_num_tx_queues(struct net_device *dev,
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 9e8085a89589..b2c41d19735c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -601,7 +601,7 @@ static inline int skb_queue_empty(const struct sk_buff_head *list)
 static inline bool skb_queue_is_last(const struct sk_buff_head *list,
 				     const struct sk_buff *skb)
 {
-	return (skb->next == (struct sk_buff *) list);
+	return skb->next == (struct sk_buff *)list;
 }
 
 /**
@@ -614,7 +614,7 @@ static inline bool skb_queue_is_last(const struct sk_buff_head *list,
 static inline bool skb_queue_is_first(const struct sk_buff_head *list,
 				      const struct sk_buff *skb)
 {
-	return (skb->prev == (struct sk_buff *) list);
+	return skb->prev == (struct sk_buff *)list;
 }
 
 /**
@@ -2156,7 +2156,7 @@ static inline u16 skb_get_rx_queue(const struct sk_buff *skb)
 
 static inline bool skb_rx_queue_recorded(const struct sk_buff *skb)
 {
-	return (skb->queue_mapping != 0);
+	return skb->queue_mapping != 0;
 }
 
 extern u16 skb_tx_hash(const struct net_device *dev,
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 4568b938ca35..ebec8c9a929d 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -233,7 +233,7 @@ static inline void inquiry_cache_init(struct hci_dev *hdev)
 static inline int inquiry_cache_empty(struct hci_dev *hdev)
 {
 	struct inquiry_cache *c = &hdev->inq_cache;
-	return (c->list == NULL);
+	return c->list == NULL;
 }
 
 static inline long inquiry_cache_age(struct hci_dev *hdev)
diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h
index 6c241444f902..c819c8bf9b68 100644
--- a/include/net/bluetooth/l2cap.h
+++ b/include/net/bluetooth/l2cap.h
@@ -414,7 +414,7 @@ static inline int l2cap_tx_window_full(struct sock *sk)
 	if (sub < 0)
 		sub += 64;
 
-	return (sub == pi->remote_tx_win);
+	return sub == pi->remote_tx_win;
 }
 
 #define __get_txseq(ctrl) ((ctrl) & L2CAP_CTRL_TXSEQ) >> 1
diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h
index 9b5d08f4f6e8..88bdd010d65d 100644
--- a/include/net/inet_ecn.h
+++ b/include/net/inet_ecn.h
@@ -27,7 +27,7 @@ static inline int INET_ECN_is_not_ect(__u8 dsfield)
 
 static inline int INET_ECN_is_capable(__u8 dsfield)
 {
-	return (dsfield & INET_ECN_ECT_0);
+	return dsfield & INET_ECN_ECT_0;
 }
 
 static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner)
diff --git a/include/net/ip.h b/include/net/ip.h
index 7691aca133db..dbee3fe260e1 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -238,9 +238,9 @@ int ip_decrease_ttl(struct iphdr *iph)
 static inline
 int ip_dont_fragment(struct sock *sk, struct dst_entry *dst)
 {
-	return (inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
+	return  inet_sk(sk)->pmtudisc == IP_PMTUDISC_DO ||
 		(inet_sk(sk)->pmtudisc == IP_PMTUDISC_WANT &&
-		 !(dst_metric_locked(dst, RTAX_MTU))));
+		 !(dst_metric_locked(dst, RTAX_MTU)));
 }
 
 extern void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more);
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 1f8412410998..4a3cd2cd2f5e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -262,7 +262,7 @@ static inline int ipv6_addr_scope(const struct in6_addr *addr)
 
 static inline int __ipv6_addr_src_scope(int type)
 {
-	return (type == IPV6_ADDR_ANY ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16));
+	return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16);
 }
 
 static inline int ipv6_addr_src_scope(const struct in6_addr *addr)
@@ -279,10 +279,10 @@ static inline int
 ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m,
 		     const struct in6_addr *a2)
 {
-	return (!!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
-		   ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
-		   ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
-		   ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])));
+	return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) |
+		  ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) |
+		  ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) |
+		  ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3]));
 }
 
 static inline void ipv6_addr_copy(struct in6_addr *a1, const struct in6_addr *a2)
@@ -317,10 +317,10 @@ static inline void ipv6_addr_set(struct in6_addr *addr,
 static inline int ipv6_addr_equal(const struct in6_addr *a1,
 				  const struct in6_addr *a2)
 {
-	return (((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
-		 (a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
-		 (a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
-		 (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0);
+	return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) |
+		(a1->s6_addr32[1] ^ a2->s6_addr32[1]) |
+		(a1->s6_addr32[2] ^ a2->s6_addr32[2]) |
+		(a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0;
 }
 
 static inline int __ipv6_prefix_equal(const __be32 *a1, const __be32 *a2,
@@ -373,20 +373,20 @@ int ip6_frag_match(struct inet_frag_queue *q, void *a);
 
 static inline int ipv6_addr_any(const struct in6_addr *a)
 {
-	return ((a->s6_addr32[0] | a->s6_addr32[1] | 
-		 a->s6_addr32[2] | a->s6_addr32[3] ) == 0); 
+	return (a->s6_addr32[0] | a->s6_addr32[1] |
+		a->s6_addr32[2] | a->s6_addr32[3]) == 0;
 }
 
 static inline int ipv6_addr_loopback(const struct in6_addr *a)
 {
-	return ((a->s6_addr32[0] | a->s6_addr32[1] |
-		 a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0);
+	return (a->s6_addr32[0] | a->s6_addr32[1] |
+		a->s6_addr32[2] | (a->s6_addr32[3] ^ htonl(1))) == 0;
 }
 
 static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
 {
-	return ((a->s6_addr32[0] | a->s6_addr32[1] |
-		 (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0);
+	return (a->s6_addr32[0] | a->s6_addr32[1] |
+		 (a->s6_addr32[2] ^ htonl(0x0000ffff))) == 0;
 }
 
 /*
@@ -395,8 +395,7 @@ static inline int ipv6_addr_v4mapped(const struct in6_addr *a)
  */
 static inline int ipv6_addr_orchid(const struct in6_addr *a)
 {
-	return ((a->s6_addr32[0] & htonl(0xfffffff0))
-		== htonl(0x20010010));
+	return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010);
 }
 
 static inline void ipv6_addr_set_v4mapped(const __be32 addr,
@@ -441,7 +440,7 @@ static inline int __ipv6_addr_diff(const void *token1, const void *token2, int a
 	 *	if returned value is greater than prefix length.
 	 *					--ANK (980803)
 	 */
-	return (addrlen << 5);
+	return addrlen << 5;
 }
 
 static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2)
diff --git a/include/net/irda/irlap.h b/include/net/irda/irlap.h
index 9d0c78ea92f5..17fcd964f9d9 100644
--- a/include/net/irda/irlap.h
+++ b/include/net/irda/irlap.h
@@ -282,7 +282,7 @@ static inline int irlap_is_primary(struct irlap_cb *self)
 	default:
 		ret = -1;
 	}
-	return(ret);
+	return ret;
 }
 
 /* Clear a pending IrLAP disconnect. - Jean II */
diff --git a/include/net/irda/irlmp.h b/include/net/irda/irlmp.h
index 3ffc1d0f93d6..fff11b7fe8a4 100644
--- a/include/net/irda/irlmp.h
+++ b/include/net/irda/irlmp.h
@@ -274,7 +274,7 @@ static inline int irlmp_lap_tx_queue_full(struct lsap_cb *self)
 	if (self->lap->irlap == NULL)
 		return 0;
 
-	return(IRLAP_GET_TX_QUEUE_LEN(self->lap->irlap) >= LAP_HIGH_THRESHOLD);
+	return IRLAP_GET_TX_QUEUE_LEN(self->lap->irlap) >= LAP_HIGH_THRESHOLD;
 }
 
 /* After doing a irlmp_dup(), this get one of the two socket back into
diff --git a/include/net/irda/irttp.h b/include/net/irda/irttp.h
index 11aee7a2972a..af4b87721d13 100644
--- a/include/net/irda/irttp.h
+++ b/include/net/irda/irttp.h
@@ -204,7 +204,7 @@ static inline int irttp_is_primary(struct tsap_cb *self)
 	    (self->lsap->lap == NULL) ||
 	    (self->lsap->lap->irlap == NULL))
 		return -2;
-	return(irlap_is_primary(self->lsap->lap->irlap));
+	return irlap_is_primary(self->lsap->lap->irlap);
 }
 
 #endif /* IRTTP_H */
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3c8728aaab4e..eda8808fdacd 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -601,7 +601,7 @@ static inline u32 qdisc_l2t(struct qdisc_rate_table* rtab, unsigned int pktlen)
 		slot = 0;
 	slot >>= rtab->rate.cell_log;
 	if (slot > 255)
-		return (rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF]);
+		return rtab->data[255]*(slot >> 8) + rtab->data[slot & 0xFF];
 	return rtab->data[slot];
 }
 
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 2cb3980b1616..505845ddb0be 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -405,7 +405,7 @@ static inline void sctp_v6_del_protocol(void) { return; }
 /* Map an association to an assoc_id. */
 static inline sctp_assoc_t sctp_assoc2id(const struct sctp_association *asoc)
 {
-	return (asoc?asoc->assoc_id:0);
+	return asoc ? asoc->assoc_id : 0;
 }
 
 /* Look up the association by its id.  */
@@ -473,7 +473,7 @@ static inline void sctp_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
 /* Tests if the list has one and only one entry. */
 static inline int sctp_list_single_entry(struct list_head *head)
 {
-	return ((head->next != head) && (head->next == head->prev));
+	return (head->next != head) && (head->next == head->prev);
 }
 
 /* Generate a random jitter in the range of -50% ~ +50% of input RTO. */
@@ -631,13 +631,13 @@ static inline int sctp_sanity_check(void)
 /* This is the hash function for the SCTP port hash table. */
 static inline int sctp_phashfn(__u16 lport)
 {
-	return (lport & (sctp_port_hashsize - 1));
+	return lport & (sctp_port_hashsize - 1);
 }
 
 /* This is the hash function for the endpoint hash table. */
 static inline int sctp_ep_hashfn(__u16 lport)
 {
-	return (lport & (sctp_ep_hashsize - 1));
+	return lport & (sctp_ep_hashsize - 1);
 }
 
 /* This is the hash function for the association hash table. */
@@ -645,7 +645,7 @@ static inline int sctp_assoc_hashfn(__u16 lport, __u16 rport)
 {
 	int h = (lport << 16) + rport;
 	h ^= h>>8;
-	return (h & (sctp_assoc_hashsize - 1));
+	return h & (sctp_assoc_hashsize - 1);
 }
 
 /* This is the hash function for the association hash table.  This is
@@ -656,7 +656,7 @@ static inline int sctp_vtag_hashfn(__u16 lport, __u16 rport, __u32 vtag)
 {
 	int h = (lport << 16) + rport;
 	h ^= vtag;
-	return (h & (sctp_assoc_hashsize-1));
+	return h & (sctp_assoc_hashsize - 1);
 }
 
 #define sctp_for_each_hentry(epb, node, head) \
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 4088c89a9055..9352d12f02de 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -345,12 +345,12 @@ enum {
 
 static inline int TSN_lt(__u32 s, __u32 t)
 {
-	return (((s) - (t)) & TSN_SIGN_BIT);
+	return ((s) - (t)) & TSN_SIGN_BIT;
 }
 
 static inline int TSN_lte(__u32 s, __u32 t)
 {
-	return (((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT));
+	return ((s) == (t)) || (((s) - (t)) & TSN_SIGN_BIT);
 }
 
 /* Compare two SSNs */
@@ -369,12 +369,12 @@ enum {
 
 static inline int SSN_lt(__u16 s, __u16 t)
 {
-	return (((s) - (t)) & SSN_SIGN_BIT);
+	return ((s) - (t)) & SSN_SIGN_BIT;
 }
 
 static inline int SSN_lte(__u16 s, __u16 t)
 {
-	return (((s) == (t)) || (((s) - (t)) & SSN_SIGN_BIT));
+	return ((s) == (t)) || (((s) - (t)) & SSN_SIGN_BIT);
 }
 
 /*
@@ -388,7 +388,7 @@ enum {
 
 static inline int ADDIP_SERIAL_gte(__u16 s, __u16 t)
 {
-	return (((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT));
+	return ((s) == (t)) || (((t) - (s)) & ADDIP_SERIAL_SIGN_BIT);
 }
 
 /* Check VTAG of the packet matches the sender's own tag. */
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index f9e7473613bd..69fef4fb79c0 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -847,7 +847,7 @@ void sctp_packet_free(struct sctp_packet *);
 
 static inline int sctp_packet_empty(struct sctp_packet *packet)
 {
-	return (packet->size == packet->overhead);
+	return packet->size == packet->overhead;
 }
 
 /* This represents a remote transport address.
diff --git a/include/net/sctp/tsnmap.h b/include/net/sctp/tsnmap.h
index 4aabc5a96cf6..e7728bc14ccf 100644
--- a/include/net/sctp/tsnmap.h
+++ b/include/net/sctp/tsnmap.h
@@ -157,7 +157,7 @@ __u16 sctp_tsnmap_pending(struct sctp_tsnmap *map);
 /* Is there a gap in the TSN map?  */
 static inline int sctp_tsnmap_has_gap(const struct sctp_tsnmap *map)
 {
-	return (map->cumulative_tsn_ack_point != map->max_tsn_seen);
+	return map->cumulative_tsn_ack_point != map->max_tsn_seen;
 }
 
 /* Mark a duplicate TSN.  Note:  limit the storage of duplicate TSN
diff --git a/include/net/tipc/tipc_msg.h b/include/net/tipc/tipc_msg.h
index 2e159a812f83..ffe50b4e7b93 100644
--- a/include/net/tipc/tipc_msg.h
+++ b/include/net/tipc/tipc_msg.h
@@ -107,7 +107,7 @@ static inline u32 msg_hdr_sz(struct tipc_msg *m)
 
 static inline int msg_short(struct tipc_msg *m)
 {
-	return (msg_hdr_sz(m) == 24);
+	return msg_hdr_sz(m) == 24;
 }
 
 static inline u32 msg_size(struct tipc_msg *m)
@@ -117,7 +117,7 @@ static inline u32 msg_size(struct tipc_msg *m)
 
 static inline u32 msg_data_sz(struct tipc_msg *m)
 {
-	return (msg_size(m) - msg_hdr_sz(m));
+	return msg_size(m) - msg_hdr_sz(m);
 }
 
 static inline unchar *msg_data(struct tipc_msg *m)
@@ -132,17 +132,17 @@ static inline u32 msg_type(struct tipc_msg *m)
 
 static inline u32 msg_named(struct tipc_msg *m)
 {
-	return (msg_type(m) == TIPC_NAMED_MSG);
+	return msg_type(m) == TIPC_NAMED_MSG;
 }
 
 static inline u32 msg_mcast(struct tipc_msg *m)
 {
-	return (msg_type(m) == TIPC_MCAST_MSG);
+	return msg_type(m) == TIPC_MCAST_MSG;
 }
 
 static inline u32 msg_connected(struct tipc_msg *m)
 {
-	return (msg_type(m) == TIPC_CONN_MSG);
+	return msg_type(m) == TIPC_CONN_MSG;
 }
 
 static inline u32 msg_errcode(struct tipc_msg *m)
diff --git a/net/802/fc.c b/net/802/fc.c
index 34cf1ee014b8..1e49f2d4ea96 100644
--- a/net/802/fc.c
+++ b/net/802/fc.c
@@ -70,7 +70,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev,
 	if(daddr)
 	{
 		memcpy(fch->daddr,daddr,dev->addr_len);
-		return(hdr_len);
+		return hdr_len;
 	}
 	return -hdr_len;
 }
diff --git a/net/802/fddi.c b/net/802/fddi.c
index 3ef0ab0a543a..94b3ad08f39a 100644
--- a/net/802/fddi.c
+++ b/net/802/fddi.c
@@ -82,10 +82,10 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev,
 	if (daddr != NULL)
 	{
 		memcpy(fddi->daddr, daddr, dev->addr_len);
-		return(hl);
+		return hl;
 	}
 
-	return(-hl);
+	return -hl;
 }
 
 
@@ -108,7 +108,7 @@ static int fddi_rebuild_header(struct sk_buff	*skb)
 	{
 		printk("%s: Don't know how to resolve type %04X addresses.\n",
 		       skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype));
-		return(0);
+		return 0;
 	}
 }
 
@@ -162,7 +162,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev)
 
 	/* Assume 802.2 SNAP frames, for now */
 
-	return(type);
+	return type;
 }
 
 EXPORT_SYMBOL(fddi_type_trans);
@@ -170,9 +170,9 @@ EXPORT_SYMBOL(fddi_type_trans);
 int fddi_change_mtu(struct net_device *dev, int new_mtu)
 {
 	if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN))
-		return(-EINVAL);
+		return -EINVAL;
 	dev->mtu = new_mtu;
-	return(0);
+	return 0;
 }
 EXPORT_SYMBOL(fddi_change_mtu);
 
diff --git a/net/802/hippi.c b/net/802/hippi.c
index cd3e8e929529..91aca8780fd0 100644
--- a/net/802/hippi.c
+++ b/net/802/hippi.c
@@ -152,7 +152,7 @@ int hippi_change_mtu(struct net_device *dev, int new_mtu)
 	if ((new_mtu < 68) || (new_mtu > 65280))
 		return -EINVAL;
 	dev->mtu = new_mtu;
-	return(0);
+	return 0;
 }
 EXPORT_SYMBOL(hippi_change_mtu);
 
diff --git a/net/802/tr.c b/net/802/tr.c
index 1c6e596074df..5e20cf8a074b 100644
--- a/net/802/tr.c
+++ b/net/802/tr.c
@@ -145,7 +145,7 @@ static int tr_header(struct sk_buff *skb, struct net_device *dev,
 	{
 		memcpy(trh->daddr,daddr,dev->addr_len);
 		tr_source_route(skb, trh, dev);
-		return(hdr_len);
+		return hdr_len;
 	}
 
 	return -hdr_len;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 889f4ac4459a..0eb486d342dc 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -27,7 +27,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 	else if (vlan_id)
 		goto drop;
 
-	return (polling ? netif_receive_skb(skb) : netif_rx(skb));
+	return polling ? netif_receive_skb(skb) : netif_rx(skb);
 
 drop:
 	dev_kfree_skb_any(skb);
diff --git a/net/9p/client.c b/net/9p/client.c
index dc6f2f26d023..f34b9f510818 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -61,13 +61,13 @@ static const match_table_t tokens = {
 
 inline int p9_is_proto_dotl(struct p9_client *clnt)
 {
-	return (clnt->proto_version == p9_proto_2000L);
+	return clnt->proto_version == p9_proto_2000L;
 }
 EXPORT_SYMBOL(p9_is_proto_dotl);
 
 inline int p9_is_proto_dotu(struct p9_client *clnt)
 {
-	return (clnt->proto_version == p9_proto_2000u);
+	return clnt->proto_version == p9_proto_2000u;
 }
 EXPORT_SYMBOL(p9_is_proto_dotu);
 
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 7dca91bb8c57..15ea84ba344e 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -179,13 +179,13 @@ static unsigned char rfcomm_crc_table[256] = {
 /* FCS on 2 bytes */
 static inline u8 __fcs(u8 *data)
 {
-	return (0xff - __crc(data));
+	return 0xff - __crc(data);
 }
 
 /* FCS on 3 bytes */
 static inline u8 __fcs2(u8 *data)
 {
-	return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]);
+	return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]];
 }
 
 /* Check FCS */
diff --git a/net/core/flow.c b/net/core/flow.c
index b143b86b1f2a..127c8a7ffd61 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -176,8 +176,8 @@ static u32 flow_hash_code(struct flow_cache *fc,
 {
 	u32 *k = (u32 *) key;
 
-	return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
-		& (flow_cache_hash_size(fc) - 1));
+	return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd)
+		& (flow_cache_hash_size(fc) - 1);
 }
 
 typedef unsigned long flow_compare_t;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a4e0a7482c2b..96b1a749abb4 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh)
 
 unsigned long neigh_rand_reach_time(unsigned long base)
 {
-	return (base ? (net_random() % base) + (base >> 1) : 0);
+	return base ? (net_random() % base) + (base >> 1) : 0;
 }
 EXPORT_SYMBOL(neigh_rand_reach_time);
 
@@ -766,9 +766,9 @@ next_elt:
 static __inline__ int neigh_max_probes(struct neighbour *n)
 {
 	struct neigh_parms *p = n->parms;
-	return (n->nud_state & NUD_PROBE ?
+	return (n->nud_state & NUD_PROBE) ?
 		p->ucast_probes :
-		p->ucast_probes + p->app_probes + p->mcast_probes);
+		p->ucast_probes + p->app_probes + p->mcast_probes;
 }
 
 static void neigh_invalidate(struct neighbour *neigh)
diff --git a/net/core/utils.c b/net/core/utils.c
index ec6bb322f372..5fea0ab21902 100644
--- a/net/core/utils.c
+++ b/net/core/utils.c
@@ -75,7 +75,7 @@ __be32 in_aton(const char *str)
 				str++;
 		}
 	}
-	return(htonl(l));
+	return htonl(l);
 }
 EXPORT_SYMBOL(in_aton);
 
diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c
index 8fc3cbf79071..497723c4d4bb 100644
--- a/net/dccp/ccids/lib/loss_interval.c
+++ b/net/dccp/ccids/lib/loss_interval.c
@@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb)
 	cur->li_length = len;
 	tfrc_lh_calc_i_mean(lh);
 
-	return (lh->i_mean < old_i_mean);
+	return lh->i_mean < old_i_mean;
 }
 
 /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */
diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c
index baa98fb83552..f8c1ae4b41f0 100644
--- a/net/econet/af_econet.c
+++ b/net/econet/af_econet.c
@@ -392,7 +392,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		dev_queue_xmit(skb);
 		dev_put(dev);
 		mutex_unlock(&econet_mutex);
-		return(len);
+		return len;
 
 	out_free:
 		kfree_skb(skb);
@@ -637,7 +637,7 @@ static int econet_create(struct net *net, struct socket *sock, int protocol,
 	eo->num = protocol;
 
 	econet_insert_socket(&econet_sklist, sk);
-	return(0);
+	return 0;
 out:
 	return err;
 }
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 85e7b4551326..f00ef2f1d814 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -387,6 +387,6 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len)
 
 	l = _format_mac_addr(buf, PAGE_SIZE, addr, len);
 	l += scnprintf(buf + l, PAGE_SIZE - l, "\n");
-	return ((ssize_t) l);
+	return (ssize_t)l;
 }
 EXPORT_SYMBOL(sysfs_format_mac);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index dcfe7e961c10..4083c186fd30 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -567,7 +567,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev,
 	if (out_dev)
 		omi = IN_DEV_MEDIUM_ID(out_dev);
 
-	return (omi != imi && omi != -1);
+	return omi != imi && omi != -1;
 }
 
 /*
diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c
index 721a8a37b45c..174be6caa5c8 100644
--- a/net/ipv4/datagram.c
+++ b/net/ipv4/datagram.c
@@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	inet->inet_id = jiffies;
 
 	sk_dst_set(sk, &rt->dst);
-	return(0);
+	return 0;
 }
 EXPORT_SYMBOL(ip4_datagram_connect);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index e5fa2ddce320..ba8042665849 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len,
 			bc += op->no;
 		}
 	}
-	return (len == 0);
+	return len == 0;
 }
 
 static int valid_cc(const void *bc, int len, int cc)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index f4dc879e258e..168440834ade 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a)
 	struct ip4_create_arg *arg = a;
 
 	qp = container_of(q, struct ipq, q);
-	return (qp->id == arg->iph->id &&
+	return	qp->id == arg->iph->id &&
 			qp->saddr == arg->iph->saddr &&
 			qp->daddr == arg->iph->daddr &&
 			qp->protocol == arg->iph->protocol &&
-			qp->user == arg->user);
+			qp->user == arg->user;
 }
 
 /* Memory Tracking Functions. */
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 714b6a80361d..0967d02fefd8 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -659,7 +659,7 @@ drop:
 	rcu_read_unlock();
 drop_nolock:
 	kfree_skb(skb);
-	return(0);
+	return 0;
 }
 
 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..8b642f152468 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap,
 	for (i = 0; i < len; i++)
 		ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i];
 
-	return (ret != 0);
+	return ret != 0;
 }
 
 /*
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index e24d48dd99d3..ae1d4a41f1c6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2791,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi
 
 	dst_release(&(*rp)->dst);
 	*rp = rt;
-	return (rt ? 0 : -ENOMEM);
+	return rt ? 0 : -ENOMEM;
 }
 
 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 1bc87a05c734..51966b3f9719 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2301,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp)
 
 static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb)
 {
-	return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto);
+	return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto;
 }
 
 static inline int tcp_head_timedout(struct sock *sk)
@@ -3398,8 +3398,8 @@ static void tcp_ack_probe(struct sock *sk)
 
 static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag)
 {
-	return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
-		inet_csk(sk)->icsk_ca_state != TCP_CA_Open);
+	return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) ||
+		inet_csk(sk)->icsk_ca_state != TCP_CA_Open;
 }
 
 static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag)
@@ -3416,9 +3416,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp,
 					const u32 ack, const u32 ack_seq,
 					const u32 nwin)
 {
-	return (after(ack, tp->snd_una) ||
+	return	after(ack, tp->snd_una) ||
 		after(ack_seq, tp->snd_wl1) ||
-		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd));
+		(ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd);
 }
 
 /* Update our send window.
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index f25b56cb85cb..43cf901d7659 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
 		return 1;
 	if (after(end_seq, s_win) && before(seq, e_win))
 		return 1;
-	return (seq == e_win && seq == end_seq);
+	return seq == e_win && seq == end_seq;
 }
 
 /*
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index ea09d2fd50c7..05b1ecf36763 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1370,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp,
 				  const struct sk_buff *skb,
 				  unsigned mss_now, int nonagle)
 {
-	return (skb->len < mss_now &&
+	return skb->len < mss_now &&
 		((nonagle & TCP_NAGLE_CORK) ||
-		 (!nonagle && tp->packets_out && tcp_minshall_check(tp))));
+		 (!nonagle && tp->packets_out && tcp_minshall_check(tp)));
 }
 
 /* Return non-zero if the Nagle test allows this packet to be
@@ -1443,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb = tcp_send_head(sk);
 
-	return (skb &&
+	return skb &&
 		tcp_snd_test(sk, skb, tcp_current_mss(sk),
 			     (tcp_skb_is_last(sk, skb) ?
-			      tp->nonagle : TCP_NAGLE_PUSH)));
+			      tp->nonagle : TCP_NAGLE_PUSH));
 }
 
 /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 20151d6a6241..a534dda5456e 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk)
  */
 static inline u32 westwood_do_filter(u32 a, u32 b)
 {
-	return (((7 * a) + b) >> 3);
+	return ((7 * a) + b) >> 3;
 }
 
 static void westwood_filter(struct westwood *w, u32 delta)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 5bc893e28008..89aa54394a08 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -243,7 +243,7 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev)
 /* Check if a route is valid prefix route */
 static inline int addrconf_is_prefix_route(const struct rt6_info *rt)
 {
-	return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0);
+	return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0;
 }
 
 static void addrconf_del_timer(struct inet6_ifaddr *ifp)
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index f0e774cea386..921dcf6c271a 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -513,10 +513,9 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 static inline int ip6addrlbl_msgsize(void)
 {
-	return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
+	return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg))
 		+ nla_total_size(16)	/* IFAL_ADDRESS */
-		+ nla_total_size(4)	/* IFAL_LABEL */
-	);
+		+ nla_total_size(4);	/* IFAL_LABEL */
 }
 
 static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 56b9bf2516f4..60220985bb80 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -467,7 +467,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
 	if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL)
 		sin->sin6_scope_id = sk->sk_bound_dev_if;
 	*uaddr_len = sizeof(*sin);
-	return(0);
+	return 0;
 }
 
 EXPORT_SYMBOL(inet6_getname);
@@ -488,7 +488,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 	case SIOCADDRT:
 	case SIOCDELRT:
 
-		return(ipv6_route_ioctl(net, cmd, (void __user *)arg));
+		return ipv6_route_ioctl(net, cmd, (void __user *)arg);
 
 	case SIOCSIFADDR:
 		return addrconf_add_ifaddr(net, (void __user *) arg);
@@ -502,7 +502,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
 		return sk->sk_prot->ioctl(sk, cmd, arg);
 	}
 	/*NOTREACHED*/
-	return(0);
+	return 0;
 }
 
 EXPORT_SYMBOL(inet6_ioctl);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index e1caa5d526c2..14ed0a955b56 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -13,12 +13,12 @@ int ipv6_ext_hdr(u8 nexthdr)
 	/*
 	 * find out if nexthdr is an extension header or a protocol
 	 */
-	return ( (nexthdr == NEXTHDR_HOP)	||
+	return   (nexthdr == NEXTHDR_HOP)	||
 		 (nexthdr == NEXTHDR_ROUTING)	||
 		 (nexthdr == NEXTHDR_FRAGMENT)	||
 		 (nexthdr == NEXTHDR_AUTH)	||
 		 (nexthdr == NEXTHDR_NONE)	||
-		 (nexthdr == NEXTHDR_DEST) );
+		 (nexthdr == NEXTHDR_DEST);
 }
 
 /*
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1838927a2243..efbbbce68f9e 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -870,8 +870,8 @@ static inline int ip6_rt_check(struct rt6key *rt_key,
 			       struct in6_addr *fl_addr,
 			       struct in6_addr *addr_cache)
 {
-	return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
-		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
+	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
+		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
 }
 
 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 69a0051cea67..b3dd844cd34f 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -228,12 +228,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur,
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
 	} while(cur < end && cur->nd_opt_type != type);
-	return (cur <= end && cur->nd_opt_type == type ? cur : NULL);
+	return cur <= end && cur->nd_opt_type == type ? cur : NULL;
 }
 
 static inline int ndisc_is_useropt(struct nd_opt_hdr *opt)
 {
-	return (opt->nd_opt_type == ND_OPT_RDNSS);
+	return opt->nd_opt_type == ND_OPT_RDNSS;
 }
 
 static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
@@ -244,7 +244,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur,
 	do {
 		cur = ((void *)cur) + (cur->nd_opt_len << 3);
 	} while(cur < end && !ndisc_is_useropt(cur));
-	return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL);
+	return cur <= end && ndisc_is_useropt(cur) ? cur : NULL;
 }
 
 static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len,
@@ -319,7 +319,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p,
 	int prepad = ndisc_addr_option_pad(dev->type);
 	if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad))
 		return NULL;
-	return (lladdr + prepad);
+	return lladdr + prepad;
 }
 
 int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 8e754be92c24..6b331e9b5706 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -82,13 +82,13 @@ EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table);
 int
 ip6t_ext_hdr(u8 nexthdr)
 {
-	return ( (nexthdr == IPPROTO_HOPOPTS)   ||
-		 (nexthdr == IPPROTO_ROUTING)   ||
-		 (nexthdr == IPPROTO_FRAGMENT)  ||
-		 (nexthdr == IPPROTO_ESP)       ||
-		 (nexthdr == IPPROTO_AH)        ||
-		 (nexthdr == IPPROTO_NONE)      ||
-		 (nexthdr == IPPROTO_DSTOPTS) );
+	return  (nexthdr == IPPROTO_HOPOPTS)   ||
+		(nexthdr == IPPROTO_ROUTING)   ||
+		(nexthdr == IPPROTO_FRAGMENT)  ||
+		(nexthdr == IPPROTO_ESP)       ||
+		(nexthdr == IPPROTO_AH)        ||
+		(nexthdr == IPPROTO_NONE)      ||
+		(nexthdr == IPPROTO_DSTOPTS);
 }
 
 /* Returns whether matches rule or not. */
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index e677937a07fc..45e6efb7f171 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -764,7 +764,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 			return -EINVAL;
 
 		if (sin6->sin6_family && sin6->sin6_family != AF_INET6)
-			return(-EAFNOSUPPORT);
+			return -EAFNOSUPPORT;
 
 		/* port is the proto value [0..255] carried in nexthdr */
 		proto = ntohs(sin6->sin6_port);
@@ -772,10 +772,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk,
 		if (!proto)
 			proto = inet->inet_num;
 		else if (proto != inet->inet_num)
-			return(-EINVAL);
+			return -EINVAL;
 
 		if (proto > 255)
-			return(-EINVAL);
+			return -EINVAL;
 
 		daddr = &sin6->sin6_addr;
 		if (np->sndflow) {
@@ -985,7 +985,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			/* You may get strange result with a positive odd offset;
 			   RFC2292bis agrees with me. */
 			if (val > 0 && (val&1))
-				return(-EINVAL);
+				return -EINVAL;
 			if (val < 0) {
 				rp->checksum = 0;
 			} else {
@@ -997,7 +997,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			break;
 
 		default:
-			return(-ENOPROTOOPT);
+			return -ENOPROTOOPT;
 	}
 }
 
@@ -1190,7 +1190,7 @@ static int rawv6_init_sk(struct sock *sk)
 	default:
 		break;
 	}
-	return(0);
+	return 0;
 }
 
 struct proto rawv6_prot = {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d126365ac046..25b0beda4331 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -217,14 +217,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 
 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
 {
-	return (rt->rt6i_flags & RTF_EXPIRES &&
-		time_after(jiffies, rt->rt6i_expires));
+	return (rt->rt6i_flags & RTF_EXPIRES) &&
+		time_after(jiffies, rt->rt6i_expires);
 }
 
 static inline int rt6_need_strict(struct in6_addr *daddr)
 {
-	return (ipv6_addr_type(daddr) &
-		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK));
+	return ipv6_addr_type(daddr) &
+		(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
 }
 
 /*
@@ -440,7 +440,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
 		  __func__, match);
 
 	net = dev_net(rt0->rt6i_dev);
-	return (match ? match : net->ipv6.ip6_null_entry);
+	return match ? match : net->ipv6.ip6_null_entry;
 }
 
 #ifdef CONFIG_IPV6_ROUTE_INFO
@@ -859,7 +859,7 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl
 
 	dst_release(*dstp);
 	*dstp = new;
-	return (new ? 0 : -ENOMEM);
+	return new ? 0 : -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
 
@@ -1070,7 +1070,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
 out:
 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
-	return (atomic_read(&ops->entries) > rt_max_size);
+	return atomic_read(&ops->entries) > rt_max_size;
 }
 
 /* Clean host part of a prefix. Not necessary in radix tree,
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fe6d40418c0b..8d93f6d81979 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -139,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 		return -EINVAL;
 
 	if (usin->sin6_family != AF_INET6)
-		return(-EAFNOSUPPORT);
+		return -EAFNOSUPPORT;
 
 	memset(&fl, 0, sizeof(fl));
 
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 6baeabbbca82..39676eac3a37 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
 
 	xfrm6_policy_afinfo.garbage_collect(net);
-	return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+	return atomic_read(&ops->entries) > ops->gc_thresh * 2;
 }
 
 static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index fd55b5135de5..bf3635129b17 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -573,9 +573,9 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name)
 		/* Requested object/attribute doesn't exist */
 		if((self->errno == IAS_CLASS_UNKNOWN) ||
 		   (self->errno == IAS_ATTRIB_UNKNOWN))
-			return (-EADDRNOTAVAIL);
+			return -EADDRNOTAVAIL;
 		else
-			return (-EHOSTUNREACH);
+			return -EHOSTUNREACH;
 	}
 
 	/* Get the remote TSAP selector */
@@ -663,7 +663,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 					   __func__, name);
 				self->daddr = DEV_ADDR_ANY;
 				kfree(discoveries);
-				return(-ENOTUNIQ);
+				return -ENOTUNIQ;
 			}
 			/* First time we found that one, save it ! */
 			daddr = self->daddr;
@@ -677,7 +677,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 			IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__);
 			self->daddr = DEV_ADDR_ANY;
 			kfree(discoveries);
-			return(-EHOSTUNREACH);
+			return -EHOSTUNREACH;
 			break;
 		}
 	}
@@ -689,7 +689,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name)
 		IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n",
 			   __func__, name);
 		self->daddr = DEV_ADDR_ANY;
-		return(-EADDRNOTAVAIL);
+		return -EADDRNOTAVAIL;
 	}
 
 	/* Revert back to discovered device & service */
@@ -2465,9 +2465,9 @@ bed:
 			/* Requested object/attribute doesn't exist */
 			if((self->errno == IAS_CLASS_UNKNOWN) ||
 			   (self->errno == IAS_ATTRIB_UNKNOWN))
-				return (-EADDRNOTAVAIL);
+				return -EADDRNOTAVAIL;
 			else
-				return (-EHOSTUNREACH);
+				return -EHOSTUNREACH;
 		}
 
 		/* Translate from internal to user structure */
diff --git a/net/irda/discovery.c b/net/irda/discovery.c
index c1c8ae939126..36c3f037f172 100644
--- a/net/irda/discovery.c
+++ b/net/irda/discovery.c
@@ -315,7 +315,7 @@ struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn,
 
 	/* Get the actual number of device in the buffer and return */
 	*pn = i;
-	return(buffer);
+	return buffer;
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index faa82ca2dfdc..a39cca8331df 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -449,8 +449,8 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp)
 		}
 
 #ifdef SERIAL_DO_RESTART
-		return ((self->flags & ASYNC_HUP_NOTIFY) ?
-			-EAGAIN : -ERESTARTSYS);
+		return (self->flags & ASYNC_HUP_NOTIFY) ?
+			-EAGAIN : -ERESTARTSYS;
 #else
 		return -EAGAIN;
 #endif
diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c
index 0e7d8bde145d..6115a44c0a24 100644
--- a/net/irda/irlmp.c
+++ b/net/irda/irlmp.c
@@ -939,7 +939,7 @@ struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots)
 	}
 
 	/* Return current cached discovery log */
-	return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE));
+	return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE);
 }
 EXPORT_SYMBOL(irlmp_get_discoveries);
 
diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c
index 3750884094da..062e63b1c5c4 100644
--- a/net/irda/irlmp_frame.c
+++ b/net/irda/irlmp_frame.c
@@ -448,7 +448,7 @@ static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel,
 	    (self->cache.slsap_sel == slsap_sel) &&
 	    (self->cache.dlsap_sel == dlsap_sel))
 	{
-		return (self->cache.lsap);
+		return self->cache.lsap;
 	}
 #endif
 
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index e98e40d76f4f..7f17a8020e8a 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -238,7 +238,7 @@ irnet_ias_to_tsap(irnet_socket *	self,
   DEXIT(IRDA_SR_TRACE, "\n");
 
   /* Return the TSAP */
-  return(dtsap_sel);
+  return dtsap_sel;
 }
 
 /*------------------------------------------------------------------*/
@@ -301,7 +301,7 @@ irnet_connect_tsap(irnet_socket *	self)
     {
       clear_bit(0, &self->ttp_connect);
       DERROR(IRDA_SR_ERROR, "connect aborted!\n");
-      return(err);
+      return err;
     }
 
   /* Connect to remote device */
@@ -312,7 +312,7 @@ irnet_connect_tsap(irnet_socket *	self)
     {
       clear_bit(0, &self->ttp_connect);
       DERROR(IRDA_SR_ERROR, "connect aborted!\n");
-      return(err);
+      return err;
     }
 
   /* The above call is non-blocking.
@@ -321,7 +321,7 @@ irnet_connect_tsap(irnet_socket *	self)
    * See you there ;-) */
 
   DEXIT(IRDA_SR_TRACE, "\n");
-  return(err);
+  return err;
 }
 
 /*------------------------------------------------------------------*/
@@ -362,10 +362,10 @@ irnet_discover_next_daddr(irnet_socket *	self)
       /* The above request is non-blocking.
        * After a while, IrDA will call us back in irnet_discovervalue_confirm()
        * We will then call irnet_ias_to_tsap() and come back here again... */
-      return(0);
+      return 0;
     }
   else
-    return(1);
+    return 1;
 }
 
 /*------------------------------------------------------------------*/
@@ -436,7 +436,7 @@ irnet_discover_daddr_and_lsap_sel(irnet_socket *	self)
   /* Follow me in irnet_discovervalue_confirm() */
 
   DEXIT(IRDA_SR_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -485,7 +485,7 @@ irnet_dname_to_daddr(irnet_socket *	self)
   /* No luck ! */
   DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname);
   kfree(discoveries);
-  return(-EADDRNOTAVAIL);
+  return -EADDRNOTAVAIL;
 }
 
 
@@ -527,7 +527,7 @@ irda_irnet_create(irnet_socket *	self)
   INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect);
 
   DEXIT(IRDA_SOCK_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -601,7 +601,7 @@ irda_irnet_connect(irnet_socket *	self)
    * We will finish the connection procedure in irnet_connect_tsap().
    */
   DEXIT(IRDA_SOCK_TRACE, "\n");
-  return(0);
+  return 0;
 }
 
 /*------------------------------------------------------------------*/
@@ -733,7 +733,7 @@ irnet_daddr_to_dname(irnet_socket *	self)
   /* No luck ! */
   DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr);
   kfree(discoveries);
-  return(-EADDRNOTAVAIL);
+  return -EADDRNOTAVAIL;
 }
 
 /*------------------------------------------------------------------*/
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index dfe7b38dd4af..69f1fa64994e 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -166,7 +166,7 @@ irnet_ctrl_write(irnet_socket *	ap,
     }
 
   /* Success : we have parsed all commands successfully */
-  return(count);
+  return count;
 }
 
 #ifdef INITIAL_DISCOVERY
@@ -300,7 +300,7 @@ irnet_ctrl_read(irnet_socket *	ap,
 	}
 
       DEXIT(CTRL_TRACE, "\n");
-      return(strlen(event));
+      return strlen(event);
     }
 #endif /* INITIAL_DISCOVERY */
 
@@ -409,7 +409,7 @@ irnet_ctrl_read(irnet_socket *	ap,
     }
 
   DEXIT(CTRL_TRACE, "\n");
-  return(strlen(event));
+  return strlen(event);
 }
 
 /*------------------------------------------------------------------*/
@@ -623,7 +623,7 @@ dev_irnet_poll(struct file *	file,
     mask |= irnet_ctrl_poll(ap, file, wait);
 
   DEXIT(FS_TRACE, " - mask=0x%X\n", mask);
-  return(mask);
+  return mask;
 }
 
 /*------------------------------------------------------------------*/
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 43040e97c474..d87c22df6f1e 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -565,12 +565,12 @@ pfkey_proto2satype(uint16_t proto)
 
 static uint8_t pfkey_proto_to_xfrm(uint8_t proto)
 {
-	return (proto == IPSEC_PROTO_ANY ? 0 : proto);
+	return proto == IPSEC_PROTO_ANY ? 0 : proto;
 }
 
 static uint8_t pfkey_proto_from_xfrm(uint8_t proto)
 {
-	return (proto ? proto : IPSEC_PROTO_ANY);
+	return proto ? proto : IPSEC_PROTO_ANY;
 }
 
 static inline int pfkey_sockaddr_len(sa_family_t family)
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 4f772de2f213..b0cc385bf989 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -207,7 +207,7 @@ static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc)
 
 	fc = hdr->frame_control;
 
-	return ((info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc));
+	return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc);
 }
 
 static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx)
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index 3713d7ecab96..1bca6d49ec96 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c
@@ -142,7 +142,7 @@ static unsigned long rfkill_last_scheduled;
 static unsigned long rfkill_ratelimit(const unsigned long last)
 {
 	const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY);
-	return (time_after(jiffies, last + delay)) ? 0 : delay;
+	return time_after(jiffies, last + delay) ? 0 : delay;
 }
 
 static void rfkill_schedule_ratelimited(void)
diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c
index a750a28e0221..fa5f5641a2c2 100644
--- a/net/rose/rose_link.c
+++ b/net/rose/rose_link.c
@@ -114,7 +114,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh)
 	if (ax25s)
 		ax25_cb_put(ax25s);
 
-	return (neigh->ax25 != NULL);
+	return neigh->ax25 != NULL;
 }
 
 /*
@@ -137,7 +137,7 @@ static int rose_link_up(struct rose_neigh *neigh)
 	if (ax25s)
 		ax25_cb_put(ax25s);
 
-	return (neigh->ax25 != NULL);
+	return neigh->ax25 != NULL;
 }
 
 /*
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f774e657641a..1ef29c74d85e 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -799,7 +799,7 @@ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len)
 static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp)
 {
 	/* PF_INET only supports AF_INET addresses. */
-	return (AF_INET == family);
+	return AF_INET == family;
 }
 
 /* Address matching with wildcards allowed. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6a691d84aef4..535659fdbaa1 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3884,7 +3884,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len,
 	}
 
 out:
-	return (retval);
+	return retval;
 }
 
 
@@ -3940,7 +3940,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len,
 	}
 
 out:
-	return (retval);
+	return retval;
 }
 
 /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS)
@@ -5594,7 +5594,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum)
 	/* Note: sk->sk_num gets filled in if ephemeral port request. */
 	ret = sctp_get_port_local(sk, &addr);
 
-	return (ret ? 1 : 0);
+	return ret ? 1 : 0;
 }
 
 /*
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dcfc66bab2bb..597c493392ad 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1049,7 +1049,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags)
 out:
 	if (acred->machine_cred != gss_cred->gc_machine_cred)
 		return 0;
-	return (rc->cr_uid == acred->uid);
+	return rc->cr_uid == acred->uid;
 }
 
 /*
diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c
index 310b78e99456..c586e92bcf76 100644
--- a/net/sunrpc/auth_gss/gss_generic_token.c
+++ b/net/sunrpc/auth_gss/gss_generic_token.c
@@ -76,19 +76,19 @@ static int
 der_length_size( int length)
 {
 	if (length < (1<<7))
-		return(1);
+		return 1;
 	else if (length < (1<<8))
-		return(2);
+		return 2;
 #if (SIZEOF_INT == 2)
 	else
-		return(3);
+		return 3;
 #else
 	else if (length < (1<<16))
-		return(3);
+		return 3;
 	else if (length < (1<<24))
-		return(4);
+		return 4;
 	else
-		return(5);
+		return 5;
 #endif
 }
 
@@ -121,14 +121,14 @@ der_read_length(unsigned char **buf, int *bufsize)
 	int ret;
 
 	if (*bufsize < 1)
-		return(-1);
+		return -1;
 	sf = *(*buf)++;
 	(*bufsize)--;
 	if (sf & 0x80) {
 		if ((sf &= 0x7f) > ((*bufsize)-1))
-			return(-1);
+			return -1;
 		if (sf > SIZEOF_INT)
-			return (-1);
+			return -1;
 		ret = 0;
 		for (; sf; sf--) {
 			ret = (ret<<8) + (*(*buf)++);
@@ -138,7 +138,7 @@ der_read_length(unsigned char **buf, int *bufsize)
 		ret = sf;
 	}
 
-	return(ret);
+	return ret;
 }
 
 /* returns the length of a token, given the mech oid and the body size */
@@ -148,7 +148,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size)
 {
 	/* set body_size to sequence contents size */
 	body_size += 2 + (int) mech->len;         /* NEED overflow check */
-	return(1 + der_length_size(body_size) + body_size);
+	return 1 + der_length_size(body_size) + body_size;
 }
 
 EXPORT_SYMBOL_GPL(g_token_size);
@@ -186,27 +186,27 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
 	int ret = 0;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	if (*buf++ != 0x60)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((seqsize = der_read_length(&buf, &toksize)) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if (seqsize != toksize)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	if (*buf++ != 0x06)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if ((toksize-=1) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	toid.len = *buf++;
 
 	if ((toksize-=toid.len) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 	toid.data = buf;
 	buf+=toid.len;
 
@@ -217,17 +217,17 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size,
       to return G_BAD_TOK_HEADER if the token header is in fact bad */
 
 	if ((toksize-=2) < 0)
-		return(G_BAD_TOK_HEADER);
+		return G_BAD_TOK_HEADER;
 
 	if (ret)
-		return(ret);
+		return ret;
 
 	if (!ret) {
 		*buf_in = buf;
 		*body_size = toksize;
 	}
 
-	return(ret);
+	return ret;
 }
 
 EXPORT_SYMBOL_GPL(g_verify_token_header);
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 415c013ba382..62ac90c62cb1 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -162,5 +162,5 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
 	*seqnum = ((plain[0]) |
 		   (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24));
 
-	return (0);
+	return 0;
 }
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 2689de39dc78..8b4061049d76 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -331,7 +331,7 @@ gss_delete_sec_context(struct gss_ctx	**context_handle)
 			*context_handle);
 
 	if (!*context_handle)
-		return(GSS_S_NO_CONTEXT);
+		return GSS_S_NO_CONTEXT;
 	if ((*context_handle)->internal_ctx_id)
 		(*context_handle)->mech_type->gm_ops
 			->gss_delete_sec_context((*context_handle)
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index cace6049e4a5..aa5dbda6608c 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -376,7 +376,7 @@ int rpc_queue_empty(struct rpc_wait_queue *queue)
 	spin_lock_bh(&queue->lock);
 	res = queue->qlen;
 	spin_unlock_bh(&queue->lock);
-	return (res == 0);
+	return res == 0;
 }
 EXPORT_SYMBOL_GPL(rpc_queue_empty);
 
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index c048543ffbeb..2ddc351b3be9 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -89,7 +89,7 @@ int tipc_addr_domain_valid(u32 addr)
 
 int tipc_addr_node_valid(u32 addr)
 {
-	return (tipc_addr_domain_valid(addr) && tipc_node(addr));
+	return tipc_addr_domain_valid(addr) && tipc_node(addr);
 }
 
 int tipc_in_scope(u32 domain, u32 addr)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index b11248c2d788..ecfaac10d0b4 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -184,7 +184,7 @@ static void bclink_set_gap(struct tipc_node *n_ptr)
 
 static int bclink_ack_allowed(u32 n)
 {
-	return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag);
+	return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag;
 }
 
 
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 52ae17b2583e..9c10c6b7c12b 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -63,7 +63,7 @@ static int media_name_valid(const char *name)
 	len = strlen(name);
 	if ((len + 1) > TIPC_MAX_MEDIA_NAME)
 		return 0;
-	return (strspn(name, tipc_alphabet) == len);
+	return strspn(name, tipc_alphabet) == len;
 }
 
 /**
diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c
index 1885a7edb0c8..6569d45bfb9a 100644
--- a/net/tipc/dbg.c
+++ b/net/tipc/dbg.c
@@ -134,7 +134,7 @@ void tipc_printbuf_reset(struct print_buf *pb)
 
 int tipc_printbuf_empty(struct print_buf *pb)
 {
-	return (!pb->buf || (pb->crs == pb->buf));
+	return !pb->buf || (pb->crs == pb->buf);
 }
 
 /**
@@ -169,7 +169,7 @@ int tipc_printbuf_validate(struct print_buf *pb)
 			tipc_printf(pb, err);
 		}
 	}
-	return (pb->crs - pb->buf + 1);
+	return pb->crs - pb->buf + 1;
 }
 
 /**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index a6a3102bb4d6..b8cf1e9d0b86 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -239,13 +239,13 @@ int tipc_link_is_up(struct link *l_ptr)
 {
 	if (!l_ptr)
 		return 0;
-	return (link_working_working(l_ptr) || link_working_unknown(l_ptr));
+	return link_working_working(l_ptr) || link_working_unknown(l_ptr);
 }
 
 int tipc_link_is_active(struct link *l_ptr)
 {
-	return ((l_ptr->owner->active_links[0] == l_ptr) ||
-		(l_ptr->owner->active_links[1] == l_ptr));
+	return	(l_ptr->owner->active_links[0] == l_ptr) ||
+		(l_ptr->owner->active_links[1] == l_ptr);
 }
 
 /**
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 2e5385c47d30..26151d30589d 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -279,12 +279,12 @@ static inline int between(u32 lower, u32 upper, u32 n)
 
 static inline int less_eq(u32 left, u32 right)
 {
-	return (mod(right - left) < 32768u);
+	return mod(right - left) < 32768u;
 }
 
 static inline int less(u32 left, u32 right)
 {
-	return (less_eq(left, right) && (mod(right) != mod(left)));
+	return less_eq(left, right) && (mod(right) != mod(left));
 }
 
 static inline u32 lesser(u32 left, u32 right)
@@ -299,32 +299,32 @@ static inline u32 lesser(u32 left, u32 right)
 
 static inline int link_working_working(struct link *l_ptr)
 {
-	return (l_ptr->state == WORKING_WORKING);
+	return l_ptr->state == WORKING_WORKING;
 }
 
 static inline int link_working_unknown(struct link *l_ptr)
 {
-	return (l_ptr->state == WORKING_UNKNOWN);
+	return l_ptr->state == WORKING_UNKNOWN;
 }
 
 static inline int link_reset_unknown(struct link *l_ptr)
 {
-	return (l_ptr->state == RESET_UNKNOWN);
+	return l_ptr->state == RESET_UNKNOWN;
 }
 
 static inline int link_reset_reset(struct link *l_ptr)
 {
-	return (l_ptr->state == RESET_RESET);
+	return l_ptr->state == RESET_RESET;
 }
 
 static inline int link_blocked(struct link *l_ptr)
 {
-	return (l_ptr->exp_msg_count || l_ptr->blocked);
+	return l_ptr->exp_msg_count || l_ptr->blocked;
 }
 
 static inline int link_congested(struct link *l_ptr)
 {
-	return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]);
+	return l_ptr->out_queue_size >= l_ptr->queue_limit[0];
 }
 
 #endif
diff --git a/net/tipc/msg.h b/net/tipc/msg.h
index 995d2da35b01..031aad18efce 100644
--- a/net/tipc/msg.h
+++ b/net/tipc/msg.h
@@ -104,7 +104,7 @@ static inline u32 msg_user(struct tipc_msg *m)
 
 static inline u32 msg_isdata(struct tipc_msg *m)
 {
-	return (msg_user(m) <= TIPC_CRITICAL_IMPORTANCE);
+	return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE;
 }
 
 static inline void msg_set_user(struct tipc_msg *m, u32 n)
@@ -289,7 +289,7 @@ static inline void msg_set_destnode(struct tipc_msg *m, u32 a)
 
 static inline int msg_is_dest(struct tipc_msg *m, u32 d)
 {
-	return(msg_short(m) || (msg_destnode(m) == d));
+	return msg_short(m) || (msg_destnode(m) == d);
 }
 
 static inline u32 msg_routed(struct tipc_msg *m)
@@ -632,7 +632,7 @@ static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n)
 
 static inline u32 msg_max_pkt(struct tipc_msg *m)
 {
-	return (msg_bits(m, 9, 16, 0xffff) * 4);
+	return msg_bits(m, 9, 16, 0xffff) * 4;
 }
 
 static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n)
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index c13c2c7c4b57..9ca4b0689237 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -116,7 +116,7 @@ DEFINE_RWLOCK(tipc_nametbl_lock);
 
 static int hash(int x)
 {
-	return(x & (tipc_nametbl_size - 1));
+	return x & (tipc_nametbl_size - 1);
 }
 
 /**
diff --git a/net/tipc/node.c b/net/tipc/node.c
index b702c7bf580f..7c49cd056df7 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -242,17 +242,17 @@ int tipc_node_has_active_links(struct tipc_node *n_ptr)
 
 int tipc_node_has_redundant_links(struct tipc_node *n_ptr)
 {
-	return (n_ptr->working_links > 1);
+	return n_ptr->working_links > 1;
 }
 
 static int tipc_node_has_active_routes(struct tipc_node *n_ptr)
 {
-	return (n_ptr && (n_ptr->last_router >= 0));
+	return n_ptr && (n_ptr->last_router >= 0);
 }
 
 int tipc_node_is_up(struct tipc_node *n_ptr)
 {
-	return (tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr));
+	return tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr);
 }
 
 struct tipc_node *tipc_node_attach_link(struct link *l_ptr)
diff --git a/net/tipc/port.h b/net/tipc/port.h
index 8d1652aab298..e74bd9563739 100644
--- a/net/tipc/port.h
+++ b/net/tipc/port.h
@@ -157,7 +157,7 @@ static inline u32 tipc_peer_node(struct port *p_ptr)
 
 static inline int tipc_port_congested(struct port *p_ptr)
 {
-	return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2));
+	return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2);
 }
 
 /**
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index f7ac94de24fe..33217fc3d697 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1195,7 +1195,7 @@ static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base)
 	if (msg_connected(msg))
 		threshold *= 4;
 
-	return (queue_size >= threshold);
+	return queue_size >= threshold;
 }
 
 /**
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index ab6eab4c45e2..1a5b9a6bd128 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -604,6 +604,6 @@ int tipc_ispublished(struct tipc_name const *name)
 {
 	u32 domain = 0;
 
-	return(tipc_nametbl_translate(name->type, name->instance,&domain) != 0);
+	return tipc_nametbl_translate(name->type, name->instance, &domain) != 0;
 }
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 37580e090a3d..5d89310b3587 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -86,7 +86,7 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy)
 static inline
 bool wiphy_idx_valid(int wiphy_idx)
 {
-	return (wiphy_idx >= 0);
+	return wiphy_idx >= 0;
 }
 
 
-- 
cgit v1.2.3-58-ga151


From 59104f062435c7816e39ee5ed504a69cb8037f10 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 20 Sep 2010 20:16:27 +0000
Subject: ip: take care of last fragment in ip_append_data

While investigating a bit, I found ip_fragment() slow path was taken
because ip_append_data() provides following layout for a send(MTU +
N*(MTU - 20)) syscall :

- one skb with 1500 (mtu) bytes
- N fragments of 1480 (mtu-20) bytes (before adding IP header)
last fragment gets 17 bytes of trail data because of following bit:

	if (datalen == length + fraggap)
		alloclen += rt->dst.trailer_len;

Then esp4 adds 16 bytes of data (while trailer_len is 17... hmm...
another bug ?)

In ip_fragment(), we notice last fragment is too big (1496 + 20) > mtu,
so we take slow path, building another skb chain.

In order to avoid taking slow path, we should correct ip_append_data()
to make sure last fragment has real trail space, under mtu...

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_output.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index e42762023c27..3551b6dc7419 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -926,16 +926,19 @@ alloc_new_skb:
 			    !(rt->dst.dev->features&NETIF_F_SG))
 				alloclen = mtu;
 			else
-				alloclen = datalen + fragheaderlen;
+				alloclen = fraglen;
 
 			/* The last fragment gets additional space at tail.
 			 * Note, with MSG_MORE we overallocate on fragments,
 			 * because we have no idea what fragment will be
 			 * the last.
 			 */
-			if (datalen == length + fraggap)
+			if (datalen == length + fraggap) {
 				alloclen += rt->dst.trailer_len;
-
+				/* make sure mtu is not reached */
+				if (datalen > mtu - fragheaderlen - rt->dst.trailer_len)
+					datalen -= ALIGN(rt->dst.trailer_len, 8);
+			}
 			if (transhdrlen) {
 				skb = sock_alloc_send_skb(sk,
 						alloclen + hh_len + 15,
-- 
cgit v1.2.3-58-ga151


From 83180af0b0ea166adf8249f4513beb7355f9b4c9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 23 Sep 2010 21:46:03 +0000
Subject: net: fix rcu use in ip_route_output_slow

__in_dev_get_rtnl(dev_out) is called while RTNL is not held, thus
triggers a lockdep fault.

At this point, we only perform a raw test of dev_out->ip_ptr being NULL,
we dont need to make sure ip_ptr cant changed right after.

We can use rcu_dereference_raw() for this.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ae1d4a41f1c6..98beda47bc99 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2579,7 +2579,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			goto out;
 
 		/* RACE: Check return value of inet_select_addr instead. */
-		if (__in_dev_get_rtnl(dev_out) == NULL) {
+		if (rcu_dereference_raw(dev_out->ip_ptr) == NULL) {
 			dev_put(dev_out);
 			goto out;	/* Wrong error code */
 		}
-- 
cgit v1.2.3-58-ga151


From e985aad723d7709e6bee566bacb100d33d9b791b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 27 Sep 2010 03:57:11 +0000
Subject: ip_gre: percpu stats accounting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Le lundi 27 septembre 2010 à 14:29 +0100, Ben Hutchings a écrit :

> > diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
> > index 5d6ddcb..de39b22 100644
> > --- a/net/ipv4/ip_gre.c
> > +++ b/net/ipv4/ip_gre.c
> [...]
> > @@ -377,7 +405,7 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
> >  	if (parms->name[0])
> >  		strlcpy(name, parms->name, IFNAMSIZ);
> >  	else
> > -		sprintf(name, "gre%%d");
> > +		strcpy(name, "gre%d");
> >
> >  	dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
> >  	if (!dev)
> [...]
>
> This is a valid fix, but doesn't belong in this patch!
>

Sorry ? It was not a fix, but at most a cleanup ;)

Anyway I forgot the gretap case...

[PATCH 2/4 v2] ip_gre: percpu stats accounting

Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets.

Other seldom used fields are kept in netdev->stats structure, possibly
unsafe.

This is a preliminary work to support lockless transmit path, and
correct RX stats, that are already unsafe.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 104 insertions(+), 39 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 5d6ddcb7403b..a1b5d5e03064 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -165,6 +165,34 @@ struct ipgre_net {
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
 /* Given src, dst and key, find appropriate for input tunnel. */
 
 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
@@ -584,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 	if ((tunnel = ipgre_tunnel_lookup(skb->dev,
 					  iph->saddr, iph->daddr, key,
 					  gre_proto))) {
-		struct net_device_stats *stats = &tunnel->dev->stats;
+		struct pcpu_tstats *tstats;
 
 		secpath_reset(skb);
 
@@ -608,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb)
 			/* Looped back packet, drop it! */
 			if (skb_rtable(skb)->fl.iif == 0)
 				goto drop;
-			stats->multicast++;
+			tunnel->dev->stats.multicast++;
 			skb->pkt_type = PACKET_BROADCAST;
 		}
 #endif
 
 		if (((flags&GRE_CSUM) && csum) ||
 		    (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
-			stats->rx_crc_errors++;
-			stats->rx_errors++;
+			tunnel->dev->stats.rx_crc_errors++;
+			tunnel->dev->stats.rx_errors++;
 			goto drop;
 		}
 		if (tunnel->parms.i_flags&GRE_SEQ) {
 			if (!(flags&GRE_SEQ) ||
 			    (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
-				stats->rx_fifo_errors++;
-				stats->rx_errors++;
+				tunnel->dev->stats.rx_fifo_errors++;
+				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
 			tunnel->i_seqno = seqno + 1;
@@ -632,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb)
 		/* Warning: All skb pointers will be invalidated! */
 		if (tunnel->dev->type == ARPHRD_ETHER) {
 			if (!pskb_may_pull(skb, ETH_HLEN)) {
-				stats->rx_length_errors++;
-				stats->rx_errors++;
+				tunnel->dev->stats.rx_length_errors++;
+				tunnel->dev->stats.rx_errors++;
 				goto drop;
 			}
 
@@ -642,13 +670,17 @@ static int ipgre_rcv(struct sk_buff *skb)
 			skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 		}
 
-		skb_tunnel_rx(skb, tunnel->dev);
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
 
 		skb_reset_network_header(skb);
 		ipgre_ecn_decapsulate(iph, skb);
 
 		if (netif_rx(skb) == NET_RX_DROP)
-			stats->rx_dropped++;
+			tunnel->dev->stats.rx_dropped++;
 
 		rcu_read_unlock();
 		return 0;
@@ -665,8 +697,7 @@ drop_nolock:
 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct pcpu_tstats *tstats;
 	struct iphdr  *old_iph = ip_hdr(skb);
 	struct iphdr  *tiph;
 	u8     tos;
@@ -694,7 +725,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 		/* NBMA tunnel */
 
 		if (skb_dst(skb) == NULL) {
-			stats->tx_fifo_errors++;
+			dev->stats.tx_fifo_errors++;
 			goto tx_error;
 		}
 
@@ -740,14 +771,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	}
 
 	{
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = dst,
-						.saddr = tiph->saddr,
-						.tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = dst,
+					.saddr = tiph->saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		}
+;
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			stats->tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 			goto tx_error;
 		}
 	}
@@ -755,7 +792,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 
 	if (tdev == dev) {
 		ip_rt_put(rt);
-		stats->collisions++;
+		dev->stats.collisions++;
 		goto tx_error;
 	}
 
@@ -818,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 			dev->needed_headroom = max_headroom;
 		if (!new_skb) {
 			ip_rt_put(rt);
-			txq->tx_dropped++;
+			dev->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 			return NETDEV_TX_OK;
 		}
@@ -885,15 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
 	}
 
 	nf_reset(skb);
-
-	IPTUNNEL_XMIT();
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
 	dst_link_failure(skb);
 
 tx_error:
-	stats->tx_errors++;
+	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -913,13 +950,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev)
 	/* Guess output device to choose reasonable mtu and needed_headroom */
 
 	if (iph->daddr) {
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.saddr = iph->saddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		};
 		struct rtable *rt;
+
 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 			tdev = rt->dst.dev;
 			ip_rt_put(rt);
@@ -1171,13 +1214,19 @@ static int ipgre_open(struct net_device *dev)
 	struct ip_tunnel *t = netdev_priv(dev);
 
 	if (ipv4_is_multicast(t->parms.iph.daddr)) {
-		struct flowi fl = { .oif = t->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = t->parms.iph.daddr,
-						.saddr = t->parms.iph.saddr,
-						.tos = RT_TOS(t->parms.iph.tos) } },
-				    .proto = IPPROTO_GRE };
+		struct flowi fl = {
+			.oif = t->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = t->parms.iph.daddr,
+					.saddr = t->parms.iph.saddr,
+					.tos = RT_TOS(t->parms.iph.tos)
+				}
+			},
+			.proto = IPPROTO_GRE
+		};
 		struct rtable *rt;
+
 		if (ip_route_output_key(dev_net(dev), &rt, &fl))
 			return -EADDRNOTAVAIL;
 		dev = rt->dst.dev;
@@ -1217,12 +1266,19 @@ static const struct net_device_ops ipgre_netdev_ops = {
 	.ndo_start_xmit		= ipgre_tunnel_xmit,
 	.ndo_do_ioctl		= ipgre_tunnel_ioctl,
 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
 };
 
+static void ipgre_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 static void ipgre_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipgre_netdev_ops;
-	dev->destructor 	= free_netdev;
+	dev->destructor 	= ipgre_dev_free;
 
 	dev->type		= ARPHRD_IPGRE;
 	dev->needed_headroom 	= LL_MAX_HEADER + sizeof(struct iphdr) + 4;
@@ -1260,6 +1316,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
 	} else
 		dev->header_ops = &ipgre_header_ops;
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1446,6 +1506,10 @@ static int ipgre_tap_init(struct net_device *dev)
 
 	ipgre_tunnel_bind_dev(dev);
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -1456,6 +1520,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = {
 	.ndo_set_mac_address 	= eth_mac_addr,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= ipgre_tunnel_change_mtu,
+	.ndo_get_stats		= ipgre_get_stats,
 };
 
 static void ipgre_tap_setup(struct net_device *dev)
@@ -1464,7 +1529,7 @@ static void ipgre_tap_setup(struct net_device *dev)
 	ether_setup(dev);
 
 	dev->netdev_ops		= &ipgre_tap_netdev_ops;
-	dev->destructor 	= free_netdev;
+	dev->destructor 	= ipgre_dev_free;
 
 	dev->iflink		= 0;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
-- 
cgit v1.2.3-58-ga151


From 3c97af99a5aa17feaebb4eb0f85f51ab6c055797 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 27 Sep 2010 00:35:50 +0000
Subject: ipip: percpu stats accounting

Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets.

Other seldom used fields are kept in netdev->stats structure, possibly
unsafe.

This is a preliminary work to support lockless transmit path, and
correct RX stats, that are already unsafe.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 127 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 93 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index babd25278106..12b6fde6f65a 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -131,8 +131,9 @@ struct ipip_net {
 	struct net_device *fb_tunnel_dev;
 };
 
-static void ipip_tunnel_init(struct net_device *dev);
+static int ipip_tunnel_init(struct net_device *dev);
 static void ipip_tunnel_setup(struct net_device *dev);
+static void ipip_dev_free(struct net_device *dev);
 
 /*
  * Locking : hash tables are protected by RCU and RTNL
@@ -141,6 +142,34 @@ static void ipip_tunnel_setup(struct net_device *dev);
 #define for_each_ip_tunnel_rcu(start) \
 	for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
 
+/* often modified stats are per cpu, other are shared (netdev->stats) */
+struct pcpu_tstats {
+	unsigned long	rx_packets;
+	unsigned long	rx_bytes;
+	unsigned long	tx_packets;
+	unsigned long	tx_bytes;
+};
+
+static struct net_device_stats *ipip_get_stats(struct net_device *dev)
+{
+	struct pcpu_tstats sum = { 0 };
+	int i;
+
+	for_each_possible_cpu(i) {
+		const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
+
+		sum.rx_packets += tstats->rx_packets;
+		sum.rx_bytes   += tstats->rx_bytes;
+		sum.tx_packets += tstats->tx_packets;
+		sum.tx_bytes   += tstats->tx_bytes;
+	}
+	dev->stats.rx_packets = sum.rx_packets;
+	dev->stats.rx_bytes   = sum.rx_bytes;
+	dev->stats.tx_packets = sum.tx_packets;
+	dev->stats.tx_bytes   = sum.tx_bytes;
+	return &dev->stats;
+}
+
 static struct ip_tunnel * ipip_tunnel_lookup(struct net *net,
 		__be32 remote, __be32 local)
 {
@@ -239,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	if (parms->name[0])
 		strlcpy(name, parms->name, IFNAMSIZ);
 	else
-		sprintf(name, "tunl%%d");
+		strcpy(name, "tunl%d");
 
 	dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup);
 	if (dev == NULL)
@@ -255,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	nt = netdev_priv(dev);
 	nt->parms = *parms;
 
-	ipip_tunnel_init(dev);
+	if (ipip_tunnel_init(dev) < 0)
+		goto failed_free;
 
 	if (register_netdevice(dev) < 0)
 		goto failed_free;
@@ -265,7 +295,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net,
 	return nt;
 
 failed_free:
-	free_netdev(dev);
+	ipip_dev_free(dev);
 	return NULL;
 }
 
@@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb)
 	const struct iphdr *iph = ip_hdr(skb);
 
 	rcu_read_lock();
-	if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev),
-					iph->saddr, iph->daddr)) != NULL) {
+	tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr);
+	if (tunnel != NULL) {
+		struct pcpu_tstats *tstats;
+
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
 			rcu_read_unlock();
 			kfree_skb(skb);
@@ -374,7 +406,11 @@ static int ipip_rcv(struct sk_buff *skb)
 		skb->protocol = htons(ETH_P_IP);
 		skb->pkt_type = PACKET_HOST;
 
-		skb_tunnel_rx(skb, tunnel->dev);
+		tstats = this_cpu_ptr(tunnel->dev->tstats);
+		tstats->rx_packets++;
+		tstats->rx_bytes += skb->len;
+
+		__skb_tunnel_rx(skb, tunnel->dev);
 
 		ipip_ecn_decapsulate(iph, skb);
 
@@ -397,13 +433,12 @@ static int ipip_rcv(struct sk_buff *skb)
 static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
-	struct net_device_stats *stats = &dev->stats;
-	struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
+	struct pcpu_tstats *tstats;
 	struct iphdr  *tiph = &tunnel->parms.iph;
 	u8     tos = tunnel->parms.iph.tos;
 	__be16 df = tiph->frag_off;
 	struct rtable *rt;     			/* Route to the other host */
-	struct net_device *tdev;			/* Device to other host */
+	struct net_device *tdev;		/* Device to other host */
 	struct iphdr  *old_iph = ip_hdr(skb);
 	struct iphdr  *iph;			/* Our new IP header */
 	unsigned int max_headroom;		/* The extra header space needed */
@@ -413,13 +448,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	if (skb->protocol != htons(ETH_P_IP))
 		goto tx_error;
 
-	if (tos&1)
+	if (tos & 1)
 		tos = old_iph->tos;
 
 	if (!dst) {
 		/* NBMA tunnel */
 		if ((rt = skb_rtable(skb)) == NULL) {
-			stats->tx_fifo_errors++;
+			dev->stats.tx_fifo_errors++;
 			goto tx_error;
 		}
 		if ((dst = rt->rt_gateway) == 0)
@@ -427,14 +462,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 	}
 
 	{
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = dst,
-						.saddr = tiph->saddr,
-						.tos = RT_TOS(tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = dst,
+					.saddr = tiph->saddr,
+					.tos = RT_TOS(tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
-			stats->tx_carrier_errors++;
+			dev->stats.tx_carrier_errors++;
 			goto tx_error_icmp;
 		}
 	}
@@ -442,7 +483,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	if (tdev == dev) {
 		ip_rt_put(rt);
-		stats->collisions++;
+		dev->stats.collisions++;
 		goto tx_error;
 	}
 
@@ -452,7 +493,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr);
 
 		if (mtu < 68) {
-			stats->collisions++;
+			dev->stats.collisions++;
 			ip_rt_put(rt);
 			goto tx_error;
 		}
@@ -488,7 +529,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
 		if (!new_skb) {
 			ip_rt_put(rt);
-			txq->tx_dropped++;
+			dev->stats.tx_dropped++;
 			dev_kfree_skb(skb);
 			return NETDEV_TX_OK;
 		}
@@ -525,14 +566,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
 		iph->ttl	=	old_iph->ttl;
 
 	nf_reset(skb);
-
-	IPTUNNEL_XMIT();
+	tstats = this_cpu_ptr(dev->tstats);
+	__IPTUNNEL_XMIT(tstats, &dev->stats);
 	return NETDEV_TX_OK;
 
 tx_error_icmp:
 	dst_link_failure(skb);
 tx_error:
-	stats->tx_errors++;
+	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
@@ -547,13 +588,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev)
 	iph = &tunnel->parms.iph;
 
 	if (iph->daddr) {
-		struct flowi fl = { .oif = tunnel->parms.link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.saddr = iph->saddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = tunnel->parms.link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.saddr = iph->saddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
 		struct rtable *rt;
+
 		if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
 			tdev = rt->dst.dev;
 			ip_rt_put(rt);
@@ -699,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = {
 	.ndo_start_xmit	= ipip_tunnel_xmit,
 	.ndo_do_ioctl	= ipip_tunnel_ioctl,
 	.ndo_change_mtu	= ipip_tunnel_change_mtu,
-
+	.ndo_get_stats  = ipip_get_stats,
 };
 
+static void ipip_dev_free(struct net_device *dev)
+{
+	free_percpu(dev->tstats);
+	free_netdev(dev);
+}
+
 static void ipip_tunnel_setup(struct net_device *dev)
 {
 	dev->netdev_ops		= &ipip_netdev_ops;
-	dev->destructor		= free_netdev;
+	dev->destructor		= ipip_dev_free;
 
 	dev->type		= ARPHRD_TUNNEL;
 	dev->hard_header_len 	= LL_MAX_HEADER + sizeof(struct iphdr);
@@ -717,7 +770,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
-static void ipip_tunnel_init(struct net_device *dev)
+static int ipip_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 
@@ -728,6 +781,12 @@ static void ipip_tunnel_init(struct net_device *dev)
 	memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
 
 	ipip_tunnel_bind_dev(dev);
+
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
+	return 0;
 }
 
 static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
-- 
cgit v1.2.3-58-ga151


From 4465b469008bc03b98a1b8df4e9ae501b6c69d4b Mon Sep 17 00:00:00 2001
From: Tom Herbert <therbert@google.com>
Date: Sun, 23 May 2010 19:54:12 +0000
Subject: ipv4: Allow configuring subnets as local addresses

This patch allows a host to be configured to respond to any address in
a specified range as if it were local, without actually needing to
configure the address on an interface.  This is done through routing
table configuration.  For instance, to configure a host to respond
to any address in 10.1/16 received on eth0 as a local address we can do:

ip rule add from all iif eth0 lookup 200
ip route add local 10.1/16 dev lo proto kernel scope host src 127.0.0.1 table 200

This host is now reachable by any 10.1/16 address (route lookup on
input for packets received on eth0 can find the route).  On output, the
rule will not be matched so that this host can still send packets to
10.1/16 (not sent on loopback).  Presumably, external routing can be
configured to make sense out of this.

To make this work, we needed to modify the logic in finding the
interface which is assigned a given source address for output
(dev_ip_find).  We perform a normal fib_lookup instead of just a
lookup on the local table, and in the lookup we ignore the input
interface for matching.

This patch is useful to implement IP-anycast for subnets of virtual
addresses.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow.h      | 1 +
 net/core/fib_rules.c    | 3 ++-
 net/ipv4/fib_frontend.c | 7 +++----
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/flow.h b/include/net/flow.h
index bb08692a20b0..0ac3fb5e0973 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -49,6 +49,7 @@ struct flowi {
 	__u8	proto;
 	__u8	flags;
 #define FLOWI_FLAG_ANYSRC 0x01
+#define FLOWI_FLAG_MATCH_ANY_IIF 0x02
 	union {
 		struct {
 			__be16	sport;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index d0787284cb07..332c2e31d048 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -182,7 +182,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops,
 {
 	int ret = 0;
 
-	if (rule->iifindex && (rule->iifindex != fl->iif))
+	if (rule->iifindex && (rule->iifindex != fl->iif) &&
+	    !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF))
 		goto out;
 
 	if (rule->oifindex && (rule->oifindex != fl->oif))
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7d02a9f999fa..981f3c59b334 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -153,17 +153,16 @@ static void fib_flush(struct net *net)
 
 struct net_device * ip_dev_find(struct net *net, __be32 addr)
 {
-	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } };
+	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } },
+			    .flags = FLOWI_FLAG_MATCH_ANY_IIF };
 	struct fib_result res;
 	struct net_device *dev = NULL;
-	struct fib_table *local_table;
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	res.r = NULL;
 #endif
 
-	local_table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!local_table || fib_table_lookup(local_table, &fl, &res))
+	if (fib_lookup(net, &fl, &res))
 		return NULL;
 	if (res.type != RTN_LOCAL)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From fada5636fe41fd1423fe4e6af7b9f609378acde6 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 27 Sep 2010 23:56:46 +0000
Subject: ipip: fix percpu stats accounting

commit 3c97af99a5aa1 (ipip: percpu stats accounting) forgot the fallback
tunnel case (tunl0), and can crash pretty fast.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 12b6fde6f65a..9e78f11da786 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -789,7 +789,7 @@ static int ipip_tunnel_init(struct net_device *dev)
 	return 0;
 }
 
-static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
+static int __net_init ipip_fb_tunnel_init(struct net_device *dev)
 {
 	struct ip_tunnel *tunnel = netdev_priv(dev);
 	struct iphdr *iph = &tunnel->parms.iph;
@@ -802,8 +802,13 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev)
 	iph->protocol		= IPPROTO_IPIP;
 	iph->ihl		= 5;
 
+	dev->tstats = alloc_percpu(struct pcpu_tstats);
+	if (!dev->tstats)
+		return -ENOMEM;
+
 	dev_hold(dev);
 	rcu_assign_pointer(ipn->tunnels_wc[0], tunnel);
+	return 0;
 }
 
 static struct xfrm_tunnel ipip_handler __read_mostly = {
@@ -852,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net)
 	}
 	dev_net_set(ipn->fb_tunnel_dev, net);
 
-	ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+	err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev);
+	if (err)
+		goto err_reg_dev;
 
 	if ((err = register_netdev(ipn->fb_tunnel_dev)))
 		goto err_reg_dev;
@@ -860,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net)
 	return 0;
 
 err_reg_dev:
-	free_netdev(ipn->fb_tunnel_dev);
+	ipip_dev_free(ipn->fb_tunnel_dev);
 err_alloc_dev:
 	/* nothing */
 	return err;
-- 
cgit v1.2.3-58-ga151


From b790e01aee74c23a5d92576177934f13aa51f718 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 27 Sep 2010 23:05:47 +0000
Subject: ip_gre: lockless xmit

GRE tunnels can benefit from lockless xmits, using NETIF_F_LLTX

Note: If tunnels are created with the "oseq" option, LLTX is not
enabled :

Even using an atomic_t o_seq, we would increase chance for packets being
out of order at receiver.

Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending
10000000 UDP frames via one gre tunnel (size:200 bytes per frame)

Before patch :
real	3m0.094s
user	0m9.365s
sys	47m50.103s

After patch:
real	0m29.756s
user	0m11.097s
sys	7m33.012s

Last problem to solve is the contention on dst :

38660.00 21.4% __ip_route_output_key          vmlinux
20786.00 11.5% dst_release                    vmlinux
14191.00  7.8% __xfrm_lookup                  vmlinux
12410.00  6.9% ip_finish_output               vmlinux
 4540.00  2.5% ip_push_pending_frames         vmlinux
 4427.00  2.4% ip_append_data                 vmlinux
 4265.00  2.4% __alloc_skb                    vmlinux
 4140.00  2.3% __ip_local_out                 vmlinux
 3991.00  2.2% dev_queue_xmit                 vmlinux

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index a1b5d5e03064..035db6350037 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1557,6 +1557,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla
 	if (!tb[IFLA_MTU])
 		dev->mtu = mtu;
 
+	/* Can use a lockless transmit, unless we generate output sequences */
+	if (!(nt->parms.o_flags & GRE_SEQ))
+		dev->features |= NETIF_F_LLTX;
+
 	err = register_netdevice(dev);
 	if (err)
 		goto out;
-- 
cgit v1.2.3-58-ga151


From 153f0943382e9ae0bff7caa110a1a4656088d0d4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 28 Sep 2010 00:17:17 +0000
Subject: ipip: enable lockless xmits

IPIP tunnels can benefit from lockless xmits, using NETIF_F_LLTX

Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending
10000000 UDP frames via one ipip tunnel (size:200 bytes per frame)

Before patch :
real	2m53.321s
user	0m10.277s
sys	46m0.597s

After patch:
real	0m32.063s
user	0m9.237s
sys	8m16.255s

Last problem to solve is the contention on dst :

16118.00 28.3% __ip_route_output_key         vmlinux
 6135.00 10.8% dst_release                   vmlinux
 3220.00  5.6% ip_finish_output              vmlinux
 2149.00  3.8% ip_route_output_flow          vmlinux
 1575.00  2.8% ip_append_data                vmlinux
 1481.00  2.6% ip_push_pending_frames        vmlinux
 1349.00  2.4% __xfrm_lookup                 vmlinux
 1216.00  2.1% csum_partial_copy_generic     vmlinux
 1208.00  2.1% udp_sendmsg                   vmlinux

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipip.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 9e78f11da786..6ad46c28ede2 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -767,6 +767,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	dev->iflink		= 0;
 	dev->addr_len		= 4;
 	dev->features		|= NETIF_F_NETNS_LOCAL;
+	dev->features		|= NETIF_F_LLTX;
 	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
 }
 
-- 
cgit v1.2.3-58-ga151


From a64de47c091e4a337fa9763315cb6f2fbf0c583b Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 28 Sep 2010 17:08:02 +0000
Subject: arp: remove unnecessary export of arp_broken_ops

arp_broken_ops is only used in arp.c

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/arp.h | 2 --
 net/ipv4/arp.c    | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/arp.h b/include/net/arp.h
index 716f43c5c98e..f4cf6ce66586 100644
--- a/include/net/arp.h
+++ b/include/net/arp.h
@@ -26,6 +26,4 @@ extern struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 				  const unsigned char *target_hw);
 extern void arp_xmit(struct sk_buff *skb);
 
-extern const struct neigh_ops arp_broken_ops;
-
 #endif	/* _ARP_H */
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 4083c186fd30..d9031ad67826 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = {
 	.queue_xmit =		dev_queue_xmit,
 };
 
-const struct neigh_ops arp_broken_ops = {
+static const struct neigh_ops arp_broken_ops = {
 	.family =		AF_INET,
 	.solicit =		arp_solicit,
 	.error_report =		arp_error_report,
@@ -170,7 +170,6 @@ const struct neigh_ops arp_broken_ops = {
 	.hh_output =		dev_queue_xmit,
 	.queue_xmit =		dev_queue_xmit,
 };
-EXPORT_SYMBOL(arp_broken_ops);
 
 struct neigh_table arp_tbl = {
 	.family		= AF_INET,
-- 
cgit v1.2.3-58-ga151


From 1b9f409293529da4630bfc5d6d8e7d7451a6ccb5 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Tue, 28 Sep 2010 19:30:14 +0000
Subject: tcp: tcp_enter_quickack_mode can be static

Function only used in tcp_input.c

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h    | 2 --
 net/ipv4/tcp_input.c | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 914a60c7ad62..4fee0424af7e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -346,8 +346,6 @@ static inline void tcp_dec_quickack_mode(struct sock *sk,
 	}
 }
 
-extern void tcp_enter_quickack_mode(struct sock *sk);
-
 #define	TCP_ECN_OK		1
 #define	TCP_ECN_QUEUE_CWR	2
 #define	TCP_ECN_DEMAND_CWR	4
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fabc09a58d7f..eaf20e7e61da 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk)
 		icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
 }
 
-void tcp_enter_quickack_mode(struct sock *sk)
+static void tcp_enter_quickack_mode(struct sock *sk)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	tcp_incr_quickack(sk);
-- 
cgit v1.2.3-58-ga151


From 6d0722a2cec2c23db3b0855ff8bb433175a16b44 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 Sep 2010 23:35:10 -0700
Subject: ip_gre: comments change

HARD_TX_LOCK no longer protects tunnels from dead loops,
but xmit_recursion percpu counter.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ip_gre.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 035db6350037..fbe2c473a06a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -64,13 +64,13 @@
    We cannot track such dead loops during route installation,
    it is infeasible task. The most general solutions would be
    to keep skb->encapsulation counter (sort of local ttl),
-   and silently drop packet when it expires. It is the best
+   and silently drop packet when it expires. It is a good
    solution, but it supposes maintaing new variable in ALL
    skb, even if no tunneling is used.
 
-   Current solution: HARD_TX_LOCK lock breaks dead loops.
-
-
+   Current solution: xmit_recursion breaks dead loops. This is a percpu
+   counter, since when we enter the first ndo_xmit(), cpu migration is
+   forbidden. We force an exit if this counter reaches RECURSION_LIMIT
 
    2. Networking dead loops would not kill routers, but would really
    kill network. IP hop limit plays role of "t->recursion" in this case,
-- 
cgit v1.2.3-58-ga151


From dd28d1a0b5ecc0f5512f658b1a8fd38bc4f4c98c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 29 Sep 2010 11:53:50 +0000
Subject: ipv4: __mkroute_output() speedup

While doing stress tests with a disabled IP route cache, I found
__mkroute_output() was touching three times in_device atomic refcount.

Use RCU to touch it once to reduce cache line ping pongs.

Before patch

time to perform the test
real	1m42.009s
user	0m12.545s
sys	25m0.726s

Profile :

16109.00 26.4% ip_route_output_slow   vmlinux
 7434.00 12.2% dst_destroy            vmlinux
 3280.00  5.4% fib_rules_lookup       vmlinux
 3252.00  5.3% fib_semantic_match     vmlinux
 2622.00  4.3% fib_table_lookup       vmlinux
 2535.00  4.1% dst_alloc              vmlinux
 1750.00  2.9% _raw_read_lock         vmlinux
 1532.00  2.5% rt_set_nexthop         vmlinux

After patch

real	1m36.503s
user	0m12.977s
sys	23m25.608s

14234.00 22.4% ip_route_output_slow   vmlinux
 8717.00 13.7% dst_destroy            vmlinux
 4052.00  6.4% fib_rules_lookup       vmlinux
 3951.00  6.2% fib_semantic_match     vmlinux
 3191.00  5.0% dst_alloc              vmlinux
 1764.00  2.8% fib_table_lookup       vmlinux
 1692.00  2.7% _raw_read_lock         vmlinux
 1605.00  2.5% rt_set_nexthop         vmlinux

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 98beda47bc99..ea895004caf3 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2358,9 +2358,8 @@ static int __mkroute_output(struct rtable **result,
 	struct rtable *rth;
 	struct in_device *in_dev;
 	u32 tos = RT_FL_TOS(oldflp);
-	int err = 0;
 
-	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
+	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
 		return -EINVAL;
 
 	if (fl->fl4_dst == htonl(0xFFFFFFFF))
@@ -2373,11 +2372,12 @@ static int __mkroute_output(struct rtable **result,
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
-	/* get work reference to inet device */
-	in_dev = in_dev_get(dev_out);
-	if (!in_dev)
+	rcu_read_lock();
+	in_dev = __in_dev_get_rcu(dev_out);
+	if (!in_dev) {
+		rcu_read_unlock();
 		return -EINVAL;
-
+	}
 	if (res->type == RTN_BROADCAST) {
 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
 		if (res->fi) {
@@ -2385,13 +2385,13 @@ static int __mkroute_output(struct rtable **result,
 			res->fi = NULL;
 		}
 	} else if (res->type == RTN_MULTICAST) {
-		flags |= RTCF_MULTICAST|RTCF_LOCAL;
+		flags |= RTCF_MULTICAST | RTCF_LOCAL;
 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
 				 oldflp->proto))
 			flags &= ~RTCF_LOCAL;
 		/* If multicast route do not exist use
-		   default one, but do not gateway in this case.
-		   Yes, it is hack.
+		 * default one, but do not gateway in this case.
+		 * Yes, it is hack.
 		 */
 		if (res->fi && res->prefixlen < 4) {
 			fib_info_put(res->fi);
@@ -2402,9 +2402,12 @@ static int __mkroute_output(struct rtable **result,
 
 	rth = dst_alloc(&ipv4_dst_ops);
 	if (!rth) {
-		err = -ENOBUFS;
-		goto cleanup;
+		rcu_read_unlock();
+		return -ENOBUFS;
 	}
+	in_dev_hold(in_dev);
+	rcu_read_unlock();
+	rth->idev = in_dev;
 
 	atomic_set(&rth->dst.__refcnt, 1);
 	rth->dst.flags= DST_HOST;
@@ -2425,7 +2428,6 @@ static int __mkroute_output(struct rtable **result,
 	   cache entry */
 	rth->dst.dev	= dev_out;
 	dev_hold(dev_out);
-	rth->idev	= in_dev_get(dev_out);
 	rth->rt_gateway = fl->fl4_dst;
 	rth->rt_spec_dst= fl->fl4_src;
 
@@ -2460,13 +2462,8 @@ static int __mkroute_output(struct rtable **result,
 	rt_set_nexthop(rth, res, 0);
 
 	rth->rt_flags = flags;
-
 	*result = rth;
- cleanup:
-	/* release work reference to inet device */
-	in_dev_put(in_dev);
-
-	return err;
+	return 0;
 }
 
 static int ip_mkroute_output(struct rtable **rp,
-- 
cgit v1.2.3-58-ga151


From 82efee1499a27c06f5afb11b07db384fdb3f7004 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 30 Sep 2010 03:31:56 +0000
Subject: ipv4: introduce __ip_dev_find()

ip_dev_find(net, addr) finds a device given an IPv4 source address and
takes a reference on it.

Introduce __ip_dev_find(), taking a third argument, to optionally take
the device reference. Callers not asking the reference to be taken
should be in an rcu_read_lock() protected section.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/inetdevice.h |  7 ++++++-
 net/ipv4/fib_frontend.c    | 32 +++++++++++++++++++-------------
 2 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 1ec09bb4a3ab..ccd5b07d678d 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -159,7 +159,12 @@ struct in_ifaddr {
 extern int register_inetaddr_notifier(struct notifier_block *nb);
 extern int unregister_inetaddr_notifier(struct notifier_block *nb);
 
-extern struct net_device *ip_dev_find(struct net *net, __be32 addr);
+extern struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref);
+static inline struct net_device *ip_dev_find(struct net *net, __be32 addr)
+{
+	return __ip_dev_find(net, addr, true);
+}
+
 extern int		inet_addr_onlink(struct in_device *in_dev, __be32 a, __be32 b);
 extern int		devinet_ioctl(struct net *net, unsigned int cmd, void __user *);
 extern void		devinet_init(void);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 981f3c59b334..4a69a957872b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -147,34 +147,40 @@ static void fib_flush(struct net *net)
 		rt_cache_flush(net, -1);
 }
 
-/*
- *	Find the first device with a given source address.
+/**
+ * __ip_dev_find - find the first device with a given source address.
+ * @net: the net namespace
+ * @addr: the source address
+ * @devref: if true, take a reference on the found device
+ *
+ * If a caller uses devref=false, it should be protected by RCU
  */
-
-struct net_device * ip_dev_find(struct net *net, __be32 addr)
+struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
-	struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } },
-			    .flags = FLOWI_FLAG_MATCH_ANY_IIF };
-	struct fib_result res;
+	struct flowi fl = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = addr
+			}
+		},
+		.flags = FLOWI_FLAG_MATCH_ANY_IIF
+	};
+	struct fib_result res = { 0 };
 	struct net_device *dev = NULL;
 
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	res.r = NULL;
-#endif
-
 	if (fib_lookup(net, &fl, &res))
 		return NULL;
 	if (res.type != RTN_LOCAL)
 		goto out;
 	dev = FIB_RES_DEV(res);
 
-	if (dev)
+	if (dev && devref)
 		dev_hold(dev);
 out:
 	fib_res_put(&res);
 	return dev;
 }
-EXPORT_SYMBOL(ip_dev_find);
+EXPORT_SYMBOL(__ip_dev_find);
 
 /*
  * Find address type as if only "dev" was present in the system. If
-- 
cgit v1.2.3-58-ga151


From 0197aa38df2ce550c0bfc96194b07ce6b68af814 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 30 Sep 2010 03:33:58 +0000
Subject: ipv4: rcu conversion in ip_route_output_slow

ip_route_output_slow() is enclosed in an rcu_read_lock() protected
section, so that no references are taken/released on device, thanks to
__ip_dev_find() & dev_get_by_index_rcu()

Tested with ip route cache disabled, and a stress test :

Before patch:

elapsed time :

real	1m38.347s
user	0m11.909s
sys	23m51.501s

Profile:

13788.00 22.7% ip_route_output_slow [kernel]
 7875.00 13.0% dst_destroy          [kernel]
 3925.00  6.5% fib_semantic_match   [kernel]
 3144.00  5.2% fib_rules_lookup     [kernel]
 3061.00  5.0% dst_alloc            [kernel]
 2276.00  3.7% rt_set_nexthop       [kernel]
 1762.00  2.9% fib_table_lookup     [kernel]
 1538.00  2.5% _raw_read_lock       [kernel]
 1358.00  2.2% ip_output            [kernel]

After patch:

real	1m28.808s
user	0m13.245s
sys	20m37.293s

10950.00 17.2% ip_route_output_slow [kernel]
10726.00 16.9% dst_destroy          [kernel]
 5170.00  8.1% fib_semantic_match   [kernel]
 3937.00  6.2% dst_alloc            [kernel]
 3635.00  5.7% rt_set_nexthop       [kernel]
 2900.00  4.6% fib_rules_lookup     [kernel]
 2240.00  3.5% fib_table_lookup     [kernel]
 1427.00  2.2% _raw_read_lock       [kernel]
 1157.00  1.8% kmem_cache_alloc     [kernel]

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 38 ++++++++++++--------------------------
 1 file changed, 12 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ea895004caf3..a61acea975f1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2487,6 +2487,7 @@ static int ip_mkroute_output(struct rtable **rp,
 
 /*
  * Major route resolver routine.
+ * called with rcu_read_lock();
  */
 
 static int ip_route_output_slow(struct net *net, struct rtable **rp,
@@ -2505,7 +2506,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			    .iif = net->loopback_dev->ifindex,
 			    .oif = oldflp->oif };
 	struct fib_result res;
-	unsigned flags = 0;
+	unsigned int flags = 0;
 	struct net_device *dev_out = NULL;
 	int free_res = 0;
 	int err;
@@ -2535,7 +2536,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
 		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-			dev_out = ip_dev_find(net, oldflp->fl4_src);
+			dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
 			if (dev_out == NULL)
 				goto out;
 
@@ -2560,26 +2561,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
 		if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-			dev_out = ip_dev_find(net, oldflp->fl4_src);
-			if (dev_out == NULL)
+			if (!__ip_dev_find(net, oldflp->fl4_src, false))
 				goto out;
-			dev_put(dev_out);
-			dev_out = NULL;
 		}
 	}
 
 
 	if (oldflp->oif) {
-		dev_out = dev_get_by_index(net, oldflp->oif);
+		dev_out = dev_get_by_index_rcu(net, oldflp->oif);
 		err = -ENODEV;
 		if (dev_out == NULL)
 			goto out;
 
 		/* RACE: Check return value of inet_select_addr instead. */
-		if (rcu_dereference_raw(dev_out->ip_ptr) == NULL) {
-			dev_put(dev_out);
+		if (rcu_dereference(dev_out->ip_ptr) == NULL)
 			goto out;	/* Wrong error code */
-		}
 
 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
 		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
@@ -2602,10 +2598,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		fl.fl4_dst = fl.fl4_src;
 		if (!fl.fl4_dst)
 			fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
-		if (dev_out)
-			dev_put(dev_out);
 		dev_out = net->loopback_dev;
-		dev_hold(dev_out);
 		fl.oif = net->loopback_dev->ifindex;
 		res.type = RTN_LOCAL;
 		flags |= RTCF_LOCAL;
@@ -2639,8 +2632,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			res.type = RTN_UNICAST;
 			goto make_route;
 		}
-		if (dev_out)
-			dev_put(dev_out);
 		err = -ENETUNREACH;
 		goto out;
 	}
@@ -2649,10 +2640,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 	if (res.type == RTN_LOCAL) {
 		if (!fl.fl4_src)
 			fl.fl4_src = fl.fl4_dst;
-		if (dev_out)
-			dev_put(dev_out);
 		dev_out = net->loopback_dev;
-		dev_hold(dev_out);
 		fl.oif = dev_out->ifindex;
 		if (res.fi)
 			fib_info_put(res.fi);
@@ -2672,28 +2660,23 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 	if (!fl.fl4_src)
 		fl.fl4_src = FIB_RES_PREFSRC(res);
 
-	if (dev_out)
-		dev_put(dev_out);
 	dev_out = FIB_RES_DEV(res);
-	dev_hold(dev_out);
 	fl.oif = dev_out->ifindex;
 
 
 make_route:
 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 
-
 	if (free_res)
 		fib_res_put(&res);
-	if (dev_out)
-		dev_put(dev_out);
 out:	return err;
 }
 
 int __ip_route_output_key(struct net *net, struct rtable **rp,
 			  const struct flowi *flp)
 {
-	unsigned hash;
+	unsigned int hash;
+	int res;
 	struct rtable *rth;
 
 	if (!rt_caching(net))
@@ -2724,7 +2707,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
 	rcu_read_unlock_bh();
 
 slow_output:
-	return ip_route_output_slow(net, rp, flp);
+	rcu_read_lock();
+	res = ip_route_output_slow(net, rp, flp);
+	rcu_read_unlock();
+	return res;
 }
 EXPORT_SYMBOL_GPL(__ip_route_output_key);
 
-- 
cgit v1.2.3-58-ga151


From ddcb4541e917780ef7ccc68dd8df18ca0bc055d0 Mon Sep 17 00:00:00 2001
From: stephen hemminger <shemminger@vyatta.com>
Date: Fri, 1 Oct 2010 13:58:00 +0000
Subject: gre: protocol table can be static

This table is only used in gre.c

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/gre.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c
index b546736da2e1..caea6885fdbd 100644
--- a/net/ipv4/gre.c
+++ b/net/ipv4/gre.c
@@ -22,7 +22,7 @@
 #include <net/gre.h>
 
 
-const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
+static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly;
 static DEFINE_SPINLOCK(gre_proto_lock);
 
 int gre_add_protocol(const struct gre_protocol *proto, u8 version)
-- 
cgit v1.2.3-58-ga151


From 55747a0a73ea74a25fcebb0731e8d3f13fe8c09d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 1 Oct 2010 16:14:55 +0000
Subject: ipmr: __pim_rcv() is called under rcu_read_lock

No need to get a reference on reg_dev and release it, we are in a
rcu_read_lock() protected section.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 10b24c02deb0..1a92ebd85196 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1805,6 +1805,7 @@ dont_forward:
 }
 
 #ifdef CONFIG_IP_PIMSM
+/* called with rcu_read_lock() */
 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 		     unsigned int pimlen)
 {
@@ -1826,26 +1827,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 	read_lock(&mrt_lock);
 	if (mrt->mroute_reg_vif_num >= 0)
 		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
-	if (reg_dev)
-		dev_hold(reg_dev);
 	read_unlock(&mrt_lock);
 
 	if (reg_dev == NULL)
 		return 1;
 
 	skb->mac_header = skb->network_header;
-	skb_pull(skb, (u8*)encap - skb->data);
+	skb_pull(skb, (u8 *)encap - skb->data);
 	skb_reset_network_header(skb);
 	skb->protocol = htons(ETH_P_IP);
-	skb->ip_summed = 0;
+	skb->ip_summed = CHECKSUM_NONE;
 	skb->pkt_type = PACKET_HOST;
 
 	skb_tunnel_rx(skb, reg_dev);
 
 	netif_rx(skb);
-	dev_put(reg_dev);
 
-	return 0;
+	return NET_RX_SUCCESS;
 }
 #endif
 
-- 
cgit v1.2.3-58-ga151


From 4c9687098f245601e9d94178715ee03afbcc6f80 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 1 Oct 2010 16:15:01 +0000
Subject: ipmr: RCU conversion of mroute_sk

Use RCU and RTNL to protect (struct mr_table)->mroute_sk

Readers use RCU, writers use RTNL.

ip_ra_control() already use an RCU grace period before
ip_ra_destroy_rcu(), so we dont need synchronize_rcu() in
mrtsock_destruct()

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 91 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 49 insertions(+), 42 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 1a92ebd85196..e2db2ea616ff 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -75,7 +75,7 @@ struct mr_table {
 	struct net		*net;
 #endif
 	u32			id;
-	struct sock		*mroute_sk;
+	struct sock __rcu	*mroute_sk;
 	struct timer_list	ipmr_expire_timer;
 	struct list_head	mfc_unres_queue;
 	struct list_head	mfc_cache_array[MFC_LINES];
@@ -867,6 +867,7 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	const int ihl = ip_hdrlen(pkt);
 	struct igmphdr *igmp;
 	struct igmpmsg *msg;
+	struct sock *mroute_sk;
 	int ret;
 
 #ifdef CONFIG_IP_PIMSM
@@ -925,7 +926,10 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	skb->transport_header = skb->network_header;
 	}
 
-	if (mrt->mroute_sk == NULL) {
+	rcu_read_lock();
+	mroute_sk = rcu_dereference(mrt->mroute_sk);
+	if (mroute_sk == NULL) {
+		rcu_read_unlock();
 		kfree_skb(skb);
 		return -EINVAL;
 	}
@@ -933,7 +937,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
 	/*
 	 *	Deliver to mrouted
 	 */
-	ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
+	ret = sock_queue_rcv_skb(mroute_sk, skb);
+	rcu_read_unlock();
 	if (ret < 0) {
 		if (net_ratelimit())
 			printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
@@ -1164,6 +1169,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
 	}
 }
 
+/* called from ip_ra_control(), before an RCU grace period,
+ * we dont need to call synchronize_rcu() here
+ */
 static void mrtsock_destruct(struct sock *sk)
 {
 	struct net *net = sock_net(sk);
@@ -1171,13 +1179,9 @@ static void mrtsock_destruct(struct sock *sk)
 
 	rtnl_lock();
 	ipmr_for_each_table(mrt, net) {
-		if (sk == mrt->mroute_sk) {
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
-
-			write_lock_bh(&mrt_lock);
-			mrt->mroute_sk = NULL;
-			write_unlock_bh(&mrt_lock);
-
+			rcu_assign_pointer(mrt->mroute_sk, NULL);
 			mroute_clean_tables(mrt);
 		}
 	}
@@ -1204,7 +1208,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		return -ENOENT;
 
 	if (optname != MRT_INIT) {
-		if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
+		if (sk != rcu_dereference_raw(mrt->mroute_sk) &&
+		    !capable(CAP_NET_ADMIN))
 			return -EACCES;
 	}
 
@@ -1217,23 +1222,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENOPROTOOPT;
 
 		rtnl_lock();
-		if (mrt->mroute_sk) {
+		if (rtnl_dereference(mrt->mroute_sk)) {
 			rtnl_unlock();
 			return -EADDRINUSE;
 		}
 
 		ret = ip_ra_control(sk, 1, mrtsock_destruct);
 		if (ret == 0) {
-			write_lock_bh(&mrt_lock);
-			mrt->mroute_sk = sk;
-			write_unlock_bh(&mrt_lock);
-
+			rcu_assign_pointer(mrt->mroute_sk, sk);
 			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
 		}
 		rtnl_unlock();
 		return ret;
 	case MRT_DONE:
-		if (sk != mrt->mroute_sk)
+		if (sk != rcu_dereference_raw(mrt->mroute_sk))
 			return -EACCES;
 		return ip_ra_control(sk, 0, NULL);
 	case MRT_ADD_VIF:
@@ -1246,7 +1248,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -ENFILE;
 		rtnl_lock();
 		if (optname == MRT_ADD_VIF) {
-			ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
+			ret = vif_add(net, mrt, &vif,
+				      sk == rtnl_dereference(mrt->mroute_sk));
 		} else {
 			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
 		}
@@ -1267,7 +1270,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 		if (optname == MRT_DEL_MFC)
 			ret = ipmr_mfc_delete(mrt, &mfc);
 		else
-			ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
+			ret = ipmr_mfc_add(net, mrt, &mfc,
+					   sk == rtnl_dereference(mrt->mroute_sk));
 		rtnl_unlock();
 		return ret;
 		/*
@@ -1309,14 +1313,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 			return -EINVAL;
 		if (get_user(v, (u32 __user *)optval))
 			return -EFAULT;
-		if (sk == mrt->mroute_sk)
-			return -EBUSY;
 
 		rtnl_lock();
 		ret = 0;
-		if (!ipmr_new_table(net, v))
-			ret = -ENOMEM;
-		raw_sk(sk)->ipmr_table = v;
+		if (sk == rtnl_dereference(mrt->mroute_sk)) {
+			ret = -EBUSY;
+		} else {
+			if (!ipmr_new_table(net, v))
+				ret = -ENOMEM;
+			raw_sk(sk)->ipmr_table = v;
+		}
 		rtnl_unlock();
 		return ret;
 	}
@@ -1713,6 +1719,7 @@ dont_forward:
 
 /*
  *	Multicast packets for forwarding arrive here
+ *	Called with rcu_read_lock();
  */
 
 int ip_mr_input(struct sk_buff *skb)
@@ -1726,7 +1733,7 @@ int ip_mr_input(struct sk_buff *skb)
 	/* Packet is looped back after forward, it should not be
 	   forwarded second time, but still can be delivered locally.
 	 */
-	if (IPCB(skb)->flags&IPSKB_FORWARDED)
+	if (IPCB(skb)->flags & IPSKB_FORWARDED)
 		goto dont_forward;
 
 	err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
@@ -1736,24 +1743,24 @@ int ip_mr_input(struct sk_buff *skb)
 	}
 
 	if (!local) {
-		    if (IPCB(skb)->opt.router_alert) {
-			    if (ip_call_ra_chain(skb))
-				    return 0;
-		    } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
-			    /* IGMPv1 (and broken IGMPv2 implementations sort of
-			       Cisco IOS <= 11.2(8)) do not put router alert
-			       option to IGMP packets destined to routable
-			       groups. It is very bad, because it means
-			       that we can forward NO IGMP messages.
-			     */
-			    read_lock(&mrt_lock);
-			    if (mrt->mroute_sk) {
-				    nf_reset(skb);
-				    raw_rcv(mrt->mroute_sk, skb);
-				    read_unlock(&mrt_lock);
-				    return 0;
-			    }
-			    read_unlock(&mrt_lock);
+		if (IPCB(skb)->opt.router_alert) {
+			if (ip_call_ra_chain(skb))
+				return 0;
+		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
+			/* IGMPv1 (and broken IGMPv2 implementations sort of
+			 * Cisco IOS <= 11.2(8)) do not put router alert
+			 * option to IGMP packets destined to routable
+			 * groups. It is very bad, because it means
+			 * that we can forward NO IGMP messages.
+			 */
+			struct sock *mroute_sk;
+
+			mroute_sk = rcu_dereference(mrt->mroute_sk);
+			if (mroute_sk) {
+				nf_reset(skb);
+				raw_rcv(mroute_sk, skb);
+				return 0;
+			}
 		    }
 	}
 
-- 
cgit v1.2.3-58-ga151


From a8c9486b816f74d4645144db9e8fa2f711c1fc4b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 1 Oct 2010 16:15:08 +0000
Subject: ipmr: RCU protection for mfc_cache_array

Use RCU & RTNL protection for mfc_cache_array[]

ipmr_cache_find() is called under rcu_read_lock();

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/mroute.h |  1 +
 net/ipv4/ipmr.c        | 87 +++++++++++++++++++++++++++-----------------------
 2 files changed, 48 insertions(+), 40 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/mroute.h b/include/linux/mroute.h
index fa04b246c9ae..0fa7a3a874c8 100644
--- a/include/linux/mroute.h
+++ b/include/linux/mroute.h
@@ -213,6 +213,7 @@ struct mfc_cache {
 			unsigned char ttls[MAXVIFS];	/* TTL thresholds		*/
 		} res;
 	} mfc_un;
+	struct rcu_head	rcu;
 };
 
 #define MFC_STATIC		1
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e2db2ea616ff..cbb6dabe024f 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -577,11 +577,18 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 	return 0;
 }
 
-static inline void ipmr_cache_free(struct mfc_cache *c)
+static void ipmr_cache_free_rcu(struct rcu_head *head)
 {
+	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);
+
 	kmem_cache_free(mrt_cachep, c);
 }
 
+static inline void ipmr_cache_free(struct mfc_cache *c)
+{
+	call_rcu(&c->rcu, ipmr_cache_free_rcu);
+}
+
 /* Destroy an unresolved cache entry, killing queued skbs
    and reporting error to netlink readers.
  */
@@ -781,6 +788,7 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	return 0;
 }
 
+/* called with rcu_read_lock() */
 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 					 __be32 origin,
 					 __be32 mcastgrp)
@@ -788,7 +796,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 	int line = MFC_HASH(mcastgrp, origin);
 	struct mfc_cache *c;
 
-	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
+	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
 			return c;
 	}
@@ -801,19 +809,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
 static struct mfc_cache *ipmr_cache_alloc(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
-	if (c == NULL)
-		return NULL;
-	c->mfc_un.res.minvif = MAXVIFS;
+
+	if (c)
+		c->mfc_un.res.minvif = MAXVIFS;
 	return c;
 }
 
 static struct mfc_cache *ipmr_cache_alloc_unres(void)
 {
 	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
-	if (c == NULL)
-		return NULL;
-	skb_queue_head_init(&c->mfc_un.unres.unresolved);
-	c->mfc_un.unres.expires = jiffies + 10*HZ;
+
+	if (c) {
+		skb_queue_head_init(&c->mfc_un.unres.unresolved);
+		c->mfc_un.unres.expires = jiffies + 10*HZ;
+	}
 	return c;
 }
 
@@ -1040,9 +1049,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
 	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
 		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
 		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
+			list_del_rcu(&c->list);
 
 			ipmr_cache_free(c);
 			return 0;
@@ -1095,9 +1102,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
 	if (!mrtsock)
 		c->mfc_flags |= MFC_STATIC;
 
-	write_lock_bh(&mrt_lock);
-	list_add(&c->list, &mrt->mfc_cache_array[line]);
-	write_unlock_bh(&mrt_lock);
+	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
 
 	/*
 	 *	Check to see if we resolved a queued list. If so we
@@ -1149,12 +1154,9 @@ static void mroute_clean_tables(struct mr_table *mrt)
 	 */
 	for (i = 0; i < MFC_LINES; i++) {
 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
-			if (c->mfc_flags&MFC_STATIC)
+			if (c->mfc_flags & MFC_STATIC)
 				continue;
-			write_lock_bh(&mrt_lock);
-			list_del(&c->list);
-			write_unlock_bh(&mrt_lock);
-
+			list_del_rcu(&c->list);
 			ipmr_cache_free(c);
 		}
 	}
@@ -1422,19 +1424,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
 		if (copy_from_user(&sr, arg, sizeof(sr)))
 			return -EFAULT;
 
-		read_lock(&mrt_lock);
+		rcu_read_lock();
 		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
 		if (c) {
 			sr.pktcnt = c->mfc_un.res.pkt;
 			sr.bytecnt = c->mfc_un.res.bytes;
 			sr.wrong_if = c->mfc_un.res.wrong_if;
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 
 			if (copy_to_user(arg, &sr, sizeof(sr)))
 				return -EFAULT;
 			return 0;
 		}
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return -EADDRNOTAVAIL;
 	default:
 		return -ENOIOCTLCMD;
@@ -1764,7 +1766,7 @@ int ip_mr_input(struct sk_buff *skb)
 		    }
 	}
 
-	read_lock(&mrt_lock);
+	/* already under rcu_read_lock() */
 	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
 
 	/*
@@ -1776,13 +1778,12 @@ int ip_mr_input(struct sk_buff *skb)
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
 			ip_local_deliver(skb);
-			if (skb2 == NULL) {
-				read_unlock(&mrt_lock);
+			if (skb2 == NULL)
 				return -ENOBUFS;
-			}
 			skb = skb2;
 		}
 
+		read_lock(&mrt_lock);
 		vif = ipmr_find_vif(mrt, skb->dev);
 		if (vif >= 0) {
 			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
@@ -1795,8 +1796,8 @@ int ip_mr_input(struct sk_buff *skb)
 		return -ENODEV;
 	}
 
+	read_lock(&mrt_lock);
 	ip_mr_forward(net, mrt, skb, cache, local);
-
 	read_unlock(&mrt_lock);
 
 	if (local)
@@ -1963,7 +1964,7 @@ int ipmr_get_route(struct net *net,
 	if (mrt == NULL)
 		return -ENOENT;
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
 
 	if (cache == NULL) {
@@ -1973,18 +1974,21 @@ int ipmr_get_route(struct net *net,
 		int vif;
 
 		if (nowait) {
-			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -EAGAIN;
 		}
 
 		dev = skb->dev;
+		read_lock(&mrt_lock);
 		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
 			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -ENODEV;
 		}
 		skb2 = skb_clone(skb, GFP_ATOMIC);
 		if (!skb2) {
 			read_unlock(&mrt_lock);
+			rcu_read_unlock();
 			return -ENOMEM;
 		}
 
@@ -1997,13 +2001,16 @@ int ipmr_get_route(struct net *net,
 		iph->version = 0;
 		err = ipmr_cache_unresolved(mrt, vif, skb2);
 		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 		return err;
 	}
 
-	if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
+	read_lock(&mrt_lock);
+	if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
 		cache->mfc_flags |= MFC_NOTIFY;
 	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
 	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -2055,14 +2062,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
 	s_h = cb->args[1];
 	s_e = cb->args[2];
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	ipmr_for_each_table(mrt, net) {
 		if (t < s_t)
 			goto next_table;
 		if (t > s_t)
 			s_h = 0;
 		for (h = s_h; h < MFC_LINES; h++) {
-			list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
+			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
 				if (e < s_e)
 					goto next_entry;
 				if (ipmr_fill_mroute(mrt, skb,
@@ -2080,7 +2087,7 @@ next_table:
 		t++;
 	}
 done:
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 
 	cb->args[2] = e;
 	cb->args[1] = h;
@@ -2213,14 +2220,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
 	struct mr_table *mrt = it->mrt;
 	struct mfc_cache *mfc;
 
-	read_lock(&mrt_lock);
+	rcu_read_lock();
 	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
 		it->cache = &mrt->mfc_cache_array[it->ct];
-		list_for_each_entry(mfc, it->cache, list)
+		list_for_each_entry_rcu(mfc, it->cache, list)
 			if (pos-- == 0)
 				return mfc;
 	}
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 
 	spin_lock_bh(&mfc_unres_lock);
 	it->cache = &mrt->mfc_unres_queue;
@@ -2279,7 +2286,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	}
 
 	/* exhausted cache_array, show unresolved */
-	read_unlock(&mrt_lock);
+	rcu_read_unlock();
 	it->cache = &mrt->mfc_unres_queue;
 	it->ct = 0;
 
@@ -2302,7 +2309,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
 	if (it->cache == &mrt->mfc_unres_queue)
 		spin_unlock_bh(&mfc_unres_lock);
 	else if (it->cache == &mrt->mfc_cache_array[it->ct])
-		read_unlock(&mrt_lock);
+		rcu_read_unlock();
 }
 
 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
@@ -2426,7 +2433,7 @@ int __init ip_mr_init(void)
 
 	mrt_cachep = kmem_cache_create("ip_mrt_cache",
 				       sizeof(struct mfc_cache),
-				       0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
 				       NULL);
 	if (!mrt_cachep)
 		return -ENOMEM;
-- 
cgit v1.2.3-58-ga151


From a8cb16dd9cb571c45bb479a1e4721ce11220a216 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 1 Oct 2010 16:15:29 +0000
Subject: ipmr: cleanups

Various code style cleanups

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/ipmr.c | 238 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 124 insertions(+), 114 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index cbb6dabe024f..86dd5691af46 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -98,7 +98,7 @@ struct ipmr_result {
 };
 
 /* Big lock, protecting vif table, mrt cache and mroute socket state.
-   Note that the changes are semaphored via rtnl_lock.
+ * Note that the changes are semaphored via rtnl_lock.
  */
 
 static DEFINE_RWLOCK(mrt_lock);
@@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock);
 static DEFINE_SPINLOCK(mfc_unres_lock);
 
 /* We return to original Alan's scheme. Hash table of resolved
-   entries is changed only in process context and protected
-   with weak lock mrt_lock. Queue of unresolved entries is protected
-   with strong spinlock mfc_unres_lock.
-
-   In this case data path is free of exclusive locks at all.
+ * entries is changed only in process context and protected
+ * with weak lock mrt_lock. Queue of unresolved entries is protected
+ * with strong spinlock mfc_unres_lock.
+ *
+ * In this case data path is free of exclusive locks at all.
  */
 
 static struct kmem_cache *mrt_cachep __read_mostly;
@@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
 			set_fs(KERNEL_DS);
 			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
 			set_fs(oldfs);
-		} else
+		} else {
 			err = -EOPNOTSUPP;
-
+		}
 		dev = NULL;
 
 		if (err == 0 &&
@@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
 	dev->iflink = 0;
 
 	rcu_read_lock();
-	if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
+	in_dev = __in_dev_get_rcu(dev);
+	if (!in_dev) {
 		rcu_read_unlock();
 		goto failure;
 	}
@@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 		mrt->mroute_reg_vif_num = -1;
 #endif
 
-	if (vifi+1 == mrt->maxvif) {
+	if (vifi + 1 == mrt->maxvif) {
 		int tmp;
-		for (tmp=vifi-1; tmp>=0; tmp--) {
+
+		for (tmp = vifi - 1; tmp >= 0; tmp--) {
 			if (VIF_EXISTS(mrt, tmp))
 				break;
 		}
@@ -565,12 +567,13 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
 
 	dev_set_allmulti(dev, -1);
 
-	if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
+	in_dev = __in_dev_get_rtnl(dev);
+	if (in_dev) {
 		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
 		ip_rt_multicast_event(in_dev);
 	}
 
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
@@ -590,7 +593,7 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
 }
 
 /* Destroy an unresolved cache entry, killing queued skbs
-   and reporting error to netlink readers.
+ * and reporting error to netlink readers.
  */
 
 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
@@ -612,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
 			memset(&e->msg, 0, sizeof(e->msg));
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-		} else
+		} else {
 			kfree_skb(skb);
+		}
 	}
 
 	ipmr_cache_free(c);
@@ -735,9 +739,9 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 				dev_put(dev);
 				return -EADDRNOTAVAIL;
 			}
-		} else
+		} else {
 			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
-
+		}
 		if (!dev)
 			return -EADDRNOTAVAIL;
 		err = dev_set_allmulti(dev, 1);
@@ -750,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 		return -EINVAL;
 	}
 
-	if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
+	in_dev = __in_dev_get_rtnl(dev);
+	if (!in_dev) {
 		dev_put(dev);
 		return -EADDRNOTAVAIL;
 	}
 	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
 	ip_rt_multicast_event(in_dev);
 
-	/*
-	 *	Fill in the VIF structures
-	 */
+	/* Fill in the VIF structures */
+
 	v->rate_limit = vifc->vifc_rate_limit;
 	v->local = vifc->vifc_lcl_addr.s_addr;
 	v->remote = vifc->vifc_rmt_addr.s_addr;
@@ -772,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt,
 	v->pkt_in = 0;
 	v->pkt_out = 0;
 	v->link = dev->ifindex;
-	if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
+	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
 		v->link = dev->iflink;
 
 	/* And finish update writing critical data */
 	write_lock_bh(&mrt_lock);
 	v->dev = dev;
 #ifdef CONFIG_IP_PIMSM
-	if (v->flags&VIFF_REGISTER)
+	if (v->flags & VIFF_REGISTER)
 		mrt->mroute_reg_vif_num = vifi;
 #endif
 	if (vifi+1 > mrt->maxvif)
@@ -836,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 	struct sk_buff *skb;
 	struct nlmsgerr *e;
 
-	/*
-	 *	Play the pending entries through our router
-	 */
+	/* Play the pending entries through our router */
 
 	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
 		if (ip_hdr(skb)->version == 0) {
 			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
 
 			if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
-				nlh->nlmsg_len = (skb_tail_pointer(skb) -
-						  (u8 *)nlh);
+				nlh->nlmsg_len = skb_tail_pointer(skb) -
+						 (u8 *)nlh;
 			} else {
 				nlh->nlmsg_type = NLMSG_ERROR;
 				nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
@@ -857,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
 			}
 
 			rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
-		} else
+		} else {
 			ip_mr_forward(net, mrt, skb, c, 0);
+		}
 	}
 }
 
@@ -892,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #ifdef CONFIG_IP_PIMSM
 	if (assert == IGMPMSG_WHOLEPKT) {
 		/* Ugly, but we have no choice with this interface.
-		   Duplicate old header, fix ihl, length etc.
-		   And all this only to mangle msg->im_msgtype and
-		   to set msg->im_mbz to "mbz" :-)
+		 * Duplicate old header, fix ihl, length etc.
+		 * And all this only to mangle msg->im_msgtype and
+		 * to set msg->im_mbz to "mbz" :-)
 		 */
 		skb_push(skb, sizeof(struct iphdr));
 		skb_reset_network_header(skb);
@@ -911,27 +914,23 @@ static int ipmr_cache_report(struct mr_table *mrt,
 #endif
 	{
 
-	/*
-	 *	Copy the IP header
-	 */
+	/* Copy the IP header */
 
 	skb->network_header = skb->tail;
 	skb_put(skb, ihl);
 	skb_copy_to_linear_data(skb, pkt->data, ihl);
-	ip_hdr(skb)->protocol = 0;			/* Flag to the kernel this is a route add */
+	ip_hdr(skb)->protocol = 0;	/* Flag to the kernel this is a route add */
 	msg = (struct igmpmsg *)skb_network_header(skb);
 	msg->im_vif = vifi;
 	skb_dst_set(skb, dst_clone(skb_dst(pkt)));
 
-	/*
-	 *	Add our header
-	 */
+	/* Add our header */
 
-	igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+	igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
 	igmp->type	=
 	msg->im_msgtype = assert;
-	igmp->code 	=	0;
-	ip_hdr(skb)->tot_len = htons(skb->len);			/* Fix the length */
+	igmp->code	= 0;
+	ip_hdr(skb)->tot_len = htons(skb->len);		/* Fix the length */
 	skb->transport_header = skb->network_header;
 	}
 
@@ -943,9 +942,8 @@ static int ipmr_cache_report(struct mr_table *mrt,
 		return -EINVAL;
 	}
 
-	/*
-	 *	Deliver to mrouted
-	 */
+	/* Deliver to mrouted */
+
 	ret = sock_queue_rcv_skb(mroute_sk, skb);
 	rcu_read_unlock();
 	if (ret < 0) {
@@ -979,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 	}
 
 	if (!found) {
-		/*
-		 *	Create a new entry if allowable
-		 */
+		/* Create a new entry if allowable */
 
 		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
 		    (c = ipmr_cache_alloc_unres()) == NULL) {
@@ -991,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 			return -ENOBUFS;
 		}
 
-		/*
-		 *	Fill in the new cache entry
-		 */
+		/* Fill in the new cache entry */
+
 		c->mfc_parent	= -1;
 		c->mfc_origin	= iph->saddr;
 		c->mfc_mcastgrp	= iph->daddr;
 
-		/*
-		 *	Reflect first query at mrouted.
-		 */
+		/* Reflect first query at mrouted. */
+
 		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
 		if (err < 0) {
 			/* If the report failed throw the cache entry
@@ -1020,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
 			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
 	}
 
-	/*
-	 *	See if we can append the packet
-	 */
-	if (c->mfc_un.unres.unresolved.qlen>3) {
+	/* See if we can append the packet */
+
+	if (c->mfc_un.unres.unresolved.qlen > 3) {
 		kfree_skb(skb);
 		err = -ENOBUFS;
 	} else {
@@ -1140,18 +1133,16 @@ static void mroute_clean_tables(struct mr_table *mrt)
 	LIST_HEAD(list);
 	struct mfc_cache *c, *next;
 
-	/*
-	 *	Shut down all active vif entries
-	 */
+	/* Shut down all active vif entries */
+
 	for (i = 0; i < mrt->maxvif; i++) {
-		if (!(mrt->vif_table[i].flags&VIFF_STATIC))
+		if (!(mrt->vif_table[i].flags & VIFF_STATIC))
 			vif_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
-	/*
-	 *	Wipe the cache
-	 */
+	/* Wipe the cache */
+
 	for (i = 0; i < MFC_LINES; i++) {
 		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
 			if (c->mfc_flags & MFC_STATIC)
@@ -1282,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	case MRT_ASSERT:
 	{
 		int v;
-		if (get_user(v,(int __user *)optval))
+		if (get_user(v, (int __user *)optval))
 			return -EFAULT;
 		mrt->mroute_do_assert = (v) ? 1 : 0;
 		return 0;
@@ -1292,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
 	{
 		int v;
 
-		if (get_user(v,(int __user *)optval))
+		if (get_user(v, (int __user *)optval))
 			return -EFAULT;
 		v = (v) ? 1 : 0;
 
@@ -1355,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
 
 	if (optname != MRT_VERSION &&
 #ifdef CONFIG_IP_PIMSM
-	   optname!=MRT_PIM &&
+	   optname != MRT_PIM &&
 #endif
-	   optname!=MRT_ASSERT)
+	   optname != MRT_ASSERT)
 		return -ENOPROTOOPT;
 
 	if (get_user(olr, optlen))
@@ -1473,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = {
 };
 
 /*
- * 	Encapsulate a packet by attaching a valid IPIP header to it.
+ *	Encapsulate a packet by attaching a valid IPIP header to it.
  *	This avoids tunnel drivers and other mess and gives us the speed so
  *	important for multicast video.
  */
@@ -1488,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 	skb_reset_network_header(skb);
 	iph = ip_hdr(skb);
 
-	iph->version	= 	4;
+	iph->version	=	4;
 	iph->tos	=	old_iph->tos;
 	iph->ttl	=	old_iph->ttl;
 	iph->frag_off	=	0;
@@ -1506,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
 
 static inline int ipmr_forward_finish(struct sk_buff *skb)
 {
-	struct ip_options * opt	= &(IPCB(skb)->opt);
+	struct ip_options *opt = &(IPCB(skb)->opt);
 
 	IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
 
@@ -1543,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	}
 #endif
 
-	if (vif->flags&VIFF_TUNNEL) {
-		struct flowi fl = { .oif = vif->link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = vif->remote,
-						.saddr = vif->local,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+	if (vif->flags & VIFF_TUNNEL) {
+		struct flowi fl = {
+			.oif = vif->link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = vif->remote,
+					.saddr = vif->local,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 		encap = sizeof(struct iphdr);
 	} else {
-		struct flowi fl = { .oif = vif->link,
-				    .nl_u = { .ip4_u =
-					      { .daddr = iph->daddr,
-						.tos = RT_TOS(iph->tos) } },
-				    .proto = IPPROTO_IPIP };
+		struct flowi fl = {
+			.oif = vif->link,
+			.nl_u = {
+				.ip4_u = {
+					.daddr = iph->daddr,
+					.tos = RT_TOS(iph->tos)
+				}
+			},
+			.proto = IPPROTO_IPIP
+		};
+
 		if (ip_route_output_key(net, &rt, &fl))
 			goto out_free;
 	}
@@ -1567,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 
 	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
 		/* Do not fragment multicasts. Alas, IPv4 does not
-		   allow to send ICMP, so that packets will disappear
-		   to blackhole.
+		 * allow to send ICMP, so that packets will disappear
+		 * to blackhole.
 		 */
 
 		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
@@ -1591,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
 	ip_decrease_ttl(ip_hdr(skb));
 
 	/* FIXME: forward and output firewalls used to be called here.
-	 * What do we do with netfilter? -- RR */
+	 * What do we do with netfilter? -- RR
+	 */
 	if (vif->flags & VIFF_TUNNEL) {
 		ip_encap(skb, vif->local, vif->remote);
 		/* FIXME: extra output firewall step used to be here. --RR */
@@ -1652,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 
 		if (skb_rtable(skb)->fl.iif == 0) {
 			/* It is our own packet, looped back.
-			   Very complicated situation...
-
-			   The best workaround until routing daemons will be
-			   fixed is not to redistribute packet, if it was
-			   send through wrong interface. It means, that
-			   multicast applications WILL NOT work for
-			   (S,G), which have default multicast route pointing
-			   to wrong oif. In any case, it is not a good
-			   idea to use multicasting applications on router.
+			 * Very complicated situation...
+			 *
+			 * The best workaround until routing daemons will be
+			 * fixed is not to redistribute packet, if it was
+			 * send through wrong interface. It means, that
+			 * multicast applications WILL NOT work for
+			 * (S,G), which have default multicast route pointing
+			 * to wrong oif. In any case, it is not a good
+			 * idea to use multicasting applications on router.
 			 */
 			goto dont_forward;
 		}
@@ -1670,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 
 		if (true_vifi >= 0 && mrt->mroute_do_assert &&
 		    /* pimsm uses asserts, when switching from RPT to SPT,
-		       so that we cannot check that packet arrived on an oif.
-		       It is bad, but otherwise we would need to move pretty
-		       large chunk of pimd to kernel. Ough... --ANK
+		     * so that we cannot check that packet arrived on an oif.
+		     * It is bad, but otherwise we would need to move pretty
+		     * large chunk of pimd to kernel. Ough... --ANK
 		     */
 		    (mrt->mroute_do_pim ||
 		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
@@ -1690,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 	/*
 	 *	Forward the frame
 	 */
-	for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
+	for (ct = cache->mfc_un.res.maxvif - 1;
+	     ct >= cache->mfc_un.res.minvif; ct--) {
 		if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
 			if (psend != -1) {
 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
 				if (skb2)
 					ipmr_queue_xmit(net, mrt, skb2, cache,
 							psend);
@@ -1704,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt,
 	if (psend != -1) {
 		if (local) {
 			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
+
 			if (skb2)
 				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
 		} else {
@@ -1733,7 +1740,7 @@ int ip_mr_input(struct sk_buff *skb)
 	int err;
 
 	/* Packet is looped back after forward, it should not be
-	   forwarded second time, but still can be delivered locally.
+	 * forwarded second time, but still can be delivered locally.
 	 */
 	if (IPCB(skb)->flags & IPSKB_FORWARDED)
 		goto dont_forward;
@@ -1822,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
 
 	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
 	/*
-	   Check that:
-	   a. packet is really destinted to a multicast group
-	   b. packet is not a NULL-REGISTER
-	   c. packet is not truncated
+	 * Check that:
+	 * a. packet is really sent to a multicast group
+	 * b. packet is not a NULL-REGISTER
+	 * c. packet is not truncated
 	 */
 	if (!ipv4_is_multicast(encap->daddr) ||
 	    encap->tot_len == 0 ||
@@ -1860,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
  * Handle IGMP messages of PIMv1
  */
 
-int pim_rcv_v1(struct sk_buff * skb)
+int pim_rcv_v1(struct sk_buff *skb)
 {
 	struct igmphdr *pim;
 	struct net *net = dev_net(skb->dev);
@@ -1887,7 +1894,7 @@ drop:
 #endif
 
 #ifdef CONFIG_IP_PIMSM_V2
-static int pim_rcv(struct sk_buff * skb)
+static int pim_rcv(struct sk_buff *skb)
 {
 	struct pimreghdr *pim;
 	struct net *net = dev_net(skb->dev);
@@ -1897,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb)
 		goto drop;
 
 	pim = (struct pimreghdr *)skb_transport_header(skb);
-	if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
-	    (pim->flags&PIM_NULL_REGISTER) ||
+	if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
+	    (pim->flags & PIM_NULL_REGISTER) ||
 	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
 	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
 		goto drop;
@@ -1971,7 +1978,7 @@ int ipmr_get_route(struct net *net,
 		struct sk_buff *skb2;
 		struct iphdr *iph;
 		struct net_device *dev;
-		int vif;
+		int vif = -1;
 
 		if (nowait) {
 			rcu_read_unlock();
@@ -1980,7 +1987,9 @@ int ipmr_get_route(struct net *net,
 
 		dev = skb->dev;
 		read_lock(&mrt_lock);
-		if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
+		if (dev)
+			vif = ipmr_find_vif(mrt, dev);
+		if (vif < 0) {
 			read_unlock(&mrt_lock);
 			rcu_read_unlock();
 			return -ENODEV;
@@ -2098,7 +2107,8 @@ done:
 
 #ifdef CONFIG_PROC_FS
 /*
- *	The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
+ *	The /proc interfaces to multicast routing :
+ *	/proc/net/ip_mr_cache & /proc/net/ip_mr_vif
  */
 struct ipmr_vif_iter {
 	struct seq_net_private p;
@@ -2294,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 	if (!list_empty(it->cache))
 		return list_first_entry(it->cache, struct mfc_cache, list);
 
- end_of_list:
+end_of_list:
 	spin_unlock_bh(&mfc_unres_lock);
 	it->cache = NULL;
 
@@ -2335,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
 				   mfc->mfc_un.res.bytes,
 				   mfc->mfc_un.res.wrong_if);
 			for (n = mfc->mfc_un.res.minvif;
-			     n < mfc->mfc_un.res.maxvif; n++ ) {
+			     n < mfc->mfc_un.res.maxvif; n++) {
 				if (VIF_EXISTS(mrt, n) &&
 				    mfc->mfc_un.res.ttls[n] < 255)
 					seq_printf(seq,
-- 
cgit v1.2.3-58-ga151


From c7d4426a98a5f6654cd0b4b33d9dab2e77192c18 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Sun, 3 Oct 2010 22:17:54 -0700
Subject: net: introduce DST_NOCACHE flag

While doing stress tests with IP route cache disabled, and multi queue
devices, I noticed a very high contention on one rwlock used in
neighbour code.

When many cpus are trying to send frames (possibly using a high
performance multiqueue device) to the same neighbour, they fight for the
neigh->lock rwlock in order to call neigh_hh_init(), and fight on
hh->hh_refcnt (a pair of atomic_inc/atomic_dec_and_test())

But we dont need to call neigh_hh_init() for dst that are used only
once. It costs four atomic operations at least, on two contended cache
lines, plus the high contention on neigh->lock rwlock.

Introduce a new dst flag, DST_NOCACHE, that is set when dst was not
inserted in route cache.

With the stress test bench, sending 160000000 frames on one neighbour,
results are :

Before patch:

real	2m28.406s
user	0m11.781s
sys	36m17.964s


After patch:

real	1m26.532s
user	0m12.185s
sys	20m3.903s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst.h    | 9 +++++----
 net/core/neighbour.c | 4 +++-
 net/ipv4/route.c     | 1 +
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst.h b/include/net/dst.h
index aa53fbc34b2b..a217c838ec0d 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -43,10 +43,11 @@ struct dst_entry {
 	short			error;
 	short			obsolete;
 	int			flags;
-#define DST_HOST		1
-#define DST_NOXFRM		2
-#define DST_NOPOLICY		4
-#define DST_NOHASH		8
+#define DST_HOST		0x0001
+#define DST_NOXFRM		0x0002
+#define DST_NOPOLICY		0x0004
+#define DST_NOHASH		0x0008
+#define DST_NOCACHE		0x0010
 	unsigned long		expires;
 
 	unsigned short		header_len;	/* more space at head required */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96b1a749abb4..b142a0d76072 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1210,7 +1210,9 @@ int neigh_resolve_output(struct sk_buff *skb)
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
-		if (dev->header_ops->cache && !dst->hh) {
+		if (dev->header_ops->cache &&
+		    !dst->hh &&
+		    !(dst->flags & DST_NOCACHE)) {
 			write_lock_bh(&neigh->lock);
 			if (!dst->hh)
 				neigh_hh_init(neigh, dst, dst->ops->protocol);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a61acea975f1..c3cb8bd23638 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1107,6 +1107,7 @@ restart:
 		 * on the route gc list.
 		 */
 
+		rt->dst.flags |= DST_NOCACHE;
 		if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 			int err = arp_bind_neighbour(&rt->dst);
 			if (err) {
-- 
cgit v1.2.3-58-ga151


From 0c200d935346fe0ebde9b6dffbb683dddd166fb9 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 4 Oct 2010 20:53:18 +0200
Subject: netfilter: nf_nat: make find/put static

The functions nf_nat_proto_find_get and nf_nat_proto_put are
only used internally in nf_nat_core. This might break some out
of tree NAT module.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/netfilter/nf_nat_protocol.h | 3 ---
 net/ipv4/netfilter/nf_nat_core.c        | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/netfilter/nf_nat_protocol.h b/include/net/netfilter/nf_nat_protocol.h
index df17bac46bf5..93cc90d28e66 100644
--- a/include/net/netfilter/nf_nat_protocol.h
+++ b/include/net/netfilter/nf_nat_protocol.h
@@ -45,9 +45,6 @@ struct nf_nat_protocol {
 extern int nf_nat_protocol_register(const struct nf_nat_protocol *proto);
 extern void nf_nat_protocol_unregister(const struct nf_nat_protocol *proto);
 
-extern const struct nf_nat_protocol *nf_nat_proto_find_get(u_int8_t protocol);
-extern void nf_nat_proto_put(const struct nf_nat_protocol *proto);
-
 /* Built-in protocols. */
 extern const struct nf_nat_protocol nf_nat_protocol_tcp;
 extern const struct nf_nat_protocol nf_nat_protocol_udp;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 2c084b3a8f0c..e2e00c4da883 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum)
 	return rcu_dereference(nf_nat_protos[protonum]);
 }
 
-const struct nf_nat_protocol *
+static const struct nf_nat_protocol *
 nf_nat_proto_find_get(u_int8_t protonum)
 {
 	const struct nf_nat_protocol *p;
@@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum)
 
 	return p;
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_find_get);
 
-void
+static void
 nf_nat_proto_put(const struct nf_nat_protocol *p)
 {
 	module_put(p->me);
 }
-EXPORT_SYMBOL_GPL(nf_nat_proto_put);
 
 /* We keep an extra hash for each conntrack, for fast searching. */
 static inline unsigned int
-- 
cgit v1.2.3-58-ga151


From a8defca048fd11eb2d1a17ab61a60a856292dd4e Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 4 Oct 2010 20:56:05 +0200
Subject: netfilter: ipt_LOG: add bufferisation to call printk() once

ipt_LOG & ip6t_LOG use lot of calls to printk() and use a lock in a hope
several cpus wont mix their output in syslog.

printk() being very expensive [1], its better to call it once, on a
prebuilt and complete line. Also, with mixed IPv4 and IPv6 trafic,
separate IPv4/IPv6 locks dont avoid garbage.

I used an allocation of a 1024 bytes structure, sort of seq_printf() but
with a fixed size limit.
Use a static buffer if dynamic allocation failed.

Emit a once time alert if buffer size happens to be too short.

[1]: printk() has various features like printk_delay()...

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/ipt_LOG.c  | 145 +++++++++++++++++++-------------------
 net/ipv6/netfilter/ip6t_LOG.c | 157 +++++++++++++++++++++---------------------
 2 files changed, 152 insertions(+), 150 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c
index 915fc17d7ce2..72ffc8fda2e9 100644
--- a/net/ipv4/netfilter/ipt_LOG.c
+++ b/net/ipv4/netfilter/ipt_LOG.c
@@ -24,16 +24,15 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv4/ipt_LOG.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>");
 MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog");
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
 			const struct sk_buff *skb,
 			unsigned int iphoff)
 {
@@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		sb_add(m, "TRUNCATED");
 		return;
 	}
 
 	/* Important fields:
 	 * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */
 	/* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */
-	printk("SRC=%pI4 DST=%pI4 ",
+	sb_add(m, "SRC=%pI4 DST=%pI4 ",
 	       &ih->saddr, &ih->daddr);
 
 	/* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */
-	printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
+	sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ",
 	       ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK,
 	       ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id));
 
 	/* Max length: 6 "CE DF MF " */
 	if (ntohs(ih->frag_off) & IP_CE)
-		printk("CE ");
+		sb_add(m, "CE ");
 	if (ntohs(ih->frag_off) & IP_DF)
-		printk("DF ");
+		sb_add(m, "DF ");
 	if (ntohs(ih->frag_off) & IP_MF)
-		printk("MF ");
+		sb_add(m, "MF ");
 
 	/* Max length: 11 "FRAG:65535 " */
 	if (ntohs(ih->frag_off) & IP_OFFSET)
-		printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
+		sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET);
 
 	if ((logflags & IPT_LOG_IPOPT) &&
 	    ih->ihl * 4 > sizeof(struct iphdr)) {
@@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info,
 		op = skb_header_pointer(skb, iphoff+sizeof(_iph),
 					optsize, _opt);
 		if (op == NULL) {
-			printk("TRUNCATED");
+			sb_add(m, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 127 "OPT (" 15*4*2chars ") " */
-		printk("OPT (");
+		sb_add(m, "OPT (");
 		for (i = 0; i < optsize; i++)
-			printk("%02X", op[i]);
-		printk(") ");
+			sb_add(m, "%02X", op[i]);
+		sb_add(m, ") ");
 	}
 
 	switch (ih->protocol) {
@@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		sb_add(m, "PROTO=TCP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info,
 		th = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		sb_add(m, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IPT_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			sb_add(m, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3F " */
-		printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			sb_add(m, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			sb_add(m, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			sb_add(m, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			sb_add(m, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			sb_add(m, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			sb_add(m, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			sb_add(m, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			sb_add(m, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IPT_LOG_TCPOPT) &&
 		    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info,
 						iphoff+ih->ihl*4+sizeof(_tcph),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("TRUNCATED");
+				sb_add(m, "TRUNCATED");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			sb_add(m, "OPT (");
 			for (i = 0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
 		}
 		break;
 	}
@@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		if (ih->protocol == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			sb_add(m, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			sb_add(m, "PROTO=UDPLITE ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info,
 		uh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info,
 			    [ICMP_ADDRESSREPLY] = 12 };
 
 		/* Max length: 11 "PROTO=ICMP " */
-		printk("PROTO=ICMP ");
+		sb_add(m, "PROTO=ICMP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info,
 		ich = skb_header_pointer(skb, iphoff + ih->ihl * 4,
 					 sizeof(_icmph), &_icmph);
 		if (ich == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ich->type, ich->code);
+		sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code);
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		if (ich->type <= NR_ICMP_TYPES &&
 		    required_len[ich->type] &&
 		    skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
@@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info,
 		case ICMP_ECHOREPLY:
 		case ICMP_ECHO:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			sb_add(m, "ID=%u SEQ=%u ",
 			       ntohs(ich->un.echo.id),
 			       ntohs(ich->un.echo.sequence));
 			break;
 
 		case ICMP_PARAMETERPROB:
 			/* Max length: 14 "PARAMETER=255 " */
-			printk("PARAMETER=%u ",
+			sb_add(m, "PARAMETER=%u ",
 			       ntohl(ich->un.gateway) >> 24);
 			break;
 		case ICMP_REDIRECT:
 			/* Max length: 24 "GATEWAY=255.255.255.255 " */
-			printk("GATEWAY=%pI4 ", &ich->un.gateway);
+			sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway);
 			/* Fall through */
 		case ICMP_DEST_UNREACH:
 		case ICMP_SOURCE_QUENCH:
 		case ICMP_TIME_EXCEEDED:
 			/* Max length: 3+maxlen */
 			if (!iphoff) { /* Only recurse once. */
-				printk("[");
-				dump_packet(info, skb,
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
 					    iphoff + ih->ihl*4+sizeof(_icmph));
-				printk("] ");
+				sb_add(m, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ich->type == ICMP_DEST_UNREACH &&
 			    ich->code == ICMP_FRAG_NEEDED)
-				printk("MTU=%u ", ntohs(ich->un.frag.mtu));
+				sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu));
 		}
 		break;
 	}
@@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info,
 			break;
 
 		/* Max length: 9 "PROTO=AH " */
-		printk("PROTO=AH ");
+		sb_add(m, "PROTO=AH ");
 
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ah = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_ahdr), &_ahdr);
 		if (ah == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(ah->spi));
+		sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
 		break;
 	}
 	case IPPROTO_ESP: {
@@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct ip_esp_hdr *eh;
 
 		/* Max length: 10 "PROTO=ESP " */
-		printk("PROTO=ESP ");
+		sb_add(m, "PROTO=ESP ");
 
 		if (ntohs(ih->frag_off) & IP_OFFSET)
 			break;
@@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info,
 		eh = skb_header_pointer(skb, iphoff+ih->ihl*4,
 					sizeof(_esph), &_esph);
 		if (eh == NULL) {
-			printk("INCOMPLETE [%u bytes] ",
+			sb_add(m, "INCOMPLETE [%u bytes] ",
 			       skb->len - iphoff - ih->ihl*4);
 			break;
 		}
 
 		/* Length: 15 "SPI=0xF1234567 " */
-		printk("SPI=0x%x ", ntohl(eh->spi));
+		sb_add(m, "SPI=0x%x ", ntohl(eh->spi));
 		break;
 	}
 	/* Max length: 10 "PROTO 255 " */
 	default:
-		printk("PROTO=%u ", ih->protocol);
+		sb_add(m, "PROTO=%u ", ih->protocol);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			sb_add(m, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!iphoff && skb->mark)
-		printk("MARK=0x%x ", skb->mark);
+		sb_add(m, "MARK=0x%x ", skb->mark);
 
 	/* Proto    Max log string length */
 	/* IP:      40+46+6+11+127 = 230 */
@@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info,
 	/* maxlen = 230+   91  + 230 + 252 = 803 */
 }
 
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
 			    const struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
@@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
 
 	switch (dev->type) {
 	case ARPHRD_ETHER:
-		printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
 		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 		       ntohs(eth_hdr(skb)->h_proto));
 		return;
@@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info,
 	}
 
 fallback:
-	printk("MAC=");
+	sb_add(m, "MAC=");
 	if (dev->hard_header_len &&
 	    skb->mac_header != skb->network_header) {
 		const unsigned char *p = skb_mac_header(skb);
 		unsigned int i;
 
-		printk("%02x", *p++);
+		sb_add(m, "%02x", *p++);
 		for (i = 1; i < dev->hard_header_len; i++, p++)
-			printk(":%02x", *p);
+			sb_add(m, ":%02x", *p);
 	}
-	printk(" ");
+	sb_add(m, " ");
 }
 
 static struct nf_loginfo default_loginfo = {
@@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf,
 	       const struct nf_loginfo *loginfo,
 	       const char *prefix)
 {
+	struct sbuff *m = sb_open();
+
 	if (!loginfo)
 		loginfo = &default_loginfo;
 
-	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
 	       prefix,
 	       in ? in->name : "",
 	       out ? out->name : "");
@@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf,
 
 		physindev = skb->nf_bridge->physindev;
 		if (physindev && in != physindev)
-			printk("PHYSIN=%s ", physindev->name);
+			sb_add(m, "PHYSIN=%s ", physindev->name);
 		physoutdev = skb->nf_bridge->physoutdev;
 		if (physoutdev && out != physoutdev)
-			printk("PHYSOUT=%s ", physoutdev->name);
+			sb_add(m, "PHYSOUT=%s ", physoutdev->name);
 	}
 #endif
 
 	/* MAC logging for input path only. */
 	if (in && !out)
-		dump_mac_header(loginfo, skb);
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, 0);
 
-	dump_packet(loginfo, skb, 0);
-	printk("\n");
-	spin_unlock_bh(&log_lock);
+	sb_close(m);
 }
 
 static unsigned int
diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c
index 0a07ae7b933f..09c88891a753 100644
--- a/net/ipv6/netfilter/ip6t_LOG.c
+++ b/net/ipv6/netfilter/ip6t_LOG.c
@@ -23,6 +23,7 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_ipv6/ip6_tables.h>
 #include <net/netfilter/nf_log.h>
+#include <net/netfilter/xt_log.h>
 
 MODULE_AUTHOR("Jan Rekorajski <baggins@pld.org.pl>");
 MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog");
@@ -32,11 +33,9 @@ struct in_device;
 #include <net/route.h>
 #include <linux/netfilter_ipv6/ip6t_LOG.h>
 
-/* Use lock to serialize, so printks don't overlap */
-static DEFINE_SPINLOCK(log_lock);
-
 /* One level of recursion won't kill us */
-static void dump_packet(const struct nf_loginfo *info,
+static void dump_packet(struct sbuff *m,
+			const struct nf_loginfo *info,
 			const struct sk_buff *skb, unsigned int ip6hoff,
 			int recurse)
 {
@@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h);
 	if (ih == NULL) {
-		printk("TRUNCATED");
+		sb_add(m, "TRUNCATED");
 		return;
 	}
 
 	/* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */
-	printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
+	sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr);
 
 	/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
-	printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
+	sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
 	       ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
 	       (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
 	       ih->hop_limit,
@@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr);
 		if (hp == NULL) {
-			printk("TRUNCATED");
+			sb_add(m, "TRUNCATED");
 			return;
 		}
 
 		/* Max length: 48 "OPT (...) " */
 		if (logflags & IP6T_LOG_IPOPT)
-			printk("OPT ( ");
+			sb_add(m, "OPT ( ");
 
 		switch (currenthdr) {
 		case IPPROTO_FRAGMENT: {
 			struct frag_hdr _fhdr;
 			const struct frag_hdr *fh;
 
-			printk("FRAG:");
+			sb_add(m, "FRAG:");
 			fh = skb_header_pointer(skb, ptr, sizeof(_fhdr),
 						&_fhdr);
 			if (fh == NULL) {
-				printk("TRUNCATED ");
+				sb_add(m, "TRUNCATED ");
 				return;
 			}
 
 			/* Max length: 6 "65535 " */
-			printk("%u ", ntohs(fh->frag_off) & 0xFFF8);
+			sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8);
 
 			/* Max length: 11 "INCOMPLETE " */
 			if (fh->frag_off & htons(0x0001))
-				printk("INCOMPLETE ");
+				sb_add(m, "INCOMPLETE ");
 
-			printk("ID:%08x ", ntohl(fh->identification));
+			sb_add(m, "ID:%08x ", ntohl(fh->identification));
 
 			if (ntohs(fh->frag_off) & 0xFFF8)
 				fragment = 1;
@@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		case IPPROTO_HOPOPTS:
 			if (fragment) {
 				if (logflags & IP6T_LOG_IPOPT)
-					printk(")");
+					sb_add(m, ")");
 				return;
 			}
 			hdrlen = ipv6_optlen(hp);
@@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info,
 				const struct ip_auth_hdr *ah;
 
 				/* Max length: 3 "AH " */
-				printk("AH ");
+				sb_add(m, "AH ");
 
 				if (fragment) {
-					printk(")");
+					sb_add(m, ")");
 					return;
 				}
 
@@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info,
 					 * Max length: 26 "INCOMPLETE [65535
 					 *  bytes] )"
 					 */
-					printk("INCOMPLETE [%u bytes] )",
+					sb_add(m, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 15 "SPI=0xF1234567 */
-				printk("SPI=0x%x ", ntohl(ah->spi));
+				sb_add(m, "SPI=0x%x ", ntohl(ah->spi));
 
 			}
 
@@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info,
 				const struct ip_esp_hdr *eh;
 
 				/* Max length: 4 "ESP " */
-				printk("ESP ");
+				sb_add(m, "ESP ");
 
 				if (fragment) {
-					printk(")");
+					sb_add(m, ")");
 					return;
 				}
 
@@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info,
 				eh = skb_header_pointer(skb, ptr, sizeof(_esph),
 							&_esph);
 				if (eh == NULL) {
-					printk("INCOMPLETE [%u bytes] )",
+					sb_add(m, "INCOMPLETE [%u bytes] )",
 					       skb->len - ptr);
 					return;
 				}
 
 				/* Length: 16 "SPI=0xF1234567 )" */
-				printk("SPI=0x%x )", ntohl(eh->spi) );
+				sb_add(m, "SPI=0x%x )", ntohl(eh->spi) );
 
 			}
 			return;
 		default:
 			/* Max length: 20 "Unknown Ext Hdr 255" */
-			printk("Unknown Ext Hdr %u", currenthdr);
+			sb_add(m, "Unknown Ext Hdr %u", currenthdr);
 			return;
 		}
 		if (logflags & IP6T_LOG_IPOPT)
-			printk(") ");
+			sb_add(m, ") ");
 
 		currenthdr = hp->nexthdr;
 		ptr += hdrlen;
@@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct tcphdr *th;
 
 		/* Max length: 10 "PROTO=TCP " */
-		printk("PROTO=TCP ");
+		sb_add(m, "PROTO=TCP ");
 
 		if (fragment)
 			break;
@@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph);
 		if (th == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u ",
+		sb_add(m, "SPT=%u DPT=%u ",
 		       ntohs(th->source), ntohs(th->dest));
 		/* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */
 		if (logflags & IP6T_LOG_TCPSEQ)
-			printk("SEQ=%u ACK=%u ",
+			sb_add(m, "SEQ=%u ACK=%u ",
 			       ntohl(th->seq), ntohl(th->ack_seq));
 		/* Max length: 13 "WINDOW=65535 " */
-		printk("WINDOW=%u ", ntohs(th->window));
+		sb_add(m, "WINDOW=%u ", ntohs(th->window));
 		/* Max length: 9 "RES=0x3C " */
-		printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
+		sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22));
 		/* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */
 		if (th->cwr)
-			printk("CWR ");
+			sb_add(m, "CWR ");
 		if (th->ece)
-			printk("ECE ");
+			sb_add(m, "ECE ");
 		if (th->urg)
-			printk("URG ");
+			sb_add(m, "URG ");
 		if (th->ack)
-			printk("ACK ");
+			sb_add(m, "ACK ");
 		if (th->psh)
-			printk("PSH ");
+			sb_add(m, "PSH ");
 		if (th->rst)
-			printk("RST ");
+			sb_add(m, "RST ");
 		if (th->syn)
-			printk("SYN ");
+			sb_add(m, "SYN ");
 		if (th->fin)
-			printk("FIN ");
+			sb_add(m, "FIN ");
 		/* Max length: 11 "URGP=65535 " */
-		printk("URGP=%u ", ntohs(th->urg_ptr));
+		sb_add(m, "URGP=%u ", ntohs(th->urg_ptr));
 
 		if ((logflags & IP6T_LOG_TCPOPT) &&
 		    th->doff * 4 > sizeof(struct tcphdr)) {
@@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info,
 						ptr + sizeof(struct tcphdr),
 						optsize, _opt);
 			if (op == NULL) {
-				printk("OPT (TRUNCATED)");
+				sb_add(m, "OPT (TRUNCATED)");
 				return;
 			}
 
 			/* Max length: 127 "OPT (" 15*4*2chars ") " */
-			printk("OPT (");
+			sb_add(m, "OPT (");
 			for (i =0; i < optsize; i++)
-				printk("%02X", op[i]);
-			printk(") ");
+				sb_add(m, "%02X", op[i]);
+			sb_add(m, ") ");
 		}
 		break;
 	}
@@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		if (currenthdr == IPPROTO_UDP)
 			/* Max length: 10 "PROTO=UDP "     */
-			printk("PROTO=UDP " );
+			sb_add(m, "PROTO=UDP " );
 		else	/* Max length: 14 "PROTO=UDPLITE " */
-			printk("PROTO=UDPLITE ");
+			sb_add(m, "PROTO=UDPLITE ");
 
 		if (fragment)
 			break;
@@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph);
 		if (uh == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 20 "SPT=65535 DPT=65535 " */
-		printk("SPT=%u DPT=%u LEN=%u ",
+		sb_add(m, "SPT=%u DPT=%u LEN=%u ",
 		       ntohs(uh->source), ntohs(uh->dest),
 		       ntohs(uh->len));
 		break;
@@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info,
 		const struct icmp6hdr *ic;
 
 		/* Max length: 13 "PROTO=ICMPv6 " */
-		printk("PROTO=ICMPv6 ");
+		sb_add(m, "PROTO=ICMPv6 ");
 
 		if (fragment)
 			break;
@@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info,
 		/* Max length: 25 "INCOMPLETE [65535 bytes] " */
 		ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h);
 		if (ic == NULL) {
-			printk("INCOMPLETE [%u bytes] ", skb->len - ptr);
+			sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr);
 			return;
 		}
 
 		/* Max length: 18 "TYPE=255 CODE=255 " */
-		printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
+		sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code);
 
 		switch (ic->icmp6_type) {
 		case ICMPV6_ECHO_REQUEST:
 		case ICMPV6_ECHO_REPLY:
 			/* Max length: 19 "ID=65535 SEQ=65535 " */
-			printk("ID=%u SEQ=%u ",
+			sb_add(m, "ID=%u SEQ=%u ",
 				ntohs(ic->icmp6_identifier),
 				ntohs(ic->icmp6_sequence));
 			break;
@@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info,
 
 		case ICMPV6_PARAMPROB:
 			/* Max length: 17 "POINTER=ffffffff " */
-			printk("POINTER=%08x ", ntohl(ic->icmp6_pointer));
+			sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer));
 			/* Fall through */
 		case ICMPV6_DEST_UNREACH:
 		case ICMPV6_PKT_TOOBIG:
 		case ICMPV6_TIME_EXCEED:
 			/* Max length: 3+maxlen */
 			if (recurse) {
-				printk("[");
-				dump_packet(info, skb, ptr + sizeof(_icmp6h),
-					    0);
-				printk("] ");
+				sb_add(m, "[");
+				dump_packet(m, info, skb,
+					    ptr + sizeof(_icmp6h), 0);
+				sb_add(m, "] ");
 			}
 
 			/* Max length: 10 "MTU=65535 " */
 			if (ic->icmp6_type == ICMPV6_PKT_TOOBIG)
-				printk("MTU=%u ", ntohl(ic->icmp6_mtu));
+				sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu));
 		}
 		break;
 	}
 	/* Max length: 10 "PROTO=255 " */
 	default:
-		printk("PROTO=%u ", currenthdr);
+		sb_add(m, "PROTO=%u ", currenthdr);
 	}
 
 	/* Max length: 15 "UID=4294967295 " */
 	if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) {
 		read_lock_bh(&skb->sk->sk_callback_lock);
 		if (skb->sk->sk_socket && skb->sk->sk_socket->file)
-			printk("UID=%u GID=%u ",
+			sb_add(m, "UID=%u GID=%u ",
 				skb->sk->sk_socket->file->f_cred->fsuid,
 				skb->sk->sk_socket->file->f_cred->fsgid);
 		read_unlock_bh(&skb->sk->sk_callback_lock);
@@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info,
 
 	/* Max length: 16 "MARK=0xFFFFFFFF " */
 	if (!recurse && skb->mark)
-		printk("MARK=0x%x ", skb->mark);
+		sb_add(m, "MARK=0x%x ", skb->mark);
 }
 
-static void dump_mac_header(const struct nf_loginfo *info,
+static void dump_mac_header(struct sbuff *m,
+			    const struct nf_loginfo *info,
 			    const struct sk_buff *skb)
 {
 	struct net_device *dev = skb->dev;
@@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
 
 	switch (dev->type) {
 	case ARPHRD_ETHER:
-		printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
+		sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ",
 		       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
 		       ntohs(eth_hdr(skb)->h_proto));
 		return;
@@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info,
 	}
 
 fallback:
-	printk("MAC=");
+	sb_add(m, "MAC=");
 	if (dev->hard_header_len &&
 	    skb->mac_header != skb->network_header) {
 		const unsigned char *p = skb_mac_header(skb);
@@ -408,19 +408,19 @@ fallback:
 			p = NULL;
 
 		if (p != NULL) {
-			printk("%02x", *p++);
+			sb_add(m, "%02x", *p++);
 			for (i = 1; i < len; i++)
-				printk(":%02x", p[i]);
+				sb_add(m, ":%02x", p[i]);
 		}
-		printk(" ");
+		sb_add(m, " ");
 
 		if (dev->type == ARPHRD_SIT) {
 			const struct iphdr *iph =
 				(struct iphdr *)skb_mac_header(skb);
-			printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
+			sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr);
 		}
 	} else
-		printk(" ");
+		sb_add(m, " ");
 }
 
 static struct nf_loginfo default_loginfo = {
@@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf,
 		const struct nf_loginfo *loginfo,
 		const char *prefix)
 {
+	struct sbuff *m = sb_open();
+
 	if (!loginfo)
 		loginfo = &default_loginfo;
 
-	spin_lock_bh(&log_lock);
-	printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
-		prefix,
-		in ? in->name : "",
-		out ? out->name : "");
+	sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level,
+	       prefix,
+	       in ? in->name : "",
+	       out ? out->name : "");
 
 	/* MAC logging for input path only. */
 	if (in && !out)
-		dump_mac_header(loginfo, skb);
+		dump_mac_header(m, loginfo, skb);
+
+	dump_packet(m, loginfo, skb, skb_network_offset(skb), 1);
 
-	dump_packet(loginfo, skb, skb_network_offset(skb), 1);
-	printk("\n");
-	spin_unlock_bh(&log_lock);
+	sb_close(m);
 }
 
 static unsigned int
-- 
cgit v1.2.3-58-ga151


From 6a31d2a97c04ffe9b161ec0177a2296366ff9249 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 4 Oct 2010 20:00:18 +0000
Subject: fib: cleanups

Code style cleanups before upcoming functional changes.
C99 initializer for fib_props array.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c  | 121 ++++++++++++----------
 net/ipv4/fib_rules.c     |  10 +-
 net/ipv4/fib_semantics.c | 257 +++++++++++++++++++++++++----------------------
 3 files changed, 206 insertions(+), 182 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 4a69a957872b..b05c23b05a9f 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -225,30 +225,33 @@ EXPORT_SYMBOL(inet_addr_type);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
 				__be32 addr)
 {
-       return __inet_dev_addr_type(net, dev, addr);
+	return __inet_dev_addr_type(net, dev, addr);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
 /* Given (packet source, input interface) and optional (dst, oif, tos):
-   - (main) check, that source is valid i.e. not broadcast or our local
-     address.
-   - figure out what "logical" interface this packet arrived
-     and calculate "specific destination" address.
-   - check, that packet arrived from expected physical interface.
+ * - (main) check, that source is valid i.e. not broadcast or our local
+ *   address.
+ * - figure out what "logical" interface this packet arrived
+ *   and calculate "specific destination" address.
+ * - check, that packet arrived from expected physical interface.
  */
-
 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			struct net_device *dev, __be32 *spec_dst,
 			u32 *itag, u32 mark)
 {
 	struct in_device *in_dev;
-	struct flowi fl = { .nl_u = { .ip4_u =
-				      { .daddr = src,
-					.saddr = dst,
-					.tos = tos } },
-			    .mark = mark,
-			    .iif = oif };
-
+	struct flowi fl = {
+		.nl_u = {
+			.ip4_u = {
+				.daddr = src,
+				.saddr = dst,
+				.tos = tos
+			}
+		},
+		.mark = mark,
+		.iif = oif
+	};
 	struct fib_result res;
 	int no_addr, rpf, accept_local;
 	bool dev_match;
@@ -477,9 +480,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt,
 }
 
 /*
- *	Handle IP routing ioctl calls. These are used to manipulate the routing tables
+ * Handle IP routing ioctl calls.
+ * These are used to manipulate the routing tables
  */
-
 int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 {
 	struct fib_config cfg;
@@ -523,7 +526,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 	return -EINVAL;
 }
 
-const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
+const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = {
 	[RTA_DST]		= { .type = NLA_U32 },
 	[RTA_SRC]		= { .type = NLA_U32 },
 	[RTA_IIF]		= { .type = NLA_U32 },
@@ -537,7 +540,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = {
 };
 
 static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
-			    struct nlmsghdr *nlh, struct fib_config *cfg)
+			     struct nlmsghdr *nlh, struct fib_config *cfg)
 {
 	struct nlattr *attr;
 	int err, remaining;
@@ -692,12 +695,11 @@ out:
 }
 
 /* Prepare and feed intra-kernel routing request.
-   Really, it should be netlink message, but :-( netlink
-   can be not configured, so that we feed it directly
-   to fib engine. It is legal, because all events occur
-   only when netlink is already locked.
+ * Really, it should be netlink message, but :-( netlink
+ * can be not configured, so that we feed it directly
+ * to fib engine. It is legal, because all events occur
+ * only when netlink is already locked.
  */
-
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
 {
 	struct net *net = dev_net(ifa->ifa_dev->dev);
@@ -743,9 +745,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 	struct in_ifaddr *prim = ifa;
 	__be32 mask = ifa->ifa_mask;
 	__be32 addr = ifa->ifa_local;
-	__be32 prefix = ifa->ifa_address&mask;
+	__be32 prefix = ifa->ifa_address & mask;
 
-	if (ifa->ifa_flags&IFA_F_SECONDARY) {
+	if (ifa->ifa_flags & IFA_F_SECONDARY) {
 		prim = inet_ifa_byprefix(in_dev, prefix, mask);
 		if (prim == NULL) {
 			printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n");
@@ -755,22 +757,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
 
 	fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return;
 
 	/* Add broadcast address, if it is explicitly assigned. */
 	if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
 		fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
 
-	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) &&
+	if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
 	    (prefix != addr || ifa->ifa_prefixlen < 32)) {
-		fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim);
+		fib_magic(RTM_NEWROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  prefix, ifa->ifa_prefixlen, prim);
 
 		/* Add network specific broadcasts, when it takes a sense */
 		if (ifa->ifa_prefixlen < 31) {
 			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
-			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim);
+			fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
+				  32, prim);
 		}
 	}
 }
@@ -781,17 +785,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	struct net_device *dev = in_dev->dev;
 	struct in_ifaddr *ifa1;
 	struct in_ifaddr *prim = ifa;
-	__be32 brd = ifa->ifa_address|~ifa->ifa_mask;
-	__be32 any = ifa->ifa_address&ifa->ifa_mask;
+	__be32 brd = ifa->ifa_address | ~ifa->ifa_mask;
+	__be32 any = ifa->ifa_address & ifa->ifa_mask;
 #define LOCAL_OK	1
 #define BRD_OK		2
 #define BRD0_OK		4
 #define BRD1_OK		8
 	unsigned ok = 0;
 
-	if (!(ifa->ifa_flags&IFA_F_SECONDARY))
-		fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL :
-			  RTN_UNICAST, any, ifa->ifa_prefixlen, prim);
+	if (!(ifa->ifa_flags & IFA_F_SECONDARY))
+		fib_magic(RTM_DELROUTE,
+			  dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
+			  any, ifa->ifa_prefixlen, prim);
 	else {
 		prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask);
 		if (prim == NULL) {
@@ -801,9 +806,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 	}
 
 	/* Deletion is more complicated than add.
-	   We should take care of not to delete too much :-)
-
-	   Scan address list to be sure that addresses are really gone.
+	 * We should take care of not to delete too much :-)
+	 *
+	 * Scan address list to be sure that addresses are really gone.
 	 */
 
 	for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) {
@@ -817,23 +822,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 			ok |= BRD0_OK;
 	}
 
-	if (!(ok&BRD_OK))
+	if (!(ok & BRD_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
-	if (!(ok&BRD1_OK))
+	if (!(ok & BRD1_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
-	if (!(ok&BRD0_OK))
+	if (!(ok & BRD0_OK))
 		fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
-	if (!(ok&LOCAL_OK)) {
+	if (!(ok & LOCAL_OK)) {
 		fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
 
 		/* Check, that this local address finally disappeared. */
 		if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) {
 			/* And the last, but not the least thing.
-			   We must flush stray FIB entries.
-
-			   First of all, we scan fib_info list searching
-			   for stray nexthop entries, then ignite fib_flush.
-			*/
+			 * We must flush stray FIB entries.
+			 *
+			 * First of all, we scan fib_info list searching
+			 * for stray nexthop entries, then ignite fib_flush.
+			 */
 			if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local))
 				fib_flush(dev_net(dev));
 		}
@@ -844,14 +849,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa)
 #undef BRD1_OK
 }
 
-static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb )
+static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 {
 
 	struct fib_result       res;
-	struct flowi            fl = { .mark = frn->fl_mark,
-				       .nl_u = { .ip4_u = { .daddr = frn->fl_addr,
-							    .tos = frn->fl_tos,
-							    .scope = frn->fl_scope } } };
+	struct flowi            fl = {
+		.mark = frn->fl_mark,
+		.nl_u = {
+			.ip4_u = {
+				.daddr = frn->fl_addr,
+				.tos = frn->fl_tos,
+				.scope = frn->fl_scope
+			}
+		}
+	};
 
 #ifdef CONFIG_IP_MULTIPLE_TABLES
 	res.r = NULL;
@@ -899,8 +910,8 @@ static void nl_fib_input(struct sk_buff *skb)
 
 	nl_fib_lookup(frn, tb);
 
-	pid = NETLINK_CB(skb).pid;       /* pid of sending process */
-	NETLINK_CB(skb).pid = 0;         /* from kernel */
+	pid = NETLINK_CB(skb).pid;      /* pid of sending process */
+	NETLINK_CB(skb).pid = 0;        /* from kernel */
 	NETLINK_CB(skb).dst_group = 0;  /* unicast */
 	netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT);
 }
@@ -947,7 +958,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
 		fib_del_ifaddr(ifa);
 		if (ifa->ifa_dev->ifa_list == NULL) {
 			/* Last address was deleted from this interface.
-			   Disable IP.
+			 * Disable IP.
 			 */
 			fib_disable_ip(dev, 1, 0);
 		} else {
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 76daeb5ff564..32300521e32c 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -6,7 +6,7 @@
  *		IPv4 Forwarding Information Base: policy rules.
  *
  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
- * 		Thomas Graf <tgraf@suug.ch>
+ *		Thomas Graf <tgraf@suug.ch>
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -14,7 +14,7 @@
  *		2 of the License, or (at your option) any later version.
  *
  * Fixes:
- * 		Rani Assaf	:	local_rule cannot be deleted
+ *		Rani Assaf	:	local_rule cannot be deleted
  *		Marc Boucher	:	routing by fwmark
  */
 
@@ -32,8 +32,7 @@
 #include <net/ip_fib.h>
 #include <net/fib_rules.h>
 
-struct fib4_rule
-{
+struct fib4_rule {
 	struct fib_rule		common;
 	u8			dst_len;
 	u8			src_len;
@@ -91,7 +90,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 		goto errout;
 	}
 
-	if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL)
+	tbl = fib_get_table(rule->fr_net, rule->table);
+	if (!tbl)
 		goto errout;
 
 	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 20f09c5b31e8..ba52f399a898 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
 
 static DEFINE_SPINLOCK(fib_multipath_lock);
 
-#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
-for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
-
-#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
-for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh;				\
+	for (nhsel = 0, nh = (fi)->fib_nh;				\
+	     nhsel < (fi)->fib_nhs;					\
+	     nh++, nhsel++)
+
+#define change_nexthops(fi) {						\
+	int nhsel; struct fib_nh *nexthop_nh;				\
+	for (nhsel = 0,	nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	     nhsel < (fi)->fib_nhs;					\
+	     nexthop_nh++, nhsel++)
 
 #else /* CONFIG_IP_ROUTE_MULTIPATH */
 
 /* Hope, that gcc will optimize it to get rid of dummy loop */
 
-#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define for_nexthops(fi) {						\
+	int nhsel; const struct fib_nh *nh = (fi)->fib_nh;		\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
-#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
-for (nhsel=0; nhsel < 1; nhsel++)
+#define change_nexthops(fi) {						\
+	int nhsel;							\
+	struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);	\
+	for (nhsel = 0; nhsel < 1; nhsel++)
 
 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
 
@@ -86,54 +95,54 @@ static const struct
 	int	error;
 	u8	scope;
 } fib_props[RTN_MAX + 1] = {
-	{
+	[RTN_UNSPEC] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_UNSPEC */
-	{
+	},
+	[RTN_UNICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNICAST */
-	{
+	},
+	[RTN_LOCAL] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_HOST,
-	},	/* RTN_LOCAL */
-	{
+	},
+	[RTN_BROADCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_BROADCAST */
-	{
+	},
+	[RTN_ANYCAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_LINK,
-	},	/* RTN_ANYCAST */
-	{
+	},
+	[RTN_MULTICAST] = {
 		.error	= 0,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_MULTICAST */
-	{
+	},
+	[RTN_BLACKHOLE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_BLACKHOLE */
-	{
+	},
+	[RTN_UNREACHABLE] = {
 		.error	= -EHOSTUNREACH,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_UNREACHABLE */
-	{
+	},
+	[RTN_PROHIBIT] = {
 		.error	= -EACCES,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_PROHIBIT */
-	{
+	},
+	[RTN_THROW] = {
 		.error	= -EAGAIN,
 		.scope	= RT_SCOPE_UNIVERSE,
-	},	/* RTN_THROW */
-	{
+	},
+	[RTN_NAT] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_NAT */
-	{
+	},
+	[RTN_XRESOLVE] = {
 		.error	= -EINVAL,
 		.scope	= RT_SCOPE_NOWHERE,
-	},	/* RTN_XRESOLVE */
+	},
 };
 
 
@@ -142,7 +151,7 @@ static const struct
 void free_fib_info(struct fib_info *fi)
 {
 	if (fi->fib_dead == 0) {
-		printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
+		pr_warning("Freeing alive fib_info %p\n", fi);
 		return;
 	}
 	change_nexthops(fi) {
@@ -173,7 +182,7 @@ void fib_release_info(struct fib_info *fi)
 	spin_unlock_bh(&fib_info_lock);
 }
 
-static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
+static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
 {
 	const struct fib_nh *onh = ofi->fib_nh;
 
@@ -187,7 +196,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *
 #ifdef CONFIG_NET_CLS_ROUTE
 		    nh->nh_tclassid != onh->nh_tclassid ||
 #endif
-		    ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
+		    ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD))
 			return -1;
 		onh++;
 	} endfor_nexthops(fi);
@@ -238,7 +247,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 		    nfi->fib_priority == fi->fib_priority &&
 		    memcmp(nfi->fib_metrics, fi->fib_metrics,
 			   sizeof(fi->fib_metrics)) == 0 &&
-		    ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
+		    ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 &&
 		    (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
 			return fi;
 	}
@@ -247,9 +256,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi)
 }
 
 /* Check, that the gateway is already configured.
-   Used only by redirect accept routine.
+ * Used only by redirect accept routine.
  */
-
 int ip_fib_check_default(__be32 gw, struct net_device *dev)
 {
 	struct hlist_head *head;
@@ -264,7 +272,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev)
 	hlist_for_each_entry(nh, node, head, nh_hash) {
 		if (nh->nh_dev == dev &&
 		    nh->nh_gw == gw &&
-		    !(nh->nh_flags&RTNH_F_DEAD)) {
+		    !(nh->nh_flags & RTNH_F_DEAD)) {
 			spin_unlock(&fib_info_lock);
 			return 0;
 		}
@@ -362,10 +370,10 @@ int fib_detect_death(struct fib_info *fi, int order,
 	}
 	if (state == NUD_REACHABLE)
 		return 0;
-	if ((state&NUD_VALID) && order != dflt)
+	if ((state & NUD_VALID) && order != dflt)
 		return 0;
-	if ((state&NUD_VALID) ||
-	    (*last_idx<0 && order > dflt)) {
+	if ((state & NUD_VALID) ||
+	    (*last_idx < 0 && order > dflt)) {
 		*last_resort = fi;
 		*last_idx = order;
 	}
@@ -476,69 +484,69 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
 
 
 /*
-   Picture
-   -------
-
-   Semantics of nexthop is very messy by historical reasons.
-   We have to take into account, that:
-   a) gateway can be actually local interface address,
-      so that gatewayed route is direct.
-   b) gateway must be on-link address, possibly
-      described not by an ifaddr, but also by a direct route.
-   c) If both gateway and interface are specified, they should not
-      contradict.
-   d) If we use tunnel routes, gateway could be not on-link.
-
-   Attempt to reconcile all of these (alas, self-contradictory) conditions
-   results in pretty ugly and hairy code with obscure logic.
-
-   I chose to generalized it instead, so that the size
-   of code does not increase practically, but it becomes
-   much more general.
-   Every prefix is assigned a "scope" value: "host" is local address,
-   "link" is direct route,
-   [ ... "site" ... "interior" ... ]
-   and "universe" is true gateway route with global meaning.
-
-   Every prefix refers to a set of "nexthop"s (gw, oif),
-   where gw must have narrower scope. This recursion stops
-   when gw has LOCAL scope or if "nexthop" is declared ONLINK,
-   which means that gw is forced to be on link.
-
-   Code is still hairy, but now it is apparently logically
-   consistent and very flexible. F.e. as by-product it allows
-   to co-exists in peace independent exterior and interior
-   routing processes.
-
-   Normally it looks as following.
-
-   {universe prefix}  -> (gw, oif) [scope link]
-			  |
-			  |-> {link prefix} -> (gw, oif) [scope local]
-						|
-						|-> {local prefix} (terminal node)
+ * Picture
+ * -------
+ *
+ * Semantics of nexthop is very messy by historical reasons.
+ * We have to take into account, that:
+ * a) gateway can be actually local interface address,
+ *    so that gatewayed route is direct.
+ * b) gateway must be on-link address, possibly
+ *    described not by an ifaddr, but also by a direct route.
+ * c) If both gateway and interface are specified, they should not
+ *    contradict.
+ * d) If we use tunnel routes, gateway could be not on-link.
+ *
+ * Attempt to reconcile all of these (alas, self-contradictory) conditions
+ * results in pretty ugly and hairy code with obscure logic.
+ *
+ * I chose to generalized it instead, so that the size
+ * of code does not increase practically, but it becomes
+ * much more general.
+ * Every prefix is assigned a "scope" value: "host" is local address,
+ * "link" is direct route,
+ * [ ... "site" ... "interior" ... ]
+ * and "universe" is true gateway route with global meaning.
+ *
+ * Every prefix refers to a set of "nexthop"s (gw, oif),
+ * where gw must have narrower scope. This recursion stops
+ * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
+ * which means that gw is forced to be on link.
+ *
+ * Code is still hairy, but now it is apparently logically
+ * consistent and very flexible. F.e. as by-product it allows
+ * to co-exists in peace independent exterior and interior
+ * routing processes.
+ *
+ * Normally it looks as following.
+ *
+ * {universe prefix}  -> (gw, oif) [scope link]
+ *		  |
+ *		  |-> {link prefix} -> (gw, oif) [scope local]
+ *					|
+ *					|-> {local prefix} (terminal node)
  */
-
 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			struct fib_nh *nh)
 {
 	int err;
 	struct net *net;
+	struct net_device *dev;
 
 	net = cfg->fc_nlinfo.nl_net;
 	if (nh->nh_gw) {
 		struct fib_result res;
 
-		if (nh->nh_flags&RTNH_F_ONLINK) {
-			struct net_device *dev;
+		if (nh->nh_flags & RTNH_F_ONLINK) {
 
 			if (cfg->fc_scope >= RT_SCOPE_LINK)
 				return -EINVAL;
 			if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
 				return -EINVAL;
-			if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
+			dev = __dev_get_by_index(net, nh->nh_oif);
+			if (!dev)
 				return -ENODEV;
-			if (!(dev->flags&IFF_UP))
+			if (!(dev->flags & IFF_UP))
 				return -ENETDOWN;
 			nh->nh_dev = dev;
 			dev_hold(dev);
@@ -559,7 +567,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			/* It is not necessary, but requires a bit of thinking */
 			if (fl.fl4_scope < RT_SCOPE_LINK)
 				fl.fl4_scope = RT_SCOPE_LINK;
-			if ((err = fib_lookup(net, &fl, &res)) != 0)
+			err = fib_lookup(net, &fl, &res);
+			if (err)
 				return err;
 		}
 		err = -EINVAL;
@@ -567,11 +576,12 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			goto out;
 		nh->nh_scope = res.scope;
 		nh->nh_oif = FIB_RES_OIF(res);
-		if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
+		nh->nh_dev = dev = FIB_RES_DEV(res);
+		if (!dev)
 			goto out;
-		dev_hold(nh->nh_dev);
+		dev_hold(dev);
 		err = -ENETDOWN;
-		if (!(nh->nh_dev->flags & IFF_UP))
+		if (!(dev->flags & IFF_UP))
 			goto out;
 		err = 0;
 out:
@@ -580,13 +590,13 @@ out:
 	} else {
 		struct in_device *in_dev;
 
-		if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
+		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
 			return -EINVAL;
 
 		in_dev = inetdev_by_index(net, nh->nh_oif);
 		if (in_dev == NULL)
 			return -ENODEV;
-		if (!(in_dev->dev->flags&IFF_UP)) {
+		if (!(in_dev->dev->flags & IFF_UP)) {
 			in_dev_put(in_dev);
 			return -ENETDOWN;
 		}
@@ -602,7 +612,9 @@ static inline unsigned int fib_laddr_hashfn(__be32 val)
 {
 	unsigned int mask = (fib_hash_size - 1);
 
-	return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
+	return ((__force u32)val ^
+		((__force u32)val >> 7) ^
+		((__force u32)val >> 14)) & mask;
 }
 
 static struct hlist_head *fib_hash_alloc(int bytes)
@@ -611,7 +623,8 @@ static struct hlist_head *fib_hash_alloc(int bytes)
 		return kzalloc(bytes, GFP_KERNEL);
 	else
 		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
+			__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+					 get_order(bytes));
 }
 
 static void fib_hash_free(struct hlist_head *hash, int bytes)
@@ -806,7 +819,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 			goto failure;
 	} else {
 		change_nexthops(fi) {
-			if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
+			err = fib_check_nh(cfg, fi, nexthop_nh);
+			if (err != 0)
 				goto failure;
 		} endfor_nexthops(fi)
 	}
@@ -819,7 +833,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
 	}
 
 link_it:
-	if ((ofi = fib_find_info(fi)) != NULL) {
+	ofi = fib_find_info(fi);
+	if (ofi) {
 		fi->fib_dead = 1;
 		free_fib_info(fi);
 		ofi->fib_treeref++;
@@ -895,7 +910,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 			case RTN_ANYCAST:
 			case RTN_MULTICAST:
 				for_nexthops(fi) {
-					if (nh->nh_flags&RTNH_F_DEAD)
+					if (nh->nh_flags & RTNH_F_DEAD)
 						continue;
 					if (!flp->oif || flp->oif == nh->nh_oif)
 						break;
@@ -906,16 +921,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 					goto out_fill_res;
 				}
 #else
-				if (nhsel < 1) {
+				if (nhsel < 1)
 					goto out_fill_res;
-				}
 #endif
 				endfor_nexthops(fi);
 				continue;
 
 			default:
-				printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
-					fa->fa_type);
+				pr_warning("fib_semantic_match bad type %#x\n",
+					   fa->fa_type);
 				return -EINVAL;
 			}
 		}
@@ -1028,10 +1042,10 @@ nla_put_failure:
 }
 
 /*
-   Update FIB if:
-   - local address disappeared -> we must delete all the entries
-     referring to it.
-   - device went down -> we must shutdown all nexthops going via it.
+ * Update FIB if:
+ * - local address disappeared -> we must delete all the entries
+ *   referring to it.
+ * - device went down -> we must shutdown all nexthops going via it.
  */
 int fib_sync_down_addr(struct net *net, __be32 local)
 {
@@ -1078,7 +1092,7 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 		prev_fi = fi;
 		dead = 0;
 		change_nexthops(fi) {
-			if (nexthop_nh->nh_flags&RTNH_F_DEAD)
+			if (nexthop_nh->nh_flags & RTNH_F_DEAD)
 				dead++;
 			else if (nexthop_nh->nh_dev == dev &&
 				 nexthop_nh->nh_scope != scope) {
@@ -1110,10 +1124,9 @@ int fib_sync_down_dev(struct net_device *dev, int force)
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 
 /*
-   Dead device goes up. We wake up dead nexthops.
-   It takes sense only on multipath routes.
+ * Dead device goes up. We wake up dead nexthops.
+ * It takes sense only on multipath routes.
  */
-
 int fib_sync_up(struct net_device *dev)
 {
 	struct fib_info *prev_fi;
@@ -1123,7 +1136,7 @@ int fib_sync_up(struct net_device *dev)
 	struct fib_nh *nh;
 	int ret;
 
-	if (!(dev->flags&IFF_UP))
+	if (!(dev->flags & IFF_UP))
 		return 0;
 
 	prev_fi = NULL;
@@ -1142,12 +1155,12 @@ int fib_sync_up(struct net_device *dev)
 		prev_fi = fi;
 		alive = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				alive++;
 				continue;
 			}
 			if (nexthop_nh->nh_dev == NULL ||
-			    !(nexthop_nh->nh_dev->flags&IFF_UP))
+			    !(nexthop_nh->nh_dev->flags & IFF_UP))
 				continue;
 			if (nexthop_nh->nh_dev != dev ||
 			    !__in_dev_get_rtnl(dev))
@@ -1169,10 +1182,9 @@ int fib_sync_up(struct net_device *dev)
 }
 
 /*
-   The algorithm is suboptimal, but it provides really
-   fair weighted route distribution.
+ * The algorithm is suboptimal, but it provides really
+ * fair weighted route distribution.
  */
-
 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 {
 	struct fib_info *fi = res->fi;
@@ -1182,7 +1194,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 	if (fi->fib_power <= 0) {
 		int power = 0;
 		change_nexthops(fi) {
-			if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
+			if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) {
 				power += nexthop_nh->nh_weight;
 				nexthop_nh->nh_power = nexthop_nh->nh_weight;
 			}
@@ -1198,15 +1210,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
 
 
 	/* w should be random number [0..fi->fib_power-1],
-	   it is pretty bad approximation.
+	 * it is pretty bad approximation.
 	 */
 
 	w = jiffies % fi->fib_power;
 
 	change_nexthops(fi) {
-		if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
+		if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
 		    nexthop_nh->nh_power) {
-			if ((w -= nexthop_nh->nh_power) <= 0) {
+			w -= nexthop_nh->nh_power;
+			if (w <= 0) {
 				nexthop_nh->nh_power--;
 				fi->fib_power--;
 				res->nh_sel = nhsel;
-- 
cgit v1.2.3-58-ga151


From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 30 Sep 2010 21:06:55 +0000
Subject: net: add a core netdev->rx_dropped counter

In various situations, a device provides a packet to our stack and we
drop it before it enters protocol stack :
- softnet backlog full (accounted in /proc/net/softnet_stat)
- bad vlan tag (not accounted)
- unknown/unregistered protocol (not accounted)

We can handle a per-device counter of such dropped frames at core level,
and automatically adds it to the device provided stats (rx_dropped), so
that standard tools can be used (ifconfig, ip link, cat /proc/net/dev)

This is a generalization of commit 8990f468a (net: rx_dropped
accounting), thus reverting it.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/loopback.c    |  8 +-------
 include/linux/netdevice.h |  3 +++
 net/8021q/vlan.h          |  2 --
 net/8021q/vlan_core.c     |  2 ++
 net/8021q/vlan_dev.c      | 11 ++++-------
 net/core/dev.c            | 19 +++++++++++--------
 net/ipv4/ip_gre.c         |  3 +--
 net/ipv4/ipip.c           |  3 +--
 net/ipv6/ip6_tunnel.c     |  3 +--
 net/ipv6/ip6mr.c          |  3 +--
 net/ipv6/sit.c            |  3 +--
 11 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'net/ipv4')

diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c
index 4b0e30b564e5..2d9663a1c54d 100644
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -64,7 +64,6 @@ struct pcpu_lstats {
 	u64			packets;
 	u64			bytes;
 	struct u64_stats_sync	syncp;
-	unsigned long		drops;
 };
 
 /*
@@ -90,8 +89,7 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 		lb_stats->bytes += len;
 		lb_stats->packets++;
 		u64_stats_update_end(&lb_stats->syncp);
-	} else
-		lb_stats->drops++;
+	}
 
 	return NETDEV_TX_OK;
 }
@@ -101,7 +99,6 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 {
 	u64 bytes = 0;
 	u64 packets = 0;
-	u64 drops = 0;
 	int i;
 
 	for_each_possible_cpu(i) {
@@ -115,14 +112,11 @@ static struct rtnl_link_stats64 *loopback_get_stats64(struct net_device *dev,
 			tbytes = lb_stats->bytes;
 			tpackets = lb_stats->packets;
 		} while (u64_stats_fetch_retry(&lb_stats->syncp, start));
-		drops   += lb_stats->drops;
 		bytes   += tbytes;
 		packets += tpackets;
 	}
 	stats->rx_packets = packets;
 	stats->tx_packets = packets;
-	stats->rx_dropped = drops;
-	stats->rx_errors  = drops;
 	stats->rx_bytes   = bytes;
 	stats->tx_bytes   = bytes;
 	return stats;
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 92d81edd5808..6abcef67b178 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -884,6 +884,9 @@ struct net_device {
 	int			iflink;
 
 	struct net_device_stats	stats;
+	atomic_long_t		rx_dropped; /* dropped packets by core network
+					     * Do not use this in drivers.
+					     */
 
 #ifdef CONFIG_WIRELESS_EXT
 	/* List of functions to handle Wireless Extensions (instead of ioctl).
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index b26ce343072c..8d9503ad01da 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -25,7 +25,6 @@ struct vlan_priority_tci_mapping {
  *	@rx_multicast: number of received multicast packets
  *	@syncp: synchronization point for 64bit counters
  *	@rx_errors: number of errors
- *	@rx_dropped: number of dropped packets
  */
 struct vlan_rx_stats {
 	u64			rx_packets;
@@ -33,7 +32,6 @@ struct vlan_rx_stats {
 	u64			rx_multicast;
 	struct u64_stats_sync	syncp;
 	unsigned long		rx_errors;
-	unsigned long		rx_dropped;
 };
 
 /**
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index b6d55a9304f2..dee727ce0291 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -33,6 +33,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
 	return polling ? netif_receive_skb(skb) : netif_rx(skb);
 
 drop:
+	atomic_long_inc(&skb->dev->rx_dropped);
 	dev_kfree_skb_any(skb);
 	return NET_RX_DROP;
 }
@@ -123,6 +124,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
 	return dev_gro_receive(napi, skb);
 
 drop:
+	atomic_long_inc(&skb->dev->rx_dropped);
 	return GRO_DROP;
 }
 
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index f6fbcc0f1af9..f54251edd40d 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -225,16 +225,15 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev,
 		}
 	}
 
-	if (unlikely(netif_rx(skb) == NET_RX_DROP)) {
-		if (rx_stats)
-			rx_stats->rx_dropped++;
-	}
+	netif_rx(skb);
+
 	rcu_read_unlock();
 	return NET_RX_SUCCESS;
 
 err_unlock:
 	rcu_read_unlock();
 err_free:
+	atomic_long_inc(&dev->rx_dropped);
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
@@ -846,15 +845,13 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st
 			accum.rx_packets += rxpackets;
 			accum.rx_bytes   += rxbytes;
 			accum.rx_multicast += rxmulticast;
-			/* rx_errors, rx_dropped are ulong, not protected by syncp */
+			/* rx_errors is ulong, not protected by syncp */
 			accum.rx_errors  += p->rx_errors;
-			accum.rx_dropped += p->rx_dropped;
 		}
 		stats->rx_packets = accum.rx_packets;
 		stats->rx_bytes   = accum.rx_bytes;
 		stats->rx_errors  = accum.rx_errors;
 		stats->multicast  = accum.rx_multicast;
-		stats->rx_dropped = accum.rx_dropped;
 	}
 	return stats;
 }
diff --git a/net/core/dev.c b/net/core/dev.c
index ce6ad88c980b..7d149550e8d6 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
 	skb_orphan(skb);
 	nf_reset(skb);
 
-	if (!(dev->flags & IFF_UP) ||
-	    (skb->len > (dev->mtu + dev->hard_header_len))) {
+	if (unlikely(!(dev->flags & IFF_UP) ||
+		     (skb->len > (dev->mtu + dev->hard_header_len)))) {
+		atomic_long_inc(&dev->rx_dropped);
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
@@ -2548,6 +2549,7 @@ enqueue:
 
 	local_irq_restore(flags);
 
+	atomic_long_inc(&skb->dev->rx_dropped);
 	kfree_skb(skb);
 	return NET_RX_DROP;
 }
@@ -2995,6 +2997,7 @@ ncls:
 	if (pt_prev) {
 		ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
 	} else {
+		atomic_long_inc(&skb->dev->rx_dropped);
 		kfree_skb(skb);
 		/* Jamal, now you will not able to escape explaining
 		 * me how you were going to use this. :-)
@@ -5429,14 +5432,14 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
 
 	if (ops->ndo_get_stats64) {
 		memset(storage, 0, sizeof(*storage));
-		return ops->ndo_get_stats64(dev, storage);
-	}
-	if (ops->ndo_get_stats) {
+		ops->ndo_get_stats64(dev, storage);
+	} else if (ops->ndo_get_stats) {
 		netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
-		return storage;
+	} else {
+		netdev_stats_to_stats64(storage, &dev->stats);
+		dev_txq_stats_fold(dev, storage);
 	}
-	netdev_stats_to_stats64(storage, &dev->stats);
-	dev_txq_stats_fold(dev, storage);
+	storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
 	return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index fbe2c473a06a..9d421f4cf3ef 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -679,8 +679,7 @@ static int ipgre_rcv(struct sk_buff *skb)
 		skb_reset_network_header(skb);
 		ipgre_ecn_decapsulate(iph, skb);
 
-		if (netif_rx(skb) == NET_RX_DROP)
-			tunnel->dev->stats.rx_dropped++;
+		netif_rx(skb);
 
 		rcu_read_unlock();
 		return 0;
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 6ad46c28ede2..e9b816e6cd73 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -414,8 +414,7 @@ static int ipip_rcv(struct sk_buff *skb)
 
 		ipip_ecn_decapsulate(iph, skb);
 
-		if (netif_rx(skb) == NET_RX_DROP)
-			tunnel->dev->stats.rx_dropped++;
+		netif_rx(skb);
 
 		rcu_read_unlock();
 		return 0;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 8be3c452af90..c2c0f89397b1 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -768,8 +768,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol,
 
 		dscp_ecn_decapsulate(t, ipv6h, skb);
 
-		if (netif_rx(skb) == NET_RX_DROP)
-			t->dev->stats.rx_dropped++;
+		netif_rx(skb);
 
 		rcu_read_unlock();
 		return 0;
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 2640c9be589d..6f32ffce7022 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -666,8 +666,7 @@ static int pim6_rcv(struct sk_buff *skb)
 
 	skb_tunnel_rx(skb, reg_dev);
 
-	if (netif_rx(skb) == NET_RX_DROP)
-		reg_dev->stats.rx_dropped++;
+	netif_rx(skb);
 
 	dev_put(reg_dev);
 	return 0;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index d7701782b639..367a6cc584cc 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -600,8 +600,7 @@ static int ipip6_rcv(struct sk_buff *skb)
 
 		ipip6_ecn_decapsulate(iph, skb);
 
-		if (netif_rx(skb) == NET_RX_DROP)
-			tunnel->dev->stats.rx_dropped++;
+		netif_rx(skb);
 
 		rcu_read_unlock();
 		return 0;
-- 
cgit v1.2.3-58-ga151


From d6bf781712a1d25cc8987036b3a48535b331eb91 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Mon, 4 Oct 2010 06:15:44 +0000
Subject: net neigh: RCU conversion of neigh hash table

David

This is the first step for RCU conversion of neigh code.

Next patches will convert hash_buckets[] and "struct neighbour" to RCU
protected objects.

Thanks

[PATCH net-next] net neigh: RCU conversion of neigh hash table

Instead of storing hash_buckets, hash_mask and hash_rnd in "struct
neigh_table", a new structure is defined :

struct neigh_hash_table {
       struct neighbour        **hash_buckets;
       unsigned int            hash_mask;
       __u32                   hash_rnd;
       struct rcu_head         rcu;
};

And "struct neigh_table" has an RCU protected pointer to such a
neigh_hash_table.

This means the signature of (*hash)() function changed: We need to add a
third parameter with the actual hash_rnd value, since this is not
anymore a neigh_table field.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |  16 +++-
 net/atm/clip.c          |   4 +-
 net/core/neighbour.c    | 219 ++++++++++++++++++++++++++++++------------------
 net/decnet/dn_neigh.c   |  13 +--
 net/ipv4/arp.c          |   8 +-
 net/ipv6/ndisc.c        |  10 ++-
 6 files changed, 170 insertions(+), 100 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 7d08fd1062f0..37845dae6488 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -138,13 +138,22 @@ struct pneigh_entry {
  *	neighbour table manipulation
  */
 
+struct neigh_hash_table {
+	struct neighbour	**hash_buckets;
+	unsigned int		hash_mask;
+	__u32			hash_rnd;
+	struct rcu_head		rcu;
+};
+
 
 struct neigh_table {
 	struct neigh_table	*next;
 	int			family;
 	int			entry_size;
 	int			key_len;
-	__u32			(*hash)(const void *pkey, const struct net_device *);
+	__u32			(*hash)(const void *pkey,
+					const struct net_device *dev,
+					__u32 hash_rnd);
 	int			(*constructor)(struct neighbour *);
 	int			(*pconstructor)(struct pneigh_entry *);
 	void			(*pdestructor)(struct pneigh_entry *);
@@ -165,9 +174,7 @@ struct neigh_table {
 	unsigned long		last_rand;
 	struct kmem_cache	*kmem_cachep;
 	struct neigh_statistics	__percpu *stats;
-	struct neighbour	**hash_buckets;
-	unsigned int		hash_mask;
-	__u32			hash_rnd;
+	struct neigh_hash_table __rcu *nht;
 	struct pneigh_entry	**phash_buckets;
 };
 
@@ -237,6 +244,7 @@ extern void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_en
 struct neigh_seq_state {
 	struct seq_net_private p;
 	struct neigh_table *tbl;
+	struct neigh_hash_table *nht;
 	void *(*neigh_sub_iter)(struct neigh_seq_state *state,
 				struct neighbour *n, loff_t *pos);
 	unsigned int bucket;
diff --git a/net/atm/clip.c b/net/atm/clip.c
index 95fdd1185067..ff956d1115bc 100644
--- a/net/atm/clip.c
+++ b/net/atm/clip.c
@@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh)
 	return 0;
 }
 
-static u32 clip_hash(const void *pkey, const struct net_device *dev)
+static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd)
 {
-	return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd);
+	return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd);
 }
 
 static struct neigh_table clip_tbl = {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index d6996e072a41..dd8920e4f508 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -131,14 +131,17 @@ static int neigh_forced_gc(struct neigh_table *tbl)
 {
 	int shrunk = 0;
 	int i;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs);
 
 	write_lock_bh(&tbl->lock);
-	for (i = 0; i <= tbl->hash_mask; i++) {
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (i = 0; i <= nht->hash_mask; i++) {
 		struct neighbour *n, **np;
 
-		np = &tbl->hash_buckets[i];
+		np = &nht->hash_buckets[i];
 		while ((n = *np) != NULL) {
 			/* Neighbour record may be discarded if:
 			 * - nobody refers to it.
@@ -199,9 +202,13 @@ static void pneigh_queue_purge(struct sk_buff_head *list)
 static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
 {
 	int i;
+	struct neigh_hash_table *nht;
 
-	for (i = 0; i <= tbl->hash_mask; i++) {
-		struct neighbour *n, **np = &tbl->hash_buckets[i];
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+
+	for (i = 0; i <= nht->hash_mask; i++) {
+		struct neighbour *n, **np = &nht->hash_buckets[i];
 
 		while ((n = *np) != NULL) {
 			if (dev && n->dev != dev) {
@@ -297,64 +304,81 @@ out_entries:
 	goto out;
 }
 
-static struct neighbour **neigh_hash_alloc(unsigned int entries)
+static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
-	struct neighbour **ret;
+	size_t size = entries * sizeof(struct neighbour *);
+	struct neigh_hash_table *ret;
+	struct neighbour **buckets;
 
-	if (size <= PAGE_SIZE) {
-		ret = kzalloc(size, GFP_ATOMIC);
-	} else {
-		ret = (struct neighbour **)
-		      __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size));
+	ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret)
+		return NULL;
+	if (size <= PAGE_SIZE)
+		buckets = kzalloc(size, GFP_ATOMIC);
+	else
+		buckets = (struct neighbour **)
+			  __get_free_pages(GFP_ATOMIC | __GFP_ZERO,
+					   get_order(size));
+	if (!buckets) {
+		kfree(ret);
+		return NULL;
 	}
+	ret->hash_buckets = buckets;
+	ret->hash_mask = entries - 1;
+	get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
 	return ret;
 }
 
-static void neigh_hash_free(struct neighbour **hash, unsigned int entries)
+static void neigh_hash_free_rcu(struct rcu_head *head)
 {
-	unsigned long size = entries * sizeof(struct neighbour *);
+	struct neigh_hash_table *nht = container_of(head,
+						    struct neigh_hash_table,
+						    rcu);
+	size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *);
+	struct neighbour **buckets = nht->hash_buckets;
 
 	if (size <= PAGE_SIZE)
-		kfree(hash);
+		kfree(buckets);
 	else
-		free_pages((unsigned long)hash, get_order(size));
+		free_pages((unsigned long)buckets, get_order(size));
+	kfree(nht);
 }
 
-static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries)
+static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl,
+						unsigned long new_entries)
 {
-	struct neighbour **new_hash, **old_hash;
-	unsigned int i, new_hash_mask, old_entries;
+	unsigned int i, hash;
+	struct neigh_hash_table *new_nht, *old_nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, hash_grows);
 
 	BUG_ON(!is_power_of_2(new_entries));
-	new_hash = neigh_hash_alloc(new_entries);
-	if (!new_hash)
-		return;
+	old_nht = rcu_dereference_protected(tbl->nht,
+					    lockdep_is_held(&tbl->lock));
+	new_nht = neigh_hash_alloc(new_entries);
+	if (!new_nht)
+		return old_nht;
 
-	old_entries = tbl->hash_mask + 1;
-	new_hash_mask = new_entries - 1;
-	old_hash = tbl->hash_buckets;
-
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-	for (i = 0; i < old_entries; i++) {
+	for (i = 0; i <= old_nht->hash_mask; i++) {
 		struct neighbour *n, *next;
 
-		for (n = old_hash[i]; n; n = next) {
-			unsigned int hash_val = tbl->hash(n->primary_key, n->dev);
+		for (n = old_nht->hash_buckets[i];
+		     n != NULL;
+		     n = next) {
+			hash = tbl->hash(n->primary_key, n->dev,
+					 new_nht->hash_rnd);
 
-			hash_val &= new_hash_mask;
+			hash &= new_nht->hash_mask;
 			next = n->next;
 
-			n->next = new_hash[hash_val];
-			new_hash[hash_val] = n;
+			n->next = new_nht->hash_buckets[hash];
+			new_nht->hash_buckets[hash] = n;
 		}
 	}
-	tbl->hash_buckets = new_hash;
-	tbl->hash_mask = new_hash_mask;
 
-	neigh_hash_free(old_hash, old_entries);
+	rcu_assign_pointer(tbl->nht, new_nht);
+	call_rcu(&old_nht->rcu, neigh_hash_free_rcu);
+	return new_nht;
 }
 
 struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
@@ -363,19 +387,23 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey,
 	struct neighbour *n;
 	int key_len = tbl->key_len;
 	u32 hash_val;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hash_val = tbl->hash(pkey, dev);
-	for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
+	read_lock(&tbl->lock);
+	for (n = nht->hash_buckets[hash_val]; n; n = n->next) {
 		if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) {
 			neigh_hold(n);
 			NEIGH_CACHE_STAT_INC(tbl, hits);
 			break;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup);
@@ -386,12 +414,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 	struct neighbour *n;
 	int key_len = tbl->key_len;
 	u32 hash_val;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, lookups);
 
-	read_lock_bh(&tbl->lock);
-	hash_val = tbl->hash(pkey, NULL);
-	for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+	hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask;
+	read_lock(&tbl->lock);
+	for (n = nht->hash_buckets[hash_val]; n; n = n->next) {
 		if (!memcmp(n->primary_key, pkey, key_len) &&
 		    net_eq(dev_net(n->dev), net)) {
 			neigh_hold(n);
@@ -399,7 +430,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net,
 			break;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 	return n;
 }
 EXPORT_SYMBOL(neigh_lookup_nodev);
@@ -411,6 +443,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	int key_len = tbl->key_len;
 	int error;
 	struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+	struct neigh_hash_table *nht;
 
 	if (!n) {
 		rc = ERR_PTR(-ENOBUFS);
@@ -437,18 +470,20 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 	n->confirmed = jiffies - (n->parms->base_reachable_time << 1);
 
 	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
 
-	if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1))
-		neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1);
+	if (atomic_read(&tbl->entries) > (nht->hash_mask + 1))
+		nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1);
 
-	hash_val = tbl->hash(pkey, dev) & tbl->hash_mask;
+	hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask;
 
 	if (n->parms->dead) {
 		rc = ERR_PTR(-EINVAL);
 		goto out_tbl_unlock;
 	}
 
-	for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) {
+	for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) {
 		if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) {
 			neigh_hold(n1);
 			rc = n1;
@@ -456,8 +491,8 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
 		}
 	}
 
-	n->next = tbl->hash_buckets[hash_val];
-	tbl->hash_buckets[hash_val] = n;
+	n->next = nht->hash_buckets[hash_val];
+	nht->hash_buckets[hash_val] = n;
 	n->dead = 0;
 	neigh_hold(n);
 	write_unlock_bh(&tbl->lock);
@@ -698,10 +733,13 @@ static void neigh_periodic_work(struct work_struct *work)
 	struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
 	struct neighbour *n, **np;
 	unsigned int i;
+	struct neigh_hash_table *nht;
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
 	write_lock_bh(&tbl->lock);
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
 
 	/*
 	 *	periodically recompute ReachableTime from random function
@@ -715,8 +753,8 @@ static void neigh_periodic_work(struct work_struct *work)
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	for (i = 0 ; i <= tbl->hash_mask; i++) {
-		np = &tbl->hash_buckets[i];
+	for (i = 0 ; i <= nht->hash_mask; i++) {
+		np = &nht->hash_buckets[i];
 
 		while ((n = *np) != NULL) {
 			unsigned int state;
@@ -1438,17 +1476,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 		panic("cannot create neighbour proc dir entry");
 #endif
 
-	tbl->hash_mask = 1;
-	tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1);
+	tbl->nht = neigh_hash_alloc(8);
 
 	phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *);
 	tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL);
 
-	if (!tbl->hash_buckets || !tbl->phash_buckets)
+	if (!tbl->nht || !tbl->phash_buckets)
 		panic("cannot allocate neighbour cache hashes");
 
-	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
-
 	rwlock_init(&tbl->lock);
 	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
 	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
@@ -1504,8 +1539,8 @@ int neigh_table_clear(struct neigh_table *tbl)
 	}
 	write_unlock(&neigh_tbl_lock);
 
-	neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1);
-	tbl->hash_buckets = NULL;
+	call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu);
+	tbl->nht = NULL;
 
 	kfree(tbl->phash_buckets);
 	tbl->phash_buckets = NULL;
@@ -1745,18 +1780,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 		unsigned long now = jiffies;
 		unsigned int flush_delta = now - tbl->last_flush;
 		unsigned int rand_delta = now - tbl->last_rand;
-
+		struct neigh_hash_table *nht;
 		struct ndt_config ndc = {
 			.ndtc_key_len		= tbl->key_len,
 			.ndtc_entry_size	= tbl->entry_size,
 			.ndtc_entries		= atomic_read(&tbl->entries),
 			.ndtc_last_flush	= jiffies_to_msecs(flush_delta),
 			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
-			.ndtc_hash_rnd		= tbl->hash_rnd,
-			.ndtc_hash_mask		= tbl->hash_mask,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
 
+		rcu_read_lock_bh();
+		nht = rcu_dereference_bh(tbl->nht);
+		ndc.ndtc_hash_rnd = nht->hash_rnd;
+		ndc.ndtc_hash_mask = nht->hash_mask;
+		rcu_read_unlock_bh();
+
 		NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc);
 	}
 
@@ -2088,14 +2127,18 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 	struct neighbour *n;
 	int rc, h, s_h = cb->args[1];
 	int idx, s_idx = idx = cb->args[2];
+	struct neigh_hash_table *nht;
 
-	read_lock_bh(&tbl->lock);
-	for (h = 0; h <= tbl->hash_mask; h++) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	read_lock(&tbl->lock);
+	for (h = 0; h <= nht->hash_mask; h++) {
 		if (h < s_h)
 			continue;
 		if (h > s_h)
 			s_idx = 0;
-		for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) {
+		for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) {
 			if (!net_eq(dev_net(n->dev), net))
 				continue;
 			if (idx < s_idx)
@@ -2104,7 +2147,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 					    cb->nlh->nlmsg_seq,
 					    RTM_NEWNEIGH,
 					    NLM_F_MULTI) <= 0) {
-				read_unlock_bh(&tbl->lock);
 				rc = -1;
 				goto out;
 			}
@@ -2112,9 +2154,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
 			idx++;
 		}
 	}
-	read_unlock_bh(&tbl->lock);
 	rc = skb->len;
 out:
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 	cb->args[1] = h;
 	cb->args[2] = idx;
 	return rc;
@@ -2147,15 +2190,20 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb)
 void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie)
 {
 	int chain;
+	struct neigh_hash_table *nht;
 
-	read_lock_bh(&tbl->lock);
-	for (chain = 0; chain <= tbl->hash_mask; chain++) {
+	rcu_read_lock_bh();
+	nht = rcu_dereference_bh(tbl->nht);
+
+	read_lock(&tbl->lock);
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
 		struct neighbour *n;
 
-		for (n = tbl->hash_buckets[chain]; n; n = n->next)
+		for (n = nht->hash_buckets[chain]; n; n = n->next)
 			cb(n, cookie);
 	}
-	read_unlock_bh(&tbl->lock);
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 }
 EXPORT_SYMBOL(neigh_for_each);
 
@@ -2164,11 +2212,14 @@ void __neigh_for_each_release(struct neigh_table *tbl,
 			      int (*cb)(struct neighbour *))
 {
 	int chain;
+	struct neigh_hash_table *nht;
 
-	for (chain = 0; chain <= tbl->hash_mask; chain++) {
+	nht = rcu_dereference_protected(tbl->nht,
+					lockdep_is_held(&tbl->lock));
+	for (chain = 0; chain <= nht->hash_mask; chain++) {
 		struct neighbour *n, **np;
 
-		np = &tbl->hash_buckets[chain];
+		np = &nht->hash_buckets[chain];
 		while ((n = *np) != NULL) {
 			int release;
 
@@ -2193,13 +2244,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq)
 {
 	struct neigh_seq_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct neigh_table *tbl = state->tbl;
+	struct neigh_hash_table *nht = state->nht;
 	struct neighbour *n = NULL;
 	int bucket = state->bucket;
 
 	state->flags &= ~NEIGH_SEQ_IS_PNEIGH;
-	for (bucket = 0; bucket <= tbl->hash_mask; bucket++) {
-		n = tbl->hash_buckets[bucket];
+	for (bucket = 0; bucket <= nht->hash_mask; bucket++) {
+		n = nht->hash_buckets[bucket];
 
 		while (n) {
 			if (!net_eq(dev_net(n->dev), net))
@@ -2234,7 +2285,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 {
 	struct neigh_seq_state *state = seq->private;
 	struct net *net = seq_file_net(seq);
-	struct neigh_table *tbl = state->tbl;
+	struct neigh_hash_table *nht = state->nht;
 
 	if (state->neigh_sub_iter) {
 		void *v = state->neigh_sub_iter(state, n, pos);
@@ -2265,10 +2316,10 @@ static struct neighbour *neigh_get_next(struct seq_file *seq,
 		if (n)
 			break;
 
-		if (++state->bucket > tbl->hash_mask)
+		if (++state->bucket > nht->hash_mask)
 			break;
 
-		n = tbl->hash_buckets[state->bucket];
+		n = nht->hash_buckets[state->bucket];
 	}
 
 	if (n && pos)
@@ -2367,6 +2418,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos)
 
 void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags)
 	__acquires(tbl->lock)
+	__acquires(rcu_bh)
 {
 	struct neigh_seq_state *state = seq->private;
 
@@ -2374,8 +2426,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl
 	state->bucket = 0;
 	state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH);
 
-	read_lock_bh(&tbl->lock);
-
+	rcu_read_lock_bh();
+	state->nht = rcu_dereference_bh(tbl->nht);
+	read_lock(&tbl->lock);
 	return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN;
 }
 EXPORT_SYMBOL(neigh_seq_start);
@@ -2409,11 +2462,13 @@ EXPORT_SYMBOL(neigh_seq_next);
 
 void neigh_seq_stop(struct seq_file *seq, void *v)
 	__releases(tbl->lock)
+	__releases(rcu_bh)
 {
 	struct neigh_seq_state *state = seq->private;
 	struct neigh_table *tbl = state->tbl;
 
-	read_unlock_bh(&tbl->lock);
+	read_unlock(&tbl->lock);
+	rcu_read_unlock_bh();
 }
 EXPORT_SYMBOL(neigh_seq_stop);
 
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index 0363bb95cc7d..a085dbcf5c7f 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -48,7 +48,6 @@
 #include <net/dn_neigh.h>
 #include <net/dn_route.h>
 
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev);
 static int dn_neigh_construct(struct neighbour *);
 static void dn_long_error_report(struct neighbour *, struct sk_buff *);
 static void dn_short_error_report(struct neighbour *, struct sk_buff *);
@@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = {
 	.queue_xmit =		dev_queue_xmit
 };
 
+static u32 dn_neigh_hash(const void *pkey,
+			 const struct net_device *dev,
+			 __u32 hash_rnd)
+{
+	return jhash_2words(*(__u16 *)pkey, 0, hash_rnd);
+}
+
 struct neigh_table dn_neigh_table = {
 	.family =			PF_DECnet,
 	.entry_size =			sizeof(struct dn_neigh),
@@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = {
 	.gc_thresh3 =			1024,
 };
 
-static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev)
-{
-	return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd);
-}
-
 static int dn_neigh_construct(struct neighbour *neigh)
 {
 	struct net_device *dev = neigh->dev;
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index d9031ad67826..f35309578170 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -127,7 +127,7 @@ EXPORT_SYMBOL(clip_tbl_hook);
 /*
  *	Interface to generic neighbour cache.
  */
-static u32 arp_hash(const void *pkey, const struct net_device *dev);
+static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd);
 static int arp_constructor(struct neighbour *neigh);
 static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -225,9 +225,11 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir)
 }
 
 
-static u32 arp_hash(const void *pkey, const struct net_device *dev)
+static u32 arp_hash(const void *pkey,
+		    const struct net_device *dev,
+		    __u32 hash_rnd)
 {
-	return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd);
+	return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd);
 }
 
 static int arp_constructor(struct neighbour *neigh)
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index b3dd844cd34f..998d6d27e7cf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -91,7 +91,9 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
 
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev);
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 rnd);
 static int ndisc_constructor(struct neighbour *neigh);
 static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb);
 static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb);
@@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d
 
 EXPORT_SYMBOL(ndisc_mc_map);
 
-static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
+static u32 ndisc_hash(const void *pkey,
+		      const struct net_device *dev,
+		      __u32 hash_rnd)
 {
 	const u32 *p32 = pkey;
 	u32 addr_hash, i;
@@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev)
 	for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++)
 		addr_hash ^= *p32++;
 
-	return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd);
+	return jhash_2words(addr_hash, dev->ifindex, hash_rnd);
 }
 
 static int ndisc_constructor(struct neighbour *neigh)
-- 
cgit v1.2.3-58-ga151


From e12b453904c54bbdc515778ff664d87a7f9473af Mon Sep 17 00:00:00 2001
From: Flavio Leitner <fleitner@redhat.com>
Date: Tue, 5 Oct 2010 14:23:58 +0000
Subject: bonding: fix to rejoin multicast groups immediately

The IGMP specs states that if the system receives a
membership report, it shouldn't send another for the
next minute. However, if a link failure happens right
after that, the backup slave and the switch connected
to this slave will not know about the multicast and
the traffic will hang for about a minute.

This patch fixes it to rejoin multicast groups immediately
after a failover restoring the multicast traffic.

Signed-off-by: Flavio Leitner <fleitner@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/igmp.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 2a4bb76f2132..25f339672b28 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1269,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im)
 	if (im->multiaddr == IGMP_ALL_HOSTS)
 		return;
 
-	if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) {
-		igmp_mod_timer(im, IGMP_Initial_Report_Delay);
-		return;
-	}
-	/* else, v3 */
-	im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv :
-		IGMP_Unsolicited_Report_Count;
-	igmp_ifc_event(in_dev);
+	/* a failover is happening and switches
+	 * must be notified immediately */
+	if (IGMP_V1_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT);
+	else if (IGMP_V2_SEEN(in_dev))
+		igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT);
+	else
+		igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT);
 #endif
 }
 EXPORT_SYMBOL(ip_mc_rejoin_group);
-- 
cgit v1.2.3-58-ga151


From ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 5 Oct 2010 10:41:36 +0000
Subject: fib: RCU conversion of fib_lookup()

fib_lookup() converted to be called in RCU protected context, no
reference taken and released on a contended cache line (fib_clntref)

fib_table_lookup() and fib_semantic_match() get an additional parameter.

struct fib_info gets an rcu_head field, and is freed after an rcu grace
period.

Stress test :
(Sending 160.000.000 UDP frames on same neighbour,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_HASH) (about same results for FIB_TRIE)

Before patch :

real	1m31.199s
user	0m13.761s
sys	23m24.780s

After patch:

real	1m5.375s
user	0m14.997s
sys	15m50.115s

Before patch Profile :

13044.00 15.4% __ip_route_output_key vmlinux
 8438.00 10.0% dst_destroy           vmlinux
 5983.00  7.1% fib_semantic_match    vmlinux
 5410.00  6.4% fib_rules_lookup      vmlinux
 4803.00  5.7% neigh_lookup          vmlinux
 4420.00  5.2% _raw_spin_lock        vmlinux
 3883.00  4.6% rt_set_nexthop        vmlinux
 3261.00  3.9% _raw_read_lock        vmlinux
 2794.00  3.3% fib_table_lookup      vmlinux
 2374.00  2.8% neigh_resolve_output  vmlinux
 2153.00  2.5% dst_alloc             vmlinux
 1502.00  1.8% _raw_read_lock_bh     vmlinux
 1484.00  1.8% kmem_cache_alloc      vmlinux
 1407.00  1.7% eth_header            vmlinux
 1406.00  1.7% ipv4_dst_destroy      vmlinux
 1298.00  1.5% __copy_from_user_ll   vmlinux
 1174.00  1.4% dev_queue_xmit        vmlinux
 1000.00  1.2% ip_output             vmlinux

After patch Profile :

13712.00 15.8% dst_destroy             vmlinux
 8548.00  9.9% __ip_route_output_key   vmlinux
 7017.00  8.1% neigh_lookup            vmlinux
 4554.00  5.3% fib_semantic_match      vmlinux
 4067.00  4.7% _raw_read_lock          vmlinux
 3491.00  4.0% dst_alloc               vmlinux
 3186.00  3.7% neigh_resolve_output    vmlinux
 3103.00  3.6% fib_table_lookup        vmlinux
 2098.00  2.4% _raw_read_lock_bh       vmlinux
 2081.00  2.4% kmem_cache_alloc        vmlinux
 2013.00  2.3% _raw_spin_lock          vmlinux
 1763.00  2.0% __copy_from_user_ll     vmlinux
 1763.00  2.0% ip_output               vmlinux
 1761.00  2.0% ipv4_dst_destroy        vmlinux
 1631.00  1.9% eth_header              vmlinux
 1440.00  1.7% _raw_read_unlock_bh     vmlinux

Reference results, if IP route cache is enabled :

real	0m29.718s
user	0m10.845s
sys	7m37.341s

25213.00 29.5% __ip_route_output_key   vmlinux
 9011.00 10.5% dst_release             vmlinux
 4817.00  5.6% ip_push_pending_frames  vmlinux
 4232.00  5.0% ip_finish_output        vmlinux
 3940.00  4.6% udp_sendmsg             vmlinux
 3730.00  4.4% __copy_from_user_ll     vmlinux
 3716.00  4.4% ip_route_output_flow    vmlinux
 2451.00  2.9% __xfrm_lookup           vmlinux
 2221.00  2.6% ip_append_data          vmlinux
 1718.00  2.0% _raw_spin_lock_bh       vmlinux
 1655.00  1.9% __alloc_skb             vmlinux
 1572.00  1.8% sock_wfree              vmlinux
 1345.00  1.6% kfree                   vmlinux

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/fib_rules.h  |  2 ++
 include/net/ip_fib.h     | 17 ++++----------
 net/core/fib_rules.c     |  3 ++-
 net/ipv4/fib_frontend.c  | 27 +++++++++++-----------
 net/ipv4/fib_hash.c      |  5 ++--
 net/ipv4/fib_lookup.h    |  2 +-
 net/ipv4/fib_rules.c     |  3 ++-
 net/ipv4/fib_semantics.c | 21 +++++++++++++----
 net/ipv4/fib_trie.c      | 10 ++++----
 net/ipv4/route.c         | 59 ++++++++++++++++++++----------------------------
 10 files changed, 72 insertions(+), 77 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h
index ac2fd002812e..106f3097d384 100644
--- a/include/net/fib_rules.h
+++ b/include/net/fib_rules.h
@@ -31,6 +31,8 @@ struct fib_lookup_arg {
 	void			*lookup_ptr;
 	void			*result;
 	struct fib_rule		*rule;
+	int			flags;
+#define FIB_LOOKUP_NOREF	1
 };
 
 struct fib_rules_ops {
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index c93f94edc610..ba3666d31766 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -86,6 +86,7 @@ struct fib_info {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 	int			fib_power;
 #endif
+	struct rcu_head		rcu;
 	struct fib_nh		fib_nh[0];
 #define fib_dev		fib_nh[0].nh_dev
 };
@@ -148,7 +149,7 @@ struct fib_table {
 };
 
 extern int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
-			    struct fib_result *res);
+			    struct fib_result *res, int fib_flags);
 extern int fib_table_insert(struct fib_table *, struct fib_config *);
 extern int fib_table_delete(struct fib_table *, struct fib_config *);
 extern int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
@@ -185,11 +186,11 @@ static inline int fib_lookup(struct net *net, const struct flowi *flp,
 	struct fib_table *table;
 
 	table = fib_get_table(net, RT_TABLE_LOCAL);
-	if (!fib_table_lookup(table, flp, res))
+	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
 		return 0;
 
 	table = fib_get_table(net, RT_TABLE_MAIN);
-	if (!fib_table_lookup(table, flp, res))
+	if (!fib_table_lookup(table, flp, res, FIB_LOOKUP_NOREF))
 		return 0;
 	return -ENETUNREACH;
 }
@@ -254,16 +255,6 @@ static inline void fib_info_put(struct fib_info *fi)
 		free_fib_info(fi);
 }
 
-static inline void fib_res_put(struct fib_result *res)
-{
-	if (res->fi)
-		fib_info_put(res->fi);
-#ifdef CONFIG_IP_MULTIPLE_TABLES
-	if (res->r)
-		fib_rule_put(res->r);
-#endif
-}
-
 #ifdef CONFIG_PROC_FS
 extern int __net_init  fib_proc_init(struct net *net);
 extern void __net_exit fib_proc_exit(struct net *net);
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index cfb7d25c172d..21698f8c49ee 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -225,7 +225,8 @@ jumped:
 			err = ops->action(rule, fl, flags, arg);
 
 		if (err != -EAGAIN) {
-			if (likely(atomic_inc_not_zero(&rule->refcnt))) {
+			if ((arg->flags & FIB_LOOKUP_NOREF) ||
+			    likely(atomic_inc_not_zero(&rule->refcnt))) {
 				arg->rule = rule;
 				goto out;
 			}
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b05c23b05a9f..919f2ad19b49 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -168,8 +168,11 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 	struct fib_result res = { 0 };
 	struct net_device *dev = NULL;
 
-	if (fib_lookup(net, &fl, &res))
+	rcu_read_lock();
+	if (fib_lookup(net, &fl, &res)) {
+		rcu_read_unlock();
 		return NULL;
+	}
 	if (res.type != RTN_LOCAL)
 		goto out;
 	dev = FIB_RES_DEV(res);
@@ -177,7 +180,7 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 	if (dev && devref)
 		dev_hold(dev);
 out:
-	fib_res_put(&res);
+	rcu_read_unlock();
 	return dev;
 }
 EXPORT_SYMBOL(__ip_dev_find);
@@ -207,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net,
 	local_table = fib_get_table(net, RT_TABLE_LOCAL);
 	if (local_table) {
 		ret = RTN_UNICAST;
-		if (!fib_table_lookup(local_table, &fl, &res)) {
+		rcu_read_lock();
+		if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) {
 			if (!dev || dev == res.fi->fib_dev)
 				ret = res.type;
-			fib_res_put(&res);
 		}
+		rcu_read_unlock();
 	}
 	return ret;
 }
@@ -235,6 +239,7 @@ EXPORT_SYMBOL(inet_dev_addr_type);
  * - figure out what "logical" interface this packet arrived
  *   and calculate "specific destination" address.
  * - check, that packet arrived from expected physical interface.
+ * called with rcu_read_lock()
  */
 int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			struct net_device *dev, __be32 *spec_dst,
@@ -259,7 +264,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 	struct net *net;
 
 	no_addr = rpf = accept_local = 0;
-	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev);
 	if (in_dev) {
 		no_addr = in_dev->ifa_list == NULL;
@@ -268,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 		if (mark && !IN_DEV_SRC_VMARK(in_dev))
 			fl.mark = 0;
 	}
-	rcu_read_unlock();
 
 	if (in_dev == NULL)
 		goto e_inval;
@@ -278,7 +281,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 		goto last_resort;
 	if (res.type != RTN_UNICAST) {
 		if (res.type != RTN_LOCAL || !accept_local)
-			goto e_inval_res;
+			goto e_inval;
 	}
 	*spec_dst = FIB_RES_PREFSRC(res);
 	fib_combine_itag(itag, &res);
@@ -299,10 +302,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 #endif
 	if (dev_match) {
 		ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
-		fib_res_put(&res);
 		return ret;
 	}
-	fib_res_put(&res);
 	if (no_addr)
 		goto last_resort;
 	if (rpf == 1)
@@ -315,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif,
 			*spec_dst = FIB_RES_PREFSRC(res);
 			ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST;
 		}
-		fib_res_put(&res);
 	}
 	return ret;
 
@@ -326,8 +326,6 @@ last_resort:
 	*itag = 0;
 	return 0;
 
-e_inval_res:
-	fib_res_put(&res);
 e_inval:
 	return -EINVAL;
 e_rpf:
@@ -873,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb)
 		local_bh_disable();
 
 		frn->tb_id = tb->tb_id;
-		frn->err = fib_table_lookup(tb, &fl, &res);
+		rcu_read_lock();
+		frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF);
 
 		if (!frn->err) {
 			frn->prefixlen = res.prefixlen;
 			frn->nh_sel = res.nh_sel;
 			frn->type = res.type;
 			frn->scope = res.scope;
-			fib_res_put(&res);
 		}
+		rcu_read_unlock();
 		local_bh_enable();
 	}
 }
diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4ed7e0dea1bc..83cca68e259c 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -244,7 +244,8 @@ fn_new_zone(struct fn_hash *table, int z)
 }
 
 int fib_table_lookup(struct fib_table *tb,
-		     const struct flowi *flp, struct fib_result *res)
+		     const struct flowi *flp, struct fib_result *res,
+		     int fib_flags)
 {
 	int err;
 	struct fn_zone *fz;
@@ -264,7 +265,7 @@ int fib_table_lookup(struct fib_table *tb,
 
 			err = fib_semantic_match(&f->fn_alias,
 						 flp, res,
-						 fz->fz_order);
+						 fz->fz_order, fib_flags);
 			if (err <= 0)
 				goto out;
 		}
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 637b133973bd..b9c9a9f2aee5 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -22,7 +22,7 @@ struct fib_alias {
 /* Exported by fib_semantics.c */
 extern int fib_semantic_match(struct list_head *head,
 			      const struct flowi *flp,
-			      struct fib_result *res, int prefixlen);
+			      struct fib_result *res, int prefixlen, int fib_flags);
 extern void fib_release_info(struct fib_info *);
 extern struct fib_info *fib_create_info(struct fib_config *cfg);
 extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi);
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index 32300521e32c..7981a24f5c7b 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -57,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res)
 {
 	struct fib_lookup_arg arg = {
 		.result = res,
+		.flags = FIB_LOOKUP_NOREF,
 	};
 	int err;
 
@@ -94,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp,
 	if (!tbl)
 		goto errout;
 
-	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result);
+	err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags);
 	if (err > 0)
 		err = -EAGAIN;
 errout:
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index ba52f399a898..0f80dfc2f7fb 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -148,6 +148,13 @@ static const struct
 
 /* Release a nexthop info record */
 
+static void free_fib_info_rcu(struct rcu_head *head)
+{
+	struct fib_info *fi = container_of(head, struct fib_info, rcu);
+
+	kfree(fi);
+}
+
 void free_fib_info(struct fib_info *fi)
 {
 	if (fi->fib_dead == 0) {
@@ -161,7 +168,7 @@ void free_fib_info(struct fib_info *fi)
 	} endfor_nexthops(fi);
 	fib_info_cnt--;
 	release_net(fi->fib_net);
-	kfree(fi);
+	call_rcu(&fi->rcu, free_fib_info_rcu);
 }
 
 void fib_release_info(struct fib_info *fi)
@@ -553,6 +560,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			nh->nh_scope = RT_SCOPE_LINK;
 			return 0;
 		}
+		rcu_read_lock();
 		{
 			struct flowi fl = {
 				.nl_u = {
@@ -568,8 +576,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			if (fl.fl4_scope < RT_SCOPE_LINK)
 				fl.fl4_scope = RT_SCOPE_LINK;
 			err = fib_lookup(net, &fl, &res);
-			if (err)
+			if (err) {
+				rcu_read_unlock();
 				return err;
+			}
 		}
 		err = -EINVAL;
 		if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
@@ -585,7 +595,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 			goto out;
 		err = 0;
 out:
-		fib_res_put(&res);
+		rcu_read_unlock();
 		return err;
 	} else {
 		struct in_device *in_dev;
@@ -879,7 +889,7 @@ failure:
 
 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
-		       struct fib_result *res, int prefixlen)
+		       struct fib_result *res, int prefixlen, int fib_flags)
 {
 	struct fib_alias *fa;
 	int nh_sel = 0;
@@ -943,7 +953,8 @@ out_fill_res:
 	res->type = fa->fa_type;
 	res->scope = fa->fa_scope;
 	res->fi = fa->fa_info;
-	atomic_inc(&res->fi->fib_clntref);
+	if (!(fib_flags & FIB_LOOKUP_NOREF))
+		atomic_inc(&res->fi->fib_clntref);
 	return 0;
 }
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index a96e5ec211a0..271c89bdf049 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1342,7 +1342,7 @@ err:
 /* should be called with rcu_read_lock */
 static int check_leaf(struct trie *t, struct leaf *l,
 		      t_key key,  const struct flowi *flp,
-		      struct fib_result *res)
+		      struct fib_result *res, int fib_flags)
 {
 	struct leaf_info *li;
 	struct hlist_head *hhead = &l->list;
@@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
 		if (l->key != (key & ntohl(mask)))
 			continue;
 
-		err = fib_semantic_match(&li->falh, flp, res, plen);
+		err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags);
 
 #ifdef CONFIG_IP_FIB_TRIE_STATS
 		if (err <= 0)
@@ -1372,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l,
 }
 
 int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
-		     struct fib_result *res)
+		     struct fib_result *res, int fib_flags)
 {
 	struct trie *t = (struct trie *) tb->tb_data;
 	int ret;
@@ -1399,7 +1399,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 
 	/* Just a leaf? */
 	if (IS_LEAF(n)) {
-		ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+		ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
 		goto found;
 	}
 
@@ -1424,7 +1424,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		}
 
 		if (IS_LEAF(n)) {
-			ret = check_leaf(t, (struct leaf *)n, key, flp, res);
+			ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags);
 			if (ret > 0)
 				goto backtrace;
 			goto found;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 04e0df82b88c..7864d0c48968 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt)
 
 	if (rt->fl.iif == 0)
 		src = rt->rt_src;
-	else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) {
-		src = FIB_RES_PREFSRC(res);
-		fib_res_put(&res);
-	} else
-		src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
+	else {
+		rcu_read_lock();
+		if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0)
+			src = FIB_RES_PREFSRC(res);
+		else
+			src = inet_select_addr(rt->dst.dev, rt->rt_gateway,
 					RT_SCOPE_UNIVERSE);
+		rcu_read_unlock();
+	}
 	memcpy(addr, &src, 4);
 }
 
@@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
  *	Such approach solves two big problems:
  *	1. Not simplex devices are handled properly.
  *	2. IP spoofing attempts are filtered with 100% of guarantee.
+ *	called with rcu_read_lock()
  */
 
 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	unsigned	hash;
 	__be32		spec_dst;
 	int		err = -EINVAL;
-	int		free_res = 0;
 	struct net    * net = dev_net(dev);
 
 	/* IP on this device is disabled. */
@@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	/*
 	 *	Now we are ready to route packet.
 	 */
-	if ((err = fib_lookup(net, &fl, &res)) != 0) {
+	err = fib_lookup(net, &fl, &res);
+	if (err != 0) {
 		if (!IN_DEV_FORWARD(in_dev))
 			goto e_hostunreach;
 		goto no_route;
 	}
-	free_res = 1;
 
 	RT_CACHE_STAT_INC(in_slow_tot);
 
@@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
 	if (res.type == RTN_LOCAL) {
 		err = fib_validate_source(saddr, daddr, tos,
-					     net->loopback_dev->ifindex,
-					     dev, &spec_dst, &itag, skb->mark);
+					  net->loopback_dev->ifindex,
+					  dev, &spec_dst, &itag, skb->mark);
 		if (err < 0)
 			goto martian_source_keep_err;
 		if (err)
@@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 		goto martian_destination;
 
 	err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
-done:
-	if (free_res)
-		fib_res_put(&res);
 out:	return err;
 
 brd_input:
@@ -2226,7 +2226,7 @@ local_input:
 	rth->rt_type	= res.type;
 	hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
 	err = rt_intern_hash(hash, rth, NULL, skb, fl.iif);
-	goto done;
+	goto out;
 
 no_route:
 	RT_CACHE_STAT_INC(in_no_route);
@@ -2249,21 +2249,21 @@ martian_destination:
 
 e_hostunreach:
 	err = -EHOSTUNREACH;
-	goto done;
+	goto out;
 
 e_inval:
 	err = -EINVAL;
-	goto done;
+	goto out;
 
 e_nobufs:
 	err = -ENOBUFS;
-	goto done;
+	goto out;
 
 martian_source:
 	err = -EINVAL;
 martian_source_keep_err:
 	ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
-	goto done;
+	goto out;
 }
 
 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr,
@@ -2349,6 +2349,7 @@ skip_cache:
 }
 EXPORT_SYMBOL(ip_route_input_common);
 
+/* called with rcu_read_lock() */
 static int __mkroute_output(struct rtable **result,
 			    struct fib_result *res,
 			    const struct flowi *fl,
@@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result,
 	if (dev_out->flags & IFF_LOOPBACK)
 		flags |= RTCF_LOCAL;
 
-	rcu_read_lock();
 	in_dev = __in_dev_get_rcu(dev_out);
-	if (!in_dev) {
-		rcu_read_unlock();
+	if (!in_dev)
 		return -EINVAL;
-	}
+
 	if (res->type == RTN_BROADCAST) {
 		flags |= RTCF_BROADCAST | RTCF_LOCAL;
-		if (res->fi) {
-			fib_info_put(res->fi);
-			res->fi = NULL;
-		}
+		res->fi = NULL;
 	} else if (res->type == RTN_MULTICAST) {
 		flags |= RTCF_MULTICAST | RTCF_LOCAL;
 		if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
@@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result,
 		 * default one, but do not gateway in this case.
 		 * Yes, it is hack.
 		 */
-		if (res->fi && res->prefixlen < 4) {
-			fib_info_put(res->fi);
+		if (res->fi && res->prefixlen < 4)
 			res->fi = NULL;
-		}
 	}
 
 
@@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result,
 	return 0;
 }
 
+/* called with rcu_read_lock() */
 static int ip_mkroute_output(struct rtable **rp,
 			     struct fib_result *res,
 			     const struct flowi *fl,
@@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 	struct fib_result res;
 	unsigned int flags = 0;
 	struct net_device *dev_out = NULL;
-	int free_res = 0;
 	int err;
 
 
@@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 		err = -ENETUNREACH;
 		goto out;
 	}
-	free_res = 1;
 
 	if (res.type == RTN_LOCAL) {
 		if (!fl.fl4_src)
 			fl.fl4_src = fl.fl4_dst;
 		dev_out = net->loopback_dev;
 		fl.oif = dev_out->ifindex;
-		if (res.fi)
-			fib_info_put(res.fi);
 		res.fi = NULL;
 		flags |= RTCF_LOCAL;
 		goto make_route;
@@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 make_route:
 	err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
 
-	if (free_res)
-		fib_res_put(&res);
 out:	return err;
 }
 
-- 
cgit v1.2.3-58-ga151


From 9c6d5e5537e3997c47b65925a235392c1968fb51 Mon Sep 17 00:00:00 2001
From: John Heffner <johnwheffner@gmail.com>
Date: Wed, 6 Oct 2010 21:18:02 -0700
Subject: TCP: Fix setting of snd_ssthresh in tcp_mtu_probe_success

This looks like a simple typo that has gone unnoticed for some time.  The
impact is relatively low but it's clearly wrong.

Signed-off-by: John Heffner <johnwheffner@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f6fdd727a23d..e4fbdae066d5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2874,7 +2874,7 @@ static void tcp_mtup_probe_success(struct sock *sk)
 		       icsk->icsk_mtup.probe_size;
 	tp->snd_cwnd_cnt = 0;
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tp->rcv_ssthresh = tcp_current_ssthresh(sk);
+	tp->snd_ssthresh = tcp_current_ssthresh(sk);
 
 	icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size;
 	icsk->icsk_mtup.probe_size = 0;
-- 
cgit v1.2.3-58-ga151


From 8391d07b80e8da957cd888870e23f8e218438622 Mon Sep 17 00:00:00 2001
From: Dimitris Michailidis <dm@chelsio.com>
Date: Thu, 7 Oct 2010 14:48:38 +0000
Subject: ipv4: Remove leftover rcu_read_unlock calls from __mkroute_output()

Commit "fib: RCU conversion of fib_lookup()" removed rcu_read_lock() from
__mkroute_output but left a couple of calls to rcu_read_unlock() in there.
This causes lockdep to complain that the rcu_read_unlock() call in
__ip_route_output_key causes a lock inbalance and quickly crashes the
kernel. The below fixes this for me.

Signed-off-by: Dimitris Michailidis <dm@chelsio.com>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7864d0c48968..3888f6ba0a5c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2396,12 +2396,10 @@ static int __mkroute_output(struct rtable **result,
 
 
 	rth = dst_alloc(&ipv4_dst_ops);
-	if (!rth) {
-		rcu_read_unlock();
+	if (!rth)
 		return -ENOBUFS;
-	}
+
 	in_dev_hold(in_dev);
-	rcu_read_unlock();
 	rth->idev = in_dev;
 
 	atomic_set(&rth->dst.__refcnt, 1);
-- 
cgit v1.2.3-58-ga151


From 0ed8ddf4045fcfcac36bad753dc4046118c603ec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 7 Oct 2010 10:44:07 +0000
Subject: neigh: Protect neigh->ha[] with a seqlock

Add a seqlock in struct neighbour to protect neigh->ha[], and avoid
dirtying neighbour in stress situation (many different flows / dsts)

Dirtying takes place because of read_lock(&n->lock) and n->used writes.

Switching to a seqlock, and writing n->used only on jiffies changes
permits less dirtying.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h | 16 +++++++++++++++-
 net/core/neighbour.c    | 47 ++++++++++++++++++++++++++++++-----------------
 net/ipv4/arp.c          |  6 ++----
 net/sched/sch_teql.c    |  8 ++++----
 4 files changed, 51 insertions(+), 26 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index a4538d553704..f04e7a2522c5 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -105,6 +105,7 @@ struct neighbour {
 	atomic_t		refcnt;
 	atomic_t		probes;
 	rwlock_t		lock;
+	seqlock_t		ha_lock;
 	unsigned char		ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))];
 	struct hh_cache		*hh;
 	int			(*output)(struct sk_buff *skb);
@@ -302,7 +303,10 @@ static inline void neigh_confirm(struct neighbour *neigh)
 
 static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
 {
-	neigh->used = jiffies;
+	unsigned long now = ACCESS_ONCE(jiffies);
+	
+	if (neigh->used != now)
+		neigh->used = now;
 	if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE)))
 		return __neigh_event_send(neigh, skb);
 	return 0;
@@ -373,4 +377,14 @@ struct neighbour_cb {
 
 #define NEIGH_CB(skb)	((struct neighbour_cb *)(skb)->cb)
 
+static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n,
+				     const struct net_device *dev)
+{
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&n->ha_lock);
+		memcpy(dst, n->ha, dev->addr_len);
+	} while (read_seqretry(&n->ha_lock, seq));
+}
 #endif
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 2044906ecd1a..b165b96355bf 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -294,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
 
 	skb_queue_head_init(&n->arp_queue);
 	rwlock_init(&n->lock);
+	seqlock_init(&n->ha_lock);
 	n->updated	  = n->used = now;
 	n->nud_state	  = NUD_NONE;
 	n->output	  = neigh_blackhole;
@@ -1015,7 +1016,7 @@ out_unlock_bh:
 }
 EXPORT_SYMBOL(__neigh_event_send);
 
-static void neigh_update_hhs(struct neighbour *neigh)
+static void neigh_update_hhs(const struct neighbour *neigh)
 {
 	struct hh_cache *hh;
 	void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *)
@@ -1151,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
 	}
 
 	if (lladdr != neigh->ha) {
+		write_seqlock(&neigh->ha_lock);
 		memcpy(&neigh->ha, lladdr, dev->addr_len);
+		write_sequnlock(&neigh->ha_lock);
 		neigh_update_hhs(neigh);
 		if (!(new & NUD_CONNECTED))
 			neigh->confirmed = jiffies -
@@ -1214,6 +1217,7 @@ static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst,
 {
 	struct hh_cache *hh;
 
+	smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */
 	for (hh = n->hh; hh; hh = hh->hh_next) {
 		if (hh->hh_type == protocol) {
 			atomic_inc(&hh->hh_refcnt);
@@ -1248,8 +1252,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 		kfree(hh);
 		return;
 	}
-	read_unlock(&n->lock);
-	write_lock(&n->lock);
+
+	write_lock_bh(&n->lock);
 
 	/* must check if another thread already did the insert */
 	if (neigh_hh_lookup(n, dst, protocol)) {
@@ -1263,13 +1267,13 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst,
 		hh->hh_output = n->ops->output;
 
 	hh->hh_next = n->hh;
+	smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */
 	n->hh	    = hh;
 
 	if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL))
 		hh_cache_put(hh);
 end:
-	write_unlock(&n->lock);
-	read_lock(&n->lock);
+	write_unlock_bh(&n->lock);
 }
 
 /* This function can be used in contexts, where only old dev_queue_xmit
@@ -1308,16 +1312,18 @@ int neigh_resolve_output(struct sk_buff *skb)
 	if (!neigh_event_send(neigh, skb)) {
 		int err;
 		struct net_device *dev = neigh->dev;
+		unsigned int seq;
 
-		read_lock_bh(&neigh->lock);
 		if (dev->header_ops->cache &&
 		    !dst->hh &&
 		    !(dst->flags & DST_NOCACHE))
 			neigh_hh_init(neigh, dst, dst->ops->protocol);
 
-		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-				      neigh->ha, NULL, skb->len);
-		read_unlock_bh(&neigh->lock);
+		do {
+			seq = read_seqbegin(&neigh->ha_lock);
+			err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+					      neigh->ha, NULL, skb->len);
+		} while (read_seqretry(&neigh->ha_lock, seq));
 
 		if (err >= 0)
 			rc = neigh->ops->queue_xmit(skb);
@@ -1344,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb)
 	struct dst_entry *dst = skb_dst(skb);
 	struct neighbour *neigh = dst->neighbour;
 	struct net_device *dev = neigh->dev;
+	unsigned int seq;
 
 	__skb_pull(skb, skb_network_offset(skb));
 
-	read_lock_bh(&neigh->lock);
-	err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-			      neigh->ha, NULL, skb->len);
-	read_unlock_bh(&neigh->lock);
+	do {
+		seq = read_seqbegin(&neigh->ha_lock);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
+				      neigh->ha, NULL, skb->len);
+	} while (read_seqretry(&neigh->ha_lock, seq));
+
 	if (err >= 0)
 		err = neigh->ops->queue_xmit(skb);
 	else {
@@ -2148,10 +2157,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
 
 	read_lock_bh(&neigh->lock);
 	ndm->ndm_state	 = neigh->nud_state;
-	if ((neigh->nud_state & NUD_VALID) &&
-	    nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) {
-		read_unlock_bh(&neigh->lock);
-		goto nla_put_failure;
+	if (neigh->nud_state & NUD_VALID) {
+		char haddr[MAX_ADDR_LEN];
+
+		neigh_ha_snapshot(haddr, neigh, neigh->dev);
+		if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
+			read_unlock_bh(&neigh->lock);
+			goto nla_put_failure;
+		}
 	}
 
 	ci.ndm_used	 = jiffies_to_clock_t(now - neigh->used);
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index f35309578170..d8e540c5b071 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -502,10 +502,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb)
 
 	if (n) {
 		n->used = jiffies;
-		if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) {
-			read_lock_bh(&n->lock);
-			memcpy(haddr, n->ha, dev->addr_len);
-			read_unlock_bh(&n->lock);
+		if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) {
+			neigh_ha_snapshot(haddr, n, dev);
 			neigh_release(n);
 			return 0;
 		}
diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c
index feaabc103ce6..401af9596709 100644
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *
 	}
 	if (neigh_event_send(n, skb_res) == 0) {
 		int err;
+		char haddr[MAX_ADDR_LEN];
 
-		read_lock(&n->lock);
-		err = dev_hard_header(skb, dev, ntohs(skb->protocol),
-				      n->ha, NULL, skb->len);
-		read_unlock(&n->lock);
+		neigh_ha_snapshot(haddr, n, dev);
+		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
+				      NULL, skb->len);
 
 		if (err < 0) {
 			neigh_release(n);
-- 
cgit v1.2.3-58-ga151


From fc66f95c68b6d4535a0ea2ea15d5cf626e310956 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 8 Oct 2010 06:37:34 +0000
Subject: net dst: use a percpu_counter to track entries

struct dst_ops tracks number of allocated dst in an atomic_t field,
subject to high cache line contention in stress workload.

Switch to a percpu_counter, to reduce number of time we need to dirty a
central location. Place it on a separate cache line to avoid dirtying
read only fields.

Stress test :

(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE, SLUB/NUMA)

Before:

real    0m51.179s
user    0m15.329s
sys     10m15.942s

After:

real	0m45.570s
user	0m15.525s
sys	9m56.669s

With a small reordering of struct neighbour fields, subject of a
following patch, (to separate refcnt from other read mostly fields)

real	0m41.841s
user	0m15.261s
sys	8m45.949s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/dst_ops.h     | 37 ++++++++++++++++++++++++++++++++++++-
 net/bridge/br_netfilter.c | 11 +++++++++--
 net/core/dst.c            |  6 +++---
 net/decnet/dn_route.c     |  3 ++-
 net/ipv4/route.c          | 36 ++++++++++++++++++++++--------------
 net/ipv4/xfrm4_policy.c   |  4 ++--
 net/ipv6/route.c          | 28 ++++++++++++++++++++--------
 net/ipv6/xfrm6_policy.c   | 10 ++++++----
 8 files changed, 100 insertions(+), 35 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index d1ff9b7e99b8..1fa5306e3e23 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -1,6 +1,7 @@
 #ifndef _NET_DST_OPS_H
 #define _NET_DST_OPS_H
 #include <linux/types.h>
+#include <linux/percpu_counter.h>
 
 struct dst_entry;
 struct kmem_cachep;
@@ -22,7 +23,41 @@ struct dst_ops {
 	void			(*update_pmtu)(struct dst_entry *dst, u32 mtu);
 	int			(*local_out)(struct sk_buff *skb);
 
-	atomic_t		entries;
 	struct kmem_cache	*kmem_cachep;
+
+	struct percpu_counter	pcpuc_entries ____cacheline_aligned_in_smp;
 };
+
+static inline int dst_entries_get_fast(struct dst_ops *dst)
+{
+	return percpu_counter_read_positive(&dst->pcpuc_entries);
+}
+
+static inline int dst_entries_get_slow(struct dst_ops *dst)
+{
+	int res;
+
+	local_bh_disable();
+	res = percpu_counter_sum_positive(&dst->pcpuc_entries);
+	local_bh_enable();
+	return res;
+}
+
+static inline void dst_entries_add(struct dst_ops *dst, int val)
+{
+	local_bh_disable();
+	percpu_counter_add(&dst->pcpuc_entries, val);
+	local_bh_enable();
+}
+
+static inline int dst_entries_init(struct dst_ops *dst)
+{
+	return percpu_counter_init(&dst->pcpuc_entries, 0);
+}
+
+static inline void dst_entries_destroy(struct dst_ops *dst)
+{
+	percpu_counter_destroy(&dst->pcpuc_entries);
+}
+
 #endif
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 77f7b5fda45a..7f9ce9600ef3 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -106,7 +106,6 @@ static struct dst_ops fake_dst_ops = {
 	.family =		AF_INET,
 	.protocol =		cpu_to_be16(ETH_P_IP),
 	.update_pmtu =		fake_update_pmtu,
-	.entries =		ATOMIC_INIT(0),
 };
 
 /*
@@ -1003,15 +1002,22 @@ int __init br_netfilter_init(void)
 {
 	int ret;
 
-	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	ret = dst_entries_init(&fake_dst_ops);
 	if (ret < 0)
 		return ret;
+
+	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret < 0) {
+		dst_entries_destroy(&fake_dst_ops);
+		return ret;
+	}
 #ifdef CONFIG_SYSCTL
 	brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table);
 	if (brnf_sysctl_header == NULL) {
 		printk(KERN_WARNING
 		       "br_netfilter: can't register to sysctl.\n");
 		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		dst_entries_destroy(&fake_dst_ops);
 		return -ENOMEM;
 	}
 #endif
@@ -1025,4 +1031,5 @@ void br_netfilter_fini(void)
 #ifdef CONFIG_SYSCTL
 	unregister_sysctl_table(brnf_sysctl_header);
 #endif
+	dst_entries_destroy(&fake_dst_ops);
 }
diff --git a/net/core/dst.c b/net/core/dst.c
index 978a1ee1f7d0..32e542d7f472 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops)
 {
 	struct dst_entry *dst;
 
-	if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
+	if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) {
 		if (ops->gc(ops))
 			return NULL;
 	}
@@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops)
 #if RT_CACHE_DEBUG >= 2
 	atomic_inc(&dst_total);
 #endif
-	atomic_inc(&ops->entries);
+	dst_entries_add(ops, 1);
 	return dst;
 }
 EXPORT_SYMBOL(dst_alloc);
@@ -236,7 +236,7 @@ again:
 		neigh_release(neigh);
 	}
 
-	atomic_dec(&dst->ops->entries);
+	dst_entries_add(dst->ops, -1);
 
 	if (dst->ops->destroy)
 		dst->ops->destroy(dst);
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 6585ea6d1182..df0f3e54ff8a 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = {
 	.negative_advice =	dn_dst_negative_advice,
 	.link_failure =		dn_dst_link_failure,
 	.update_pmtu =		dn_dst_update_pmtu,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static __inline__ unsigned dn_hash(__le16 src, __le16 dst)
@@ -1758,6 +1757,7 @@ void __init dn_route_init(void)
 	dn_dst_ops.kmem_cachep =
 		kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0,
 				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	dst_entries_init(&dn_dst_ops);
 	setup_timer(&dn_route_timer, dn_dst_check_expire, 0);
 	dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ;
 	add_timer(&dn_route_timer);
@@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void)
 	dn_run_flush(0);
 
 	proc_net_remove(&init_net, "decnet_cache");
+	dst_entries_destroy(&dn_dst_ops);
 }
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 3888f6ba0a5c..0755aa4af86c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = {
 	.link_failure =		ipv4_link_failure,
 	.update_pmtu =		ip_rt_update_pmtu,
 	.local_out =		__ip_local_out,
-	.entries =		ATOMIC_INIT(0),
 };
 
 #define ECN_OR_COST(class)	TC_PRIO_##class
@@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 
 	seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 		   " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
-		   atomic_read(&ipv4_dst_ops.entries),
+		   dst_entries_get_slow(&ipv4_dst_ops),
 		   st->in_hit,
 		   st->in_slow_tot,
 		   st->in_slow_mc,
@@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops)
 	struct rtable *rth, **rthp;
 	unsigned long now = jiffies;
 	int goal;
+	int entries = dst_entries_get_fast(&ipv4_dst_ops);
 
 	/*
 	 * Garbage collection is pretty expensive,
@@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops)
 	RT_CACHE_STAT_INC(gc_total);
 
 	if (now - last_gc < ip_rt_gc_min_interval &&
-	    atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
+	    entries < ip_rt_max_size) {
 		RT_CACHE_STAT_INC(gc_ignored);
 		goto out;
 	}
 
+	entries = dst_entries_get_slow(&ipv4_dst_ops);
 	/* Calculate number of entries, which we want to expire now. */
-	goal = atomic_read(&ipv4_dst_ops.entries) -
-		(ip_rt_gc_elasticity << rt_hash_log);
+	goal = entries - (ip_rt_gc_elasticity << rt_hash_log);
 	if (goal <= 0) {
 		if (equilibrium < ipv4_dst_ops.gc_thresh)
 			equilibrium = ipv4_dst_ops.gc_thresh;
-		goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+		goal = entries - equilibrium;
 		if (goal > 0) {
 			equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-			goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
+			goal = entries - equilibrium;
 		}
 	} else {
 		/* We are in dangerous area. Try to reduce cache really
 		 * aggressively.
 		 */
 		goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
-		equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
+		equilibrium = entries - goal;
 	}
 
 	if (now - last_gc >= ip_rt_gc_min_interval)
@@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops)
 		expire >>= 1;
 #if RT_CACHE_DEBUG >= 2
 		printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
-				atomic_read(&ipv4_dst_ops.entries), goal, i);
+				dst_entries_get_fast(&ipv4_dst_ops), goal, i);
 #endif
 
-		if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+		if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
 			goto out;
 	} while (!in_softirq() && time_before_eq(jiffies, now));
 
-	if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
+	if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size)
+		goto out;
+	if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size)
 		goto out;
 	if (net_ratelimit())
 		printk(KERN_WARNING "dst cache overflow\n");
@@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops)
 work_done:
 	expire += ip_rt_gc_min_interval;
 	if (expire > ip_rt_gc_timeout ||
-	    atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
+	    dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh ||
+	    dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh)
 		expire = ip_rt_gc_timeout;
 #if RT_CACHE_DEBUG >= 2
 	printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
-			atomic_read(&ipv4_dst_ops.entries), goal, rover);
+			dst_entries_get_fast(&ipv4_dst_ops), goal, rover);
 #endif
 out:	return 0;
 }
@@ -2717,7 +2720,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = {
 	.destroy		=	ipv4_dst_destroy,
 	.check			=	ipv4_blackhole_dst_check,
 	.update_pmtu		=	ipv4_rt_blackhole_update_pmtu,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 
@@ -3287,6 +3289,12 @@ int __init ip_rt_init(void)
 
 	ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
 
+	if (dst_entries_init(&ipv4_dst_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_ops counter\n");
+
+	if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
+		panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
+
 	rt_hash_table = (struct rt_hash_bucket *)
 		alloc_large_system_hash("IP route cache",
 					sizeof(struct rt_hash_bucket),
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index a580349f0b8a..4464f3bff6a7 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops)
 	struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops);
 
 	xfrm4_policy_afinfo.garbage_collect(net);
-	return (atomic_read(&ops->entries) > ops->gc_thresh * 2);
+	return (dst_entries_get_slow(ops) > ops->gc_thresh * 2);
 }
 
 static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = {
 	.ifdown =		xfrm4_dst_ifdown,
 	.local_out =		__ip_local_out,
 	.gc_thresh =		1024,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
@@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size)
 	 * and start cleaning when were 1/2 full
 	 */
 	xfrm4_dst_ops.gc_thresh = rt_max_size/2;
+	dst_entries_init(&xfrm4_dst_ops);
 
 	xfrm4_state_init();
 	xfrm4_policy_init();
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 17e217933885..25661f968f3f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,6 @@ static struct dst_ops ip6_dst_ops_template = {
 	.link_failure		=	ip6_link_failure,
 	.update_pmtu		=	ip6_rt_update_pmtu,
 	.local_out		=	__ip6_local_out,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -122,7 +121,6 @@ static struct dst_ops ip6_dst_blackhole_ops = {
 	.destroy		=	ip6_dst_destroy,
 	.check			=	ip6_dst_check,
 	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
-	.entries		=	ATOMIC_INIT(0),
 };
 
 static struct rt6_info ip6_null_entry_template = {
@@ -1058,19 +1056,22 @@ static int ip6_dst_gc(struct dst_ops *ops)
 	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
 	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
 	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+	int entries;
 
+	entries = dst_entries_get_fast(ops);
 	if (time_after(rt_last_gc + rt_min_interval, now) &&
-	    atomic_read(&ops->entries) <= rt_max_size)
+	    entries <= rt_max_size)
 		goto out;
 
 	net->ipv6.ip6_rt_gc_expire++;
 	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
 	net->ipv6.ip6_rt_last_gc = now;
-	if (atomic_read(&ops->entries) < ops->gc_thresh)
+	entries = dst_entries_get_slow(ops);
+	if (entries < ops->gc_thresh)
 		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
 out:
 	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
-	return atomic_read(&ops->entries) > rt_max_size;
+	return entries > rt_max_size;
 }
 
 /* Clean host part of a prefix. Not necessary in radix tree,
@@ -2524,7 +2525,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v)
 		   net->ipv6.rt6_stats->fib_rt_alloc,
 		   net->ipv6.rt6_stats->fib_rt_entries,
 		   net->ipv6.rt6_stats->fib_rt_cache,
-		   atomic_read(&net->ipv6.ip6_dst_ops.entries),
+		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
 		   net->ipv6.rt6_stats->fib_discarded_routes);
 
 	return 0;
@@ -2666,11 +2667,14 @@ static int __net_init ip6_route_net_init(struct net *net)
 	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
 	       sizeof(net->ipv6.ip6_dst_ops));
 
+	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+		goto out_ip6_dst_ops;
+
 	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
 					   sizeof(*net->ipv6.ip6_null_entry),
 					   GFP_KERNEL);
 	if (!net->ipv6.ip6_null_entry)
-		goto out_ip6_dst_ops;
+		goto out_ip6_dst_entries;
 	net->ipv6.ip6_null_entry->dst.path =
 		(struct dst_entry *)net->ipv6.ip6_null_entry;
 	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
@@ -2720,6 +2724,8 @@ out_ip6_prohibit_entry:
 out_ip6_null_entry:
 	kfree(net->ipv6.ip6_null_entry);
 #endif
+out_ip6_dst_entries:
+	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
 out_ip6_dst_ops:
 	goto out;
 }
@@ -2758,10 +2764,14 @@ int __init ip6_route_init(void)
 	if (!ip6_dst_ops_template.kmem_cachep)
 		goto out;
 
-	ret = register_pernet_subsys(&ip6_route_net_ops);
+	ret = dst_entries_init(&ip6_dst_blackhole_ops);
 	if (ret)
 		goto out_kmem_cache;
 
+	ret = register_pernet_subsys(&ip6_route_net_ops);
+	if (ret)
+		goto out_dst_entries;
+
 	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
 
 	/* Registering of the loopback is done before this portion of code,
@@ -2808,6 +2818,8 @@ out_fib6_init:
 	fib6_gc_cleanup();
 out_register_subsys:
 	unregister_pernet_subsys(&ip6_route_net_ops);
+out_dst_entries:
+	dst_entries_destroy(&ip6_dst_blackhole_ops);
 out_kmem_cache:
 	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
 	goto out;
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 39676eac3a37..7e74023ea6e4 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops)
 	struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops);
 
 	xfrm6_policy_afinfo.garbage_collect(net);
-	return atomic_read(&ops->entries) > ops->gc_thresh * 2;
+	return dst_entries_get_fast(ops) > ops->gc_thresh * 2;
 }
 
 static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -255,7 +255,6 @@ static struct dst_ops xfrm6_dst_ops = {
 	.ifdown =		xfrm6_dst_ifdown,
 	.local_out =		__ip6_local_out,
 	.gc_thresh =		1024,
-	.entries =		ATOMIC_INIT(0),
 };
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
@@ -312,11 +311,13 @@ int __init xfrm6_init(void)
 	 */
 	gc_thresh = FIB6_TABLE_HASHSZ * 8;
 	xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh;
+	dst_entries_init(&xfrm6_dst_ops);
 
 	ret = xfrm6_policy_init();
-	if (ret)
+	if (ret) {
+		dst_entries_destroy(&xfrm6_dst_ops);
 		goto out;
-
+	}
 	ret = xfrm6_state_init();
 	if (ret)
 		goto out_policy;
@@ -341,4 +342,5 @@ void xfrm6_fini(void)
 	//xfrm6_input_fini();
 	xfrm6_policy_fini();
 	xfrm6_state_fini();
+	dst_entries_destroy(&xfrm6_dst_ops);
 }
-- 
cgit v1.2.3-58-ga151


From 12b00c2c025b8af697d9a022ea2e928cad889ef1 Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 13 Oct 2010 15:56:56 +0200
Subject: netfilter: xtables: resolve indirect macros 1/3

Many of the used macros are just there for userspace compatibility.
Substitute the in-kernel code to directly use the terminal macro
and stuff the defines into #ifndef __KERNEL__ sections.

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 include/linux/netfilter_arp/arp_tables.h  | 10 ++++++----
 include/linux/netfilter_ipv4/ip_tables.h  | 10 ++++++----
 include/linux/netfilter_ipv6/ip6_tables.h | 11 ++++++-----
 net/ipv4/netfilter/arp_tables.c           | 10 +++++-----
 net/ipv4/netfilter/ip_tables.c            | 12 ++++++------
 net/ipv6/netfilter/ip6_tables.c           | 12 ++++++------
 6 files changed, 35 insertions(+), 30 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index e9948c0560f6..81938600470d 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -21,8 +21,10 @@
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define ARPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define ARPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
+#endif
 
 #define ARPT_DEV_ADDR_LEN_MAX 16
 
@@ -134,7 +136,7 @@ struct arpt_entry
 /* The argument to ARPT_SO_GET_INFO */
 struct arpt_getinfo {
 	/* Which table: caller fills this in. */
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Kernel fills these in. */
 	/* Which hook entry points are valid: bitmask */
@@ -156,7 +158,7 @@ struct arpt_getinfo {
 /* The argument to ARPT_SO_SET_REPLACE. */
 struct arpt_replace {
 	/* Which table. */
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -191,7 +193,7 @@ struct arpt_replace {
 /* The argument to ARPT_SO_GET_ENTRIES. */
 struct arpt_get_entries {
 	/* Which table: user fills this in. */
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* User fills this in: total entry size. */
 	unsigned int size;
@@ -230,7 +232,7 @@ struct arpt_standard {
 
 struct arpt_error_target {
 	struct arpt_entry_target target;
-	char errorname[ARPT_FUNCTION_MAXNAMELEN];
+	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
 struct arpt_error {
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 704a7b6e8169..1b7cdf1137e3 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -27,12 +27,14 @@
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define IPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define IPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
 #define ipt_match xt_match
 #define ipt_target xt_target
 #define ipt_table xt_table
 #define ipt_get_revision xt_get_revision
+#endif
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ipt_ip {
@@ -146,7 +148,7 @@ struct ipt_icmp {
 /* The argument to IPT_SO_GET_INFO */
 struct ipt_getinfo {
 	/* Which table: caller fills this in. */
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Kernel fills these in. */
 	/* Which hook entry points are valid: bitmask */
@@ -168,7 +170,7 @@ struct ipt_getinfo {
 /* The argument to IPT_SO_SET_REPLACE. */
 struct ipt_replace {
 	/* Which table. */
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -202,7 +204,7 @@ struct ipt_replace {
 /* The argument to IPT_SO_GET_ENTRIES. */
 struct ipt_get_entries {
 	/* Which table: user fills this in. */
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* User fills this in: total entry size. */
 	unsigned int size;
@@ -254,7 +256,7 @@ struct ipt_standard {
 
 struct ipt_error_target {
 	struct ipt_entry_target target;
-	char errorname[IPT_FUNCTION_MAXNAMELEN];
+	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
 struct ipt_error {
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 18442ff19c07..abe31d020e3c 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -27,13 +27,14 @@
 
 #include <linux/netfilter/x_tables.h>
 
+#ifndef __KERNEL__
 #define IP6T_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define IP6T_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
-
 #define ip6t_match xt_match
 #define ip6t_target xt_target
 #define ip6t_table xt_table
 #define ip6t_get_revision xt_get_revision
+#endif
 
 /* Yes, Virginia, you have to zero the padding. */
 struct ip6t_ip6 {
@@ -117,7 +118,7 @@ struct ip6t_standard {
 
 struct ip6t_error_target {
 	struct ip6t_entry_target target;
-	char errorname[IP6T_FUNCTION_MAXNAMELEN];
+	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
 struct ip6t_error {
@@ -203,7 +204,7 @@ struct ip6t_icmp {
 /* The argument to IP6T_SO_GET_INFO */
 struct ip6t_getinfo {
 	/* Which table: caller fills this in. */
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Kernel fills these in. */
 	/* Which hook entry points are valid: bitmask */
@@ -225,7 +226,7 @@ struct ip6t_getinfo {
 /* The argument to IP6T_SO_SET_REPLACE. */
 struct ip6t_replace {
 	/* Which table. */
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* Which hook entry points are valid: bitmask.  You can't
            change this. */
@@ -259,7 +260,7 @@ struct ip6t_replace {
 /* The argument to IP6T_SO_GET_ENTRIES. */
 struct ip6t_get_entries {
 	/* Which table: user fills this in. */
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 
 	/* User fills this in: total entry size. */
 	unsigned int size;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e8f4f9a57f12..e427a9e3c489 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -895,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -908,7 +908,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[ARPT_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(NFPROTO_ARP);
@@ -1474,7 +1474,7 @@ out_unlock:
 }
 
 struct compat_arpt_replace {
-	char				name[ARPT_TABLE_MAXNAMELEN];
+	char				name[XT_TABLE_MAXNAMELEN];
 	u32				valid_hooks;
 	u32				num_entries;
 	u32				size;
@@ -1628,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 }
 
 struct compat_arpt_get_entries {
-	char name[ARPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_arpt_entry entrytable[0];
 };
@@ -1840,7 +1840,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 	{
 		.name             = ARPT_ERROR_TARGET,
 		.target           = arpt_error,
-		.targetsize       = ARPT_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_ARP,
 	},
 };
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d163f2e3b2e9..2efd41bef452 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1092,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -1105,7 +1105,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[IPT_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(AF_INET);
@@ -1400,7 +1400,7 @@ do_add_counters(struct net *net, const void __user *user,
 
 #ifdef CONFIG_COMPAT
 struct compat_ipt_replace {
-	char			name[IPT_TABLE_MAXNAMELEN];
+	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
 	u32			num_entries;
 	u32			size;
@@ -1884,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 }
 
 struct compat_ipt_get_entries {
-	char name[IPT_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_ipt_entry entrytable[0];
 };
@@ -2039,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
 	case IPT_SO_GET_REVISION_MATCH:
 	case IPT_SO_GET_REVISION_TARGET: {
-		struct ipt_get_revision rev;
+		struct xt_get_revision rev;
 		int target;
 
 		if (*len != sizeof(rev)) {
@@ -2188,7 +2188,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 	{
 		.name             = IPT_ERROR_TARGET,
 		.target           = ipt_error,
-		.targetsize       = IPT_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV4,
 	},
 };
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 8e754be92c24..4b973e13952d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1105,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info,
 static int get_info(struct net *net, void __user *user,
                     const int *len, int compat)
 {
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	struct xt_table *t;
 	int ret;
 
@@ -1118,7 +1118,7 @@ static int get_info(struct net *net, void __user *user,
 	if (copy_from_user(name, user, sizeof(name)) != 0)
 		return -EFAULT;
 
-	name[IP6T_TABLE_MAXNAMELEN-1] = '\0';
+	name[XT_TABLE_MAXNAMELEN-1] = '\0';
 #ifdef CONFIG_COMPAT
 	if (compat)
 		xt_compat_lock(AF_INET6);
@@ -1415,7 +1415,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 
 #ifdef CONFIG_COMPAT
 struct compat_ip6t_replace {
-	char			name[IP6T_TABLE_MAXNAMELEN];
+	char			name[XT_TABLE_MAXNAMELEN];
 	u32			valid_hooks;
 	u32			num_entries;
 	u32			size;
@@ -1899,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 }
 
 struct compat_ip6t_get_entries {
-	char name[IP6T_TABLE_MAXNAMELEN];
+	char name[XT_TABLE_MAXNAMELEN];
 	compat_uint_t size;
 	struct compat_ip6t_entry entrytable[0];
 };
@@ -2054,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 
 	case IP6T_SO_GET_REVISION_MATCH:
 	case IP6T_SO_GET_REVISION_TARGET: {
-		struct ip6t_get_revision rev;
+		struct xt_get_revision rev;
 		int target;
 
 		if (*len != sizeof(rev)) {
@@ -2203,7 +2203,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 	{
 		.name             = IP6T_ERROR_TARGET,
 		.target           = ip6t_error,
-		.targetsize       = IP6T_FUNCTION_MAXNAMELEN,
+		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV6,
 	},
 };
-- 
cgit v1.2.3-58-ga151


From 87a2e70db62fec7348c6e5545eb7b7650c33d81b Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 13 Oct 2010 16:11:22 +0200
Subject: netfilter: xtables: resolve indirect macros 2/3

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 include/linux/netfilter_arp/arp_tables.h  | 15 ++++-----
 include/linux/netfilter_ipv4/ip_tables.h  | 18 +++++------
 include/linux/netfilter_ipv6/ip6_tables.h | 20 ++++++------
 net/ipv4/netfilter/arp_tables.c           | 38 +++++++++++-----------
 net/ipv4/netfilter/ip_tables.c            | 54 +++++++++++++++----------------
 net/ipv6/netfilter/ip6_tables.c           | 54 +++++++++++++++----------------
 net/sched/act_ipt.c                       | 12 +++----
 7 files changed, 103 insertions(+), 108 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 81938600470d..7e193c9241b3 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -24,6 +24,8 @@
 #ifndef __KERNEL__
 #define ARPT_FUNCTION_MAXNAMELEN XT_FUNCTION_MAXNAMELEN
 #define ARPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
+#define arpt_entry_target xt_entry_target
+#define arpt_standard_target xt_standard_target
 #endif
 
 #define ARPT_DEV_ADDR_LEN_MAX 16
@@ -65,9 +67,6 @@ struct arpt_arp {
 	u_int16_t invflags;
 };
 
-#define arpt_entry_target xt_entry_target
-#define arpt_standard_target xt_standard_target
-
 /* Values for "flag" field in struct arpt_ip (general arp structure).
  * No flags defined yet.
  */
@@ -208,7 +207,7 @@ struct arpt_get_entries {
 #define ARPT_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
-static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e)
+static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e)
 {
 	return (void *)e + e->target_offset;
 }
@@ -227,11 +226,11 @@ static __inline__ struct arpt_entry_target *arpt_get_target(struct arpt_entry *e
 /* Standard entry. */
 struct arpt_standard {
 	struct arpt_entry entry;
-	struct arpt_standard_target target;
+	struct xt_standard_target target;
 };
 
 struct arpt_error_target {
-	struct arpt_entry_target target;
+	struct xt_entry_target target;
 	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
@@ -250,7 +249,7 @@ struct arpt_error {
 {									       \
 	.entry		= ARPT_ENTRY_INIT(sizeof(struct arpt_standard)),       \
 	.target		= XT_TARGET_INIT(ARPT_STANDARD_TARGET,		       \
-					 sizeof(struct arpt_standard_target)), \
+					 sizeof(struct xt_standard_target)), \
 	.target.verdict	= -(__verdict) - 1,				       \
 }
 
@@ -287,7 +286,7 @@ struct compat_arpt_entry {
 	unsigned char elems[0];
 };
 
-static inline struct arpt_entry_target *
+static inline struct xt_entry_target *
 compat_arpt_get_target(struct compat_arpt_entry *e)
 {
 	return (void *)e + e->target_offset;
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index 1b7cdf1137e3..ec506918a9b9 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -34,6 +34,10 @@
 #define ipt_target xt_target
 #define ipt_table xt_table
 #define ipt_get_revision xt_get_revision
+#define ipt_entry_match xt_entry_match
+#define ipt_entry_target xt_entry_target
+#define ipt_standard_target xt_standard_target
+#define ipt_counters xt_counters
 #endif
 
 /* Yes, Virginia, you have to zero the padding. */
@@ -54,12 +58,6 @@ struct ipt_ip {
 	u_int8_t invflags;
 };
 
-#define ipt_entry_match xt_entry_match
-#define ipt_entry_target xt_entry_target
-#define ipt_standard_target xt_standard_target
-
-#define ipt_counters xt_counters
-
 /* Values for "flag" field in struct ipt_ip (general ip structure). */
 #define IPT_F_FRAG		0x01	/* Set if rule is a fragment rule */
 #define IPT_F_GOTO		0x02	/* Set if jump is a goto */
@@ -219,7 +217,7 @@ struct ipt_get_entries {
 #define IPT_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
-static __inline__ struct ipt_entry_target *
+static __inline__ struct xt_entry_target *
 ipt_get_target(struct ipt_entry *e)
 {
 	return (void *)e + e->target_offset;
@@ -251,11 +249,11 @@ extern void ipt_unregister_table(struct net *net, struct xt_table *table);
 /* Standard entry. */
 struct ipt_standard {
 	struct ipt_entry entry;
-	struct ipt_standard_target target;
+	struct xt_standard_target target;
 };
 
 struct ipt_error_target {
-	struct ipt_entry_target target;
+	struct xt_entry_target target;
 	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
@@ -309,7 +307,7 @@ struct compat_ipt_entry {
 };
 
 /* Helper functions */
-static inline struct ipt_entry_target *
+static inline struct xt_entry_target *
 compat_ipt_get_target(struct compat_ipt_entry *e)
 {
 	return (void *)e + e->target_offset;
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index abe31d020e3c..40d11fa05840 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -34,6 +34,10 @@
 #define ip6t_target xt_target
 #define ip6t_table xt_table
 #define ip6t_get_revision xt_get_revision
+#define ip6t_entry_match xt_entry_match
+#define ip6t_entry_target xt_entry_target
+#define ip6t_standard_target xt_standard_target
+#define ip6t_counters xt_counters
 #endif
 
 /* Yes, Virginia, you have to zero the padding. */
@@ -63,12 +67,6 @@ struct ip6t_ip6 {
 	u_int8_t invflags;
 };
 
-#define ip6t_entry_match xt_entry_match
-#define ip6t_entry_target xt_entry_target
-#define ip6t_standard_target xt_standard_target
-
-#define ip6t_counters	xt_counters
-
 /* Values for "flag" field in struct ip6t_ip6 (general ip6 structure). */
 #define IP6T_F_PROTO		0x01	/* Set if rule cares about upper 
 					   protocols */
@@ -113,11 +111,11 @@ struct ip6t_entry {
 /* Standard entry */
 struct ip6t_standard {
 	struct ip6t_entry entry;
-	struct ip6t_standard_target target;
+	struct xt_standard_target target;
 };
 
 struct ip6t_error_target {
-	struct ip6t_entry_target target;
+	struct xt_entry_target target;
 	char errorname[XT_FUNCTION_MAXNAMELEN];
 };
 
@@ -136,7 +134,7 @@ struct ip6t_error {
 {									       \
 	.entry		= IP6T_ENTRY_INIT(sizeof(struct ip6t_standard)),       \
 	.target		= XT_TARGET_INIT(IP6T_STANDARD_TARGET,		       \
-					 sizeof(struct ip6t_standard_target)), \
+					 sizeof(struct xt_standard_target)),   \
 	.target.verdict	= -(__verdict) - 1,				       \
 }
 
@@ -275,7 +273,7 @@ struct ip6t_get_entries {
 #define IP6T_ERROR_TARGET XT_ERROR_TARGET
 
 /* Helper functions */
-static __inline__ struct ip6t_entry_target *
+static __inline__ struct xt_entry_target *
 ip6t_get_target(struct ip6t_entry *e)
 {
 	return (void *)e + e->target_offset;
@@ -332,7 +330,7 @@ struct compat_ip6t_entry {
 	unsigned char elems[0];
 };
 
-static inline struct ip6t_entry_target *
+static inline struct xt_entry_target *
 compat_ip6t_get_target(struct compat_ip6t_entry *e)
 {
 	return (void *)e + e->target_offset;
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index e427a9e3c489..ed178cbe6626 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par)
 	return NF_DROP;
 }
 
-static inline const struct arpt_entry_target *
+static inline const struct xt_entry_target *
 arpt_get_target_c(const struct arpt_entry *e)
 {
 	return arpt_get_target((struct arpt_entry *)e);
@@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 
 	arp = arp_hdr(skb);
 	do {
-		const struct arpt_entry_target *t;
+		const struct xt_entry_target *t;
 
 		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
 			e = arpt_next_entry(e);
@@ -297,7 +297,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct arpt_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
 				if (v != ARPT_RETURN) {
@@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct arpt_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)arpt_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 
 static inline int check_entry(const struct arpt_entry *e, const char *name)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!arp_checkentry(&e->arp)) {
 		duprintf("arp_tables: arp check failed %p %s.\n", e, name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset)
+	if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
 		return -EINVAL;
 
 	t = arpt_get_target_c(e);
@@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name)
 
 static inline int check_target(struct arpt_entry *e, const char *name)
 {
-	struct arpt_entry_target *t = arpt_get_target(e);
+	struct xt_entry_target *t = arpt_get_target(e);
 	int ret;
 	struct xt_tgchk_param par = {
 		.table     = name,
@@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name)
 static inline int
 find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 
@@ -536,7 +536,7 @@ out:
 
 static bool check_underflow(const struct arpt_entry *e)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->arp))
@@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e)
 	t = arpt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct arpt_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) {
+	    < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 static inline void cleanup_entry(struct arpt_entry *e)
 {
 	struct xt_tgdtor_param par;
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 
 	t = arpt_get_target(e);
 	par.target   = t->u.kernel.target;
@@ -794,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	/* FIXME: use iterator macros --RR */
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
-		const struct arpt_entry_target *t;
+		const struct xt_entry_target *t;
 
 		e = (struct arpt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -807,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size,
 
 		t = arpt_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct arpt_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -844,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e,
 			     const struct xt_table_info *info,
 			     const void *base, struct xt_table_info *newinfo)
 {
-	const struct arpt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 #ifdef CONFIG_COMPAT
 static inline void compat_release_entry(struct compat_arpt_entry *e)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 
 	t = compat_arpt_get_target(e);
 	module_put(t->u.kernel.target->me);
@@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 				  const unsigned int *underflows,
 				  const char *name)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	int ret, off, h;
@@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct arpt_entry *de;
 	unsigned int origsize;
@@ -1567,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr,
 				     struct xt_counters *counters,
 				     unsigned int i)
 {
-	struct arpt_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_arpt_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 2efd41bef452..cb108880050a 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip)
 }
 
 /* for const-correctness */
-static inline const struct ipt_entry_target *
+static inline const struct xt_entry_target *
 ipt_get_target_c(const struct ipt_entry *e)
 {
 	return ipt_get_target((struct ipt_entry *)e);
@@ -230,7 +230,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 		      const char *hookname, const char **chainname,
 		      const char **comment, unsigned int *rulenum)
 {
-	const struct ipt_standard_target *t = (void *)ipt_get_target_c(s);
+	const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
 
 	if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
@@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb,
 		 get_entry(table_base, private->underflow[hook]));
 
 	do {
-		const struct ipt_entry_target *t;
+		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
@@ -380,7 +380,7 @@ ipt_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct ipt_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
 				if (v != IPT_RETURN) {
@@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct ipt_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)ipt_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	return 1;
 }
 
-static void cleanup_match(struct ipt_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
 	struct xt_mtdtor_param par;
 
@@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net)
 static int
 check_entry(const struct ipt_entry *e, const char *name)
 {
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!ip_checkentry(&e->ip)) {
 		duprintf("ip check failed %p %s.\n", e, par->match->name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct ipt_entry_target) >
+	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
 		return -EINVAL;
 
@@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name)
 }
 
 static int
-check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ipt_ip *ip = par->entryinfo;
 	int ret;
@@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	struct xt_match *match;
 	int ret;
@@ -630,7 +630,7 @@ err:
 
 static int check_target(struct ipt_entry *e, struct net *net, const char *name)
 {
-	struct ipt_entry_target *t = ipt_get_target(e);
+	struct xt_entry_target *t = ipt_get_target(e);
 	struct xt_tgchk_param par = {
 		.net       = net,
 		.table     = name,
@@ -656,7 +656,7 @@ static int
 find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 		 unsigned int size)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
@@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ipt_entry *e)
 {
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->ip))
@@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e)
 	t = ipt_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct ipt_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) {
+	    < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -771,7 +771,7 @@ static void
 cleanup_entry(struct ipt_entry *e, struct net *net)
 {
 	struct xt_tgdtor_param par;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -972,8 +972,8 @@ copy_entries_to_user(unsigned int total_size,
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
 		unsigned int i;
-		const struct ipt_entry_match *m;
-		const struct ipt_entry_target *t;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
 
 		e = (struct ipt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -990,7 +990,7 @@ copy_entries_to_user(unsigned int total_size,
 			m = (void *)e + i;
 
 			if (copy_to_user(userptr + off + i
-					 + offsetof(struct ipt_entry_match,
+					 + offsetof(struct xt_entry_match,
 						    u.user.name),
 					 m->u.kernel.match->name,
 					 strlen(m->u.kernel.match->name)+1)
@@ -1002,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size,
 
 		t = ipt_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct ipt_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1040,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e,
 			     const void *base, struct xt_table_info *newinfo)
 {
 	const struct xt_entry_match *ematch;
-	const struct ipt_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -1407,7 +1407,7 @@ struct compat_ipt_replace {
 	u32			hook_entry[NF_INET_NUMHOOKS];
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
-	compat_uptr_t		counters;	/* struct ipt_counters * */
+	compat_uptr_t		counters;	/* struct xt_counters * */
 	struct compat_ipt_entry	entries[0];
 };
 
@@ -1416,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 			  unsigned int *size, struct xt_counters *counters,
 			  unsigned int i)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_ipt_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
@@ -1451,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ipt_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
 		       const char *name,
 		       const struct ipt_ip *ip,
 		       unsigned int hookmask,
@@ -1473,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m,
 
 static void compat_release_entry(struct compat_ipt_entry *e)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -1494,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 				  const char *name)
 {
 	struct xt_entry_match *ematch;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
@@ -1576,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct ipt_entry *de;
 	unsigned int origsize;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4b973e13952d..c7334c10a4b3 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6)
 	return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
 }
 
-static inline const struct ip6t_entry_target *
+static inline const struct xt_entry_target *
 ip6t_get_target_c(const struct ip6t_entry *e)
 {
 	return ip6t_get_target((struct ip6t_entry *)e);
@@ -260,7 +260,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 		      const char *hookname, const char **chainname,
 		      const char **comment, unsigned int *rulenum)
 {
-	const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s);
+	const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
 
 	if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
@@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb,
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	do {
-		const struct ip6t_entry_target *t;
+		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
 
 		IP_NF_ASSERT(e);
@@ -403,7 +403,7 @@ ip6t_do_table(struct sk_buff *skb,
 		if (!t->u.kernel.target->target) {
 			int v;
 
-			v = ((struct ip6t_standard_target *)t)->verdict;
+			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
 				if (v != IP6T_RETURN) {
@@ -474,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 		e->counters.pcnt = pos;
 
 		for (;;) {
-			const struct ip6t_standard_target *t
+			const struct xt_standard_target *t
 				= (void *)ip6t_get_target_c(e);
 			int visited = e->comefrom & (1 << hook);
 
@@ -565,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 	return 1;
 }
 
-static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
+static void cleanup_match(struct xt_entry_match *m, struct net *net)
 {
 	struct xt_mtdtor_param par;
 
@@ -581,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net)
 static int
 check_entry(const struct ip6t_entry *e, const char *name)
 {
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 
 	if (!ip6_checkentry(&e->ipv6)) {
 		duprintf("ip_tables: ip check failed %p %s.\n", e, name);
 		return -EINVAL;
 	}
 
-	if (e->target_offset + sizeof(struct ip6t_entry_target) >
+	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
 		return -EINVAL;
 
@@ -599,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name)
 	return 0;
 }
 
-static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	const struct ip6t_ip6 *ipv6 = par->entryinfo;
 	int ret;
@@ -618,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
 }
 
 static int
-find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par)
+find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par)
 {
 	struct xt_match *match;
 	int ret;
@@ -643,7 +643,7 @@ err:
 
 static int check_target(struct ip6t_entry *e, struct net *net, const char *name)
 {
-	struct ip6t_entry_target *t = ip6t_get_target(e);
+	struct xt_entry_target *t = ip6t_get_target(e);
 	struct xt_tgchk_param par = {
 		.net       = net,
 		.table     = name,
@@ -670,7 +670,7 @@ static int
 find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 		 unsigned int size)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	int ret;
 	unsigned int j;
@@ -721,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 
 static bool check_underflow(const struct ip6t_entry *e)
 {
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int verdict;
 
 	if (!unconditional(&e->ipv6))
@@ -729,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e)
 	t = ip6t_get_target_c(e);
 	if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
 		return false;
-	verdict = ((struct ip6t_standard_target *)t)->verdict;
+	verdict = ((struct xt_standard_target *)t)->verdict;
 	verdict = -verdict - 1;
 	return verdict == NF_DROP || verdict == NF_ACCEPT;
 }
@@ -752,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	}
 
 	if (e->next_offset
-	    < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) {
+	    < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) {
 		duprintf("checking: element %p size %u\n",
 			 e, e->next_offset);
 		return -EINVAL;
@@ -784,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 {
 	struct xt_tgdtor_param par;
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -985,8 +985,8 @@ copy_entries_to_user(unsigned int total_size,
 	/* ... then go back and fix counters and names */
 	for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){
 		unsigned int i;
-		const struct ip6t_entry_match *m;
-		const struct ip6t_entry_target *t;
+		const struct xt_entry_match *m;
+		const struct xt_entry_target *t;
 
 		e = (struct ip6t_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
@@ -1003,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size,
 			m = (void *)e + i;
 
 			if (copy_to_user(userptr + off + i
-					 + offsetof(struct ip6t_entry_match,
+					 + offsetof(struct xt_entry_match,
 						    u.user.name),
 					 m->u.kernel.match->name,
 					 strlen(m->u.kernel.match->name)+1)
@@ -1015,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size,
 
 		t = ip6t_get_target_c(e);
 		if (copy_to_user(userptr + off + e->target_offset
-				 + offsetof(struct ip6t_entry_target,
+				 + offsetof(struct xt_entry_target,
 					    u.user.name),
 				 t->u.kernel.target->name,
 				 strlen(t->u.kernel.target->name)+1) != 0) {
@@ -1053,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e,
 			     const void *base, struct xt_table_info *newinfo)
 {
 	const struct xt_entry_match *ematch;
-	const struct ip6t_entry_target *t;
+	const struct xt_entry_target *t;
 	unsigned int entry_offset;
 	int off, i, ret;
 
@@ -1422,7 +1422,7 @@ struct compat_ip6t_replace {
 	u32			hook_entry[NF_INET_NUMHOOKS];
 	u32			underflow[NF_INET_NUMHOOKS];
 	u32			num_counters;
-	compat_uptr_t		counters;	/* struct ip6t_counters * */
+	compat_uptr_t		counters;	/* struct xt_counters * */
 	struct compat_ip6t_entry entries[0];
 };
 
@@ -1431,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 			  unsigned int *size, struct xt_counters *counters,
 			  unsigned int i)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct compat_ip6t_entry __user *ce;
 	u_int16_t target_offset, next_offset;
 	compat_uint_t origsize;
@@ -1466,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr,
 }
 
 static int
-compat_find_calc_match(struct ip6t_entry_match *m,
+compat_find_calc_match(struct xt_entry_match *m,
 		       const char *name,
 		       const struct ip6t_ip6 *ipv6,
 		       unsigned int hookmask,
@@ -1488,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m,
 
 static void compat_release_entry(struct compat_ip6t_entry *e)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_entry_match *ematch;
 
 	/* Cleanup all matches */
@@ -1509,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 				  const char *name)
 {
 	struct xt_entry_match *ematch;
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	unsigned int entry_offset;
 	unsigned int j;
@@ -1591,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr,
 			    unsigned int *size, const char *name,
 			    struct xt_table_info *newinfo, unsigned char *base)
 {
-	struct ip6t_entry_target *t;
+	struct xt_entry_target *t;
 	struct xt_target *target;
 	struct ip6t_entry *de;
 	unsigned int origsize;
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index c7e59e6ec349..f6d464f993ef 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = {
 	.lock	=	&ipt_lock,
 };
 
-static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook)
+static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
 {
 	struct xt_tgchk_param par;
 	struct xt_target *target;
@@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int
 	return 0;
 }
 
-static void ipt_destroy_target(struct ipt_entry_target *t)
+static void ipt_destroy_target(struct xt_entry_target *t)
 {
 	struct xt_tgdtor_param par = {
 		.target   = t->u.kernel.target,
@@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
 	[TCA_IPT_TABLE]	= { .type = NLA_STRING, .len = IFNAMSIZ },
 	[TCA_IPT_HOOK]	= { .type = NLA_U32 },
 	[TCA_IPT_INDEX]	= { .type = NLA_U32 },
-	[TCA_IPT_TARG]	= { .len = sizeof(struct ipt_entry_target) },
+	[TCA_IPT_TARG]	= { .len = sizeof(struct xt_entry_target) },
 };
 
 static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
@@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	struct nlattr *tb[TCA_IPT_MAX + 1];
 	struct tcf_ipt *ipt;
 	struct tcf_common *pc;
-	struct ipt_entry_target *td, *t;
+	struct xt_entry_target *td, *t;
 	char *tname;
 	int ret = 0, err;
 	u32 hook = 0;
@@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est,
 	if (tb[TCA_IPT_TARG] == NULL)
 		return -EINVAL;
 
-	td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
+	td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]);
 	if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size)
 		return -EINVAL;
 
@@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_ipt *ipt = a->priv;
-	struct ipt_entry_target *t;
+	struct xt_entry_target *t;
 	struct tcf_t tm;
 	struct tc_cnt c;
 
-- 
cgit v1.2.3-58-ga151


From 243bf6e29eef642de0ff62f1ebf58bc2396d6d6e Mon Sep 17 00:00:00 2001
From: Jan Engelhardt <jengelh@medozas.de>
Date: Wed, 13 Oct 2010 16:28:00 +0200
Subject: netfilter: xtables: resolve indirect macros 3/3

---
 include/linux/netfilter_arp/arp_tables.h  | 33 +++++----------
 include/linux/netfilter_ipv4/ip_tables.h  | 69 ++++++++++++++-----------------
 include/linux/netfilter_ipv6/ip6_tables.h | 69 +++++++++++--------------------
 net/ipv4/netfilter/arp_tables.c           | 14 +++----
 net/ipv4/netfilter/arpt_mangle.c          |  2 +-
 net/ipv4/netfilter/ip_tables.c            | 18 ++++----
 net/ipv6/netfilter/ip6_tables.c           | 18 ++++----
 net/sched/act_ipt.c                       |  2 +-
 8 files changed, 94 insertions(+), 131 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/netfilter_arp/arp_tables.h b/include/linux/netfilter_arp/arp_tables.h
index 7e193c9241b3..6e2341a811d6 100644
--- a/include/linux/netfilter_arp/arp_tables.h
+++ b/include/linux/netfilter_arp/arp_tables.h
@@ -26,6 +26,14 @@
 #define ARPT_TABLE_MAXNAMELEN XT_TABLE_MAXNAMELEN
 #define arpt_entry_target xt_entry_target
 #define arpt_standard_target xt_standard_target
+#define ARPT_CONTINUE XT_CONTINUE
+#define ARPT_RETURN XT_RETURN
+#define arpt_counters_info xt_counters_info
+#define arpt_counters xt_counters
+#define ARPT_STANDARD_TARGET XT_STANDARD_TARGET
+#define ARPT_ERROR_TARGET XT_ERROR_TARGET
+#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \
+	XT_ENTRY_ITERATE(struct arpt_entry, entries, size, fn, ## args)
 #endif
 
 #define ARPT_DEV_ADDR_LEN_MAX 16
@@ -126,12 +134,6 @@ struct arpt_entry
 #define ARPT_SO_GET_REVISION_TARGET	(ARPT_BASE_CTL + 3)
 #define ARPT_SO_GET_MAX			(ARPT_SO_GET_REVISION_TARGET)
 
-/* CONTINUE verdict for targets */
-#define ARPT_CONTINUE XT_CONTINUE
-
-/* For standard target */
-#define ARPT_RETURN XT_RETURN
-
 /* The argument to ARPT_SO_GET_INFO */
 struct arpt_getinfo {
 	/* Which table: caller fills this in. */
@@ -185,10 +187,6 @@ struct arpt_replace {
 	struct arpt_entry entries[0];
 };
 
-/* The argument to ARPT_SO_ADD_COUNTERS. */
-#define arpt_counters_info xt_counters_info
-#define arpt_counters xt_counters
-
 /* The argument to ARPT_SO_GET_ENTRIES. */
 struct arpt_get_entries {
 	/* Which table: user fills this in. */
@@ -201,23 +199,12 @@ struct arpt_get_entries {
 	struct arpt_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define ARPT_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define ARPT_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
 static __inline__ struct xt_entry_target *arpt_get_target(struct arpt_entry *e)
 {
 	return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define ARPT_ENTRY_ITERATE(entries, size, fn, args...) \
-	XT_ENTRY_ITERATE(struct arpt_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *	Main firewall chains definitions and global var's definitions.
  */
@@ -248,7 +235,7 @@ struct arpt_error {
 #define ARPT_STANDARD_INIT(__verdict)					       \
 {									       \
 	.entry		= ARPT_ENTRY_INIT(sizeof(struct arpt_standard)),       \
-	.target		= XT_TARGET_INIT(ARPT_STANDARD_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_STANDARD_TARGET,		       \
 					 sizeof(struct xt_standard_target)), \
 	.target.verdict	= -(__verdict) - 1,				       \
 }
@@ -256,7 +243,7 @@ struct arpt_error {
 #define ARPT_ERROR_INIT							       \
 {									       \
 	.entry		= ARPT_ENTRY_INIT(sizeof(struct arpt_error)),	       \
-	.target		= XT_TARGET_INIT(ARPT_ERROR_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_ERROR_TARGET,		       \
 					 sizeof(struct arpt_error_target)),    \
 	.target.errorname = "ERROR",					       \
 }
diff --git a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
index ec506918a9b9..ee54b3b7e237 100644
--- a/include/linux/netfilter_ipv4/ip_tables.h
+++ b/include/linux/netfilter_ipv4/ip_tables.h
@@ -38,6 +38,36 @@
 #define ipt_entry_target xt_entry_target
 #define ipt_standard_target xt_standard_target
 #define ipt_counters xt_counters
+#define IPT_CONTINUE XT_CONTINUE
+#define IPT_RETURN XT_RETURN
+
+/* This group is older than old (iptables < v1.4.0-rc1~89) */
+#include <linux/netfilter/xt_tcpudp.h>
+#define ipt_udp xt_udp
+#define ipt_tcp xt_tcp
+#define IPT_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
+#define IPT_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
+#define IPT_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
+#define IPT_TCP_INV_OPTION	XT_TCP_INV_OPTION
+#define IPT_TCP_INV_MASK	XT_TCP_INV_MASK
+#define IPT_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
+#define IPT_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
+#define IPT_UDP_INV_MASK	XT_UDP_INV_MASK
+
+/* The argument to IPT_SO_ADD_COUNTERS. */
+#define ipt_counters_info xt_counters_info
+/* Standard return verdict, or do jump. */
+#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
+/* Error verdict. */
+#define IPT_ERROR_TARGET XT_ERROR_TARGET
+
+/* fn returns 0 to continue iteration */
+#define IPT_MATCH_ITERATE(e, fn, args...) \
+	XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args)
+
+/* fn returns 0 to continue iteration */
+#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
+	XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args)
 #endif
 
 /* Yes, Virginia, you have to zero the padding. */
@@ -116,23 +146,6 @@ struct ipt_entry {
 #define IPT_SO_GET_REVISION_TARGET	(IPT_BASE_CTL + 3)
 #define IPT_SO_GET_MAX			IPT_SO_GET_REVISION_TARGET
 
-#define IPT_CONTINUE XT_CONTINUE
-#define IPT_RETURN XT_RETURN
-
-#include <linux/netfilter/xt_tcpudp.h>
-#define ipt_udp xt_udp
-#define ipt_tcp xt_tcp
-
-#define IPT_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
-#define IPT_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
-#define IPT_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
-#define IPT_TCP_INV_OPTION	XT_TCP_INV_OPTION
-#define IPT_TCP_INV_MASK	XT_TCP_INV_MASK
-
-#define IPT_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
-#define IPT_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
-#define IPT_UDP_INV_MASK	XT_UDP_INV_MASK
-
 /* ICMP matching stuff */
 struct ipt_icmp {
 	u_int8_t type;				/* type to match */
@@ -196,9 +209,6 @@ struct ipt_replace {
 	struct ipt_entry entries[0];
 };
 
-/* The argument to IPT_SO_ADD_COUNTERS. */
-#define ipt_counters_info xt_counters_info
-
 /* The argument to IPT_SO_GET_ENTRIES. */
 struct ipt_get_entries {
 	/* Which table: user fills this in. */
@@ -211,11 +221,6 @@ struct ipt_get_entries {
 	struct ipt_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define IPT_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define IPT_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
 static __inline__ struct xt_entry_target *
 ipt_get_target(struct ipt_entry *e)
@@ -223,16 +228,6 @@ ipt_get_target(struct ipt_entry *e)
 	return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define IPT_MATCH_ITERATE(e, fn, args...) \
-	XT_MATCH_ITERATE(struct ipt_entry, e, fn, ## args)
-
-/* fn returns 0 to continue iteration */
-#define IPT_ENTRY_ITERATE(entries, size, fn, args...) \
-	XT_ENTRY_ITERATE(struct ipt_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *	Main firewall chains definitions and global var's definitions.
  */
@@ -271,7 +266,7 @@ struct ipt_error {
 #define IPT_STANDARD_INIT(__verdict)					       \
 {									       \
 	.entry		= IPT_ENTRY_INIT(sizeof(struct ipt_standard)),	       \
-	.target		= XT_TARGET_INIT(IPT_STANDARD_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_STANDARD_TARGET,		       \
 					 sizeof(struct xt_standard_target)),   \
 	.target.verdict	= -(__verdict) - 1,				       \
 }
@@ -279,7 +274,7 @@ struct ipt_error {
 #define IPT_ERROR_INIT							       \
 {									       \
 	.entry		= IPT_ENTRY_INIT(sizeof(struct ipt_error)),	       \
-	.target		= XT_TARGET_INIT(IPT_ERROR_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_ERROR_TARGET,		       \
 					 sizeof(struct ipt_error_target)),     \
 	.target.errorname = "ERROR",					       \
 }
diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h
index 40d11fa05840..ac2b411ea63a 100644
--- a/include/linux/netfilter_ipv6/ip6_tables.h
+++ b/include/linux/netfilter_ipv6/ip6_tables.h
@@ -38,6 +38,29 @@
 #define ip6t_entry_target xt_entry_target
 #define ip6t_standard_target xt_standard_target
 #define ip6t_counters xt_counters
+#define IP6T_CONTINUE XT_CONTINUE
+#define IP6T_RETURN XT_RETURN
+
+/* Pre-iptables-1.4.0 */
+#include <linux/netfilter/xt_tcpudp.h>
+#define ip6t_tcp xt_tcp
+#define ip6t_udp xt_udp
+#define IP6T_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
+#define IP6T_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
+#define IP6T_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
+#define IP6T_TCP_INV_OPTION	XT_TCP_INV_OPTION
+#define IP6T_TCP_INV_MASK	XT_TCP_INV_MASK
+#define IP6T_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
+#define IP6T_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
+#define IP6T_UDP_INV_MASK	XT_UDP_INV_MASK
+
+#define ip6t_counters_info xt_counters_info
+#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
+#define IP6T_ERROR_TARGET XT_ERROR_TARGET
+#define IP6T_MATCH_ITERATE(e, fn, args...) \
+	XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args)
+#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \
+	XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args)
 #endif
 
 /* Yes, Virginia, you have to zero the padding. */
@@ -133,7 +156,7 @@ struct ip6t_error {
 #define IP6T_STANDARD_INIT(__verdict)					       \
 {									       \
 	.entry		= IP6T_ENTRY_INIT(sizeof(struct ip6t_standard)),       \
-	.target		= XT_TARGET_INIT(IP6T_STANDARD_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_STANDARD_TARGET,		       \
 					 sizeof(struct xt_standard_target)),   \
 	.target.verdict	= -(__verdict) - 1,				       \
 }
@@ -141,7 +164,7 @@ struct ip6t_error {
 #define IP6T_ERROR_INIT							       \
 {									       \
 	.entry		= IP6T_ENTRY_INIT(sizeof(struct ip6t_error)),	       \
-	.target		= XT_TARGET_INIT(IP6T_ERROR_TARGET,		       \
+	.target		= XT_TARGET_INIT(XT_ERROR_TARGET,		       \
 					 sizeof(struct ip6t_error_target)),    \
 	.target.errorname = "ERROR",					       \
 }
@@ -165,30 +188,6 @@ struct ip6t_error {
 #define IP6T_SO_GET_REVISION_TARGET	(IP6T_BASE_CTL + 5)
 #define IP6T_SO_GET_MAX			IP6T_SO_GET_REVISION_TARGET
 
-/* CONTINUE verdict for targets */
-#define IP6T_CONTINUE XT_CONTINUE
-
-/* For standard target */
-#define IP6T_RETURN XT_RETURN
-
-/* TCP/UDP matching stuff */
-#include <linux/netfilter/xt_tcpudp.h>
-
-#define ip6t_tcp xt_tcp
-#define ip6t_udp xt_udp
-
-/* Values for "inv" field in struct ipt_tcp. */
-#define IP6T_TCP_INV_SRCPT	XT_TCP_INV_SRCPT
-#define IP6T_TCP_INV_DSTPT	XT_TCP_INV_DSTPT
-#define IP6T_TCP_INV_FLAGS	XT_TCP_INV_FLAGS
-#define IP6T_TCP_INV_OPTION	XT_TCP_INV_OPTION
-#define IP6T_TCP_INV_MASK	XT_TCP_INV_MASK
-
-/* Values for "invflags" field in struct ipt_udp. */
-#define IP6T_UDP_INV_SRCPT	XT_UDP_INV_SRCPT
-#define IP6T_UDP_INV_DSTPT	XT_UDP_INV_DSTPT
-#define IP6T_UDP_INV_MASK	XT_UDP_INV_MASK
-
 /* ICMP matching stuff */
 struct ip6t_icmp {
 	u_int8_t type;				/* type to match */
@@ -252,9 +251,6 @@ struct ip6t_replace {
 	struct ip6t_entry entries[0];
 };
 
-/* The argument to IP6T_SO_ADD_COUNTERS. */
-#define ip6t_counters_info xt_counters_info
-
 /* The argument to IP6T_SO_GET_ENTRIES. */
 struct ip6t_get_entries {
 	/* Which table: user fills this in. */
@@ -267,11 +263,6 @@ struct ip6t_get_entries {
 	struct ip6t_entry entrytable[0];
 };
 
-/* Standard return verdict, or do jump. */
-#define IP6T_STANDARD_TARGET XT_STANDARD_TARGET
-/* Error verdict. */
-#define IP6T_ERROR_TARGET XT_ERROR_TARGET
-
 /* Helper functions */
 static __inline__ struct xt_entry_target *
 ip6t_get_target(struct ip6t_entry *e)
@@ -279,16 +270,6 @@ ip6t_get_target(struct ip6t_entry *e)
 	return (void *)e + e->target_offset;
 }
 
-#ifndef __KERNEL__
-/* fn returns 0 to continue iteration */
-#define IP6T_MATCH_ITERATE(e, fn, args...) \
-	XT_MATCH_ITERATE(struct ip6t_entry, e, fn, ## args)
-
-/* fn returns 0 to continue iteration */
-#define IP6T_ENTRY_ITERATE(entries, size, fn, args...) \
-	XT_ENTRY_ITERATE(struct ip6t_entry, entries, size, fn, ## args)
-#endif
-
 /*
  *	Main firewall chains definitions and global var's definitions.
  */
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index ed178cbe6626..d756edae59ec 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -300,7 +300,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != ARPT_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 		/* Target might have changed stuff. */
 		arp = arp_hdr(skb);
 
-		if (verdict == ARPT_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = arpt_next_entry(e);
 		else
 			/* Verdict */
@@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct arpt_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     ARPT_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 && unconditional(&e->arp)) ||
 			    visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-					    ARPT_STANDARD_TARGET) == 0) &&
+					    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   ARPT_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct arpt_entry)) {
@@ -1828,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target arpt_builtin_tg[] __read_mostly = {
 	{
-		.name             = ARPT_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_ARP,
 #ifdef CONFIG_COMPAT
@@ -1838,7 +1838,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = ARPT_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = arpt_error,
 		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_ARP,
diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c
index e1be7dd1171b..b8ddcc480ed9 100644
--- a/net/ipv4/netfilter/arpt_mangle.c
+++ b/net/ipv4/netfilter/arpt_mangle.c
@@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par)
 		return false;
 
 	if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT &&
-	   mangle->target != ARPT_CONTINUE)
+	   mangle->target != XT_CONTINUE)
 		return false;
 	return true;
 }
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index cb108880050a..d31b007a6d80 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -232,7 +232,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 {
 	const struct xt_standard_target *t = (void *)ipt_get_target_c(s);
 
-	if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) {
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
 		*chainname = t->target.data;
 		(*rulenum) = 0;
@@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
 
 		if (s->target_offset == sizeof(struct ipt_entry) &&
 		    strcmp(t->target.u.kernel.target->name,
-			   IPT_STANDARD_TARGET) == 0 &&
+			   XT_STANDARD_TARGET) == 0 &&
 		   t->verdict < 0 &&
 		   unconditional(&s->ip)) {
 			/* Tail of chains: STANDARD target (return/policy) */
@@ -383,7 +383,7 @@ ipt_do_table(struct sk_buff *skb,
 			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != IPT_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb,
 		verdict = t->u.kernel.target->target(skb, &acpar);
 		/* Target might have changed stuff. */
 		ip = ip_hdr(skb);
-		if (verdict == IPT_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = ipt_next_entry(e);
 		else
 			/* Verdict */
@@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct ipt_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     IPT_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 && unconditional(&e->ip)) ||
 			    visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-			    		    IPT_STANDARD_TARGET) == 0) &&
+			    		    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   IPT_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct ipt_entry)) {
@@ -2176,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par)
 
 static struct xt_target ipt_builtin_tg[] __read_mostly = {
 	{
-		.name             = IPT_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV4,
 #ifdef CONFIG_COMPAT
@@ -2186,7 +2186,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = IPT_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = ipt_error,
 		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV4,
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index c7334c10a4b3..c683e9e7023b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -262,7 +262,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 {
 	const struct xt_standard_target *t = (void *)ip6t_get_target_c(s);
 
-	if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) {
+	if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) {
 		/* Head of user chain: ERROR target with chainname */
 		*chainname = t->target.data;
 		(*rulenum) = 0;
@@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
 
 		if (s->target_offset == sizeof(struct ip6t_entry) &&
 		    strcmp(t->target.u.kernel.target->name,
-			   IP6T_STANDARD_TARGET) == 0 &&
+			   XT_STANDARD_TARGET) == 0 &&
 		    t->verdict < 0 &&
 		    unconditional(&s->ipv6)) {
 			/* Tail of chains: STANDARD target (return/policy) */
@@ -406,7 +406,7 @@ ip6t_do_table(struct sk_buff *skb,
 			v = ((struct xt_standard_target *)t)->verdict;
 			if (v < 0) {
 				/* Pop from stack? */
-				if (v != IP6T_RETURN) {
+				if (v != XT_RETURN) {
 					verdict = (unsigned)(-v) - 1;
 					break;
 				}
@@ -434,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb,
 		acpar.targinfo = t->data;
 
 		verdict = t->u.kernel.target->target(skb, &acpar);
-		if (verdict == IP6T_CONTINUE)
+		if (verdict == XT_CONTINUE)
 			e = ip6t_next_entry(e);
 		else
 			/* Verdict */
@@ -488,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			/* Unconditional return/END. */
 			if ((e->target_offset == sizeof(struct ip6t_entry) &&
 			     (strcmp(t->target.u.user.name,
-				     IP6T_STANDARD_TARGET) == 0) &&
+				     XT_STANDARD_TARGET) == 0) &&
 			     t->verdict < 0 &&
 			     unconditional(&e->ipv6)) || visited) {
 				unsigned int oldpos, size;
 
 				if ((strcmp(t->target.u.user.name,
-					    IP6T_STANDARD_TARGET) == 0) &&
+					    XT_STANDARD_TARGET) == 0) &&
 				    t->verdict < -NF_MAX_VERDICT - 1) {
 					duprintf("mark_source_chains: bad "
 						"negative verdict (%i)\n",
@@ -537,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo,
 				int newpos = t->verdict;
 
 				if (strcmp(t->target.u.user.name,
-					   IP6T_STANDARD_TARGET) == 0 &&
+					   XT_STANDARD_TARGET) == 0 &&
 				    newpos >= 0) {
 					if (newpos > newinfo->size -
 						sizeof(struct ip6t_entry)) {
@@ -2191,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par)
 /* The built-in targets: standard (NULL) and error. */
 static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 	{
-		.name             = IP6T_STANDARD_TARGET,
+		.name             = XT_STANDARD_TARGET,
 		.targetsize       = sizeof(int),
 		.family           = NFPROTO_IPV6,
 #ifdef CONFIG_COMPAT
@@ -2201,7 +2201,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = {
 #endif
 	},
 	{
-		.name             = IP6T_ERROR_TARGET,
+		.name             = XT_ERROR_TARGET,
 		.target           = ip6t_error,
 		.targetsize       = XT_FUNCTION_MAXNAMELEN,
 		.family           = NFPROTO_IPV6,
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index f6d464f993ef..8daef9632255 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a,
 		result = TC_ACT_SHOT;
 		ipt->tcf_qstats.drops++;
 		break;
-	case IPT_CONTINUE:
+	case XT_CONTINUE:
 		result = TC_ACT_PIPE;
 		break;
 	default:
-- 
cgit v1.2.3-58-ga151


From 874ffa8f72444d6253d2669fed304875c128f86b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 13 Oct 2010 06:56:11 +0000
Subject: fib_trie: use fls() instead of open coded loop

fib_table_lookup() might use fls() to speedup an open coded loop.

Noticed while doing a profile analysis.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 271c89bdf049..31494f335686 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1384,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 	t_key cindex = 0;
 	int current_prefix_length = KEYLENGTH;
 	struct tnode *cn;
-	t_key node_prefix, key_prefix, pref_mismatch;
-	int mp;
+	t_key pref_mismatch;
 
 	rcu_read_lock();
 
@@ -1500,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		 * matching prefix.
 		 */
 
-		node_prefix = mask_pfx(cn->key, cn->pos);
-		key_prefix = mask_pfx(key, cn->pos);
-		pref_mismatch = key_prefix^node_prefix;
-		mp = 0;
+		pref_mismatch = mask_pfx(cn->key ^ key, cn->pos);
 
 		/*
 		 * In short: If skipped bits in this node do not match
@@ -1511,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
 		 * state.directly.
 		 */
 		if (pref_mismatch) {
-			while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
-				mp++;
-				pref_mismatch = pref_mismatch << 1;
-			}
-			key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
+			int mp = KEYLENGTH - fls(pref_mismatch);
 
-			if (key_prefix != 0)
+			if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0)
 				goto backtrace;
 
 			if (current_prefix_length >= cn->pos)
-- 
cgit v1.2.3-58-ga151


From 10da66f7552b3c7966c2f4f1f72009fb0b5539ec Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 13 Oct 2010 08:22:03 +0000
Subject: fib: avoid false sharing on fib_table_hash

While doing profile analysis, I found fib_hash_table was sometime in a
cache line shared by a possibly often written kernel structure.

(CONFIG_IP_ROUTE_MULTIPATH || !CONFIG_IPV6_MULTIPLE_TABLES)

It's hard to detect because not easily reproductible.

Make sure we allocate a full cache line to keep this shared in all cpus
caches.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 11 +++++------
 net/ipv6/ip6_fib.c      |  9 ++++++---
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 919f2ad19b49..3df057e89640 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -1016,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = {
 static int __net_init ip_fib_net_init(struct net *net)
 {
 	int err;
-	unsigned int i;
+	size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ;
+
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
 
-	net->ipv4.fib_table_hash = kzalloc(
-			sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL);
+	net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL);
 	if (net->ipv4.fib_table_hash == NULL)
 		return -ENOMEM;
 
-	for (i = 0; i < FIB_TABLE_HASHSZ; i++)
-		INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]);
-
 	err = fib4_rules_init(net);
 	if (err < 0)
 		goto fail;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index b6a585909d35..de382114609b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1500,15 +1500,18 @@ static void fib6_gc_timer_cb(unsigned long arg)
 
 static int __net_init fib6_net_init(struct net *net)
 {
+	size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+
 	setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
 
 	net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
 	if (!net->ipv6.rt6_stats)
 		goto out_timer;
 
-	net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ,
-					   sizeof(*net->ipv6.fib_table_hash),
-					   GFP_KERNEL);
+	/* Avoid false sharing : Use at least a full cache line */
+	size = max_t(size_t, size, L1_CACHE_BYTES);
+
+	net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
 	if (!net->ipv6.fib_table_hash)
 		goto out_rt6_stats;
 
-- 
cgit v1.2.3-58-ga151


From 1fdb936101637c91819efea47e921bb592e07e34 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Thu, 14 Oct 2010 01:42:30 +0000
Subject: tcp: sack lost marking fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When only fast rexmit should be done, tcp_mark_head_lost marks
L too far. Also, sacked_upto below 1 is perfectly valid number,
the packets == 0 then needs to be trapped elsewhere.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index e4fbdae066d5..ee0df4817498 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2495,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-static void tcp_mark_head_lost(struct sock *sk, int packets)
+static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
@@ -2503,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 	int err;
 	unsigned int mss;
 
-	if (packets == 0)
-		return;
-
 	WARN_ON(packets > tp->packets_out);
 	if (tp->lost_skb_hint) {
 		skb = tp->lost_skb_hint;
 		cnt = tp->lost_cnt_hint;
+		/* Head already handled? */
+		if (mark_head && skb != tcp_write_queue_head(sk))
+			return;
 	} else {
 		skb = tcp_write_queue_head(sk);
 		cnt = 0;
@@ -2544,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
 		}
 
 		tcp_skb_mark_lost(tp, skb);
+
+		if (mark_head)
+			break;
 	}
 	tcp_verify_left_out(tp);
 }
@@ -2555,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	if (tcp_is_reno(tp)) {
-		tcp_mark_head_lost(sk, 1);
+		tcp_mark_head_lost(sk, 1, 1);
 	} else if (tcp_is_fack(tp)) {
 		int lost = tp->fackets_out - tp->reordering;
 		if (lost <= 0)
 			lost = 1;
-		tcp_mark_head_lost(sk, lost);
+		tcp_mark_head_lost(sk, lost, 0);
 	} else {
 		int sacked_upto = tp->sacked_out - tp->reordering;
-		if (sacked_upto < fast_rexmit)
-			sacked_upto = fast_rexmit;
-		tcp_mark_head_lost(sk, sacked_upto);
+		if (sacked_upto >= 0)
+			tcp_mark_head_lost(sk, sacked_upto, 0);
+		else if (fast_rexmit)
+			tcp_mark_head_lost(sk, 1, 1);
 	}
 
 	tcp_timeout_skbs(sk);
@@ -2971,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	    before(tp->snd_una, tp->high_seq) &&
 	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
-		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
+		tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS);
 	}
 
-- 
cgit v1.2.3-58-ga151


From c60ce4e265404ca42ba860401f4b0f1e97332a67 Mon Sep 17 00:00:00 2001
From: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Date: Thu, 14 Oct 2010 01:52:09 +0000
Subject: tcp: use correct counters in CA_CWR state too
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As CWR is stronger than CA_Disorder state, we can miscount
SACK/Reno failure into other timeouts. Not a bad problem as
it can happen only due to ECN, FRTO detecting spurious RTO
or xmit error which are the only callers of tcp_enter_cwr.
And even then losses and RTO must still follow thereafter
to actually end up into the relevant code paths.

Compile tested.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_timer.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index f3c8c6c019ae..74a6aa003657 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -367,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk)
 	if (icsk->icsk_retransmits == 0) {
 		int mib_idx;
 
-		if (icsk->icsk_ca_state == TCP_CA_Disorder) {
-			if (tcp_is_sack(tp))
-				mib_idx = LINUX_MIB_TCPSACKFAILURES;
-			else
-				mib_idx = LINUX_MIB_TCPRENOFAILURES;
-		} else if (icsk->icsk_ca_state == TCP_CA_Recovery) {
+		if (icsk->icsk_ca_state == TCP_CA_Recovery) {
 			if (tcp_is_sack(tp))
 				mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL;
 			else
 				mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL;
 		} else if (icsk->icsk_ca_state == TCP_CA_Loss) {
 			mib_idx = LINUX_MIB_TCPLOSSFAILURES;
+		} else if ((icsk->icsk_ca_state == TCP_CA_Disorder) ||
+			   tp->sacked_out) {
+			if (tcp_is_sack(tp))
+				mib_idx = LINUX_MIB_TCPSACKFAILURES;
+			else
+				mib_idx = LINUX_MIB_TCPRENOFAILURES;
 		} else {
 			mib_idx = LINUX_MIB_TCPTIMEOUTS;
 		}
-- 
cgit v1.2.3-58-ga151


From 9bef83edfba72ba58b42c14fb046da2199574bc0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 14 Oct 2010 20:53:04 +0000
Subject: fib_hash: embed initial hash table in fn_zone

While looking for false sharing problems, I noticed
sizeof(struct fn_zone) was small (28 bytes) and possibly sharing a cache
line with an often written kernel structure.

Most of the time, fn_zone uses its initial hash table of 16 slots.

We can avoid the false sharing problem by embedding this initial hash
table in fn_zone itself, so that sizeof(fn_zone) > L1_CACHE_BYTES

We did a similar optimization in commit a6501e080c (Reduce memory needs
and speedup lookups)

Add a fz_revorder field to speedup fn_hash() a bit.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_hash.c | 52 +++++++++++++++++++++++-----------------------------
 1 file changed, 23 insertions(+), 29 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 83cca68e259c..10001aa40692 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -54,23 +54,23 @@ struct fib_node {
 	struct fib_alias        fn_embedded_alias;
 };
 
+#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
+
 struct fn_zone {
 	struct fn_zone		*fz_next;	/* Next not empty zone	*/
 	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
-	int			fz_nent;	/* Number of entries	*/
-
-	int			fz_divisor;	/* Hash divisor		*/
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
-#define FZ_HASHMASK(fz)		((fz)->fz_hashmask)
 
-	int			fz_order;	/* Zone order		*/
-	__be32			fz_mask;
+	u8			fz_order;	/* Zone order (0..32)	*/
+	u8			fz_revorder;	/* 32 - fz_order	*/
+	__be32			fz_mask;	/* inet_make_mask(order) */
 #define FZ_MASK(fz)		((fz)->fz_mask)
-};
 
-/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask
- * can be cheaper than memory lookup, so that FZ_* macros are used.
- */
+	struct hlist_head	fz_embedded_hash[EMBEDDED_HASH_SIZE];
+
+	int			fz_nent;	/* Number of entries	*/
+	int			fz_divisor;	/* Hash size (mask+1)	*/
+};
 
 struct fn_hash {
 	struct fn_zone	*fn_zones[33];
@@ -79,11 +79,11 @@ struct fn_hash {
 
 static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
 {
-	u32 h = ntohl(key)>>(32 - fz->fz_order);
+	u32 h = ntohl(key) >> fz->fz_revorder;
 	h ^= (h>>20);
 	h ^= (h>>10);
 	h ^= (h>>5);
-	h &= FZ_HASHMASK(fz);
+	h &= fz->fz_hashmask;
 	return h;
 }
 
@@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	int old_divisor, new_divisor;
 	u32 new_hashmask;
 
-	old_divisor = fz->fz_divisor;
+	new_divisor = old_divisor = fz->fz_divisor;
 
 	switch (old_divisor) {
-	case 16:
-		new_divisor = 256;
+	case EMBEDDED_HASH_SIZE:
+		new_divisor *= EMBEDDED_HASH_SIZE;
 		break;
-	case 256:
-		new_divisor = 1024;
+	case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE:
+		new_divisor *= (EMBEDDED_HASH_SIZE/2);
 		break;
 	default:
 		if ((old_divisor << 1) > FZ_MAX_DIVISOR) {
@@ -184,7 +184,8 @@ static void fn_rehash_zone(struct fn_zone *fz)
 		fib_hash_genid++;
 		write_unlock_bh(&fib_hash_lock);
 
-		fz_hash_free(old_ht, old_divisor);
+		if (old_ht != fz->fz_embedded_hash)
+			fz_hash_free(old_ht, old_divisor);
 	}
 }
 
@@ -210,18 +211,11 @@ fn_new_zone(struct fn_hash *table, int z)
 	if (!fz)
 		return NULL;
 
-	if (z) {
-		fz->fz_divisor = 16;
-	} else {
-		fz->fz_divisor = 1;
-	}
-	fz->fz_hashmask = (fz->fz_divisor - 1);
-	fz->fz_hash = fz_hash_alloc(fz->fz_divisor);
-	if (!fz->fz_hash) {
-		kfree(fz);
-		return NULL;
-	}
+	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
+	fz->fz_hashmask = fz->fz_divisor - 1;
+	fz->fz_hash = fz->fz_embedded_hash;
 	fz->fz_order = z;
+	fz->fz_revorder = 32 - z;
 	fz->fz_mask = inet_make_mask(z);
 
 	/* Find the first not empty zone with more specific mask */
-- 
cgit v1.2.3-58-ga151


From 117a8cdea3647e8e11fac10d14eafefc20f9bda5 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 14 Oct 2010 20:53:34 +0000
Subject: fib_hash: RCU conversion phase 1

First step for RCU conversion of fib_hash :

struct fn_zone are created and never deleted.

Very classic conversion, using rcu_assign_pointer(), rcu_dereference()
and rtnl_dereference() verbs.

__rcu markers on fz_next and fn_zone_list

They are created under RTNL, we dont need fib_hash_lock anymore in
fn_new_zone().

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_hash.c | 57 +++++++++++++++++++++++++++++++++++------------------
 1 file changed, 38 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 10001aa40692..04f05a96b75b 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -57,7 +57,7 @@ struct fib_node {
 #define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head))
 
 struct fn_zone {
-	struct fn_zone		*fz_next;	/* Next not empty zone	*/
+	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
 	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
 
@@ -73,8 +73,8 @@ struct fn_zone {
 };
 
 struct fn_hash {
-	struct fn_zone	*fn_zones[33];
-	struct fn_zone	*fn_zone_list;
+	struct fn_zone		*fn_zones[33];
+	struct fn_zone __rcu	*fn_zone_list;
 };
 
 static inline u32 fn_hash(__be32 key, struct fn_zone *fz)
@@ -219,21 +219,21 @@ fn_new_zone(struct fn_hash *table, int z)
 	fz->fz_mask = inet_make_mask(z);
 
 	/* Find the first not empty zone with more specific mask */
-	for (i=z+1; i<=32; i++)
+	for (i = z + 1; i <= 32; i++)
 		if (table->fn_zones[i])
 			break;
-	write_lock_bh(&fib_hash_lock);
-	if (i>32) {
+	if (i > 32) {
 		/* No more specific masks, we are the first. */
-		fz->fz_next = table->fn_zone_list;
-		table->fn_zone_list = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zone_list));
+		rcu_assign_pointer(table->fn_zone_list, fz);
 	} else {
-		fz->fz_next = table->fn_zones[i]->fz_next;
-		table->fn_zones[i]->fz_next = fz;
+		rcu_assign_pointer(fz->fz_next,
+				   rtnl_dereference(table->fn_zones[i]->fz_next));
+		rcu_assign_pointer(table->fn_zones[i]->fz_next, fz);
 	}
 	table->fn_zones[z] = fz;
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 	return fz;
 }
 
@@ -245,8 +245,11 @@ int fib_table_lookup(struct fib_table *tb,
 	struct fn_zone *fz;
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
-	for (fz = t->fn_zone_list; fz; fz = fz->fz_next) {
+	for (fz = rcu_dereference(t->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next)) {
 		struct hlist_head *head;
 		struct hlist_node *node;
 		struct fib_node *f;
@@ -267,6 +270,7 @@ int fib_table_lookup(struct fib_table *tb,
 	err = 1;
 out:
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	return err;
 }
 
@@ -362,6 +366,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	return NULL;
 }
 
+/* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
@@ -657,13 +662,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 	return found;
 }
 
+/* caller must hold RTNL. */
 int fib_table_flush(struct fib_table *tb)
 {
 	struct fn_hash *table = (struct fn_hash *) tb->tb_data;
 	struct fn_zone *fz;
 	int found = 0;
 
-	for (fz = table->fn_zone_list; fz; fz = fz->fz_next) {
+	for (fz = rtnl_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rtnl_dereference(fz->fz_next)) {
 		int i;
 
 		for (i = fz->fz_divisor - 1; i >= 0; i--)
@@ -741,23 +749,29 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb,
 int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 		   struct netlink_callback *cb)
 {
-	int m, s_m;
+	int m = 0, s_m;
 	struct fn_zone *fz;
 	struct fn_hash *table = (struct fn_hash *)tb->tb_data;
 
 	s_m = cb->args[2];
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
-	for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) {
-		if (m < s_m) continue;
+	for (fz = rcu_dereference(table->fn_zone_list);
+	     fz != NULL;
+	     fz = rcu_dereference(fz->fz_next), m++) {
+		if (m < s_m)
+			continue;
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[2] = m;
 			read_unlock(&fib_hash_lock);
+			rcu_read_unlock();
 			return -1;
 		}
 		memset(&cb->args[3], 0,
 		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 	cb->args[2] = m;
 	return skb->len;
 }
@@ -820,8 +834,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq)
 	iter->genid	= fib_hash_genid;
 	iter->valid	= 1;
 
-	for (iter->zone = table->fn_zone_list; iter->zone;
-	     iter->zone = iter->zone->fz_next) {
+	for (iter->zone = rcu_dereference(table->fn_zone_list);
+	     iter->zone != NULL;
+	     iter->zone = rcu_dereference(iter->zone->fz_next)) {
 		int maxslot;
 
 		if (!iter->zone->fz_nent)
@@ -906,7 +921,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq)
 			}
 		}
 
-		iter->zone = iter->zone->fz_next;
+		iter->zone = rcu_dereference(iter->zone->fz_next);
 
 		if (!iter->zone)
 			goto out;
@@ -946,9 +961,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
 	__acquires(fib_hash_lock)
+	__acquires(RCU)
 {
 	void *v = NULL;
 
+	rcu_read_lock();
 	read_lock(&fib_hash_lock);
 	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
@@ -963,8 +980,10 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 
 static void fib_seq_stop(struct seq_file *seq, void *v)
 	__releases(fib_hash_lock)
+	__releases(RCU)
 {
 	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
-- 
cgit v1.2.3-58-ga151


From 19f572565ef66a0439574fd2299a7c804147e133 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Thu, 14 Oct 2010 20:56:39 +0000
Subject: fib_hash: RCU conversion phase 2

Get rid of fib_hash_lock rwlock.

The fn_zone hash table resize is the noticeable part of this patch.

I added a seqlock per fn_zone, so that readers can restart their lookup
in the (very rare) case a writer expanded the hash table.

Add rcu heads in fib_alias and fib_node, use call_rcu() to defer their
freeing, and use appropriate _rcu list manipulations.

Stress test (160.000.000 udp frames sent, IP route cache disabled to
mimic DDOS attack, FIB_HASH)

Before:
real	0m41.191s
user	0m13.137s
sys	8m55.241s

After:
real	0m38.091s
user	0m13.189s
sys	7m53.018s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_hash.c   | 176 +++++++++++++++++++++++++++++---------------------
 net/ipv4/fib_lookup.h |   2 -
 2 files changed, 101 insertions(+), 77 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 04f05a96b75b..4f1aafd3ba89 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -58,7 +58,8 @@ struct fib_node {
 
 struct fn_zone {
 	struct fn_zone __rcu	*fz_next;	/* Next not empty zone	*/
-	struct hlist_head	*fz_hash;	/* Hash table pointer	*/
+	struct hlist_head __rcu	*fz_hash;	/* Hash table pointer	*/
+	seqlock_t		fz_lock;
 	u32			fz_hashmask;	/* (fz_divisor - 1)	*/
 
 	u8			fz_order;	/* Zone order (0..32)	*/
@@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz)
 	return dst & FZ_MASK(fz);
 }
 
-static DEFINE_RWLOCK(fib_hash_lock);
 static unsigned int fib_hash_genid;
 
 #define FZ_MAX_DIVISOR ((PAGE_SIZE<<MAX_ORDER) / sizeof(struct hlist_head))
@@ -101,12 +101,11 @@ static struct hlist_head *fz_hash_alloc(int divisor)
 {
 	unsigned long size = divisor * sizeof(struct hlist_head);
 
-	if (size <= PAGE_SIZE) {
+	if (size <= PAGE_SIZE)
 		return kzalloc(size, GFP_KERNEL);
-	} else {
-		return (struct hlist_head *)
-			__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
-	}
+
+	return (struct hlist_head *)
+		__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(size));
 }
 
 /* The fib hash lock must be held when this is called. */
@@ -121,12 +120,12 @@ static inline void fn_rebuild_zone(struct fn_zone *fz,
 		struct fib_node *f;
 
 		hlist_for_each_entry_safe(f, node, n, &old_ht[i], fn_hash) {
-			struct hlist_head *new_head;
+			struct hlist_head __rcu *new_head;
 
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 
 			new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
-			hlist_add_head(&f->fn_hash, new_head);
+			hlist_add_head_rcu(&f->fn_hash, new_head);
 		}
 	}
 }
@@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz)
 	ht = fz_hash_alloc(new_divisor);
 
 	if (ht)	{
-		write_lock_bh(&fib_hash_lock);
+		struct fn_zone nfz;
+
+		memcpy(&nfz, fz, sizeof(nfz));
+
+		write_seqlock_bh(&fz->fz_lock);
 		old_ht = fz->fz_hash;
-		fz->fz_hash = ht;
+		nfz.fz_hash = ht;
+		nfz.fz_hashmask = new_hashmask;
+		nfz.fz_divisor = new_divisor;
+		fn_rebuild_zone(&nfz, old_ht, old_divisor);
+		fib_hash_genid++;
+		rcu_assign_pointer(fz->fz_hash, ht);
 		fz->fz_hashmask = new_hashmask;
 		fz->fz_divisor = new_divisor;
-		fn_rebuild_zone(fz, old_ht, old_divisor);
-		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
+		write_sequnlock_bh(&fz->fz_lock);
 
-		if (old_ht != fz->fz_embedded_hash)
+		if (old_ht != fz->fz_embedded_hash) {
+			synchronize_rcu();
 			fz_hash_free(old_ht, old_divisor);
+		}
 	}
 }
 
-static inline void fn_free_node(struct fib_node * f)
+static void fn_free_node_rcu(struct rcu_head *head)
 {
+	struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu);
+
 	kmem_cache_free(fn_hash_kmem, f);
 }
 
+static inline void fn_free_node(struct fib_node *f)
+{
+	call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu);
+}
+
+static void fn_free_alias_rcu(struct rcu_head *head)
+{
+	struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
+
+	kmem_cache_free(fn_alias_kmem, fa);
+}
+
 static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f)
 {
 	fib_release_info(fa->fa_info);
 	if (fa == &f->fn_embedded_alias)
 		fa->fa_info = NULL;
 	else
-		kmem_cache_free(fn_alias_kmem, fa);
+		call_rcu(&fa->rcu, fn_free_alias_rcu);
 }
 
 static struct fn_zone *
@@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z)
 	if (!fz)
 		return NULL;
 
+	seqlock_init(&fz->fz_lock);
 	fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1;
 	fz->fz_hashmask = fz->fz_divisor - 1;
 	fz->fz_hash = fz->fz_embedded_hash;
@@ -246,30 +269,34 @@ int fib_table_lookup(struct fib_table *tb,
 	struct fn_hash *t = (struct fn_hash *)tb->tb_data;
 
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	for (fz = rcu_dereference(t->fn_zone_list);
 	     fz != NULL;
 	     fz = rcu_dereference(fz->fz_next)) {
-		struct hlist_head *head;
+		struct hlist_head __rcu *head;
 		struct hlist_node *node;
 		struct fib_node *f;
-		__be32 k = fz_key(flp->fl4_dst, fz);
+		__be32 k;
+		unsigned int seq;
 
-		head = &fz->fz_hash[fn_hash(k, fz)];
-		hlist_for_each_entry(f, node, head, fn_hash) {
-			if (f->fn_key != k)
-				continue;
+		do {
+			seq = read_seqbegin(&fz->fz_lock);
+			k = fz_key(flp->fl4_dst, fz);
+
+			head = &fz->fz_hash[fn_hash(k, fz)];
+			hlist_for_each_entry_rcu(f, node, head, fn_hash) {
+				if (f->fn_key != k)
+					continue;
 
-			err = fib_semantic_match(&f->fn_alias,
+				err = fib_semantic_match(&f->fn_alias,
 						 flp, res,
 						 fz->fz_order, fib_flags);
-			if (err <= 0)
-				goto out;
-		}
+				if (err <= 0)
+					goto out;
+			}
+		} while (read_seqretry(&fz->fz_lock, seq));
 	}
 	err = 1;
 out:
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 	return err;
 }
@@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb,
 	last_resort = NULL;
 	order = -1;
 
-	read_lock(&fib_hash_lock);
-	hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) {
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			struct fib_info *next_fi = fa->fa_info;
 
 			if (fa->fa_scope != res->scope ||
@@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb,
 		fib_result_assign(res, last_resort);
 	tb->tb_default = last_idx;
 out:
-	read_unlock(&fib_hash_lock);
+	rcu_read_unlock();
 }
 
 /* Insert node F to FZ. */
@@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f)
 {
 	struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)];
 
-	hlist_add_head(&f->fn_hash, head);
+	hlist_add_head_rcu(&f->fn_hash, head);
 }
 
 /* Return the node in FZ matching KEY. */
@@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	struct hlist_node *node;
 	struct fib_node *f;
 
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		if (f->fn_key == key)
 			return f;
 	}
@@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key)
 	return NULL;
 }
 
+
+static struct fib_alias *fib_fast_alloc(struct fib_node *f)
+{
+	struct fib_alias *fa = &f->fn_embedded_alias;
+
+	if (fa->fa_info != NULL)
+		fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
+	return fa;
+}
+
 /* Caller must hold RTNL. */
 int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 {
@@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		}
 
 		if (cfg->fc_nlflags & NLM_F_REPLACE) {
-			struct fib_info *fi_drop;
 			u8 state;
 
 			fa = fa_first;
@@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 					err = 0;
 				goto out;
 			}
-			write_lock_bh(&fib_hash_lock);
-			fi_drop = fa->fa_info;
-			fa->fa_info = fi;
-			fa->fa_type = cfg->fc_type;
-			fa->fa_scope = cfg->fc_scope;
+			err = -ENOBUFS;
+			new_fa = fib_fast_alloc(f);
+			if (new_fa == NULL)
+				goto out;
+
+			new_fa->fa_tos = fa->fa_tos;
+			new_fa->fa_info = fi;
+			new_fa->fa_type = cfg->fc_type;
+			new_fa->fa_scope = cfg->fc_scope;
 			state = fa->fa_state;
-			fa->fa_state &= ~FA_S_ACCESSED;
+			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			fib_hash_genid++;
-			write_unlock_bh(&fib_hash_lock);
+			list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
 
-			fib_release_info(fi_drop);
+			fn_free_alias(fa, f);
 			if (state & FA_S_ACCESSED)
 				rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
-			rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id,
-				  &cfg->fc_nlinfo, NLM_F_REPLACE);
+			rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len,
+				  tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
 			return 0;
 		}
 
@@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 		f = new_f;
 	}
 
-	new_fa = &f->fn_embedded_alias;
-	if (new_fa->fa_info != NULL) {
-		new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
-		if (new_fa == NULL)
-			goto out;
-	}
+	new_fa = fib_fast_alloc(f);
+	if (new_fa == NULL)
+		goto out;
+
 	new_fa->fa_info = fi;
 	new_fa->fa_tos = tos;
 	new_fa->fa_type = cfg->fc_type;
@@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 	 * Insert new entry to the list.
 	 */
 
-	write_lock_bh(&fib_hash_lock);
 	if (new_f)
 		fib_insert_node(fz, new_f);
-	list_add_tail(&new_fa->fa_list,
+	list_add_tail_rcu(&new_fa->fa_list,
 		 (fa ? &fa->fa_list : &f->fn_alias));
 	fib_hash_genid++;
-	write_unlock_bh(&fib_hash_lock);
 
 	if (new_f)
 		fz->fz_nent++;
@@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
 			  tb->tb_id, &cfg->fc_nlinfo, 0);
 
 		kill_fn = 0;
-		write_lock_bh(&fib_hash_lock);
-		list_del(&fa->fa_list);
+		list_del_rcu(&fa->fa_list);
 		if (list_empty(&f->fn_alias)) {
-			hlist_del(&f->fn_hash);
+			hlist_del_rcu(&f->fn_hash);
 			kill_fn = 1;
 		}
 		fib_hash_genid++;
-		write_unlock_bh(&fib_hash_lock);
 
 		if (fa->fa_state & FA_S_ACCESSED)
 			rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
@@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx)
 			struct fib_info *fi = fa->fa_info;
 
 			if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
-				write_lock_bh(&fib_hash_lock);
-				list_del(&fa->fa_list);
+				list_del_rcu(&fa->fa_list);
 				if (list_empty(&f->fn_alias)) {
-					hlist_del(&f->fn_hash);
+					hlist_del_rcu(&f->fn_hash);
 					kill_f = 1;
 				}
 				fib_hash_genid++;
-				write_unlock_bh(&fib_hash_lock);
 
 				fn_free_alias(fa, f);
 				found++;
@@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 
 	s_i = cb->args[4];
 	i = 0;
-	hlist_for_each_entry(f, node, head, fn_hash) {
+	hlist_for_each_entry_rcu(f, node, head, fn_hash) {
 		struct fib_alias *fa;
 
-		list_for_each_entry(fa, &f->fn_alias, fa_list) {
+		list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) {
 			if (i < s_i)
 				goto next;
 
@@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb,
 				cb->args[4] = i;
 				return -1;
 			}
-		next:
+next:
 			i++;
 		}
 	}
@@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 
 	s_m = cb->args[2];
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	for (fz = rcu_dereference(table->fn_zone_list);
 	     fz != NULL;
 	     fz = rcu_dereference(fz->fz_next), m++) {
@@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
 			continue;
 		if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) {
 			cb->args[2] = m;
-			read_unlock(&fib_hash_lock);
 			rcu_read_unlock();
 			return -1;
 		}
 		memset(&cb->args[3], 0,
 		       sizeof(cb->args) - 3*sizeof(cb->args[0]));
 	}
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 	cb->args[2] = m;
 	return skb->len;
@@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos)
 }
 
 static void *fib_seq_start(struct seq_file *seq, loff_t *pos)
-	__acquires(fib_hash_lock)
 	__acquires(RCU)
 {
 	void *v = NULL;
 
 	rcu_read_lock();
-	read_lock(&fib_hash_lock);
 	if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN))
 		v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 	return v;
@@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void fib_seq_stop(struct seq_file *seq, void *v)
-	__releases(fib_hash_lock)
 	__releases(RCU)
 {
-	read_unlock(&fib_hash_lock);
 	rcu_read_unlock();
 }
 
 static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi)
 {
 	static const unsigned type2flags[RTN_MAX + 1] = {
-		[7] = RTF_REJECT, [8] = RTF_REJECT,
+		[7] = RTF_REJECT,
+		[8] = RTF_REJECT,
 	};
 	unsigned flags = type2flags[type];
 
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index b9c9a9f2aee5..5072d8effd5d 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -12,9 +12,7 @@ struct fib_alias {
 	u8			fa_type;
 	u8			fa_scope;
 	u8			fa_state;
-#ifdef CONFIG_IP_FIB_TRIE
 	struct rcu_head		rcu;
-#endif
 };
 
 #define FA_S_ACCESSED	0x01
-- 
cgit v1.2.3-58-ga151


From 76b6717bc6ccb715b04e36efc26566a6313ede5f Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 18 Oct 2010 11:13:30 +0200
Subject: netfilter: fix kconfig unmet dependency warning

Fix netfilter kconfig unmet dependencies warning & spell out
"compatible" while there.

warning: (IP_NF_TARGET_TTL && NET && INET && NETFILTER && IP_NF_IPTABLES && NETFILTER_ADVANCED || IP6_NF_TARGET_HL && NET && INET && IPV6 && NETFILTER && IP6_NF_IPTABLES && NETFILTER_ADVANCED) selects NETFILTER_XT_TARGET_HL which has unmet direct dependencies ((IP_NF_MANGLE || IP6_NF_MANGLE) && NETFILTER_ADVANCED)

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 net/ipv4/netfilter/Kconfig | 4 ++--
 net/ipv6/netfilter/Kconfig | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 1833bdbf9805..8e3350643b63 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN
 
 config IP_NF_TARGET_TTL
 	tristate '"TTL" target support'
-	depends on NETFILTER_ADVANCED
+	depends on NETFILTER_ADVANCED && IP_NF_MANGLE
 	select NETFILTER_XT_TARGET_HL
 	---help---
-	This is a backwards-compat option for the user's convenience
+	This is a backwards-compatible option for the user's convenience
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_TARGET_HL.
 
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 29d643bcafa4..44d2eeac089b 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -132,10 +132,10 @@ config IP6_NF_MATCH_RT
 # The targets
 config IP6_NF_TARGET_HL
 	tristate '"HL" hoplimit target support'
-	depends on NETFILTER_ADVANCED
+	depends on NETFILTER_ADVANCED && IP6_NF_MANGLE
 	select NETFILTER_XT_TARGET_HL
 	---help---
-	This is a backwards-compat option for the user's convenience
+	This is a backwards-compatible option for the user's convenience
 	(e.g. when running oldconfig). It selects
 	CONFIG_NETFILTER_XT_TARGET_HL.
 
-- 
cgit v1.2.3-58-ga151


From 27a954bd560f3e385bbed38fde3051fe718b8d75 Mon Sep 17 00:00:00 2001
From: Andy Walls <awalls@md.metrocast.net>
Date: Sun, 17 Oct 2010 15:11:22 +0000
Subject: IPv4: route.c: Change checks against 0xffffffff to ipv4_is_lbcast()

Change a few checks against the hardcoded broadcast address,
0xffffffff, to ipv4_is_lbcast().  Remove some existing checks
using ipv4_is_lbcast() that are now obviously superfluous.

Signed-off-by: Andy Walls <awalls@md.metrocast.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 0755aa4af86c..ff98983d2a45 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2124,7 +2124,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	    ipv4_is_loopback(saddr))
 		goto martian_source;
 
-	if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
+	if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
 		goto brd_input;
 
 	/* Accept zero addresses only to limited broadcast;
@@ -2133,8 +2133,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 	if (ipv4_is_zeronet(saddr))
 		goto martian_source;
 
-	if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
-	    ipv4_is_loopback(daddr))
+	if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr))
 		goto martian_destination;
 
 	/*
@@ -2367,11 +2366,11 @@ static int __mkroute_output(struct rtable **result,
 	if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK))
 		return -EINVAL;
 
-	if (fl->fl4_dst == htonl(0xFFFFFFFF))
+	if (ipv4_is_lbcast(fl->fl4_dst))
 		res->type = RTN_BROADCAST;
 	else if (ipv4_is_multicast(fl->fl4_dst))
 		res->type = RTN_MULTICAST;
-	else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
+	else if (ipv4_is_zeronet(fl->fl4_dst))
 		return -EINVAL;
 
 	if (dev_out->flags & IFF_LOOPBACK)
@@ -2530,7 +2529,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 
 		if (oldflp->oif == 0 &&
 		    (ipv4_is_multicast(oldflp->fl4_dst) ||
-		     oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+		     ipv4_is_lbcast(oldflp->fl4_dst))) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
 			dev_out = __ip_dev_find(net, oldflp->fl4_src, false);
 			if (dev_out == NULL)
@@ -2574,7 +2573,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
 			goto out;	/* Wrong error code */
 
 		if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
-		    oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
+		    ipv4_is_lbcast(oldflp->fl4_dst)) {
 			if (!fl.fl4_src)
 				fl.fl4_src = inet_select_addr(dev_out, 0,
 							      RT_SCOPE_LINK);
-- 
cgit v1.2.3-58-ga151


From 9e917dca74138cccf398ce8bb924c7fd2980ec1d Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Oct 2010 00:39:18 +0000
Subject: net: avoid a dev refcount in ip_mc_find_dev()

We hold RTNL in ip_mc_find_dev(), no need to touch device refcount.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_frontend.c | 2 +-
 net/ipv4/igmp.c         | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 3df057e89640..36e27c2107de 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -153,7 +153,7 @@ static void fib_flush(struct net *net)
  * @addr: the source address
  * @devref: if true, take a reference on the found device
  *
- * If a caller uses devref=false, it should be protected by RCU
+ * If a caller uses devref=false, it should be protected by RCU, or RTNL
  */
 struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref)
 {
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 25f339672b28..a525328ec372 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1418,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev)
 	write_unlock_bh(&in_dev->mc_list_lock);
 }
 
+/* RTNL is locked */
 static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 {
 	struct flowi fl = { .nl_u = { .ip4_u =
@@ -1433,10 +1434,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 		return idev;
 	}
 	if (imr->imr_address.s_addr) {
-		dev = ip_dev_find(net, imr->imr_address.s_addr);
+		dev = __ip_dev_find(net, imr->imr_address.s_addr, false);
 		if (!dev)
 			return NULL;
-		dev_put(dev);
 	}
 
 	if (!dev && !ip_route_output_key(net, &rt, &fl)) {
-- 
cgit v1.2.3-58-ga151


From 8723e1b4ad9be4444423b4d41509ce859a629649 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Oct 2010 00:39:26 +0000
Subject: inet: RCU changes in inetdev_by_index()

Convert inetdev_by_index() to not increment in_dev refcount.

Callers hold RCU or RTNL, and should not decrement in_dev refcount.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/devinet.c       |  7 ++++---
 net/ipv4/fib_semantics.c | 25 +++++++++++--------------
 net/ipv4/igmp.c          |  2 --
 net/ipv4/ip_gre.c        |  4 +---
 4 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index c2ff48fa18c7..dc94b0316b78 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa)
 	return inet_insert_ifa(ifa);
 }
 
+/* Caller must hold RCU or RTNL :
+ * We dont take a reference on found in_device
+ */
 struct in_device *inetdev_by_index(struct net *net, int ifindex)
 {
 	struct net_device *dev;
@@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex)
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, ifindex);
 	if (dev)
-		in_dev = in_dev_get(dev);
+		in_dev = rcu_dereference_rtnl(dev->ip_ptr);
 	rcu_read_unlock();
 	return in_dev;
 }
@@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg
 		goto errout;
 	}
 
-	__in_dev_put(in_dev);
-
 	for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL;
 	     ifap = &ifa->ifa_next) {
 		if (tb[IFA_LOCAL] &&
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 0f80dfc2f7fb..6734c9cab248 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -590,32 +590,29 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
 		if (!dev)
 			goto out;
 		dev_hold(dev);
-		err = -ENETDOWN;
-		if (!(dev->flags & IFF_UP))
-			goto out;
-		err = 0;
-out:
-		rcu_read_unlock();
-		return err;
+		err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
 	} else {
 		struct in_device *in_dev;
 
 		if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
 			return -EINVAL;
 
+		rcu_read_lock();
+		err = -ENODEV;
 		in_dev = inetdev_by_index(net, nh->nh_oif);
 		if (in_dev == NULL)
-			return -ENODEV;
-		if (!(in_dev->dev->flags & IFF_UP)) {
-			in_dev_put(in_dev);
-			return -ENETDOWN;
-		}
+			goto out;
+		err = -ENETDOWN;
+		if (!(in_dev->dev->flags & IFF_UP))
+			goto out;
 		nh->nh_dev = in_dev->dev;
 		dev_hold(nh->nh_dev);
 		nh->nh_scope = RT_SCOPE_HOST;
-		in_dev_put(in_dev);
+		err = 0;
 	}
-	return 0;
+out:
+	rcu_read_unlock();
+	return err;
 }
 
 static inline unsigned int fib_laddr_hashfn(__be32 val)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index a525328ec372..c8877c6c7216 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -1429,8 +1429,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
 
 	if (imr->imr_ifindex) {
 		idev = inetdev_by_index(net, imr->imr_ifindex);
-		if (idev)
-			__in_dev_put(idev);
 		return idev;
 	}
 	if (imr->imr_address.s_addr) {
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 9d421f4cf3ef..d0ffcbe369b7 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1245,10 +1245,8 @@ static int ipgre_close(struct net_device *dev)
 	if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
 		struct in_device *in_dev;
 		in_dev = inetdev_by_index(dev_net(dev), t->mlink);
-		if (in_dev) {
+		if (in_dev)
 			ip_mc_dec_group(in_dev, t->parms.iph.daddr);
-			in_dev_put(in_dev);
-		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3-58-ga151


From 27b75c95f10d249574d9c4cb9dab878107faede8 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Fri, 15 Oct 2010 05:44:11 +0000
Subject: net: avoid RCU for NOCACHE dst

There is no point using RCU for dst we allocate for a very short time
(used once).

Change dst_release() to take DST_NOCACHE into account, but also change
skb_dst_set_noref() to force a refcount increment for such dst.

This is a _huge_ gain, because we dont waste memory to store xx thousand
of dsts. Instead of queueing them to RCU, we can free them instantly.

CPU caches can stay hot, re-using same memory blocks to hold temporary
dsts.

Note : remove unneeded smp_mb__before_atomic_dec(); in dst_release(),
since atomic_dec_return() implies a full memory barrier.

Stress test, 160.000.000 udp frames sent, IP route cache disabled
(DDOS).

Before:

real    0m38.091s
user    0m13.189s
sys     7m53.018s

After:

real	0m29.946s
user	0m12.157s
sys	7m40.605s

For reference, if IP route cache was enabled :

real	0m32.030s
user	0m10.521s
sys	8m15.243s

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 +-------------
 net/core/dst.c         | 29 ++++++++++++++++++++++++++++-
 net/ipv4/route.c       |  9 ++++-----
 3 files changed, 33 insertions(+), 19 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 05a358f1ba11..e6ba898de61c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -460,19 +460,7 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 	skb->_skb_refdst = (unsigned long)dst;
 }
 
-/**
- * skb_dst_set_noref - sets skb dst, without a reference
- * @skb: buffer
- * @dst: dst entry
- *
- * Sets skb dst, assuming a reference was not taken on dst
- * skb_dst_drop() should not dst_release() this dst
- */
-static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
-{
-	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
-	skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
-}
+extern void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst);
 
 /**
  * skb_dst_is_noref - Test if skb dst isnt refcounted
diff --git a/net/core/dst.c b/net/core/dst.c
index 32e542d7f472..8abe628b79f1 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst)
 	if (dst) {
 		int newrefcnt;
 
-		smp_mb__before_atomic_dec();
 		newrefcnt = atomic_dec_return(&dst->__refcnt);
 		WARN_ON(newrefcnt < 0);
+		if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) {
+			dst = dst_destroy(dst);
+			if (dst)
+				__dst_free(dst);
+		}
 	}
 }
 EXPORT_SYMBOL(dst_release);
 
+/**
+ * skb_dst_set_noref - sets skb dst, without a reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst
+ * skb_dst_drop() should not dst_release() this dst
+ */
+void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+	/* If dst not in cache, we must take a reference, because
+	 * dst_release() will destroy dst as soon as its refcount becomes zero
+	 */
+	if (unlikely(dst->flags & DST_NOCACHE)) {
+		dst_hold(dst);
+		skb_dst_set(skb, dst);
+	} else {
+		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
+	}
+}
+EXPORT_SYMBOL(skb_dst_set_noref);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ff98983d2a45..d6cb2bfcd8e1 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1105,9 +1105,9 @@ restart:
 		 * Note that we do rt_free on this new route entry, so that
 		 * once its refcount hits zero, we are still able to reap it
 		 * (Thanks Alexey)
-		 * Note also the rt_free uses call_rcu.  We don't actually
-		 * need rcu protection here, this is just our path to get
-		 * on the route gc list.
+		 * Note: To avoid expensive rcu stuff for this uncached dst,
+		 * we set DST_NOCACHE so that dst_release() can free dst without
+		 * waiting a grace period.
 		 */
 
 		rt->dst.flags |= DST_NOCACHE;
@@ -1117,12 +1117,11 @@ restart:
 				if (net_ratelimit())
 					printk(KERN_WARNING
 					    "Neighbour table failure & not caching routes.\n");
-				rt_drop(rt);
+				ip_rt_put(rt);
 				return err;
 			}
 		}
 
-		rt_free(rt);
 		goto skip_hashing;
 	}
 
-- 
cgit v1.2.3-58-ga151


From 9b0c290e78d667e6a483bde8c7cef7dd15f49017 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 20 Oct 2010 22:03:38 +0000
Subject: fib: introduce fib_alias_accessed() helper

Perf tools session at NFWS 2010 pointed out a false sharing on struct
fib_alias that can be avoided pretty easily, if we set FA_S_ACCESSED bit
only if needed (ie : not already set)

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_hash.c      | 3 ++-
 net/ipv4/fib_lookup.h    | 7 +++++++
 net/ipv4/fib_semantics.c | 2 +-
 net/ipv4/fib_trie.c      | 3 ++-
 4 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c
index 4f1aafd3ba89..43e1c594ce8f 100644
--- a/net/ipv4/fib_hash.c
+++ b/net/ipv4/fib_hash.c
@@ -335,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb,
 			if (!next_fi->fib_nh[0].nh_gw ||
 			    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 				continue;
-			fa->fa_state |= FA_S_ACCESSED;
+
+			fib_alias_accessed(fa);
 
 			if (fi == NULL) {
 				if (next_fi != res->fi)
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index 5072d8effd5d..a29edf2219c8 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -17,6 +17,13 @@ struct fib_alias {
 
 #define FA_S_ACCESSED	0x01
 
+/* Dont write on fa_state unless needed, to keep it shared on all cpus */
+static inline void fib_alias_accessed(struct fib_alias *fa)
+{
+	if (!(fa->fa_state & FA_S_ACCESSED))
+		fa->fa_state |= FA_S_ACCESSED;
+}
+
 /* Exported by fib_semantics.c */
 extern int fib_semantic_match(struct list_head *head,
 			      const struct flowi *flp,
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 6734c9cab248..3e0da3ef6116 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -901,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp,
 		if (fa->fa_scope < flp->fl4_scope)
 			continue;
 
-		fa->fa_state |= FA_S_ACCESSED;
+		fib_alias_accessed(fa);
 
 		err = fib_props[fa->fa_type].error;
 		if (err == 0) {
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 31494f335686..cd5e13aee7d5 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1838,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb,
 		if (!next_fi->fib_nh[0].nh_gw ||
 		    next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
 			continue;
-		fa->fa_state |= FA_S_ACCESSED;
+
+		fib_alias_accessed(fa);
 
 		if (fi == NULL) {
 			if (next_fi != res->fi)
-- 
cgit v1.2.3-58-ga151


From 093d282321daeb19c107e5f1f16d7f68484f3ade Mon Sep 17 00:00:00 2001
From: Balazs Scheidler <bazsi@balabit.hu>
Date: Thu, 21 Oct 2010 13:06:43 +0200
Subject: tproxy: fix hash locking issue when using port redirection in
 __inet_inherit_port()

When __inet_inherit_port() is called on a tproxy connection the wrong locks are
held for the inet_bind_bucket it is added to. __inet_inherit_port() made an
implicit assumption that the listener's port number (and thus its bind bucket).
Unfortunately, if you're using the TPROXY target to redirect skbs to a
transparent proxy that assumption is not true anymore and things break.

This patch adds code to __inet_inherit_port() so that it can handle this case
by looking up or creating a new bind bucket for the child socket and updates
callers of __inet_inherit_port() to gracefully handle __inet_inherit_port()
failing.

Reported by and original patch from Stephen Buck <stephen.buck@exinda.com>.
See http://marc.info/?t=128169268200001&r=1&w=2 for the original discussion.

Signed-off-by: KOVACS Krisztian <hidden@balabit.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>
---
 include/net/inet_hashtables.h |  2 +-
 net/dccp/ipv4.c               | 10 +++++++---
 net/dccp/ipv6.c               | 10 +++++++---
 net/ipv4/inet_hashtables.c    | 28 ++++++++++++++++++++++++++--
 net/ipv4/tcp_ipv4.c           | 10 +++++++---
 net/ipv6/tcp_ipv6.c           | 12 ++++++++----
 6 files changed, 56 insertions(+), 16 deletions(-)

(limited to 'net/ipv4')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 74358d1b3f43..e9c2ed8af864 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -245,7 +245,7 @@ static inline int inet_sk_listen_hashfn(const struct sock *sk)
 }
 
 /* Caller must disable local BH processing. */
-extern void __inet_inherit_port(struct sock *sk, struct sock *child);
+extern int __inet_inherit_port(struct sock *sk, struct sock *child);
 
 extern void inet_put_port(struct sock *sk);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index d4a166f0f391..3f69ea114829 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = dccp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto exit;
+		goto exit_nonewsk;
 
 	sk_setup_caps(newsk, dst);
 
@@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	dccp_sync_mss(newsk, dst_mtu(dst));
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto exit;
+	}
 	__inet_hash_nolisten(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 exit_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
 exit:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-	dst_release(dst);
 	return NULL;
 }
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e3f32575df7..dca711df9b60 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 
 	newsk = dccp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto out;
+		goto out_nonewsk;
 
 	/*
 	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6;
 	newinet->inet_rcv_saddr = LOOPBACK4_IPV6;
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
 	__inet6_hash(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 out_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+out_nonewsk:
+	dst_release(dst);
 out:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	if (opt != NULL && opt != np->opt)
 		sock_kfree_s(sk, opt, opt->tot_len);
-	dst_release(dst);
 	return NULL;
 }
 
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index fb7ad5a21ff3..1b344f30b463 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk)
 }
 EXPORT_SYMBOL(inet_put_port);
 
-void __inet_inherit_port(struct sock *sk, struct sock *child)
+int __inet_inherit_port(struct sock *sk, struct sock *child)
 {
 	struct inet_hashinfo *table = sk->sk_prot->h.hashinfo;
-	const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num,
+	unsigned short port = inet_sk(child)->inet_num;
+	const int bhash = inet_bhashfn(sock_net(sk), port,
 			table->bhash_size);
 	struct inet_bind_hashbucket *head = &table->bhash[bhash];
 	struct inet_bind_bucket *tb;
 
 	spin_lock(&head->lock);
 	tb = inet_csk(sk)->icsk_bind_hash;
+	if (tb->port != port) {
+		/* NOTE: using tproxy and redirecting skbs to a proxy
+		 * on a different listener port breaks the assumption
+		 * that the listener socket's icsk_bind_hash is the same
+		 * as that of the child socket. We have to look up or
+		 * create a new bind bucket for the child here. */
+		struct hlist_node *node;
+		inet_bind_bucket_for_each(tb, node, &head->chain) {
+			if (net_eq(ib_net(tb), sock_net(sk)) &&
+			    tb->port == port)
+				break;
+		}
+		if (!node) {
+			tb = inet_bind_bucket_create(table->bind_bucket_cachep,
+						     sock_net(sk), head, port);
+			if (!tb) {
+				spin_unlock(&head->lock);
+				return -ENOMEM;
+			}
+		}
+	}
 	sk_add_bind_node(child, &tb->owners);
 	inet_csk(child)->icsk_bind_hash = tb;
 	spin_unlock(&head->lock);
+
+	return 0;
 }
 EXPORT_SYMBOL_GPL(__inet_inherit_port);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a0232f3a358b..8f8527d41682 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (!newsk)
-		goto exit;
+		goto exit_nonewsk;
 
 	newsk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(newsk, dst);
@@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto exit;
+	}
 	__inet_hash_nolisten(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 exit_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
+exit_nonewsk:
+	dst_release(dst);
 exit:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
-	dst_release(dst);
 	return NULL;
 }
 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index fe6d40418c0b..ba5258ef1c57 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 
 	newsk = tcp_create_openreq_child(sk, req, skb);
 	if (newsk == NULL)
-		goto out;
+		goto out_nonewsk;
 
 	/*
 	 * No need to charge this sock to the relevant IPv6 refcnt debug socks
@@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	}
 #endif
 
+	if (__inet_inherit_port(sk, newsk) < 0) {
+		sock_put(newsk);
+		goto out;
+	}
 	__inet6_hash(newsk, NULL);
-	__inet_inherit_port(sk, newsk);
 
 	return newsk;
 
 out_overflow:
 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
-out:
-	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
+out_nonewsk:
 	if (opt && opt != np->opt)
 		sock_kfree_s(sk, opt, opt->tot_len);
 	dst_release(dst);
+out:
+	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
 	return NULL;
 }
 
-- 
cgit v1.2.3-58-ga151


From b0aeef30433ea6854e985c2e9842fa19f51b95cc Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Mon, 11 Oct 2010 11:23:07 +0300
Subject: nf_nat: restrict ICMP translation for embedded header

 	Skip ICMP translation of embedded protocol header
if NAT bits are not set. Needed for IPVS to see the original
embedded addresses because for IPVS traffic the IPS_SRC_NAT_BIT
and IPS_DST_NAT_BIT bits are not set. It happens when IPVS performs
DNAT for client packets after using nf_conntrack_alter_reply
to expect replies from real server.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/ipv4/netfilter/nf_nat_core.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'net/ipv4')

diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index e2e00c4da883..0047923c1f22 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 			return 0;
 	}
 
+	if (manip == IP_NAT_MANIP_SRC)
+		statusbit = IPS_SRC_NAT;
+	else
+		statusbit = IPS_DST_NAT;
+
+	/* Invert if this is reply dir. */
+	if (dir == IP_CT_DIR_REPLY)
+		statusbit ^= IPS_NAT_MASK;
+
+	if (!(ct->status & statusbit))
+		return 1;
+
 	pr_debug("icmp_reply_translation: translating error %p manip %u "
 		 "dir %s\n", skb, manip,
 		 dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY");
@@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct,
 
 	/* Change outer to look the reply to an incoming packet
 	 * (proto 0 means don't invert per-proto part). */
-	if (manip == IP_NAT_MANIP_SRC)
-		statusbit = IPS_SRC_NAT;
-	else
-		statusbit = IPS_DST_NAT;
-
-	/* Invert if this is reply dir. */
-	if (dir == IP_CT_DIR_REPLY)
-		statusbit ^= IPS_NAT_MASK;
-
-	if (ct->status & statusbit) {
-		nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
-		if (!manip_pkt(0, skb, 0, &target, manip))
-			return 0;
-	}
+	nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
+	if (!manip_pkt(0, skb, 0, &target, manip))
+		return 0;
 
 	return 1;
 }
-- 
cgit v1.2.3-58-ga151