From 75e8d06d4308436055d1a78a2c02bf6328ba724d Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 14 Jan 2015 15:33:57 +0100
Subject: netfilter: nf_tables: validate hooks in NAT expressions

The user can crash the kernel if it uses any of the existing NAT
expressions from the wrong hook, so add some code to validate this
when loading the rule.

This patch introduces nft_chain_validate_hooks() which is based on
an existing function in the bridge version of the reject expression.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 3ae969e3acf0..9eaaa7884586 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -530,6 +530,8 @@ enum nft_chain_type {
 
 int nft_chain_validate_dependency(const struct nft_chain *chain,
 				  enum nft_chain_type type);
+int nft_chain_validate_hooks(const struct nft_chain *chain,
+                             unsigned int hook_flags);
 
 struct nft_stats {
 	u64			bytes;
-- 
cgit v1.2.3-58-ga151


From e73ebb0881ea5534ce606c1d71b4ac44db5c6930 Mon Sep 17 00:00:00 2001
From: Neal Cardwell <ncardwell@google.com>
Date: Wed, 28 Jan 2015 20:01:35 -0500
Subject: tcp: stretch ACK fixes prep

LRO, GRO, delayed ACKs, and middleboxes can cause "stretch ACKs" that
cover more than the RFC-specified maximum of 2 packets. These stretch
ACKs can cause serious performance shortfalls in common congestion
control algorithms that were designed and tuned years ago with
receiver hosts that were not using LRO or GRO, and were instead
politely ACKing every other packet.

This patch series fixes Reno and CUBIC to handle stretch ACKs.

This patch prepares for the upcoming stretch ACK bug fix patches. It
adds an "acked" parameter to tcp_cong_avoid_ai() to allow for future
fixes to tcp_cong_avoid_ai() to correctly handle stretch ACKs, and
changes all congestion control algorithms to pass in 1 for the ACKed
count. It also changes tcp_slow_start() to return the number of packet
ACK "credits" that were not processed in slow start mode, and can be
processed by the congestion control module in additive increase mode.

In future patches we will fix tcp_cong_avoid_ai() to handle stretch
ACKs, and fix Reno and CUBIC handling of stretch ACKs in slow start
and additive increase mode.

Reported-by: Eyal Perry <eyalpe@mellanox.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h       |  4 ++--
 net/ipv4/tcp_bic.c      |  2 +-
 net/ipv4/tcp_cong.c     | 11 +++++++----
 net/ipv4/tcp_cubic.c    |  2 +-
 net/ipv4/tcp_scalable.c |  3 ++-
 net/ipv4/tcp_veno.c     |  2 +-
 net/ipv4/tcp_yeah.c     |  2 +-
 7 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f50f29faf76f..9d9111ef43ae 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -834,8 +834,8 @@ void tcp_get_available_congestion_control(char *buf, size_t len);
 void tcp_get_allowed_congestion_control(char *buf, size_t len);
 int tcp_set_allowed_congestion_control(char *allowed);
 int tcp_set_congestion_control(struct sock *sk, const char *name);
-void tcp_slow_start(struct tcp_sock *tp, u32 acked);
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w);
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
 
 u32 tcp_reno_ssthresh(struct sock *sk);
 void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index bb395d46a389..c037644eafb7 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	else {
 		bictcp_update(ca, tp->snd_cwnd);
-		tcp_cong_avoid_ai(tp, ca->cnt);
+		tcp_cong_avoid_ai(tp, ca->cnt, 1);
 	}
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index 27ead0dd16bc..6826017c12d1 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -291,25 +291,28 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
  * ABC caps N to 2. Slow start exits when cwnd grows over ssthresh and
  * returns the leftover acks to adjust cwnd in congestion avoidance mode.
  */
-void tcp_slow_start(struct tcp_sock *tp, u32 acked)
+u32 tcp_slow_start(struct tcp_sock *tp, u32 acked)
 {
 	u32 cwnd = tp->snd_cwnd + acked;
 
 	if (cwnd > tp->snd_ssthresh)
 		cwnd = tp->snd_ssthresh + 1;
+	acked -= cwnd - tp->snd_cwnd;
 	tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp);
+
+	return acked;
 }
 EXPORT_SYMBOL_GPL(tcp_slow_start);
 
 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd (or alternative w) */
-void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w)
+void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked)
 {
 	if (tp->snd_cwnd_cnt >= w) {
 		if (tp->snd_cwnd < tp->snd_cwnd_clamp)
 			tp->snd_cwnd++;
 		tp->snd_cwnd_cnt = 0;
 	} else {
-		tp->snd_cwnd_cnt++;
+		tp->snd_cwnd_cnt += acked;
 	}
 }
 EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai);
@@ -333,7 +336,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	/* In dangerous area, increase slowly. */
 	else
-		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 }
 EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid);
 
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 6b6002416a73..df4bc4d87e58 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -320,7 +320,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 		tcp_slow_start(tp, acked);
 	} else {
 		bictcp_update(ca, tp->snd_cwnd);
-		tcp_cong_avoid_ai(tp, ca->cnt);
+		tcp_cong_avoid_ai(tp, ca->cnt, 1);
 	}
 }
 
diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c
index 6824afb65d93..333bcb2415ff 100644
--- a/net/ipv4/tcp_scalable.c
+++ b/net/ipv4/tcp_scalable.c
@@ -25,7 +25,8 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 	if (tp->snd_cwnd <= tp->snd_ssthresh)
 		tcp_slow_start(tp, acked);
 	else
-		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT));
+		tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT),
+				  1);
 }
 
 static u32 tcp_scalable_ssthresh(struct sock *sk)
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index a4d2d2d88dca..112151eeee45 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -159,7 +159,7 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 				/* In the "non-congestive state", increase cwnd
 				 *  every rtt.
 				 */
-				tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+				tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 			} else {
 				/* In the "congestive state", increase cwnd
 				 * every other rtt.
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index cd7273218598..17d35662930d 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -92,7 +92,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked)
 
 	} else {
 		/* Reno */
-		tcp_cong_avoid_ai(tp, tp->snd_cwnd);
+		tcp_cong_avoid_ai(tp, tp->snd_cwnd, 1);
 	}
 
 	/* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt.
-- 
cgit v1.2.3-58-ga151


From 0d32ef8cef9aa8f375e128f78b77caceaa7e8da0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 17:30:12 -0800
Subject: net: sched: fix panic in rate estimators

Doing the following commands on a non idle network device
panics the box instantly, because cpu_bstats gets overwritten
by stats.

tc qdisc add dev eth0 root <your_favorite_qdisc>
... some traffic (one packet is enough) ...
tc qdisc replace dev eth0 root est 1sec 4sec <your_favorite_qdisc>

[  325.355596] BUG: unable to handle kernel paging request at ffff8841dc5a074c
[  325.362609] IP: [<ffffffff81541c9e>] __gnet_stats_copy_basic+0x3e/0x90
[  325.369158] PGD 1fa7067 PUD 0
[  325.372254] Oops: 0000 [#1] SMP
[  325.375514] Modules linked in: ...
[  325.398346] CPU: 13 PID: 14313 Comm: tc Not tainted 3.19.0-smp-DEV #1163
[  325.412042] task: ffff8800793ab5d0 ti: ffff881ff2fa4000 task.ti: ffff881ff2fa4000
[  325.419518] RIP: 0010:[<ffffffff81541c9e>]  [<ffffffff81541c9e>] __gnet_stats_copy_basic+0x3e/0x90
[  325.428506] RSP: 0018:ffff881ff2fa7928  EFLAGS: 00010286
[  325.433824] RAX: 000000000000000c RBX: ffff881ff2fa796c RCX: 000000000000000c
[  325.440988] RDX: ffff8841dc5a0744 RSI: 0000000000000060 RDI: 0000000000000060
[  325.448120] RBP: ffff881ff2fa7948 R08: ffffffff81cd4f80 R09: 0000000000000000
[  325.455268] R10: ffff883ff223e400 R11: 0000000000000000 R12: 000000015cba0744
[  325.462405] R13: ffffffff81cd4f80 R14: ffff883ff223e460 R15: ffff883feea0722c
[  325.469536] FS:  00007f2ee30fa700(0000) GS:ffff88407fa20000(0000) knlGS:0000000000000000
[  325.477630] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  325.483380] CR2: ffff8841dc5a074c CR3: 0000003feeae9000 CR4: 00000000001407e0
[  325.490510] Stack:
[  325.492524]  ffff883feea0722c ffff883fef719dc0 ffff883feea0722c ffff883ff223e4a0
[  325.499990]  ffff881ff2fa79a8 ffffffff815424ee ffff883ff223e49c 000000015cba0744
[  325.507460]  00000000f2fa7978 0000000000000000 ffff881ff2fa79a8 ffff883ff223e4a0
[  325.514956] Call Trace:
[  325.517412]  [<ffffffff815424ee>] gen_new_estimator+0x8e/0x230
[  325.523250]  [<ffffffff815427aa>] gen_replace_estimator+0x4a/0x60
[  325.529349]  [<ffffffff815718ab>] tc_modify_qdisc+0x52b/0x590
[  325.535117]  [<ffffffff8155edd0>] rtnetlink_rcv_msg+0xa0/0x240
[  325.540963]  [<ffffffff8155ed30>] ? __rtnl_unlock+0x20/0x20
[  325.546532]  [<ffffffff8157f811>] netlink_rcv_skb+0xb1/0xc0
[  325.552145]  [<ffffffff8155b355>] rtnetlink_rcv+0x25/0x40
[  325.557558]  [<ffffffff8157f0d8>] netlink_unicast+0x168/0x220
[  325.563317]  [<ffffffff8157f47c>] netlink_sendmsg+0x2ec/0x3e0

Lets play safe and not use an union : percpu 'pointers' are mostly read
anyway, and we have typically few qdiscs per host.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: John Fastabend <john.fastabend@gmail.com>
Fixes: 22e0f8b9322c ("net: sched: make bstats per cpu and estimator RCU safe")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sch_generic.h | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 3d282cbb66bf..c605d305c577 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -79,6 +79,9 @@ struct Qdisc {
 	struct netdev_queue	*dev_queue;
 
 	struct gnet_stats_rate_est64	rate_est;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_queue	__percpu *cpu_qstats;
+
 	struct Qdisc		*next_sched;
 	struct sk_buff		*gso_skb;
 	/*
@@ -86,15 +89,9 @@ struct Qdisc {
 	 */
 	unsigned long		state;
 	struct sk_buff_head	q;
-	union {
-		struct gnet_stats_basic_packed bstats;
-		struct gnet_stats_basic_cpu __percpu *cpu_bstats;
-	} __packed;
+	struct gnet_stats_basic_packed bstats;
 	unsigned int		__state;
-	union {
-		struct gnet_stats_queue	qstats;
-		struct gnet_stats_queue	__percpu *cpu_qstats;
-	} __packed;
+	struct gnet_stats_queue	qstats;
 	struct rcu_head		rcu_head;
 	int			padded;
 	atomic_t		refcnt;
-- 
cgit v1.2.3-58-ga151


From bdbbb8527b6f6a358dbcb70dac247034d665b8e4 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 29 Jan 2015 21:35:05 -0800
Subject: ipv4: tcp: get rid of ugly unicast_sock

In commit be9f4a44e7d41 ("ipv4: tcp: remove per net tcp_sock")
I tried to address contention on a socket lock, but the solution
I chose was horrible :

commit 3a7c384ffd57e ("ipv4: tcp: unicast_sock should not land outside
of TCP stack") addressed a selinux regression.

commit 0980e56e506b ("ipv4: tcp: set unicast_sock uc_ttl to -1")
took care of another regression.

commit b5ec8eeac46 ("ipv4: fix ip_send_skb()") fixed another regression.

commit 811230cd85 ("tcp: ipv4: initialize unicast_sock sk_pacing_rate")
was another shot in the dark.

Really, just use a proper socket per cpu, and remove the skb_orphan()
call, to re-enable flow control.

This solves a serious problem with FQ packet scheduler when used in
hostile environments, as we do not want to allocate a flow structure
for every RST packet sent in response to a spoofed packet.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ip.h         |  2 +-
 include/net/netns/ipv4.h |  1 +
 net/ipv4/ip_output.c     | 30 +++---------------------------
 net/ipv4/tcp_ipv4.c      | 37 ++++++++++++++++++++++++++++++++-----
 4 files changed, 37 insertions(+), 33 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index f7cbd703d15d..09cf5aebb283 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
 	return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
 }
 
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 24945cefc4fd..0ffef1a38efc 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -52,6 +52,7 @@ struct netns_ipv4 {
 	struct inet_peer_base	*peers;
 	struct tcpm_hash_bucket	*tcp_metrics_hash;
 	unsigned int		tcp_metrics_hash_log;
+	struct sock  * __percpu	*tcp_sk;
 	struct netns_frags	frags;
 #ifdef CONFIG_NETFILTER
 	struct xt_table		*iptable_filter;
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 38a20a9cca1a..c373c0708d97 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
 /*
  *	Generic function to send a packet as reply to another packet.
  *	Used to send some TCP resets/acks so far.
- *
- *	Use a fake percpu inet socket to avoid false sharing and contention.
  */
-static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
-	.sk = {
-		.__sk_common = {
-			.skc_refcnt = ATOMIC_INIT(1),
-		},
-		.sk_wmem_alloc	= ATOMIC_INIT(1),
-		.sk_allocation	= GFP_ATOMIC,
-		.sk_flags	= (1UL << SOCK_USE_WRITE_QUEUE),
-		.sk_pacing_rate = ~0U,
-	},
-	.pmtudisc	= IP_PMTUDISC_WANT,
-	.uc_ttl		= -1,
-};
-
-void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
+void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 			   const struct ip_options *sopt,
 			   __be32 daddr, __be32 saddr,
 			   const struct ip_reply_arg *arg,
@@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 	struct ipcm_cookie ipc;
 	struct flowi4 fl4;
 	struct rtable *rt = skb_rtable(skb);
+	struct net *net = sock_net(sk);
 	struct sk_buff *nskb;
-	struct sock *sk;
-	struct inet_sock *inet;
 	int err;
 
 	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
@@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 	if (IS_ERR(rt))
 		return;
 
-	inet = &get_cpu_var(unicast_sock);
+	inet_sk(sk)->tos = arg->tos;
 
-	inet->tos = arg->tos;
-	sk = &inet->sk;
 	sk->sk_priority = skb->priority;
 	sk->sk_protocol = ip_hdr(skb)->protocol;
 	sk->sk_bound_dev_if = arg->bound_dev_if;
-	sock_net_set(sk, net);
-	__skb_queue_head_init(&sk->sk_write_queue);
 	sk->sk_sndbuf = sysctl_wmem_default;
 	err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
 			     len, 0, &ipc, &rt, MSG_DONTWAIT);
@@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
 			  arg->csumoffset) = csum_fold(csum_add(nskb->csum,
 								arg->csum));
 		nskb->ip_summed = CHECKSUM_NONE;
-		skb_orphan(nskb);
 		skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
 		ip_push_pending_frames(sk, &fl4);
 	}
 out:
-	put_cpu_var(unicast_sock);
-
 	ip_rt_put(rt);
 }
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index a3f72d7fc06c..d22f54482bab 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 		arg.bound_dev_if = sk->sk_bound_dev_if;
 
 	arg.tos = ip_hdr(skb)->tos;
-	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
@@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 	if (oif)
 		arg.bound_dev_if = oif;
 	arg.tos = tos;
-	ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
+	ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
+			      skb, &TCP_SKB_CB(skb)->header.h4.opt,
 			      ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
 			      &arg, arg.iov[0].iov_len);
 
@@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
 };
 EXPORT_SYMBOL(tcp_prot);
 
+static void __net_exit tcp_sk_exit(struct net *net)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
+	free_percpu(net->ipv4.tcp_sk);
+}
+
 static int __net_init tcp_sk_init(struct net *net)
 {
+	int res, cpu;
+
+	net->ipv4.tcp_sk = alloc_percpu(struct sock *);
+	if (!net->ipv4.tcp_sk)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct sock *sk;
+
+		res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
+					   IPPROTO_TCP, net);
+		if (res)
+			goto fail;
+		*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
+	}
 	net->ipv4.sysctl_tcp_ecn = 2;
 	return 0;
-}
 
-static void __net_exit tcp_sk_exit(struct net *net)
-{
+fail:
+	tcp_sk_exit(net);
+
+	return res;
 }
 
 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
-- 
cgit v1.2.3-58-ga151


From 0508c07f5e0c94f38afd5434e8b2a55b84553077 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 3 Feb 2015 16:36:15 -0500
Subject: ipv6: Select fragment id during UFO segmentation if not set.

If the IPv6 fragment id has not been set and we perform
fragmentation due to UFO, select a new fragment id.
We now consider a fragment id of 0 as unset and if id selection
process returns 0 (after all the pertrubations), we set it to
0x80000000, thus giving us ample space not to create collisions
with the next packet we may have to fragment.

When doing UFO integrity checking, we also select the
fragment id if it has not be set yet.   This is stored into
the skb_shinfo() thus allowing UFO to function correclty.

This patch also removes duplicate fragment id generation code
and moves ipv6_select_ident() into the header as it may be
used during GSO.

Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h     |  3 +++
 net/ipv6/ip6_output.c  | 14 --------------
 net/ipv6/output_core.c | 41 +++++++++++++++++++++++++++++++++++------
 net/ipv6/udp_offload.c | 10 +++++++++-
 4 files changed, 47 insertions(+), 21 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929392b0..9bf85d34c024 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -671,6 +671,9 @@ static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_add
 	return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr));
 }
 
+u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst,
+			struct in6_addr *src);
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt);
 void ipv6_proxy_select_ident(struct sk_buff *skb);
 
 int ip6_dst_hoplimit(struct dst_entry *dst);
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12ae48c..d28f2a2efb32 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -537,20 +537,6 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	skb_copy_secmark(to, from);
 }
 
-static void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
-{
-	static u32 ip6_idents_hashrnd __read_mostly;
-	u32 hash, id;
-
-	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
-
-	hash = __ipv6_addr_jhash(&rt->rt6i_dst.addr, ip6_idents_hashrnd);
-	hash = __ipv6_addr_jhash(&rt->rt6i_src.addr, hash);
-
-	id = ip_idents_reserve(hash, 1);
-	fhdr->identification = htonl(id);
-}
-
 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 {
 	struct sk_buff *frag;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 97f41a3e68d9..54520a0bd5e3 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -9,6 +9,24 @@
 #include <net/addrconf.h>
 #include <net/secure_seq.h>
 
+u32 __ipv6_select_ident(u32 hashrnd, struct in6_addr *dst, struct in6_addr *src)
+{
+	u32 hash, id;
+
+	hash = __ipv6_addr_jhash(dst, hashrnd);
+	hash = __ipv6_addr_jhash(src, hash);
+
+	/* Treat id of 0 as unset and if we get 0 back from ip_idents_reserve,
+	 * set the hight order instead thus minimizing possible future
+	 * collisions.
+	 */
+	id = ip_idents_reserve(hash, 1);
+	if (unlikely(!id))
+		id = 1 << 31;
+
+	return id;
+}
+
 /* This function exists only for tap drivers that must support broken
  * clients requesting UFO without specifying an IPv6 fragment ID.
  *
@@ -22,7 +40,7 @@ void ipv6_proxy_select_ident(struct sk_buff *skb)
 	static u32 ip6_proxy_idents_hashrnd __read_mostly;
 	struct in6_addr buf[2];
 	struct in6_addr *addrs;
-	u32 hash, id;
+	u32 id;
 
 	addrs = skb_header_pointer(skb,
 				   skb_network_offset(skb) +
@@ -34,14 +52,25 @@ void ipv6_proxy_select_ident(struct sk_buff *skb)
 	net_get_random_once(&ip6_proxy_idents_hashrnd,
 			    sizeof(ip6_proxy_idents_hashrnd));
 
-	hash = __ipv6_addr_jhash(&addrs[1], ip6_proxy_idents_hashrnd);
-	hash = __ipv6_addr_jhash(&addrs[0], hash);
-
-	id = ip_idents_reserve(hash, 1);
-	skb_shinfo(skb)->ip6_frag_id = htonl(id);
+	id = __ipv6_select_ident(ip6_proxy_idents_hashrnd,
+				 &addrs[1], &addrs[0]);
+	skb_shinfo(skb)->ip6_frag_id = id;
 }
 EXPORT_SYMBOL_GPL(ipv6_proxy_select_ident);
 
+void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
+{
+	static u32 ip6_idents_hashrnd __read_mostly;
+	u32 id;
+
+	net_get_random_once(&ip6_idents_hashrnd, sizeof(ip6_idents_hashrnd));
+
+	id = __ipv6_select_ident(ip6_idents_hashrnd, &rt->rt6i_dst.addr,
+				 &rt->rt6i_src.addr);
+	fhdr->identification = htonl(id);
+}
+EXPORT_SYMBOL(ipv6_select_ident);
+
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 {
 	u16 offset = sizeof(struct ipv6hdr);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index b6aa8ed18257..a56276996b72 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -52,6 +52,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 
 		skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss);
 
+		/* Set the IPv6 fragment id if not set yet */
+		if (!skb_shinfo(skb)->ip6_frag_id)
+			ipv6_proxy_select_ident(skb);
+
 		segs = NULL;
 		goto out;
 	}
@@ -108,7 +112,11 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		fptr = (struct frag_hdr *)(skb_network_header(skb) + unfrag_ip6hlen);
 		fptr->nexthdr = nexthdr;
 		fptr->reserved = 0;
-		fptr->identification = skb_shinfo(skb)->ip6_frag_id;
+		if (skb_shinfo(skb)->ip6_frag_id)
+			fptr->identification = skb_shinfo(skb)->ip6_frag_id;
+		else
+			ipv6_select_ident(fptr,
+					  (struct rt6_info *)skb_dst(skb));
 
 		/* Fragment the skb. ipv6 header and the remaining fields of the
 		 * fragment header are updated in ipv6_gso_segment()
-- 
cgit v1.2.3-58-ga151


From f4575d3534617eec98c7eb8701185cec96b4374b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 13:31:54 -0800
Subject: flow_keys: n_proto type should be __be16

(struct flow_keys)->n_proto is in network order, use
proper type for this.

Fixes following sparse errors :

net/core/flow_dissector.c:139:39: warning: incorrect type in assignment (different base types)
net/core/flow_dissector.c:139:39:    expected unsigned short [unsigned] [usertype] n_proto
net/core/flow_dissector.c:139:39:    got restricted __be16 [assigned] [usertype] proto
net/core/flow_dissector.c:237:23: warning: incorrect type in assignment (different base types)
net/core/flow_dissector.c:237:23:    expected unsigned short [unsigned] [usertype] n_proto
net/core/flow_dissector.c:237:23:    got restricted __be16 [assigned] [usertype] proto

Signed-off-by: Eric Dumazet <edumazet@google.com>
Fixes: e0f31d849867 ("flow_keys: Record IP layer protocol in skb_flow_dissect()")
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_keys.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/flow_keys.h b/include/net/flow_keys.h
index 7ee2df083542..dc8fd81412bf 100644
--- a/include/net/flow_keys.h
+++ b/include/net/flow_keys.h
@@ -22,9 +22,9 @@ struct flow_keys {
 		__be32 ports;
 		__be16 port16[2];
 	};
-	u16 thoff;
-	u16 n_proto;
-	u8 ip_proto;
+	u16	thoff;
+	__be16	n_proto;
+	u8	ip_proto;
 };
 
 bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow,
-- 
cgit v1.2.3-58-ga151


From 677651462c774b5866be2bc42601303a76b021a0 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Wed, 4 Feb 2015 15:03:25 -0800
Subject: ipv6: fix sparse errors in ip6_make_flowlabel()

include/net/ipv6.h:713:22: warning: incorrect type in assignment (different base types)
include/net/ipv6.h:713:22:    expected restricted __be32 [usertype] hash
include/net/ipv6.h:713:22:    got unsigned int
include/net/ipv6.h:719:25: warning: restricted __be32 degrades to integer
include/net/ipv6.h:719:22: warning: invalid assignment: ^=
include/net/ipv6.h:719:22:    left side has type restricted __be32
include/net/ipv6.h:719:22:    right side has type unsigned int

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 9bf85d34c024..6e416f6d3e3c 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -711,7 +711,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 					__be32 flowlabel, bool autolabel)
 {
 	if (!flowlabel && (autolabel || net->ipv6.sysctl.auto_flowlabels)) {
-		__be32 hash;
+		u32 hash;
 
 		hash = skb_get_hash(skb);
 
@@ -721,7 +721,7 @@ static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb,
 		 */
 		hash ^= hash >> 12;
 
-		flowlabel = hash & IPV6_FLOWLABEL_MASK;
+		flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK;
 	}
 
 	return flowlabel;
-- 
cgit v1.2.3-58-ga151