summaryrefslogtreecommitdiff
path: root/net/ipv4
diff options
context:
space:
mode:
authorPablo Neira Ayuso <pablo@netfilter.org>2015-10-17 14:11:08 +0200
committerPablo Neira Ayuso <pablo@netfilter.org>2015-10-17 14:28:03 +0200
commitf0a0a978b66fea782a52b0a7075b3fa9ab27ad0a (patch)
tree52ecc0eafbac697c6afaa542efe324984484120c /net/ipv4
parentc8d71d08aa23679f56e7072358383442c6ede352 (diff)
parent4be3158abe1e02d24f82b34101e41d662fae2185 (diff)
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
This merge resolves conflicts with 75aec9df3a78 ("bridge: Remove br_nf_push_frag_xmit_sk") as part of Eric Biederman's effort to improve netns support in the network stack that reached upstream via David's net-next tree. Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> Conflicts: net/bridge/br_netfilter_hooks.c
Diffstat (limited to 'net/ipv4')
-rw-r--r--net/ipv4/fib_semantics.c161
-rw-r--r--net/ipv4/icmp.c19
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c87
-rw-r--r--net/ipv4/inet_hashtables.c6
-rw-r--r--net/ipv4/ip_forward.c2
-rw-r--r--net/ipv4/ip_fragment.c7
-rw-r--r--net/ipv4/ip_input.c10
-rw-r--r--net/ipv4/ip_output.c86
-rw-r--r--net/ipv4/ip_tunnel_core.c6
-rw-r--r--net/ipv4/ip_vti.c2
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c2
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c7
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/raw.c10
-rw-r--r--net/ipv4/route.c81
-rw-r--r--net/ipv4/syncookies.c7
-rw-r--r--net/ipv4/tcp.c24
-rw-r--r--net/ipv4/tcp_fastopen.c26
-rw-r--r--net/ipv4/tcp_input.c16
-rw-r--r--net/ipv4/tcp_ipv4.c13
-rw-r--r--net/ipv4/tcp_minisocks.c19
-rw-r--r--net/ipv4/tcp_output.c2
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c28
-rw-r--r--net/ipv4/xfrm4_output.c6
28 files changed, 382 insertions, 261 deletions
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 064bd3caaa4f..42778d9d71e5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -57,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-
-static DEFINE_SPINLOCK(fib_multipath_lock);
+u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
@@ -532,7 +531,67 @@ errout:
return ret;
}
-#endif
+static void fib_rebalance(struct fib_info *fi)
+{
+ int total;
+ int w;
+ struct in_device *in_dev;
+
+ if (fi->fib_nhs < 2)
+ return;
+
+ total = 0;
+ for_nexthops(fi) {
+ if (nh->nh_flags & RTNH_F_DEAD)
+ continue;
+
+ in_dev = __in_dev_get_rtnl(nh->nh_dev);
+
+ if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nh->nh_flags & RTNH_F_LINKDOWN)
+ continue;
+
+ total += nh->nh_weight;
+ } endfor_nexthops(fi);
+
+ w = 0;
+ change_nexthops(fi) {
+ int upper_bound;
+
+ in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
+
+ if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
+ upper_bound = -1;
+ } else if (in_dev &&
+ IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
+ nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
+ upper_bound = -1;
+ } else {
+ w += nexthop_nh->nh_weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
+ } endfor_nexthops(fi);
+
+ net_get_random_once(&fib_multipath_secret,
+ sizeof(fib_multipath_secret));
+}
+
+static inline void fib_add_weight(struct fib_info *fi,
+ const struct fib_nh *nh)
+{
+ fi->fib_weight += nh->nh_weight;
+}
+
+#else /* CONFIG_IP_ROUTE_MULTIPATH */
+
+#define fib_rebalance(fi) do { } while (0)
+#define fib_add_weight(fi, nh) do { } while (0)
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
@@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
+ fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
+ fib_rebalance(fi);
+
link_it:
ofi = fib_find_info(fi);
if (ofi) {
@@ -1317,12 +1379,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- fi->fib_power -= nexthop_nh->nh_power;
- nexthop_nh->nh_power = 0;
- spin_unlock_bh(&fib_multipath_lock);
-#endif
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
@@ -1345,6 +1401,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
}
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1467,20 +1525,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
!__in_dev_get_rtnl(dev))
continue;
alive++;
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- spin_lock_bh(&fib_multipath_lock);
- nexthop_nh->nh_power = 0;
nexthop_nh->nh_flags &= ~nh_flags;
- spin_unlock_bh(&fib_multipath_lock);
-#else
- nexthop_nh->nh_flags &= ~nh_flags;
-#endif
} endfor_nexthops(fi)
if (alive > 0) {
fi->fib_flags &= ~nh_flags;
ret++;
}
+
+ fib_rebalance(fi);
}
return ret;
@@ -1488,62 +1541,40 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
-/*
- * The algorithm is suboptimal, but it provides really
- * fair weighted route distribution.
- */
-void fib_select_multipath(struct fib_result *res)
+void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
- struct in_device *in_dev;
- int w;
-
- spin_lock_bh(&fib_multipath_lock);
- if (fi->fib_power <= 0) {
- int power = 0;
- change_nexthops(fi) {
- in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
- if (nexthop_nh->nh_flags & RTNH_F_DEAD)
- continue;
- if (in_dev &&
- IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
- nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
- continue;
- power += nexthop_nh->nh_weight;
- nexthop_nh->nh_power = nexthop_nh->nh_weight;
- } endfor_nexthops(fi);
- fi->fib_power = power;
- if (power <= 0) {
- spin_unlock_bh(&fib_multipath_lock);
- /* Race condition: route has just become dead. */
- res->nh_sel = 0;
- return;
- }
- }
-
- /* w should be random number [0..fi->fib_power-1],
- * it is pretty bad approximation.
- */
-
- w = jiffies % fi->fib_power;
+ for_nexthops(fi) {
+ if (hash > atomic_read(&nh->nh_upper_bound))
+ continue;
- change_nexthops(fi) {
- if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
- nexthop_nh->nh_power) {
- w -= nexthop_nh->nh_power;
- if (w <= 0) {
- nexthop_nh->nh_power--;
- fi->fib_power--;
- res->nh_sel = nhsel;
- spin_unlock_bh(&fib_multipath_lock);
- return;
- }
- }
+ res->nh_sel = nhsel;
+ return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
- spin_unlock_bh(&fib_multipath_lock);
}
#endif
+
+void fib_select_path(struct net *net, struct fib_result *res,
+ struct flowi4 *fl4, int mp_hash)
+{
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+ if (res->fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
+ if (mp_hash < 0)
+ mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr);
+ fib_select_multipath(res, mp_hash);
+ }
+ else
+#endif
+ if (!res->prefixlen &&
+ res->table->tb_num_default > 1 &&
+ res->type == RTN_UNICAST && !fl4->flowi4_oif)
+ fib_select_default(fl4, res);
+
+ if (!fl4->saddr)
+ fl4->saddr = FIB_RES_PREFSRC(net, *res);
+}
+EXPORT_SYMBOL_GPL(fib_select_path);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 6b96dee2800b..36e26977c908 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -440,6 +440,22 @@ out_unlock:
icmp_xmit_unlock(sk);
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* Source and destination is swapped. See ip_multipath_icmp_hash */
+static int icmp_multipath_hash_skb(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+
+ return fib_multipath_hash(iph->daddr, iph->saddr);
+}
+
+#else
+
+#define icmp_multipath_hash_skb(skb) (-1)
+
+#endif
+
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
@@ -464,7 +480,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
- rt = __ip_route_output_key(net, fl4);
+ rt = __ip_route_output_key_hash(net, fl4,
+ icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index de6d4c8ba600..64aaf3522a59 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -397,7 +397,7 @@ static int igmpv3_sendpack(struct sk_buff *skb)
pig->csum = ip_compute_csum(igmp_hdr(skb), igmplen);
- return ip_local_out(skb);
+ return ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
}
static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel)
@@ -739,7 +739,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
ih->group = group;
ih->csum = ip_compute_csum((void *)ih, sizeof(struct igmphdr));
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
static void igmp_gq_timer_expire(unsigned long data)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 89eedfbd4ad5..8430bc8ccd58 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -546,6 +546,13 @@ void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req)
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_drop);
+void inet_csk_reqsk_queue_drop_and_put(struct sock *sk, struct request_sock *req)
+{
+ inet_csk_reqsk_queue_drop(sk, req);
+ reqsk_put(req);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_drop_and_put);
+
static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
@@ -579,7 +586,7 @@ static void reqsk_timer_handler(unsigned long data)
* ones are about to clog our table.
*/
qlen = reqsk_queue_len(queue);
- if ((qlen << 1) > sk_listener->sk_max_ack_backlog) {
+ if ((qlen << 1) > max(8U, sk_listener->sk_max_ack_backlog)) {
int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) {
@@ -608,8 +615,7 @@ static void reqsk_timer_handler(unsigned long data)
return;
}
drop:
- inet_csk_reqsk_queue_drop(sk_listener, req);
- reqsk_put(req);
+ inet_csk_reqsk_queue_drop_and_put(sk_listener, req);
}
static void reqsk_queue_hash_req(struct request_sock *req,
@@ -727,14 +733,14 @@ void inet_csk_prepare_forced_close(struct sock *sk)
}
EXPORT_SYMBOL(inet_csk_prepare_forced_close);
-int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
+int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
reqsk_queue_alloc(&icsk->icsk_accept_queue);
- sk->sk_max_ack_backlog = 0;
+ sk->sk_max_ack_backlog = backlog;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
@@ -758,6 +764,53 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
+static void inet_child_forget(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ sk->sk_prot->disconnect(child, O_NONBLOCK);
+
+ sock_orphan(child);
+
+ percpu_counter_inc(sk->sk_prot->orphan_count);
+
+ if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
+ BUG_ON(tcp_sk(child)->fastopen_rsk != req);
+ BUG_ON(sk != req->rsk_listener);
+
+ /* Paranoid, to prevent race condition if
+ * an inbound pkt destined for child is
+ * blocked by sock lock in tcp_v4_rcv().
+ * Also to satisfy an assertion in
+ * tcp_v4_destroy_sock().
+ */
+ tcp_sk(child)->fastopen_rsk = NULL;
+ }
+ inet_csk_destroy_sock(child);
+ reqsk_put(req);
+}
+
+void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
+ struct sock *child)
+{
+ struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
+
+ spin_lock(&queue->rskq_lock);
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
+ inet_child_forget(sk, req, child);
+ } else {
+ req->sk = child;
+ req->dl_next = NULL;
+ if (queue->rskq_accept_head == NULL)
+ queue->rskq_accept_head = req;
+ else
+ queue->rskq_accept_tail->dl_next = req;
+ queue->rskq_accept_tail = req;
+ sk_acceptq_added(sk);
+ }
+ spin_unlock(&queue->rskq_lock);
+}
+EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
+
/*
* This routine closes sockets which have been at least partially
* opened, but not yet accepted.
@@ -784,31 +837,11 @@ void inet_csk_listen_stop(struct sock *sk)
WARN_ON(sock_owned_by_user(child));
sock_hold(child);
- sk->sk_prot->disconnect(child, O_NONBLOCK);
-
- sock_orphan(child);
-
- percpu_counter_inc(sk->sk_prot->orphan_count);
-
- if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
- BUG_ON(tcp_sk(child)->fastopen_rsk != req);
- BUG_ON(sk != req->rsk_listener);
-
- /* Paranoid, to prevent race condition if
- * an inbound pkt destined for child is
- * blocked by sock lock in tcp_v4_rcv().
- * Also to satisfy an assertion in
- * tcp_v4_destroy_sock().
- */
- tcp_sk(child)->fastopen_rsk = NULL;
- }
- inet_csk_destroy_sock(child);
-
+ inet_child_forget(sk, req, child);
bh_unlock_sock(child);
local_bh_enable();
sock_put(child);
- reqsk_put(req);
cond_resched();
}
if (queue->fastopenq.rskq_rst_head) {
@@ -823,7 +856,7 @@ void inet_csk_listen_stop(struct sock *sk)
req = next;
}
}
- WARN_ON(sk->sk_ack_backlog);
+ WARN_ON_ONCE(sk->sk_ack_backlog);
}
EXPORT_SYMBOL_GPL(inet_csk_listen_stop);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index bed8886a4b6c..958728a22001 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -137,6 +137,10 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
spin_lock(&head->lock);
tb = inet_csk(sk)->icsk_bind_hash;
+ if (unlikely(!tb)) {
+ spin_unlock(&head->lock);
+ return -ENOENT;
+ }
if (tb->port != port) {
/* NOTE: using tproxy and redirecting skbs to a proxy
* on a different listener port breaks the assumption
@@ -185,6 +189,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
}
return score;
}
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index d66cfb35ba74..da0d7ce85844 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -72,7 +72,7 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
ip_forward_options(skb);
skb_sender_cpu_clear(skb);
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
int ip_forward(struct sk_buff *skb)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 9772b789adf3..5482745d5d68 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -654,11 +654,10 @@ out_fail:
}
/* Process an incoming IP datagram fragment. */
-int ip_defrag(struct sk_buff *skb, u32 user)
+int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
int vif = l3mdev_master_ifindex_rcu(dev);
- struct net *net = dev_net(dev);
struct ipq *qp;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
@@ -683,7 +682,7 @@ int ip_defrag(struct sk_buff *skb, u32 user)
}
EXPORT_SYMBOL(ip_defrag);
-struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
+struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
struct iphdr iph;
int netoff;
@@ -712,7 +711,7 @@ struct sk_buff *ip_check_defrag(struct sk_buff *skb, u32 user)
if (pskb_trim_rcsum(skb, netoff + len))
return skb;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- if (ip_defrag(skb, user))
+ if (ip_defrag(net, skb, user))
return NULL;
skb_clear_hash(skb);
}
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 7cc9f7bb7fb7..b1209b63381f 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -157,6 +157,7 @@ bool ip_call_ra_chain(struct sk_buff *skb)
u8 protocol = ip_hdr(skb)->protocol;
struct sock *last = NULL;
struct net_device *dev = skb->dev;
+ struct net *net = dev_net(dev);
for (ra = rcu_dereference(ip_ra_chain); ra; ra = rcu_dereference(ra->next)) {
struct sock *sk = ra->sk;
@@ -167,9 +168,9 @@ bool ip_call_ra_chain(struct sk_buff *skb)
if (sk && inet_sk(sk)->inet_num == protocol &&
(!sk->sk_bound_dev_if ||
sk->sk_bound_dev_if == dev->ifindex) &&
- net_eq(sock_net(sk), dev_net(dev))) {
+ net_eq(sock_net(sk), net)) {
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN))
+ if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
return true;
}
if (last) {
@@ -246,14 +247,15 @@ int ip_local_deliver(struct sk_buff *skb)
/*
* Reassemble IP fragments.
*/
+ struct net *net = dev_net(skb->dev);
if (ip_is_fragment(ip_hdr(skb))) {
- if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
+ if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
+ net, NULL, skb, skb->dev, NULL,
ip_local_deliver_finish);
}
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index aff6766922e8..67404e1fe7d4 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -83,9 +83,10 @@
int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
EXPORT_SYMBOL(sysctl_ip_default_ttl);
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *));
+static int
+ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ unsigned int mtu,
+ int (*output)(struct net *, struct sock *, struct sk_buff *));
/* Generate a checksum for an outgoing IP datagram. */
void ip_send_check(struct iphdr *iph)
@@ -95,34 +96,28 @@ void ip_send_check(struct iphdr *iph)
}
EXPORT_SYMBOL(ip_send_check);
-static int __ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, skb_dst(skb)->dev,
- dst_output_okfn);
-}
-
-int __ip_local_out(struct sk_buff *skb)
-{
- return __ip_local_out_sk(skb->sk, skb);
+ dst_output);
}
-int ip_local_out_sk(struct sock *sk, struct sk_buff *skb)
+int ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
int err;
- err = __ip_local_out(skb);
+ err = __ip_local_out(net, sk, skb);
if (likely(err == 1))
- err = dst_output(sk, skb);
+ err = dst_output(net, sk, skb);
return err;
}
-EXPORT_SYMBOL_GPL(ip_local_out_sk);
+EXPORT_SYMBOL_GPL(ip_local_out);
static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
{
@@ -142,6 +137,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
{
struct inet_sock *inet = inet_sk(sk);
struct rtable *rt = skb_rtable(skb);
+ struct net *net = sock_net(sk);
struct iphdr *iph;
/* Build the IP header. */
@@ -160,7 +156,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
iph->id = 0;
} else {
iph->frag_off = 0;
- __ip_select_ident(sock_net(sk), iph, 1);
+ __ip_select_ident(net, iph, 1);
}
if (opt && opt->opt.optlen) {
@@ -172,16 +168,15 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk,
skb->mark = sk->sk_mark;
/* Send it out. */
- return ip_local_out(skb);
+ return ip_local_out(net, skb->sk, skb);
}
EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
-static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
+static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
struct rtable *rt = (struct rtable *)dst;
struct net_device *dev = dst->dev;
- struct net *net = dev_net(dev);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
struct neighbour *neigh;
u32 nexthop;
@@ -225,8 +220,8 @@ static int ip_finish_output2(struct sock *sk, struct sk_buff *skb)
return -EINVAL;
}
-static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
- unsigned int mtu)
+static int ip_finish_output_gso(struct net *net, struct sock *sk,
+ struct sk_buff *skb, unsigned int mtu)
{
netdev_features_t features;
struct sk_buff *segs;
@@ -235,7 +230,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
/* common case: locally created skb or seglen is <= mtu */
if (((IPCB(skb)->flags & IPSKB_FORWARDED) == 0) ||
skb_gso_network_seglen(skb) <= mtu)
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
/* Slowpath - GSO segment length is exceeding the dst MTU.
*
@@ -258,7 +253,7 @@ static int ip_finish_output_gso(struct sock *sk, struct sk_buff *skb,
int err;
segs->next = NULL;
- err = ip_fragment(sk, segs, mtu, ip_finish_output2);
+ err = ip_fragment(net, sk, segs, mtu, ip_finish_output2);
if (err && ret == 0)
ret = err;
@@ -276,24 +271,23 @@ static int ip_finish_output(struct net *net, struct sock *sk, struct sk_buff *sk
/* Policy lookup after SNAT yielded a new policy */
if (skb_dst(skb)->xfrm) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
mtu = ip_skb_dst_mtu(skb);
if (skb_is_gso(skb))
- return ip_finish_output_gso(sk, skb, mtu);
+ return ip_finish_output_gso(net, sk, skb, mtu);
if (skb->len > mtu || (IPCB(skb)->flags & IPSKB_FRAG_PMTU))
- return ip_fragment(sk, skb, mtu, ip_finish_output2);
+ return ip_fragment(net, sk, skb, mtu, ip_finish_output2);
- return ip_finish_output2(sk, skb);
+ return ip_finish_output2(net, sk, skb);
}
-int ip_mc_output(struct sock *sk, struct sk_buff *skb)
+int ip_mc_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct rtable *rt = skb_rtable(skb);
struct net_device *dev = rt->dst.dev;
- struct net *net = dev_net(dev);
/*
* If the indicated interface is up and running, send the packet.
@@ -352,10 +346,9 @@ int ip_mc_output(struct sock *sk, struct sk_buff *skb)
!(IPCB(skb)->flags & IPSKB_REROUTED));
}
-int ip_output(struct sock *sk, struct sk_buff *skb)
+int ip_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct net_device *dev = skb_dst(skb)->dev;
- struct net *net = dev_net(dev);
IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
@@ -386,6 +379,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ip_options_rcu *inet_opt;
struct flowi4 *fl4;
struct rtable *rt;
@@ -416,7 +410,7 @@ int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)
* keep trying until route appears or the connection times
* itself out.
*/
- rt = ip_route_output_ports(sock_net(sk), fl4, sk,
+ rt = ip_route_output_ports(net, fl4, sk,
daddr, inet->inet_saddr,
inet->inet_dport,
inet->inet_sport,
@@ -453,20 +447,20 @@ packet_routed:
ip_options_build(skb, &inet_opt->opt, inet->inet_daddr, rt, 0);
}
- ip_select_ident_segs(sock_net(sk), skb, sk,
+ ip_select_ident_segs(net, skb, sk,
skb_shinfo(skb)->gso_segs ?: 1);
/* TODO : should we use skb->sk here instead of sk ? */
skb->priority = sk->sk_priority;
skb->mark = sk->sk_mark;
- res = ip_local_out(skb);
+ res = ip_local_out(net, sk, skb);
rcu_read_unlock();
return res;
no_route:
rcu_read_unlock();
- IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
+ IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
kfree_skb(skb);
return -EHOSTUNREACH;
}
@@ -495,20 +489,18 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
skb_copy_secmark(to, from);
}
-static int ip_fragment(struct sock *sk, struct sk_buff *skb,
+static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
- int (*output)(struct sock *, struct sk_buff *))
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph = ip_hdr(skb);
if ((iph->frag_off & htons(IP_DF)) == 0)
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
if (unlikely(!skb->ignore_df ||
(IPCB(skb)->frag_max_size &&
IPCB(skb)->frag_max_size > mtu))) {
- struct net *net = dev_net(skb_rtable(skb)->dst.dev);
-
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
htonl(mtu));
@@ -516,7 +508,7 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
return -EMSGSIZE;
}
- return ip_do_fragment(sk, skb, output);
+ return ip_do_fragment(net, sk, skb, output);
}
/*
@@ -526,8 +518,8 @@ static int ip_fragment(struct sock *sk, struct sk_buff *skb,
* single device frame, and queue such a frame for sending.
*/
-int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
- int (*output)(struct sock *, struct sk_buff *))
+int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
+ int (*output)(struct net *, struct sock *, struct sk_buff *))
{
struct iphdr *iph;
int ptr;
@@ -537,11 +529,9 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
int offset;
__be16 not_last_frag;
struct rtable *rt = skb_rtable(skb);
- struct net *net;
int err = 0;
dev = rt->dst.dev;
- net = dev_net(dev);
/*
* Point into the IP datagram header.
@@ -631,7 +621,7 @@ int ip_do_fragment(struct sock *sk, struct sk_buff *skb,
ip_send_check(iph);
}
- err = output(sk, skb);
+ err = output(net, sk, skb);
if (!err)
IP_INC_STATS(net, IPSTATS_MIB_FRAGCREATES);
@@ -771,7 +761,7 @@ slow_path:
ip_send_check(iph);
- err = output(sk, skb2);
+ err = output(net, sk, skb2);
if (err)
goto fail;
@@ -1444,7 +1434,7 @@ int ip_send_skb(struct net *net, struct sk_buff *skb)
{
int err;
- err = ip_local_out(skb);
+ err = ip_local_out(net, skb->sk, skb);
if (err) {
if (err > 0)
err = net_xmit_errno(err);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 84dce6a92f93..6cb9009c3d96 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -53,6 +53,7 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
__u8 tos, __u8 ttl, __be16 df, bool xnet)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
+ struct net *net = dev_net(rt->dst.dev);
struct iphdr *iph;
int err;
@@ -76,10 +77,9 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
- __ip_select_ident(dev_net(rt->dst.dev), iph,
- skb_shinfo(skb)->gso_segs ?: 1);
+ __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1);
- err = ip_local_out_sk(sk, skb);
+ err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 3b87ec5178f9..4d8f0b698777 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -197,7 +197,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
- err = dst_output(skb->sk, skb);
+ err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index cfcb996ec51b..fc42525d8694 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1689,7 +1689,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
if (unlikely(opt->optlen))
ip_forward_options(skb);
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
/*
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index a1058363d2e9..5fdc556514ba 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -63,7 +63,7 @@ synproxy_send_tcp(const struct synproxy_net *snet,
nf_conntrack_get(nfct);
}
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
free_nskb:
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 9df3f93269d3..0e5591c2ee9f 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -22,14 +22,15 @@
#endif
#include <net/netfilter/nf_conntrack_zones.h>
-static int nf_ct_ipv4_gather_frags(struct sk_buff *skb, u_int32_t user)
+static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
+ u_int32_t user)
{
int err;
skb_orphan(skb);
local_bh_disable();
- err = ip_defrag(skb, user);
+ err = ip_defrag(net, skb, user);
local_bh_enable();
if (!err) {
@@ -85,7 +86,7 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
enum ip_defrag_users user =
nf_ct_defrag_user(state->hook, skb);
- if (nf_ct_ipv4_gather_frags(skb, user))
+ if (nf_ct_ipv4_gather_frags(state->net, skb, user))
return NF_STOLEN;
}
return NF_ACCEPT;
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ce2a59e5c665..ceb187308120 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -92,7 +92,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum,
if (nf_dup_ipv4_route(net, skb, gw, oif)) {
__this_cpu_write(nf_skb_duplicated, true);
- ip_local_out(skb);
+ ip_local_out(net, skb->sk, skb);
__this_cpu_write(nf_skb_duplicated, false);
} else {
kfree_skb(skb);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 2f5e925d3264..c747b2d9eb77 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -157,7 +157,7 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook)
dev_queue_xmit(nskb);
} else
#endif
- ip_local_out(nskb);
+ ip_local_out(net, nskb->sk, nskb);
return;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 28ef8a913130..8c0d0bdc2a7c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -413,7 +413,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, rt->dst.dev,
- dst_output_okfn);
+ dst_output);
if (err > 0)
err = net_xmit_errno(err);
if (err)
@@ -484,6 +484,7 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
struct rtable *rt = NULL;
struct flowi4 fl4;
@@ -543,7 +544,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
ipc.oif = sk->sk_bound_dev_if;
if (msg->msg_controllen) {
- err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
+ err = ip_cmsg_send(net, msg, &ipc, false);
if (err)
goto out;
if (ipc.opt)
@@ -598,6 +599,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, &fl4);
+
if (!inet->hdrincl) {
rfv.msg = msg;
rfv.hlen = 0;
@@ -608,7 +612,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
}
security_sk_classify_flow(sk, flowi4_to_flowi(&fl4));
- rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
+ rt = ip_route_output_flow(net, &fl4, sk);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
rt = NULL;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 76ca4e75f785..85f184e429c6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1152,7 +1152,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
dst_set_expires(&rt->dst, 0);
}
-static int ip_rt_bug(struct sock *sk, struct sk_buff *skb)
+static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
{
pr_debug("%s: %pI4 -> %pI4, %s\n",
__func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
@@ -1651,6 +1651,48 @@ out:
return err;
}
+#ifdef CONFIG_IP_ROUTE_MULTIPATH
+
+/* To make ICMP packets follow the right flow, the multipath hash is
+ * calculated from the inner IP addresses in reverse order.
+ */
+static int ip_multipath_icmp_hash(struct sk_buff *skb)
+{
+ const struct iphdr *outer_iph = ip_hdr(skb);
+ struct icmphdr _icmph;
+ const struct icmphdr *icmph;
+ struct iphdr _inner_iph;
+ const struct iphdr *inner_iph;
+
+ if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
+ goto standard_hash;
+
+ icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
+ &_icmph);
+ if (!icmph)
+ goto standard_hash;
+
+ if (icmph->type != ICMP_DEST_UNREACH &&
+ icmph->type != ICMP_REDIRECT &&
+ icmph->type != ICMP_TIME_EXCEEDED &&
+ icmph->type != ICMP_PARAMETERPROB) {
+ goto standard_hash;
+ }
+
+ inner_iph = skb_header_pointer(skb,
+ outer_iph->ihl * 4 + sizeof(_icmph),
+ sizeof(_inner_iph), &_inner_iph);
+ if (!inner_iph)
+ goto standard_hash;
+
+ return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
+
+standard_hash:
+ return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
+}
+
+#endif /* CONFIG_IP_ROUTE_MULTIPATH */
+
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
@@ -1658,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1)
- fib_select_multipath(res);
+ if (res->fi && res->fi->fib_nhs > 1) {
+ int h;
+
+ if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
+ h = ip_multipath_icmp_hash(skb);
+ else
+ h = fib_multipath_hash(saddr, daddr);
+ fib_select_multipath(res, h);
+ }
#endif
/* create a routing cache entry */
@@ -2026,7 +2075,8 @@ add:
* Major route resolver routine.
*/
-struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
+struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
+ int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
@@ -2146,7 +2196,8 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
if (err) {
res.fi = NULL;
res.table = NULL;
- if (fl4->flowi4_oif) {
+ if (fl4->flowi4_oif &&
+ !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
/* Apparently, routing tables are wrong. Assume,
that the destination is on link.
@@ -2188,18 +2239,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
goto make_route;
}
-#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
- fib_select_multipath(&res);
- else
-#endif
- if (!res.prefixlen &&
- res.table->tb_num_default > 1 &&
- res.type == RTN_UNICAST && !fl4->flowi4_oif)
- fib_select_default(fl4, &res);
-
- if (!fl4->saddr)
- fl4->saddr = FIB_RES_PREFSRC(net, res);
+ fib_select_path(net, &res, fl4, mp_hash);
dev_out = FIB_RES_DEV(res);
fl4->flowi4_oif = dev_out->ifindex;
@@ -2212,7 +2252,7 @@ out:
rcu_read_unlock();
return rth;
}
-EXPORT_SYMBOL_GPL(__ip_route_output_key);
+EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
@@ -2264,7 +2304,7 @@ struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_or
new->__use = 1;
new->input = dst_discard;
- new->output = dst_discard_sk;
+ new->output = dst_discard_out;
new->dev = ort->dst.dev;
if (new->dev)
@@ -2468,6 +2508,9 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
fl4.flowi4_mark = mark;
+ if (netif_index_is_l3_master(net, fl4.flowi4_oif))
+ fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
+
if (iif) {
struct net_device *dev;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 729ceb5f63c6..4c0892badb8b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -225,6 +225,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
if (child) {
atomic_set(&req->rsk_refcnt, 1);
+ sock_rps_save_rxhash(child, skb);
inet_csk_reqsk_queue_add(sk, req, child);
} else {
reqsk_free(req);
@@ -326,7 +327,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
ret = NULL;
- req = inet_reqsk_alloc(&tcp_request_sock_ops, sk); /* for safety */
+ req = inet_reqsk_alloc(&tcp_request_sock_ops, sk, false); /* for safety */
if (!req)
goto out;
@@ -381,10 +382,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
/* Try to redo what tcp_v4_send_synack did. */
- req->window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
+ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW);
tcp_select_initial_window(tcp_full_space(sk), req->mss,
- &req->rcv_wnd, &req->window_clamp,
+ &req->rsk_rcv_wnd, &req->rsk_window_clamp,
ireq->wscale_ok, &rcv_wscale,
dst_metric(&rt->dst, RTAX_INITRWND));
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 3c96fa87ff9e..ac1bdbb50352 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -900,7 +900,8 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto out_err;
}
@@ -967,7 +968,8 @@ new_segment:
copied += copy;
offset += copy;
- if (!(size -= copy)) {
+ size -= copy;
+ if (!size) {
tcp_tx_timestamp(sk, skb);
goto out;
}
@@ -988,7 +990,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1111,7 +1114,8 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
*/
if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) &&
!tcp_passive_fastopen(sk)) {
- if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)
+ err = sk_stream_wait_connect(sk, &timeo);
+ if (err != 0)
goto do_error;
}
@@ -1267,7 +1271,8 @@ wait_for_memory:
tcp_push(sk, flags & ~MSG_MORE, mss_now,
TCP_NAGLE_PUSH, size_goal);
- if ((err = sk_stream_wait_memory(sk, &timeo)) != 0)
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err != 0)
goto do_error;
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1767,7 +1772,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* __ Restore normal policy in scheduler __ */
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk);
len -= chunk;
copied += chunk;
@@ -1778,7 +1784,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
do_prequeue:
tcp_prequeue_process(sk);
- if ((chunk = len - tp->ucopy.len) != 0) {
+ chunk = len - tp->ucopy.len;
+ if (chunk != 0) {
NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk);
len -= chunk;
copied += chunk;
@@ -2230,7 +2237,8 @@ int tcp_disconnect(struct sock *sk, int flags)
sk->sk_shutdown = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->srtt_us = 0;
- if ((tp->write_seq += tp->max_window + 2) == 0)
+ tp->write_seq += tp->max_window + 2;
+ if (tp->write_seq == 0)
tp->write_seq = 1;
icsk->icsk_backoff = 0;
tp->snd_cwnd = 2;
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 410ac481fda0..93396bf7b475 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -168,8 +168,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
TCP_TIMEOUT_INIT, TCP_RTO_MAX);
atomic_set(&req->rsk_refcnt, 2);
- /* Add the child socket directly into the accept queue */
- inet_csk_reqsk_queue_add(sk, req, child);
/* Now finish processing the fastopen child socket. */
inet_csk(child)->icsk_af_ops->rebuild_header(child);
@@ -178,12 +176,10 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet. We need to first
- * bump skb's refcnt because the caller will attempt to free it.
- * Note that IPv6 might also have used skb_get() trick
- * in tcp_v6_conn_request() to keep this SYN around (treq->pktopts)
- * So we need to eventually get a clone of the packet,
- * before inserting it in sk_receive_queue.
+ /* Queue the data carried in the SYN packet.
+ * We used to play tricky games with skb_get().
+ * With lockless listener, it is a dead end.
+ * Do not think about it.
*
* XXX (TFO) - we honor a zero-payload TFO request for now,
* (any reason not to?) but no need to queue the skb since
@@ -191,12 +187,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
*/
end_seq = TCP_SKB_CB(skb)->end_seq;
if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2;
-
- if (unlikely(skb_shared(skb)))
- skb2 = skb_clone(skb, GFP_ATOMIC);
- else
- skb2 = skb_get(skb);
+ struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
if (likely(skb2)) {
skb_dst_drop(skb2);
@@ -214,12 +205,9 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
}
}
tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
- sk->sk_data_ready(sk);
- bh_unlock_sock(child);
- /* Note: sock_put(child) will be done by tcp_conn_request()
- * after SYNACK packet is sent.
+ /* tcp_conn_request() is sending the SYNACK,
+ * and queues the child into listener accept queue.
*/
- WARN_ON(!req->sk);
return child;
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 27108757c310..3b35c3f4d268 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6022,7 +6022,7 @@ static void tcp_openreq_init(struct request_sock *req,
{
struct inet_request_sock *ireq = inet_rsk(req);
- req->rcv_wnd = 0; /* So that tcp_send_synack() knows! */
+ req->rsk_rcv_wnd = 0; /* So that tcp_send_synack() knows! */
req->cookie_ts = 0;
tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
@@ -6042,9 +6042,11 @@ static void tcp_openreq_init(struct request_sock *req,
}
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
- struct sock *sk_listener)
+ struct sock *sk_listener,
+ bool attach_listener)
{
- struct request_sock *req = reqsk_alloc(ops, sk_listener);
+ struct request_sock *req = reqsk_alloc(ops, sk_listener,
+ attach_listener);
if (req) {
struct inet_request_sock *ireq = inet_rsk(req);
@@ -6143,7 +6145,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop;
}
- req = inet_reqsk_alloc(rsk_ops, sk);
+ req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
if (!req)
goto drop;
@@ -6229,12 +6231,16 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
- fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
tcp_reqsk_record_syn(sk, req, skb);
+ fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
skb_get_queue_mapping(skb), &foc, false);
+ /* Add the child socket directly into the accept queue */
+ inet_csk_reqsk_queue_add(sk, req, fastopen_sk);
+ sk->sk_data_ready(sk);
+ bh_unlock_sock(fastopen_sk);
sock_put(fastopen_sk);
} else {
tcp_rsk(req)->tfo_listener = false;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 34310748a365..9c68cf3762c4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -324,7 +324,6 @@ void tcp_req_err(struct sock *sk, u32 seq)
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- reqsk_put(req);
} else {
/*
* Still in SYN_RECV, just remove it silently.
@@ -332,9 +331,10 @@ void tcp_req_err(struct sock *sk, u32 seq)
* created socket, and POSIX does not want network
* errors returned from accept().
*/
- NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
inet_csk_reqsk_queue_drop(req->rsk_listener, req);
+ NET_INC_STATS_BH(net, LINUX_MIB_LISTENDROPS);
}
+ reqsk_put(req);
}
EXPORT_SYMBOL(tcp_req_err);
@@ -803,7 +803,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
*/
tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
- tcp_rsk(req)->rcv_nxt, req->rcv_wnd,
+ tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
0,
@@ -1572,6 +1572,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
TCP_SKB_CB(skb)->sacked = 0;
+lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1587,8 +1588,12 @@ process:
sk = req->rsk_listener;
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
- if (sk->sk_state == TCP_LISTEN)
+ if (likely(sk->sk_state == TCP_LISTEN)) {
nsk = tcp_check_req(sk, skb, req, false);
+ } else {
+ inet_csk_reqsk_queue_drop_and_put(sk, req);
+ goto lookup;
+ }
if (!nsk) {
reqsk_put(req);
goto discard_it;
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 9adf1e2c3170..41828bdc5d32 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -381,18 +381,18 @@ void tcp_openreq_init_rwin(struct request_sock *req,
window_clamp = READ_ONCE(tp->window_clamp);
/* Set this up on the first call only */
- req->window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
+ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW);
/* limit the window selection if the user enforce a smaller rx buffer */
if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK &&
- (req->window_clamp > full_space || req->window_clamp == 0))
- req->window_clamp = full_space;
+ (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0))
+ req->rsk_window_clamp = full_space;
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(full_space,
mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
- &req->rcv_wnd,
- &req->window_clamp,
+ &req->rsk_rcv_wnd,
+ &req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
dst_metric(dst, RTAX_INITRWND));
@@ -512,9 +512,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (sysctl_tcp_fack)
tcp_enable_fack(newtp);
}
- newtp->window_clamp = req->window_clamp;
- newtp->rcv_ssthresh = req->rcv_wnd;
- newtp->rcv_wnd = req->rcv_wnd;
+ newtp->window_clamp = req->rsk_window_clamp;
+ newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
newtp->rx_opt.snd_wscale = ireq->snd_wscale;
@@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
/* RFC793: "first check sequence number". */
if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
- tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rcv_wnd)) {
+ tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) {
/* Out of window: send ACK and drop. */
if (!(flg & TCP_FLAG_RST))
req->rsk_ops->send_ack(sk, skb, req);
@@ -768,6 +768,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!child)
goto listen_overflow;
+ sock_rps_save_rxhash(child, skb);
tcp_synack_rtt_meas(child, req);
inet_csk_reqsk_queue_drop(sk, req);
inet_csk_reqsk_queue_add(sk, req, child);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 55ed3266b05f..6e79fcb0addb 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3023,7 +3023,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
- th->window = htons(min(req->rcv_wnd, 65535U));
+ th->window = htons(min(req->rsk_rcv_wnd, 65535U));
tcp_options_write((__be32 *)(th + 1), NULL, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 7149ebc820c7..c9c716a483e4 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -83,7 +83,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
}
/* Calculate maximal number or retries on an orphaned socket. */
-static int tcp_orphan_retries(struct sock *sk, int alive)
+static int tcp_orphan_retries(struct sock *sk, bool alive)
{
int retries = sysctl_tcp_orphan_retries; /* May be zero. */
@@ -184,7 +184,7 @@ static int tcp_write_timeout(struct sock *sk)
retry_until = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = icsk->icsk_rto < TCP_RTO_MAX;
+ const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
retry_until = tcp_orphan_retries(sk, alive);
do_reset = alive ||
@@ -298,7 +298,7 @@ static void tcp_probe_timer(struct sock *sk)
max_probes = sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
- const int alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
+ const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
max_probes = tcp_orphan_retries(sk, alive);
if (!alive && icsk->icsk_backoff >= max_probes)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 156ba75b6000..24ec14f9825c 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -375,7 +375,8 @@ static inline int compute_score(struct sock *sk, struct net *net,
return -1;
score += 4;
}
-
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
return score;
}
@@ -419,6 +420,9 @@ static inline int compute_score2(struct sock *sk, struct net *net,
score += 4;
}
+ if (sk->sk_incoming_cpu == raw_smp_processor_id())
+ score++;
+
return score;
}
@@ -1017,30 +1021,14 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
fl4 = &fl4_stack;
- /* unconnected socket. If output device is enslaved to a VRF
- * device lookup source address from VRF table. This mimics
- * behavior of ip_route_connect{_init}.
- */
- if (netif_index_is_l3_master(net, ipc.oif)) {
- flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
- RT_SCOPE_UNIVERSE, sk->sk_protocol,
- (flow_flags | FLOWI_FLAG_VRFSRC |
- FLOWI_FLAG_SKIP_NH_OIF),
- faddr, saddr, dport,
- inet->inet_sport);
-
- rt = ip_route_output_flow(net, fl4, sk);
- if (!IS_ERR(rt)) {
- saddr = fl4->saddr;
- ip_rt_put(rt);
- }
- }
-
flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos,
RT_SCOPE_UNIVERSE, sk->sk_protocol,
flow_flags,
faddr, saddr, dport, inet->inet_sport);
+ if (!saddr && ipc.oif)
+ l3mdev_get_saddr(net, ipc.oif, fl4);
+
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
if (IS_ERR(rt)) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index cd6be736e19f..9f298d0dc9a1 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -87,17 +87,15 @@ static int __xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_NETFILTER
if (!x) {
IPCB(skb)->flags |= IPSKB_REROUTED;
- return dst_output(sk, skb);
+ return dst_output(net, sk, skb);
}
#endif
return x->outer_mode->afinfo->output_finish(sk, skb);
}
-int xfrm4_output(struct sock *sk, struct sk_buff *skb)
+int xfrm4_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- struct net *net = dev_net(skb_dst(skb)->dev);
-
return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING,
net, sk, skb, NULL, skb_dst(skb)->dev,
__xfrm4_output,