diff options
Diffstat (limited to 'net/ipv4')
-rw-r--r-- | net/ipv4/devinet.c | 3 | ||||
-rw-r--r-- | net/ipv4/ip_forward.c | 71 | ||||
-rw-r--r-- | net/ipv4/ip_tunnel.c | 29 | ||||
-rw-r--r-- | net/ipv4/ipconfig.c | 2 | ||||
-rw-r--r-- | net/ipv4/netfilter/Kconfig | 5 | ||||
-rw-r--r-- | net/ipv4/netfilter/Makefile | 1 | ||||
-rw-r--r-- | net/ipv4/netfilter/nf_nat_h323.c | 5 | ||||
-rw-r--r-- | net/ipv4/netfilter/nft_reject_ipv4.c | 75 | ||||
-rw-r--r-- | net/ipv4/route.c | 13 | ||||
-rw-r--r-- | net/ipv4/tcp.c | 2 | ||||
-rw-r--r-- | net/ipv4/tcp_input.c | 18 | ||||
-rw-r--r-- | net/ipv4/tcp_output.c | 15 | ||||
-rw-r--r-- | net/ipv4/udp_offload.c | 17 |
13 files changed, 209 insertions, 47 deletions
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index ac2dff3c2c1c..bdbf68bb2e2d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1443,7 +1443,8 @@ static size_t inet_nlmsg_size(void) + nla_total_size(4) /* IFA_LOCAL */ + nla_total_size(4) /* IFA_BROADCAST */ + nla_total_size(IFNAMSIZ) /* IFA_LABEL */ - + nla_total_size(4); /* IFA_FLAGS */ + + nla_total_size(4) /* IFA_FLAGS */ + + nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */ } static inline u32 cstamp_delta(unsigned long cstamp) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index e9f1217a8afd..f3869c186d97 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -39,6 +39,71 @@ #include <net/route.h> #include <net/xfrm.h> +static bool ip_may_fragment(const struct sk_buff *skb) +{ + return unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0) || + !skb->local_df; +} + +static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) +{ + if (skb->len <= mtu || skb->local_df) + return false; + + if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu) + return false; + + return true; +} + +static bool ip_gso_exceeds_dst_mtu(const struct sk_buff *skb) +{ + unsigned int mtu; + + if (skb->local_df || !skb_is_gso(skb)) + return false; + + mtu = ip_dst_mtu_maybe_forward(skb_dst(skb), true); + + /* if seglen > mtu, do software segmentation for IP fragmentation on + * output. DF bit cannot be set since ip_forward would have sent + * icmp error. + */ + return skb_gso_network_seglen(skb) > mtu; +} + +/* called if GSO skb needs to be fragmented on forward */ +static int ip_forward_finish_gso(struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + netdev_features_t features; + struct sk_buff *segs; + int ret = 0; + + features = netif_skb_dev_features(skb, dst->dev); + segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); + if (IS_ERR(segs)) { + kfree_skb(skb); + return -ENOMEM; + } + + consume_skb(skb); + + do { + struct sk_buff *nskb = segs->next; + int err; + + segs->next = NULL; + err = dst_output(segs); + + if (err && ret == 0) + ret = err; + segs = nskb; + } while (segs); + + return ret; +} + static int ip_forward_finish(struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); @@ -49,6 +114,9 @@ static int ip_forward_finish(struct sk_buff *skb) if (unlikely(opt->optlen)) ip_forward_options(skb); + if (ip_gso_exceeds_dst_mtu(skb)) + return ip_forward_finish_gso(skb); + return dst_output(skb); } @@ -91,8 +159,7 @@ int ip_forward(struct sk_buff *skb) IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); - if (unlikely(skb->len > mtu && !skb_is_gso(skb) && - (ip_hdr(skb)->frag_off & htons(IP_DF))) && !skb->local_df) { + if (!ip_may_fragment(skb) && ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(dev_net(rt->dst.dev), IPSTATS_MIB_FRAGFAILS); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index bd28f386bd02..50228be5c17b 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -101,28 +101,22 @@ static void tunnel_dst_reset_all(struct ip_tunnel *t) __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL); } -static struct dst_entry *tunnel_dst_get(struct ip_tunnel *t) +static struct rtable *tunnel_rtable_get(struct ip_tunnel *t, u32 cookie) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(this_cpu_ptr(t->dst_cache)->dst); - if (dst) + if (dst) { + if (dst->obsolete && dst->ops->check(dst, cookie) == NULL) { + rcu_read_unlock(); + tunnel_dst_reset(t); + return NULL; + } dst_hold(dst); - rcu_read_unlock(); - return dst; -} - -static struct dst_entry *tunnel_dst_check(struct ip_tunnel *t, u32 cookie) -{ - struct dst_entry *dst = tunnel_dst_get(t); - - if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { - tunnel_dst_reset(t); - return NULL; } - - return dst; + rcu_read_unlock(); + return (struct rtable *)dst; } /* Often modified stats are per cpu, other are shared (netdev->stats) */ @@ -584,7 +578,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4; u8 tos, ttl; __be16 df; - struct rtable *rt = NULL; /* Route to the other host */ + struct rtable *rt; /* Route to the other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst; int err; @@ -657,8 +651,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); - if (connected) - rt = (struct rtable *)tunnel_dst_check(tunnel, 0); + rt = connected ? tunnel_rtable_get(tunnel, 0) : NULL; if (!rt) { rt = ip_route_output_key(tunnel->net, &fl4); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index efa1138fa523..b3e86ea7b71b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -273,7 +273,7 @@ static int __init ic_open_devs(void) msleep(1); - if time_before(jiffies, next_msg) + if (time_before(jiffies, next_msg)) continue; elapsed = jiffies_to_msecs(jiffies - start); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 81c6910cfa92..a26ce035e3fa 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -61,6 +61,11 @@ config NFT_CHAIN_NAT_IPV4 packet transformations such as the source, destination address and source and destination ports. +config NFT_REJECT_IPV4 + depends on NF_TABLES_IPV4 + default NFT_REJECT + tristate + config NF_TABLES_ARP depends on NF_TABLES tristate "ARP nf_tables support" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index c16be9d58420..90b82405331e 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_NF_NAT_PROTO_GRE) += nf_nat_proto_gre.o obj-$(CONFIG_NF_TABLES_IPV4) += nf_tables_ipv4.o obj-$(CONFIG_NFT_CHAIN_ROUTE_IPV4) += nft_chain_route_ipv4.o obj-$(CONFIG_NFT_CHAIN_NAT_IPV4) += nft_chain_nat_ipv4.o +obj-$(CONFIG_NFT_REJECT_IPV4) += nft_reject_ipv4.o obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o # generic IP tables diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 9eea059dd621..574f7ebba0b6 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -229,7 +229,10 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, ret = nf_ct_expect_related(rtcp_exp); if (ret == 0) break; - else if (ret != -EBUSY) { + else if (ret == -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + continue; + } else if (ret < 0) { nf_ct_unexpect_related(rtp_exp); nated_port = 0; break; diff --git a/net/ipv4/netfilter/nft_reject_ipv4.c b/net/ipv4/netfilter/nft_reject_ipv4.c new file mode 100644 index 000000000000..e79718a382f2 --- /dev/null +++ b/net/ipv4/netfilter/nft_reject_ipv4.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> + * Copyright (c) 2013 Eric Leblond <eric@regit.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Development of this code funded by Astaro AG (http://www.astaro.com/) + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables.h> +#include <net/icmp.h> +#include <net/netfilter/ipv4/nf_reject.h> +#include <net/netfilter/nft_reject.h> + +void nft_reject_ipv4_eval(const struct nft_expr *expr, + struct nft_data data[NFT_REG_MAX + 1], + const struct nft_pktinfo *pkt) +{ + struct nft_reject *priv = nft_expr_priv(expr); + + switch (priv->type) { + case NFT_REJECT_ICMP_UNREACH: + nf_send_unreach(pkt->skb, priv->icmp_code); + break; + case NFT_REJECT_TCP_RST: + nf_send_reset(pkt->skb, pkt->ops->hooknum); + break; + } + + data[NFT_REG_VERDICT].verdict = NF_DROP; +} +EXPORT_SYMBOL_GPL(nft_reject_ipv4_eval); + +static struct nft_expr_type nft_reject_ipv4_type; +static const struct nft_expr_ops nft_reject_ipv4_ops = { + .type = &nft_reject_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_reject)), + .eval = nft_reject_ipv4_eval, + .init = nft_reject_init, + .dump = nft_reject_dump, +}; + +static struct nft_expr_type nft_reject_ipv4_type __read_mostly = { + .family = NFPROTO_IPV4, + .name = "reject", + .ops = &nft_reject_ipv4_ops, + .policy = nft_reject_policy, + .maxattr = NFTA_REJECT_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_reject_ipv4_module_init(void) +{ + return nft_register_expr(&nft_reject_ipv4_type); +} + +static void __exit nft_reject_ipv4_module_exit(void) +{ + nft_unregister_expr(&nft_reject_ipv4_type); +} + +module_init(nft_reject_ipv4_module_init); +module_exit(nft_reject_ipv4_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); +MODULE_ALIAS_NFT_AF_EXPR(AF_INET, "reject"); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 25071b48921c..4c011ec69ed4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1597,6 +1597,7 @@ static int __mkroute_input(struct sk_buff *skb, rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); rth->dst.input = ip_forward; rth->dst.output = ip_output; @@ -1695,10 +1696,11 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.daddr = daddr; fl4.saddr = saddr; err = fib_lookup(net, &fl4, &res); - if (err != 0) + if (err != 0) { + if (!IN_DEV_FORWARD(in_dev)) + err = -EHOSTUNREACH; goto no_route; - - RT_CACHE_STAT_INC(in_slow_tot); + } if (res.type == RTN_BROADCAST) goto brd_input; @@ -1712,8 +1714,10 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto local_input; } - if (!IN_DEV_FORWARD(in_dev)) + if (!IN_DEV_FORWARD(in_dev)) { + err = -EHOSTUNREACH; goto no_route; + } if (res.type != RTN_UNICAST) goto martian_destination; @@ -1768,6 +1772,7 @@ local_input: rth->rt_gateway = 0; rth->rt_uses_gateway = 0; INIT_LIST_HEAD(&rth->rt_uncached); + RT_CACHE_STAT_INC(in_slow_tot); if (res.type == RTN_UNREACHABLE) { rth->dst.input= ip_error; rth->dst.error= -err; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4475b3bb494d..9f3a2db9109e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2229,7 +2229,7 @@ adjudge_to_death: /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. - * We use a 3 minute timeout (about the same as BSD) then kill + * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 65cf90e063d5..227cba79fa6b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -671,6 +671,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) { struct tcp_sock *tp = tcp_sk(sk); long m = mrtt; /* RTT */ + u32 srtt = tp->srtt; /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev @@ -688,11 +689,9 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if (m == 0) - m = 1; - if (tp->srtt != 0) { - m -= (tp->srtt >> 3); /* m is now error in rtt est */ - tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (srtt != 0) { + m -= (srtt >> 3); /* m is now error in rtt est */ + srtt += m; /* rtt = 7/8 rtt + 1/8 new */ if (m < 0) { m = -m; /* m is now abs(error) */ m -= (tp->mdev >> 2); /* similar update on mdev */ @@ -723,11 +722,12 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) } } else { /* no previous measure. */ - tp->srtt = m << 3; /* take the measured time to be rtt */ + srtt = m << 3; /* take the measured time to be rtt */ tp->mdev = m << 1; /* make sure rto = 3*rtt */ tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } + tp->srtt = max(1U, srtt); } /* Set the sk_pacing_rate to allow proper sizing of TSO packets. @@ -746,8 +746,10 @@ static void tcp_update_pacing_rate(struct sock *sk) rate *= max(tp->snd_cwnd, tp->packets_out); - /* Correction for small srtt : minimum srtt being 8 (1 jiffy << 3), - * be conservative and assume srtt = 1 (125 us instead of 1.25 ms) + /* Correction for small srtt and scheduling constraints. + * For small rtt, consider noise is too high, and use + * the minimal value (srtt = 1 -> 125 us for HZ=1000) + * * We probably need usec resolution in the future. * Note: This also takes care of possible srtt=0 case, * when tcp_rtt_estimator() was not yet called. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 03d26b85eab8..3be16727f058 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -698,7 +698,8 @@ static void tcp_tsq_handler(struct sock *sk) if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_CLOSE_WAIT | TCPF_LAST_ACK)) - tcp_write_xmit(sk, tcp_current_mss(sk), 0, 0, GFP_ATOMIC); + tcp_write_xmit(sk, tcp_current_mss(sk), tcp_sk(sk)->nonagle, + 0, GFP_ATOMIC); } /* * One tasklet per cpu tries to send more skbs. @@ -1904,7 +1905,15 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, if (atomic_read(&sk->sk_wmem_alloc) > limit) { set_bit(TSQ_THROTTLED, &tp->tsq_flags); - break; + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED, so we must + * test again the condition. + * We abuse smp_mb__after_clear_bit() because + * there is no smp_mb__after_set_bit() yet + */ + smp_mb__after_clear_bit(); + if (atomic_read(&sk->sk_wmem_alloc) > limit) + break; } limit = mss_now; @@ -1977,7 +1986,7 @@ bool tcp_schedule_loss_probe(struct sock *sk) /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ - if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out || + if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out || !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open) return false; diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 25f5cee3a08a..88b4023ecfcf 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -17,6 +17,8 @@ static DEFINE_SPINLOCK(udp_offload_lock); static struct udp_offload_priv __rcu *udp_offload_base __read_mostly; +#define udp_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&udp_offload_lock)) + struct udp_offload_priv { struct udp_offload *offload; struct rcu_head rcu; @@ -100,8 +102,7 @@ out: int udp_add_offload(struct udp_offload *uo) { - struct udp_offload_priv __rcu **head = &udp_offload_base; - struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_KERNEL); + struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC); if (!new_offload) return -ENOMEM; @@ -109,8 +110,8 @@ int udp_add_offload(struct udp_offload *uo) new_offload->offload = uo; spin_lock(&udp_offload_lock); - rcu_assign_pointer(new_offload->next, rcu_dereference(*head)); - rcu_assign_pointer(*head, new_offload); + new_offload->next = udp_offload_base; + rcu_assign_pointer(udp_offload_base, new_offload); spin_unlock(&udp_offload_lock); return 0; @@ -130,12 +131,12 @@ void udp_del_offload(struct udp_offload *uo) spin_lock(&udp_offload_lock); - uo_priv = rcu_dereference(*head); + uo_priv = udp_deref_protected(*head); for (; uo_priv != NULL; - uo_priv = rcu_dereference(*head)) { - + uo_priv = udp_deref_protected(*head)) { if (uo_priv->offload == uo) { - rcu_assign_pointer(*head, rcu_dereference(uo_priv->next)); + rcu_assign_pointer(*head, + udp_deref_protected(uo_priv->next)); goto unlock; } head = &uo_priv->next; |