From 4648dc97af9d496218a05353b0e442b3dfa6aaab Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 5 Mar 2012 19:35:04 +0000 Subject: tcp: fix tcp_shift_skb_data() to not shift SACKed data below snd_una This commit fixes tcp_shift_skb_data() so that it does not shift SACKed data below snd_una. This fixes an issue whose symptoms exactly match reports showing tp->sacked_out going negative since 3.3.0-rc4 (see "WARNING: at net/ipv4/tcp_input.c:3418" thread on netdev). Since 2008 (832d11c5cd076abc0aa1eaf7be96c81d1a59ce41) tcp_shift_skb_data() had been shifting SACKed ranges that were below snd_una. It checked that the *end* of the skb it was about to shift from was above snd_una, but did not check that the end of the actual shifted range was above snd_una; this commit adds that check. Shifting SACKed ranges below snd_una is problematic because for such ranges tcp_sacktag_one() short-circuits: it does not declare anything as SACKed and does not increase sacked_out. Before the fixes in commits cc9a672ee522d4805495b98680f4a3db5d0a0af9 and daef52bab1fd26e24e8e9578f8fb33ba1d0cb412, shifting SACKed ranges below snd_una happened to work because tcp_shifted_skb() was always (incorrectly) passing in to tcp_sacktag_one() an skb whose end_seq tcp_shift_skb_data() had already guaranteed was beyond snd_una. Hence tcp_sacktag_one() never short-circuited and always increased tp->sacked_out in this case. After those two fixes, my testing has verified that shifting SACKed ranges below snd_una could cause tp->sacked_out to go negative with the following sequence of events: (1) tcp_shift_skb_data() sees an skb whose end_seq is beyond snd_una, then shifts a prefix of that skb that is below snd_una (2) tcp_shifted_skb() increments the packet count of the already-SACKed prev sk_buff (3) tcp_sacktag_one() sees the end of the new SACKed range is below snd_una, so it short-circuits and doesn't increase tp->sacked_out (5) tcp_clean_rtx_queue() sees the SACKed skb has been ACKed, decrements tp->sacked_out by this "inflated" pcount that was missing a matching increase in tp->sacked_out, and hence tp->sacked_out underflows to a u32 like 0xFFFFFFFF, which casted to s32 is negative. (6) this leads to the warnings seen in the recent "WARNING: at net/ipv4/tcp_input.c:3418" thread on the netdev list; e.g.: tcp_input.c:3418 WARN_ON((int)tp->sacked_out < 0); More generally, I think this bug can be tickled in some cases where two or more ACKs from the receiver are lost and then a DSACK arrives that is immediately above an existing SACKed skb in the write queue. This fix changes tcp_shift_skb_data() to abort this sequence at step (1) in the scenario above by noticing that the bytes are below snd_una and not shifting them. Signed-off-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d9b83d198c3d..b5e315f13641 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1585,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } } + /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ + if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) + goto fallback; + if (!skb_shift(prev, skb, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) -- cgit v1.2.3-58-ga151 From 848edc69192a38bf9d261032f248b14f47e6af8b Mon Sep 17 00:00:00 2001 From: Santosh Nayak Date: Tue, 6 Mar 2012 01:22:50 +0000 Subject: netfilter: ebtables: fix wrong name length while copying to user-space user-space ebtables expects 32 bytes-long names, but xt_match names use 29 bytes. We have to copy less 29 bytes and then, make sure we fill the remaining bytes with zeroes. Signed-off-by: Santosh Nayak Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/bridge/netfilter/ebtables.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 8aa4ad0e06af..15e9575f7207 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1335,7 +1335,12 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m, const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)m - base); - if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN)) + char name[EBT_FUNCTION_MAXNAMELEN] = {}; + + /* ebtables expects 32 bytes long names but xt_match names are 29 bytes + long. Copy 29 bytes and fill remaining bytes with zeroes. */ + strncpy(name, m->u.match->name, sizeof(name)); + if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } @@ -1344,7 +1349,10 @@ static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)w - base); - if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN)) + char name[EBT_FUNCTION_MAXNAMELEN] = {}; + + strncpy(name, w->u.watcher->name, sizeof(name)); + if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } @@ -1355,10 +1363,12 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) int ret; char __user *hlp; const struct ebt_entry_target *t; + char name[EBT_FUNCTION_MAXNAMELEN] = {}; if (e->bitmask == 0) return 0; + strncpy(name, t->u.target->name, sizeof(name)); hlp = ubase + (((char *)e + e->target_offset) - base); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); @@ -1368,7 +1378,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); if (ret != 0) return ret; - if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN)) + if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } -- cgit v1.2.3-58-ga151 From 8be619d1e430fd87a02587a2a6830b692cb91b84 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Mar 2012 01:22:51 +0000 Subject: netfilter: ctnetlink: remove incorrect spin_[un]lock_bh on NAT module autoload Since 7d367e0, ctnetlink_new_conntrack is called without holding the nf_conntrack_lock spinlock. Thus, ctnetlink_parse_nat_setup does not require to release that spinlock anymore in the NAT module autoload case. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_netlink.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 30c9d4ca0218..10687692831e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1041,16 +1041,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, if (!parse_nat_setup) { #ifdef CONFIG_MODULES rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); nfnl_unlock(); if (request_module("nf-nat-ipv4") < 0) { nfnl_lock(); - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(); - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); if (nfnetlink_parse_nat_setup_hook) return -EAGAIN; -- cgit v1.2.3-58-ga151 From a157b9d5b5b626e46eba2ac4e342da8db25cabc4 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Mar 2012 01:22:53 +0000 Subject: netfilter: bridge: fix wrong pointer dereference In adf7ff8, a invalid dereference was added in ebt_make_names. CC [M] net/bridge/netfilter/ebtables.o net/bridge/netfilter/ebtables.c: In function `ebt_make_names': net/bridge/netfilter/ebtables.c:1371:20: warning: `t' may be used uninitialized in this function [-Wuninitialized] Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/bridge/netfilter/ebtables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 15e9575f7207..5fe2ff3b01ef 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1368,7 +1368,6 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) if (e->bitmask == 0) return 0; - strncpy(name, t->u.target->name, sizeof(name)); hlp = ubase + (((char *)e + e->target_offset) - base); t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); @@ -1378,6 +1377,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); if (ret != 0) return ret; + strncpy(name, t->u.target->name, sizeof(name)); if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; -- cgit v1.2.3-58-ga151 From 739e4505a0e8209622dc71743bfa1c804eacf7f4 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 6 Mar 2012 01:22:54 +0000 Subject: bridge: netfilter: don't call iptables on vlan packets if sysctl is off When net.bridge.bridge-nf-filter-vlan-tagged is 0 (default), vlan packets arriving should not be sent to ip(6)tables by bridge netfilter. However, it turns out that we currently always send VLAN packets to netfilter, if .. a), CONFIG_VLAN_8021Q is enabled ; or b), CONFIG_VLAN_8021Q is not set but rx vlan offload is enabled on the bridge port. This is because bridge netfilter treats skb with skb->protocol == ETH_P_IP{V6} as "non-vlan packet". With rx vlan offload on or CONFIG_VLAN_8021Q=y, the vlan header has already been removed here, and we cannot rely on skb->protocol alone. Fix this by only using skb->protocol if the skb has no vlan tag, or if a vlan tag is present and filter-vlan-tagged bridge netfilter sysctl is enabled. We cannot remove the skb->protocol == htons(ETH_P_8021Q) test because the vlan tag is still around in the CONFIG_VLAN_8021Q=n && "ethtool -K $itf rxvlan off" case. reproducer: iptables -t raw -I PREROUTING -i br0 iptables -t raw -I PREROUTING -i br0.1 Then send packets to an ip address configured on br0.1 interface. Even with net.bridge.bridge-nf-filter-vlan-tagged=0, the 1st rule will match instead of the 2nd one. With this patch applied, the 2nd rule will match instead. In the non-local address case, netfilter won't be consulted after this patch unless the sysctl is switched on. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 84122472656c..dec4f3817133 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -62,6 +62,15 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0; #define brnf_filter_pppoe_tagged 0 #endif +#define IS_IP(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) + +#define IS_IPV6(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) + +#define IS_ARP(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) + static inline __be16 vlan_proto(const struct sk_buff *skb) { if (vlan_tx_tag_present(skb)) @@ -639,8 +648,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, return NF_DROP; br = p->br; - if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) { + if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { if (!brnf_call_ip6tables && !br->nf_call_ip6tables) return NF_ACCEPT; @@ -651,8 +659,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, if (!brnf_call_iptables && !br->nf_call_iptables) return NF_ACCEPT; - if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) && - !IS_PPPOE_IP(skb)) + if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); @@ -701,7 +708,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct net_device *in; - if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) { + if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { in = nf_bridge->physindev; if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; @@ -718,6 +725,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) return 0; } + /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev @@ -744,11 +752,9 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, if (!parent) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || - IS_PPPOE_IP(skb)) + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) pf = PF_INET; - else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) pf = PF_INET6; else return NF_ACCEPT; @@ -795,7 +801,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb, if (!brnf_call_arptables && !br->nf_call_arptables) return NF_ACCEPT; - if (skb->protocol != htons(ETH_P_ARP)) { + if (!IS_ARP(skb)) { if (!IS_VLAN_ARP(skb)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); @@ -853,11 +859,9 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, if (!realoutdev) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || - IS_PPPOE_IP(skb)) + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) pf = PF_INET; - else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) pf = PF_INET6; else return NF_ACCEPT; -- cgit v1.2.3-58-ga151 From 741385119706d4370eb7899c5ca96ad125c520e5 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 6 Mar 2012 01:22:55 +0000 Subject: netfilter: nf_conntrack: fix early_drop with reliable event delivery If reliable event delivery is enabled and ctnetlink fails to deliver the destroy event in early_drop, the conntrack subsystem cannot drop any the candidate flow that was planned to be evicted. Reported-by: Kerin Millar Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ed86a3be678e..fa4b82c8ae80 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -635,8 +635,12 @@ static noinline int early_drop(struct net *net, unsigned int hash) if (del_timer(&ct->timeout)) { death_by_timeout((unsigned long)ct); - dropped = 1; - NF_CT_STAT_INC_ATOMIC(net, early_drop); + /* Check if we indeed killed this entry. Reliable event + delivery may have inserted it into the dying list. */ + if (test_bit(IPS_DYING_BIT, &ct->status)) { + dropped = 1; + NF_CT_STAT_INC_ATOMIC(net, early_drop); + } } nf_ct_put(ct); return dropped; -- cgit v1.2.3-58-ga151 From d6ddef9e641d1229d4ec841dc75ae703171c3e92 Mon Sep 17 00:00:00 2001 From: Li Wei Date: Mon, 5 Mar 2012 14:45:17 +0000 Subject: IPv6: Fix not join all-router mcast group when forwarding set. When forwarding was set and a new net device is register, we need add this device to the all-router mcast group. Signed-off-by: Li Wei Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index c02280a4d126..6b8ebc5da0e1 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -434,6 +434,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) /* Join all-node multicast group */ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + return ndev; } -- cgit v1.2.3-58-ga151 From 651a68ea2ce9738b84e928836053b2e0fb5db2ba Mon Sep 17 00:00:00 2001 From: Ben Pfaff Date: Tue, 6 Mar 2012 15:04:04 -0800 Subject: openvswitch: Honor dp_ifindex, when specified, for vport lookup by name. When OVS_VPORT_ATTR_NAME is specified and dp_ifindex is nonzero, the logical behavior would be for the vport name lookup scope to be limited to the specified datapath, but in fact the dp_ifindex value was ignored. This commit causes the search scope to be honored. Signed-off-by: Ben Pfaff Signed-off-by: Jesse Gross --- net/openvswitch/datapath.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ce64c18b8c79..2c030505b335 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1521,6 +1521,9 @@ static struct vport *lookup_vport(struct ovs_header *ovs_header, vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) return ERR_PTR(-ENODEV); + if (ovs_header->dp_ifindex && + ovs_header->dp_ifindex != get_dpifindex(vport->dp)) + return ERR_PTR(-ENODEV); return vport; } else if (a[OVS_VPORT_ATTR_PORT_NO]) { u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); -- cgit v1.2.3-58-ga151 From 81e5d41d7ed4f6c61ba3d2414f4f9ddf6d934ebb Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Tue, 6 Mar 2012 15:05:46 -0800 Subject: openvswitch: Fix checksum update for actions on UDP packets. When modifying IP addresses or ports on a UDP packet we don't correctly follow the rules for unchecksummed packets. This meant that packets without a checksum can be given a incorrect new checksum and packets with a checksum can become marked as being unchecksummed. This fixes it to handle those requirements. Signed-off-by: Jesse Gross --- net/openvswitch/actions.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2725d1bdf291..48badffaafc1 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira Networks. + * Copyright (c) 2007-2012 Nicira Networks. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -145,9 +145,16 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, *addr, new_addr, 1); } else if (nh->protocol == IPPROTO_UDP) { - if (likely(transport_len >= sizeof(struct udphdr))) - inet_proto_csum_replace4(&udp_hdr(skb)->check, skb, - *addr, new_addr, 1); + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&uh->check, skb, + *addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } } csum_replace4(&nh->check, *addr, new_addr); @@ -197,8 +204,22 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port, skb->rxhash = 0; } -static int set_udp_port(struct sk_buff *skb, - const struct ovs_key_udp *udp_port_key) +static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) +{ + struct udphdr *uh = udp_hdr(skb); + + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + set_tp_port(skb, port, new_port, &uh->check); + + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } else { + *port = new_port; + skb->rxhash = 0; + } +} + +static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) { struct udphdr *uh; int err; @@ -210,16 +231,15 @@ static int set_udp_port(struct sk_buff *skb, uh = udp_hdr(skb); if (udp_port_key->udp_src != uh->source) - set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check); + set_udp_port(skb, &uh->source, udp_port_key->udp_src); if (udp_port_key->udp_dst != uh->dest) - set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check); + set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); return 0; } -static int set_tcp_port(struct sk_buff *skb, - const struct ovs_key_tcp *tcp_port_key) +static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) { struct tcphdr *th; int err; @@ -328,11 +348,11 @@ static int execute_set_action(struct sk_buff *skb, break; case OVS_KEY_ATTR_TCP: - err = set_tcp_port(skb, nla_data(nested_attr)); + err = set_tcp(skb, nla_data(nested_attr)); break; case OVS_KEY_ATTR_UDP: - err = set_udp_port(skb, nla_data(nested_attr)); + err = set_udp(skb, nla_data(nested_attr)); break; } -- cgit v1.2.3-58-ga151 From d9e179ecec0805c41b17f9a0c3b925d415677772 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Tue, 6 Mar 2012 22:25:14 +0000 Subject: bridge: br_log_state() s/entering/entered/ When br_log_state() is reporting state it should say "entered" istead of "entering" since state at this point is already changed. Signed-off-by: Paulius Zaleckas Signed-off-by: David S. Miller --- net/bridge/br_stp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 6751ed4e0c07..8c836d96ba76 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -31,7 +31,7 @@ static const char *const br_port_state_names[] = { void br_log_state(const struct net_bridge_port *p) { - br_info(p->br, "port %u(%s) entering %s state\n", + br_info(p->br, "port %u(%s) entered %s state\n", (unsigned) p->port_no, p->dev->name, br_port_state_names[p->state]); } -- cgit v1.2.3-58-ga151 From 5200959b833ddacf28b6ffce8c331dfd6e0ca797 Mon Sep 17 00:00:00 2001 From: Paulius Zaleckas Date: Tue, 6 Mar 2012 22:25:22 +0000 Subject: bridge: fix state reporting when port is disabled Now we have: eth0: link *down* br0: port 1(eth0) entered *forwarding* state br_log_state(p) should be called *after* p->state is set to BR_STATE_DISABLED. Reported-by: Zilvinas Valinskas Signed-off-by: Paulius Zaleckas Acked-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 19308e305d85..f494496373d6 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -98,14 +98,13 @@ void br_stp_disable_port(struct net_bridge_port *p) struct net_bridge *br = p->br; int wasroot; - br_log_state(p); - wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; p->topology_change_ack = 0; p->config_pending = 0; + br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); -- cgit v1.2.3-58-ga151 From 5faa5df1fa2024bd750089ff21dcc4191798263d Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 6 Mar 2012 21:20:26 +0000 Subject: inetpeer: Invalidate the inetpeer tree along with the routing cache We initialize the routing metrics with the values cached on the inetpeer in rt_init_metrics(). So if we have the metrics cached on the inetpeer, we ignore the user configured fib_metrics. To fix this issue, we replace the old tree with a fresh initialized inet_peer_base. The old tree is removed later with a delayed work queue. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/inetpeer.h | 3 ++ net/ipv4/inetpeer.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++- net/ipv4/route.c | 1 + 3 files changed, 83 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 06b795dd5906..ff04a33acf00 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -41,6 +41,7 @@ struct inet_peer { u32 pmtu_orig; u32 pmtu_learned; struct inetpeer_addr_base redirect_learned; + struct list_head gc_list; /* * Once inet_peer is queued for deletion (refcnt == -1), following fields * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp @@ -96,6 +97,8 @@ static inline struct inet_peer *inet_getpeer_v6(const struct in6_addr *v6daddr, extern void inet_putpeer(struct inet_peer *p); extern bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); +extern void inetpeer_invalidate_tree(int family); + /* * temporary check to make sure we dont access rid, ip_id_count, tcp_ts, * tcp_ts_stamp if no refcount is taken on inet_peer diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bf4a9c4808e1..deea2e96b7f2 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -66,6 +67,11 @@ static struct kmem_cache *peer_cachep __read_mostly; +static LIST_HEAD(gc_list); +static const int gc_delay = 60 * HZ; +static struct delayed_work gc_work; +static DEFINE_SPINLOCK(gc_lock); + #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) @@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +static void inetpeer_gc_worker(struct work_struct *work) +{ + struct inet_peer *p, *n; + LIST_HEAD(list); + + spin_lock_bh(&gc_lock); + list_replace_init(&gc_list, &list); + spin_unlock_bh(&gc_lock); + + if (list_empty(&list)) + return; + + list_for_each_entry_safe(p, n, &list, gc_list) { + + if(need_resched()) + cond_resched(); + + if (p->avl_left != peer_avl_empty) { + list_add_tail(&p->avl_left->gc_list, &list); + p->avl_left = peer_avl_empty; + } + + if (p->avl_right != peer_avl_empty) { + list_add_tail(&p->avl_right->gc_list, &list); + p->avl_right = peer_avl_empty; + } + + n = list_entry(p->gc_list.next, struct inet_peer, gc_list); + + if (!atomic_read(&p->refcnt)) { + list_del(&p->gc_list); + kmem_cache_free(peer_cachep, p); + } + } + + if (list_empty(&list)) + return; + + spin_lock_bh(&gc_lock); + list_splice(&list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -126,6 +176,7 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker); } static int addr_compare(const struct inetpeer_addr *a, @@ -449,7 +500,7 @@ relookup: p->pmtu_orig = 0; p->redirect_genid = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - + INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ link_to_pool(p, base); @@ -509,3 +560,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); + +void inetpeer_invalidate_tree(int family) +{ + struct inet_peer *old, *new, *prev; + struct inet_peer_base *base = family_to_base(family); + + write_seqlock_bh(&base->lock); + + old = base->root; + if (old == peer_avl_empty_rcu) + goto out; + + new = peer_avl_empty_rcu; + + prev = cmpxchg(&base->root, old, new); + if (prev == old) { + base->total = 0; + spin_lock(&gc_lock); + list_add_tail(&prev->gc_list, &gc_list); + spin_unlock(&gc_lock); + schedule_delayed_work(&gc_work, gc_delay); + } + +out: + write_sequnlock_bh(&base->lock); +} +EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bcacf54e5418..23ce0c1287ab 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -938,6 +938,7 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); redirect_genid++; + inetpeer_invalidate_tree(AF_INET); } /* -- cgit v1.2.3-58-ga151 From ac3f48de09d8f4b73397047e413fadff7f65cfa7 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Tue, 6 Mar 2012 21:21:10 +0000 Subject: route: Remove redirect_genid As we invalidate the inetpeer tree along with the routing cache now, we don't need a genid to reset the redirect handling when the routing cache is flushed. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/inetpeer.h | 1 - net/ipv4/inetpeer.c | 1 - net/ipv4/route.c | 11 ++--------- 3 files changed, 2 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index ff04a33acf00..b94765e38e80 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -35,7 +35,6 @@ struct inet_peer { u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ - int redirect_genid; unsigned long rate_last; unsigned long pmtu_expires; u32 pmtu_orig; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index deea2e96b7f2..d4d61b694fab 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -498,7 +498,6 @@ relookup: p->rate_last = 0; p->pmtu_expires = 0; p->pmtu_orig = 0; - p->redirect_genid = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); INIT_LIST_HEAD(&p->gc_list); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 23ce0c1287ab..019774796174 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static int redirect_genid; static struct delayed_work expires_work; static unsigned long expires_ljiffies; @@ -937,7 +936,6 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - redirect_genid++; inetpeer_invalidate_tree(AF_INET); } @@ -1486,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, peer = rt->peer; if (peer) { - if (peer->redirect_learned.a4 != new_gw || - peer->redirect_genid != redirect_genid) { + if (peer->redirect_learned.a4 != new_gw) { peer->redirect_learned.a4 = new_gw; - peer->redirect_genid = redirect_genid; atomic_inc(&__rt_peer_genid); } check_peer_redir(&rt->dst, peer); @@ -1794,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt) if (peer) { check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) check_peer_redir(&rt->dst, peer); @@ -1959,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, peer->metrics, false); check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; + if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; -- cgit v1.2.3-58-ga151