From d7f5762c5e532dfe8247ce1bc60d97af27ff8d00 Mon Sep 17 00:00:00 2001
From: Alexander Alemayhu <alexander@alemayhu.com>
Date: Wed, 4 Jan 2017 23:16:41 +0100
Subject: netfilter: nf_tables: fix spelling mistakes

o s/numerice/numeric
o s/opertaor/operator

Signed-off-by: Alexander Alemayhu <alexander@alemayhu.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_tables.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 881d49e94569..e3f27e09eb2b 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -235,7 +235,7 @@ enum nft_rule_compat_flags {
 /**
  * enum nft_rule_compat_attributes - nf_tables rule compat attributes
  *
- * @NFTA_RULE_COMPAT_PROTO: numerice value of handled protocol (NLA_U32)
+ * @NFTA_RULE_COMPAT_PROTO: numeric value of handled protocol (NLA_U32)
  * @NFTA_RULE_COMPAT_FLAGS: bitmask of enum nft_rule_compat_flags (NLA_U32)
  */
 enum nft_rule_compat_attributes {
@@ -499,7 +499,7 @@ enum nft_bitwise_attributes {
  * enum nft_byteorder_ops - nf_tables byteorder operators
  *
  * @NFT_BYTEORDER_NTOH: network to host operator
- * @NFT_BYTEORDER_HTON: host to network opertaor
+ * @NFT_BYTEORDER_HTON: host to network operator
  */
 enum nft_byteorder_ops {
 	NFT_BYTEORDER_NTOH,
-- 
cgit v1.2.3-58-ga151


From 6443ebc3fdd6f3c766d9442c18be274b3d736050 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sat, 7 Jan 2017 20:49:18 +0800
Subject: netfilter: rpfilter: fix incorrect loopback packet judgment

Currently, we check the existing rtable in PREROUTING hook, if RTCF_LOCAL
is set, we assume that the packet is loopback.

But this assumption is incorrect, for example, a packet encapsulated
in ipsec transport mode was received and routed to local, after
decapsulation, it would be delivered to local again, and the rtable
was not dropped, so RTCF_LOCAL check would trigger. But actually, the
packet was not loopback.

So for these normal loopback packets, we can check whether the in device
is IFF_LOOPBACK or not. For these locally generated broadcast/multicast,
we can check whether the skb->pkt_type is PACKET_LOOPBACK or not.

Finally, there's a subtle difference between nft fib expr and xtables
rpfilter extension, user can add the following nft rule to do strict
rpfilter check:
  # nft add rule x y meta iif eth0 fib saddr . iif oif != eth0 drop

So when the packet is loopback, it's better to store the in device
instead of the LOOPBACK_IFINDEX, otherwise, after adding the above
nft rule, locally generated broad/multicast packets will be dropped
incorrectly.

Fixes: f83a7ea2075c ("netfilter: xt_rpfilter: skip locally generated broadcast/multicast, too")
Fixes: f6d0cbcf09c5 ("netfilter: nf_tables: add fib expression")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_fib.h    |  6 ++++++
 net/ipv4/netfilter/ipt_rpfilter.c  |  8 ++++----
 net/ipv4/netfilter/nft_fib_ipv4.c  | 15 +++++----------
 net/ipv6/netfilter/ip6t_rpfilter.c |  8 ++++----
 net/ipv6/netfilter/nft_fib_ipv6.c  | 13 ++++---------
 5 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index cbedda077db2..5ceb2205e4e3 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -9,6 +9,12 @@ struct nft_fib {
 
 extern const struct nla_policy nft_fib_policy[];
 
+static inline bool
+nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in)
+{
+	return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
+}
+
 int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr);
 int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		 const struct nlattr * const tb[]);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index f273098e48fd..37fb9552e858 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -63,10 +63,10 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4,
 	return dev_match || flags & XT_RPFILTER_LOOSE;
 }
 
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
 {
-	const struct rtable *rt = skb_rtable(skb);
-	return rt && (rt->rt_flags & RTCF_LOCAL);
+	return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
 }
 
 static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -79,7 +79,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	info = par->matchinfo;
 	invert = info->flags & XT_RPFILTER_INVERT;
 
-	if (rpfilter_is_local(skb))
+	if (rpfilter_is_loopback(skb, xt_in(par)))
 		return true ^ invert;
 
 	iph = ip_hdr(skb);
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 965b1a161369..2981291910dd 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -26,13 +26,6 @@ static __be32 get_saddr(__be32 addr)
 	return addr;
 }
 
-static bool fib4_is_local(const struct sk_buff *skb)
-{
-	const struct rtable *rt = skb_rtable(skb);
-
-	return rt && (rt->rt_flags & RTCF_LOCAL);
-}
-
 #define DSCP_BITS     0xfc
 
 void nft_fib4_eval_type(const struct nft_expr *expr, struct nft_regs *regs,
@@ -95,8 +88,10 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	else
 		oif = NULL;
 
-	if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib4_is_local(pkt->skb)) {
-		nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX);
+	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+		nft_fib_store_result(dest, priv->result, pkt,
+				     nft_in(pkt)->ifindex);
 		return;
 	}
 
@@ -131,7 +126,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	switch (res.type) {
 	case RTN_UNICAST:
 		break;
-	case RTN_LOCAL:	/* should not appear here, see fib4_is_local() above */
+	case RTN_LOCAL: /* Should not see RTN_LOCAL here */
 		return;
 	default:
 		break;
diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c
index d5263dc364a9..b12e61b7b16c 100644
--- a/net/ipv6/netfilter/ip6t_rpfilter.c
+++ b/net/ipv6/netfilter/ip6t_rpfilter.c
@@ -72,10 +72,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb,
 	return ret;
 }
 
-static bool rpfilter_is_local(const struct sk_buff *skb)
+static bool
+rpfilter_is_loopback(const struct sk_buff *skb, const struct net_device *in)
 {
-	const struct rt6_info *rt = (const void *) skb_dst(skb);
-	return rt && (rt->rt6i_flags & RTF_LOCAL);
+	return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
 }
 
 static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
@@ -85,7 +85,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	struct ipv6hdr *iph;
 	bool invert = info->flags & XT_RPFILTER_INVERT;
 
-	if (rpfilter_is_local(skb))
+	if (rpfilter_is_loopback(skb, xt_in(par)))
 		return true ^ invert;
 
 	iph = ipv6_hdr(skb);
diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c
index c947aad8bcc6..765facf03d45 100644
--- a/net/ipv6/netfilter/nft_fib_ipv6.c
+++ b/net/ipv6/netfilter/nft_fib_ipv6.c
@@ -18,13 +18,6 @@
 #include <net/ip6_fib.h>
 #include <net/ip6_route.h>
 
-static bool fib6_is_local(const struct sk_buff *skb)
-{
-	const struct rt6_info *rt = (const void *)skb_dst(skb);
-
-	return rt && (rt->rt6i_flags & RTF_LOCAL);
-}
-
 static int get_ifindex(const struct net_device *dev)
 {
 	return dev ? dev->ifindex : 0;
@@ -164,8 +157,10 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs,
 
 	lookup_flags = nft_fib6_flowi_init(&fl6, priv, pkt, oif);
 
-	if (nft_hook(pkt) == NF_INET_PRE_ROUTING && fib6_is_local(pkt->skb)) {
-		nft_fib_store_result(dest, priv->result, pkt, LOOPBACK_IFINDEX);
+	if (nft_hook(pkt) == NF_INET_PRE_ROUTING &&
+	    nft_fib_is_loopback(pkt->skb, nft_in(pkt))) {
+		nft_fib_store_result(dest, priv->result, pkt,
+				     nft_in(pkt)->ifindex);
 		return;
 	}
 
-- 
cgit v1.2.3-58-ga151


From 501db511397fd6efff3aa5b4e8de415b55559550 Mon Sep 17 00:00:00 2001
From: Rolf Neugebauer <rolf.neugebauer@docker.com>
Date: Tue, 17 Jan 2017 18:13:51 +0000
Subject: virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on xmit

This patch part reverts fd2a0437dc33 and e858fae2b0b8 which introduced a
subtle change in how the virtio_net flags are derived from the SKBs
ip_summed field.

With the above commits, the flags are set to VIRTIO_NET_HDR_F_DATA_VALID
when ip_summed == CHECKSUM_UNNECESSARY, thus treating it differently to
ip_summed == CHECKSUM_NONE, which should be the same.

Further, the virtio spec 1.0 / CS04 explicitly says that
VIRTIO_NET_HDR_F_DATA_VALID must not be set by the driver.

Fixes: fd2a0437dc33 ("virtio_net: introduce virtio_net_hdr_{from,to}_skb")
Fixes: e858fae2b0b8 (" virtio_net: use common code for virtio_net_hdr and skb GSO conversion")
Signed-off-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/virtio_net.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 66204007d7ac..56436472ccc7 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -91,8 +91,6 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 				skb_checksum_start_offset(skb));
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
-	} else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
-		hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 	} /* else everything is zero */
 
 	return 0;
-- 
cgit v1.2.3-58-ga151


From 9ed59592e3e379b2e9557dc1d9e9ec8fcbb33f16 Mon Sep 17 00:00:00 2001
From: David Ahern <dsa@cumulusnetworks.com>
Date: Tue, 17 Jan 2017 14:57:36 -0800
Subject: lwtunnel: fix autoload of lwt modules

Trying to add an mpls encap route when the MPLS modules are not loaded
hangs. For example:

    CONFIG_MPLS=y
    CONFIG_NET_MPLS_GSO=m
    CONFIG_MPLS_ROUTING=m
    CONFIG_MPLS_IPTUNNEL=m

    $ ip route add 10.10.10.10/32 encap mpls 100 via inet 10.100.1.2

The ip command hangs:
root       880   826  0 21:25 pts/0    00:00:00 ip route add 10.10.10.10/32 encap mpls 100 via inet 10.100.1.2

    $ cat /proc/880/stack
    [<ffffffff81065a9b>] call_usermodehelper_exec+0xd6/0x134
    [<ffffffff81065efc>] __request_module+0x27b/0x30a
    [<ffffffff814542f6>] lwtunnel_build_state+0xe4/0x178
    [<ffffffff814aa1e4>] fib_create_info+0x47f/0xdd4
    [<ffffffff814ae451>] fib_table_insert+0x90/0x41f
    [<ffffffff814a8010>] inet_rtm_newroute+0x4b/0x52
    ...

modprobe is trying to load rtnl-lwt-MPLS:

root       881     5  0 21:25 ?        00:00:00 /sbin/modprobe -q -- rtnl-lwt-MPLS

and it hangs after loading mpls_router:

    $ cat /proc/881/stack
    [<ffffffff81441537>] rtnl_lock+0x12/0x14
    [<ffffffff8142ca2a>] register_netdevice_notifier+0x16/0x179
    [<ffffffffa0033025>] mpls_init+0x25/0x1000 [mpls_router]
    [<ffffffff81000471>] do_one_initcall+0x8e/0x13f
    [<ffffffff81119961>] do_init_module+0x5a/0x1e5
    [<ffffffff810bd070>] load_module+0x13bd/0x17d6
    ...

The problem is that lwtunnel_build_state is called with rtnl lock
held preventing mpls_init from registering.

Given the potential references held by the time lwtunnel_build_state it
can not drop the rtnl lock to the load module. So, extract the module
loading code from lwtunnel_build_state into a new function to validate
the encap type. The new function is called while converting the user
request into a fib_config which is well before any table, device or
fib entries are examined.

Fixes: 745041e2aaf1 ("lwtunnel: autoload of lwt modules")
Signed-off-by: David Ahern <dsa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h  | 11 +++++++++
 net/core/lwtunnel.c     | 62 ++++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/fib_frontend.c |  8 +++++++
 net/ipv6/route.c        | 12 +++++++++-
 4 files changed, 86 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index d4c1c75b8862..0b585f1fd340 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -105,6 +105,8 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
+int lwtunnel_valid_encap_type(u16 encap_type);
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len);
 int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 			 struct nlattr *encap,
 			 unsigned int family, const void *cfg,
@@ -168,6 +170,15 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
 	return -EOPNOTSUPP;
 }
 
+static inline int lwtunnel_valid_encap_type(u16 encap_type)
+{
+	return -EOPNOTSUPP;
+}
+static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len)
+{
+	return -EOPNOTSUPP;
+}
+
 static inline int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 				       struct nlattr *encap,
 				       unsigned int family, const void *cfg,
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index a5d4e866ce88..47b1dd65947b 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -26,6 +26,7 @@
 #include <net/lwtunnel.h>
 #include <net/rtnetlink.h>
 #include <net/ip6_fib.h>
+#include <net/nexthop.h>
 
 #ifdef CONFIG_MODULES
 
@@ -114,25 +115,74 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
 	ret = -EOPNOTSUPP;
 	rcu_read_lock();
 	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	if (likely(ops && ops->build_state))
+		ret = ops->build_state(dev, encap, family, cfg, lws);
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_build_state);
+
+int lwtunnel_valid_encap_type(u16 encap_type)
+{
+	const struct lwtunnel_encap_ops *ops;
+	int ret = -EINVAL;
+
+	if (encap_type == LWTUNNEL_ENCAP_NONE ||
+	    encap_type > LWTUNNEL_ENCAP_MAX)
+		return ret;
+
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[encap_type]);
+	rcu_read_unlock();
 #ifdef CONFIG_MODULES
 	if (!ops) {
 		const char *encap_type_str = lwtunnel_encap_str(encap_type);
 
 		if (encap_type_str) {
-			rcu_read_unlock();
+			__rtnl_unlock();
 			request_module("rtnl-lwt-%s", encap_type_str);
+			rtnl_lock();
+
 			rcu_read_lock();
 			ops = rcu_dereference(lwtun_encaps[encap_type]);
+			rcu_read_unlock();
 		}
 	}
 #endif
-	if (likely(ops && ops->build_state))
-		ret = ops->build_state(dev, encap, family, cfg, lws);
-	rcu_read_unlock();
+	return ops ? 0 : -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(lwtunnel_valid_encap_type);
 
-	return ret;
+int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining)
+{
+	struct rtnexthop *rtnh = (struct rtnexthop *)attr;
+	struct nlattr *nla_entype;
+	struct nlattr *attrs;
+	struct nlattr *nla;
+	u16 encap_type;
+	int attrlen;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			attrs = rtnh_attrs(rtnh);
+			nla = nla_find(attrs, attrlen, RTA_ENCAP);
+			nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+
+			if (nla_entype) {
+				encap_type = nla_get_u16(nla_entype);
+
+				if (lwtunnel_valid_encap_type(encap_type) != 0)
+					return -EOPNOTSUPP;
+			}
+		}
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	return 0;
 }
-EXPORT_SYMBOL(lwtunnel_build_state);
+EXPORT_SYMBOL(lwtunnel_valid_encap_type_attr);
 
 void lwtstate_free(struct lwtunnel_state *lws)
 {
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index eae0332b0e8c..7db2ad2e82d3 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -46,6 +46,7 @@
 #include <net/rtnetlink.h>
 #include <net/xfrm.h>
 #include <net/l3mdev.h>
+#include <net/lwtunnel.h>
 #include <trace/events/fib.h>
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
@@ -677,6 +678,10 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			cfg->fc_mx_len = nla_len(attr);
 			break;
 		case RTA_MULTIPATH:
+			err = lwtunnel_valid_encap_type_attr(nla_data(attr),
+							     nla_len(attr));
+			if (err < 0)
+				goto errout;
 			cfg->fc_mp = nla_data(attr);
 			cfg->fc_mp_len = nla_len(attr);
 			break;
@@ -691,6 +696,9 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb,
 			break;
 		case RTA_ENCAP_TYPE:
 			cfg->fc_encap_type = nla_get_u16(attr);
+			err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+			if (err < 0)
+				goto errout;
 			break;
 		}
 	}
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4f6b067c8753..7ea85370c11c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2896,6 +2896,11 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[RTA_MULTIPATH]) {
 		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
 		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
+
+		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
+						     cfg->fc_mp_len);
+		if (err < 0)
+			goto errout;
 	}
 
 	if (tb[RTA_PREF]) {
@@ -2909,9 +2914,14 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (tb[RTA_ENCAP])
 		cfg->fc_encap = tb[RTA_ENCAP];
 
-	if (tb[RTA_ENCAP_TYPE])
+	if (tb[RTA_ENCAP_TYPE]) {
 		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
 
+		err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
+		if (err < 0)
+			goto errout;
+	}
+
 	if (tb[RTA_EXPIRES]) {
 		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
 
-- 
cgit v1.2.3-58-ga151


From d407bd25a204bd66b7346dde24bd3d37ef0e0b05 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 18 Jan 2017 15:14:17 +0100
Subject: bpf: don't trigger OOM killer under pressure with map alloc

This patch adds two helpers, bpf_map_area_alloc() and bpf_map_area_free(),
that are to be used for map allocations. Using kmalloc() for very large
allocations can cause excessive work within the page allocator, so i) fall
back earlier to vmalloc() when the attempt is considered costly anyway,
and even more importantly ii) don't trigger OOM killer with any of the
allocators.

Since this is based on a user space request, for example, when creating
maps with element pre-allocation, we really want such requests to fail
instead of killing other user space processes.

Also, don't spam the kernel log with warnings should any of the allocations
fail under pressure. Given that, we can make backend selection in
bpf_map_area_alloc() generic, and convert all maps over to use this API
for spots with potentially large allocation requests.

Note, replacing the one kmalloc_array() is fine as overflow checks happen
earlier in htab_map_alloc(), since it must also protect the multiplication
for vmalloc() should kmalloc_array() fail.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/bpf.h   |  2 ++
 kernel/bpf/arraymap.c | 18 +++++++-----------
 kernel/bpf/hashtab.c  | 22 +++++++++-------------
 kernel/bpf/stackmap.c | 20 ++++++++------------
 kernel/bpf/syscall.c  | 26 ++++++++++++++++++++++++++
 5 files changed, 52 insertions(+), 36 deletions(-)

(limited to 'include')

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 05cf951df3fe..3ed1f3b1d594 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -247,6 +247,8 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
 void bpf_map_put_with_uref(struct bpf_map *map);
 void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
+void *bpf_map_area_alloc(size_t size);
+void bpf_map_area_free(void *base);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 229a5d5df977..3d55d95dcf49 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,7 +11,6 @@
  */
 #include <linux/bpf.h>
 #include <linux/err.h>
-#include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/filter.h>
@@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	if (array_size >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-ENOMEM);
 
-
 	/* allocate all map elements and zero-initialize them */
-	array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
-	if (!array) {
-		array = vzalloc(array_size);
-		if (!array)
-			return ERR_PTR(-ENOMEM);
-	}
+	array = bpf_map_area_alloc(array_size);
+	if (!array)
+		return ERR_PTR(-ENOMEM);
 
 	/* copy mandatory map attributes */
 	array->map.map_type = attr->map_type;
@@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 
 	if (array_size >= U32_MAX - PAGE_SIZE ||
 	    elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
-		kvfree(array);
+		bpf_map_area_free(array);
 		return ERR_PTR(-ENOMEM);
 	}
 out:
@@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map)
 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		bpf_array_free_percpu(array);
 
-	kvfree(array);
+	bpf_map_area_free(array);
 }
 
 static const struct bpf_map_ops array_ops = {
@@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map)
 	/* make sure it's empty */
 	for (i = 0; i < array->map.max_entries; i++)
 		BUG_ON(array->ptrs[i] != NULL);
-	kvfree(array);
+
+	bpf_map_area_free(array);
 }
 
 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3f2bb58952d8..a753bbe7df0a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,7 +13,6 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
-#include <linux/vmalloc.h>
 #include "percpu_freelist.h"
 #include "bpf_lru_list.h"
 
@@ -103,7 +102,7 @@ static void htab_free_elems(struct bpf_htab *htab)
 		free_percpu(pptr);
 	}
 free_elems:
-	vfree(htab->elems);
+	bpf_map_area_free(htab->elems);
 }
 
 static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
@@ -125,7 +124,8 @@ static int prealloc_init(struct bpf_htab *htab)
 {
 	int err = -ENOMEM, i;
 
-	htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+	htab->elems = bpf_map_area_alloc(htab->elem_size *
+					 htab->map.max_entries);
 	if (!htab->elems)
 		return -ENOMEM;
 
@@ -320,14 +320,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 		goto free_htab;
 
 	err = -ENOMEM;
-	htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
-				      GFP_USER | __GFP_NOWARN);
-
-	if (!htab->buckets) {
-		htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
-		if (!htab->buckets)
-			goto free_htab;
-	}
+	htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+					   sizeof(struct bucket));
+	if (!htab->buckets)
+		goto free_htab;
 
 	for (i = 0; i < htab->n_buckets; i++) {
 		INIT_HLIST_HEAD(&htab->buckets[i].head);
@@ -354,7 +350,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
 free_extra_elems:
 	free_percpu(htab->extra_elems);
 free_buckets:
-	kvfree(htab->buckets);
+	bpf_map_area_free(htab->buckets);
 free_htab:
 	kfree(htab);
 	return ERR_PTR(err);
@@ -1014,7 +1010,7 @@ static void htab_map_free(struct bpf_map *map)
 		prealloc_destroy(htab);
 
 	free_percpu(htab->extra_elems);
-	kvfree(htab->buckets);
+	bpf_map_area_free(htab->buckets);
 	kfree(htab);
 }
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 732ae16d12b7..be8519148c25 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,7 +7,6 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
-#include <linux/vmalloc.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
 #include "percpu_freelist.h"
@@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 	u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
 	int err;
 
-	smap->elems = vzalloc(elem_size * smap->map.max_entries);
+	smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
 	if (!smap->elems)
 		return -ENOMEM;
 
@@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
 	return 0;
 
 free_elems:
-	vfree(smap->elems);
+	bpf_map_area_free(smap->elems);
 	return err;
 }
 
@@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	if (cost >= U32_MAX - PAGE_SIZE)
 		return ERR_PTR(-E2BIG);
 
-	smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
-	if (!smap) {
-		smap = vzalloc(cost);
-		if (!smap)
-			return ERR_PTR(-ENOMEM);
-	}
+	smap = bpf_map_area_alloc(cost);
+	if (!smap)
+		return ERR_PTR(-ENOMEM);
 
 	err = -E2BIG;
 	cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
@@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 put_buffers:
 	put_callchain_buffers();
 free_smap:
-	kvfree(smap);
+	bpf_map_area_free(smap);
 	return ERR_PTR(err);
 }
 
@@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map)
 	/* wait for bpf programs to complete before freeing stack map */
 	synchronize_rcu();
 
-	vfree(smap->elems);
+	bpf_map_area_free(smap->elems);
 	pcpu_freelist_destroy(&smap->freelist);
-	kvfree(smap);
+	bpf_map_area_free(smap);
 	put_callchain_buffers();
 }
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1d6b29e4e2c3..19b6129eab23 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,6 +12,8 @@
 #include <linux/bpf.h>
 #include <linux/syscalls.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/license.h>
@@ -49,6 +51,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
 	list_add(&tl->list_node, &bpf_map_types);
 }
 
+void *bpf_map_area_alloc(size_t size)
+{
+	/* We definitely need __GFP_NORETRY, so OOM killer doesn't
+	 * trigger under memory pressure as we really just want to
+	 * fail instead.
+	 */
+	const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+	void *area;
+
+	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+		area = kmalloc(size, GFP_USER | flags);
+		if (area != NULL)
+			return area;
+	}
+
+	return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+			 PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+	kvfree(area);
+}
+
 int bpf_map_precharge_memlock(u32 pages)
 {
 	struct user_struct *user = get_current_user();
-- 
cgit v1.2.3-58-ga151


From 6391a4481ba0796805d6581e42f9f0418c099e34 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Fri, 20 Jan 2017 14:32:42 +0800
Subject: virtio-net: restore VIRTIO_HDR_F_DATA_VALID on receiving

Commit 501db511397f ("virtio: don't set VIRTIO_NET_HDR_F_DATA_VALID on
xmit") in fact disables VIRTIO_HDR_F_DATA_VALID on receiving path too,
fixing this by adding a hint (has_data_valid) and set it only on the
receiving path.

Cc: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/macvtap.c      | 2 +-
 drivers/net/tun.c          | 2 +-
 drivers/net/virtio_net.c   | 2 +-
 include/linux/virtio_net.h | 6 +++++-
 net/packet/af_packet.c     | 4 ++--
 5 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 5c26653eceb5..402618565838 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -825,7 +825,7 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &vnet_hdr,
-					    macvtap_is_little_endian(q)))
+					    macvtap_is_little_endian(q), true))
 			BUG();
 
 		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index cd8e02c94be0..2cd10b26b650 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1360,7 +1360,7 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			return -EINVAL;
 
 		if (virtio_net_hdr_from_skb(skb, &gso,
-					    tun_is_little_endian(tun))) {
+					    tun_is_little_endian(tun), true)) {
 			struct skb_shared_info *sinfo = skb_shinfo(skb);
 			pr_err("unexpected GSO type: "
 			       "0x%x, gso_size %d, hdr_len %d\n",
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4a105006ca63..347424351ade 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1104,7 +1104,7 @@ static int xmit_skb(struct send_queue *sq, struct sk_buff *skb)
 		hdr = skb_vnet_hdr(skb);
 
 	if (virtio_net_hdr_from_skb(skb, &hdr->hdr,
-				    virtio_is_little_endian(vi->vdev)))
+				    virtio_is_little_endian(vi->vdev), false))
 		BUG();
 
 	if (vi->mergeable_rx_bufs)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h
index 56436472ccc7..5209b5ed2a64 100644
--- a/include/linux/virtio_net.h
+++ b/include/linux/virtio_net.h
@@ -56,7 +56,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb,
 
 static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 					  struct virtio_net_hdr *hdr,
-					  bool little_endian)
+					  bool little_endian,
+					  bool has_data_valid)
 {
 	memset(hdr, 0, sizeof(*hdr));   /* no info leak */
 
@@ -91,6 +92,9 @@ static inline int virtio_net_hdr_from_skb(const struct sk_buff *skb,
 				skb_checksum_start_offset(skb));
 		hdr->csum_offset = __cpu_to_virtio16(little_endian,
 				skb->csum_offset);
+	} else if (has_data_valid &&
+		   skb->ip_summed == CHECKSUM_UNNECESSARY) {
+		hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
 	} /* else everything is zero */
 
 	return 0;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index b9e1a13b4ba3..3d555c79a7b5 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1976,7 +1976,7 @@ static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
 		return -EINVAL;
 	*len -= sizeof(vnet_hdr);
 
-	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le()))
+	if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true))
 		return -EINVAL;
 
 	return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
@@ -2237,7 +2237,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 	if (po->has_vnet_hdr) {
 		if (virtio_net_hdr_from_skb(skb, h.raw + macoff -
 					    sizeof(struct virtio_net_hdr),
-					    vio_le())) {
+					    vio_le(), true)) {
 			spin_lock(&sk->sk_receive_queue.lock);
 			goto drop_n_account;
 		}
-- 
cgit v1.2.3-58-ga151


From 5ce6b04ce96896e8a79e6f60740ced911eaac7a4 Mon Sep 17 00:00:00 2001
From: Liping Zhang <zlpnobody@gmail.com>
Date: Sun, 22 Jan 2017 22:10:32 +0800
Subject: netfilter: nft_log: restrict the log prefix length to 127

First, log prefix will be truncated to NF_LOG_PREFIXLEN-1, i.e. 127,
at nf_log_packet(), so the extra part is useless.

Second, after adding a log rule with a very very long prefix, we will
fail to dump the nft rules after this _special_ one, but acctually,
they do exist. For example:
  # name_65000=$(printf "%0.sQ" {1..65000})
  # nft add rule filter output log prefix "$name_65000"
  # nft add rule filter output counter
  # nft add rule filter output counter
  # nft list chain filter output
  table ip filter {
      chain output {
          type filter hook output priority 0; policy accept;
      }
  }

So now, restrict the log prefix length to NF_LOG_PREFIXLEN-1.

Fixes: 96518518cc41 ("netfilter: add nftables")
Signed-off-by: Liping Zhang <zlpnobody@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/uapi/linux/netfilter/nf_log.h | 2 ++
 net/netfilter/nf_log.c                | 1 -
 net/netfilter/nft_log.c               | 3 ++-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/uapi/linux/netfilter/nf_log.h b/include/uapi/linux/netfilter/nf_log.h
index 8be21e02387d..d0b5fa91ff54 100644
--- a/include/uapi/linux/netfilter/nf_log.h
+++ b/include/uapi/linux/netfilter/nf_log.h
@@ -9,4 +9,6 @@
 #define NF_LOG_MACDECODE	0x20	/* Decode MAC header */
 #define NF_LOG_MASK		0x2f
 
+#define NF_LOG_PREFIXLEN	128
+
 #endif /* _NETFILTER_NF_LOG_H */
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 3dca90dc24ad..ffb9e8ada899 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -13,7 +13,6 @@
 /* Internal logging interface, which relies on the real
    LOG target modules */
 
-#define NF_LOG_PREFIXLEN		128
 #define NFLOGGER_NAME_LEN		64
 
 static struct nf_logger __rcu *loggers[NFPROTO_NUMPROTO][NF_LOG_TYPE_MAX] __read_mostly;
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 6271e40a3dd6..6f6e64423643 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -39,7 +39,8 @@ static void nft_log_eval(const struct nft_expr *expr,
 
 static const struct nla_policy nft_log_policy[NFTA_LOG_MAX + 1] = {
 	[NFTA_LOG_GROUP]	= { .type = NLA_U16 },
-	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING },
+	[NFTA_LOG_PREFIX]	= { .type = NLA_STRING,
+				    .len = NF_LOG_PREFIXLEN - 1 },
 	[NFTA_LOG_SNAPLEN]	= { .type = NLA_U32 },
 	[NFTA_LOG_QTHRESHOLD]	= { .type = NLA_U16 },
 	[NFTA_LOG_LEVEL]	= { .type = NLA_U32 },
-- 
cgit v1.2.3-58-ga151


From de70185de0333783154863278ac87bfbbc54e384 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 24 Jan 2017 00:51:41 +0100
Subject: netfilter: nf_tables: deconstify walk callback function

The flush operation needs to modify set and element objects, so let's
deconstify this.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h |  6 +++---
 net/netfilter/nf_tables_api.c     | 24 ++++++++++++------------
 net/netfilter/nft_set_hash.c      |  2 +-
 net/netfilter/nft_set_rbtree.c    |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'include')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 924325c46aab..7dfdb517f0be 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -207,9 +207,9 @@ struct nft_set_iter {
 	unsigned int	skip;
 	int		err;
 	int		(*fn)(const struct nft_ctx *ctx,
-			      const struct nft_set *set,
+			      struct nft_set *set,
 			      const struct nft_set_iter *iter,
-			      const struct nft_set_elem *elem);
+			      struct nft_set_elem *elem);
 };
 
 /**
@@ -301,7 +301,7 @@ struct nft_set_ops {
 	void				(*remove)(const struct nft_set *set,
 						  const struct nft_set_elem *elem);
 	void				(*walk)(const struct nft_ctx *ctx,
-						const struct nft_set *set,
+						struct nft_set *set,
 						struct nft_set_iter *iter);
 
 	unsigned int			(*privsize)(const struct nlattr * const nla[]);
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 831a9a16f563..5bd0068320fb 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -3087,9 +3087,9 @@ static int nf_tables_delset(struct net *net, struct sock *nlsk,
 }
 
 static int nf_tables_bind_check_setelem(const struct nft_ctx *ctx,
-					const struct nft_set *set,
+					struct nft_set *set,
 					const struct nft_set_iter *iter,
-					const struct nft_set_elem *elem)
+					struct nft_set_elem *elem)
 {
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 	enum nft_registers dreg;
@@ -3308,9 +3308,9 @@ struct nft_set_dump_args {
 };
 
 static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
-				  const struct nft_set *set,
+				  struct nft_set *set,
 				  const struct nft_set_iter *iter,
-				  const struct nft_set_elem *elem)
+				  struct nft_set_elem *elem)
 {
 	struct nft_set_dump_args *args;
 
@@ -3322,7 +3322,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	struct net *net = sock_net(skb->sk);
 	u8 genmask = nft_genmask_cur(net);
-	const struct nft_set *set;
+	struct nft_set *set;
 	struct nft_set_dump_args args;
 	struct nft_ctx ctx;
 	struct nlattr *nla[NFTA_SET_ELEM_LIST_MAX + 1];
@@ -3890,9 +3890,9 @@ err1:
 }
 
 static int nft_flush_set(const struct nft_ctx *ctx,
-			 const struct nft_set *set,
+			 struct nft_set *set,
 			 const struct nft_set_iter *iter,
-			 const struct nft_set_elem *elem)
+			 struct nft_set_elem *elem)
 {
 	struct nft_trans *trans;
 	int err;
@@ -3907,8 +3907,8 @@ static int nft_flush_set(const struct nft_ctx *ctx,
 		goto err1;
 	}
 
-	nft_trans_elem_set(trans) = (struct nft_set *)set;
-	nft_trans_elem(trans) = *((struct nft_set_elem *)elem);
+	nft_trans_elem_set(trans) = set;
+	nft_trans_elem(trans) = *elem;
 	list_add_tail(&trans->list, &ctx->net->nft.commit_list);
 
 	return 0;
@@ -5019,9 +5019,9 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 				 const struct nft_chain *chain);
 
 static int nf_tables_loop_check_setelem(const struct nft_ctx *ctx,
-					const struct nft_set *set,
+					struct nft_set *set,
 					const struct nft_set_iter *iter,
-					const struct nft_set_elem *elem)
+					struct nft_set_elem *elem)
 {
 	const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv);
 	const struct nft_data *data;
@@ -5045,7 +5045,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx,
 {
 	const struct nft_rule *rule;
 	const struct nft_expr *expr, *last;
-	const struct nft_set *set;
+	struct nft_set *set;
 	struct nft_set_binding *binding;
 	struct nft_set_iter iter;
 
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 1e20e2bbb6d9..e36069fb76ae 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -212,7 +212,7 @@ static void nft_hash_remove(const struct nft_set *set,
 	rhashtable_remove_fast(&priv->ht, &he->node, nft_hash_params);
 }
 
-static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
+static void nft_hash_walk(const struct nft_ctx *ctx, struct nft_set *set,
 			  struct nft_set_iter *iter)
 {
 	struct nft_hash *priv = nft_set_priv(set);
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index 08376e50f6cd..f06f55ee516d 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -221,7 +221,7 @@ static void *nft_rbtree_deactivate(const struct net *net,
 }
 
 static void nft_rbtree_walk(const struct nft_ctx *ctx,
-			    const struct nft_set *set,
+			    struct nft_set *set,
 			    struct nft_set_iter *iter)
 {
 	const struct nft_rbtree *priv = nft_set_priv(set);
-- 
cgit v1.2.3-58-ga151


From 88ff7334f25909802140e690c0e16433e485b0a0 Mon Sep 17 00:00:00 2001
From: Robert Shearman <rshearma@brocade.com>
Date: Tue, 24 Jan 2017 16:26:47 +0000
Subject: net: Specify the owning module for lwtunnel ops

Modules implementing lwtunnel ops should not be allowed to unload
while there is state alive using those ops, so specify the owning
module for all lwtunnel ops.

Signed-off-by: Robert Shearman <rshearma@brocade.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/lwtunnel.h    | 2 ++
 net/core/lwt_bpf.c        | 1 +
 net/ipv4/ip_tunnel_core.c | 2 ++
 net/ipv6/ila/ila_lwt.c    | 1 +
 net/ipv6/seg6_iptunnel.c  | 1 +
 net/mpls/mpls_iptunnel.c  | 1 +
 6 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index 0b585f1fd340..73dd87647460 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h
@@ -44,6 +44,8 @@ struct lwtunnel_encap_ops {
 	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
 	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
 	int (*xmit)(struct sk_buff *skb);
+
+	struct module *owner;
 };
 
 #ifdef CONFIG_LWTUNNEL
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index 71bb3e2eca08..b3eef90b2df9 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -386,6 +386,7 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
 	.fill_encap	= bpf_fill_encap_info,
 	.get_encap_size = bpf_encap_nlsize,
 	.cmp_encap	= bpf_encap_cmp,
+	.owner		= THIS_MODULE,
 };
 
 static int __init bpf_lwt_init(void)
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index fed3d29f9eb3..0fd1976ab63b 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -313,6 +313,7 @@ static const struct lwtunnel_encap_ops ip_tun_lwt_ops = {
 	.fill_encap = ip_tun_fill_encap_info,
 	.get_encap_size = ip_tun_encap_nlsize,
 	.cmp_encap = ip_tun_cmp_encap,
+	.owner = THIS_MODULE,
 };
 
 static const struct nla_policy ip6_tun_policy[LWTUNNEL_IP6_MAX + 1] = {
@@ -403,6 +404,7 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
 	.fill_encap = ip6_tun_fill_encap_info,
 	.get_encap_size = ip6_tun_encap_nlsize,
 	.cmp_encap = ip_tun_cmp_encap,
+	.owner = THIS_MODULE,
 };
 
 void __init ip_tunnel_core_init(void)
diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c
index a7bc54ab46e2..13b5e85fe0d5 100644
--- a/net/ipv6/ila/ila_lwt.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -238,6 +238,7 @@ static const struct lwtunnel_encap_ops ila_encap_ops = {
 	.fill_encap = ila_fill_encap_info,
 	.get_encap_size = ila_encap_nlsize,
 	.cmp_encap = ila_encap_cmp,
+	.owner = THIS_MODULE,
 };
 
 int ila_lwt_init(void)
diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c
index 1d60cb132835..c46f8cbf5ab5 100644
--- a/net/ipv6/seg6_iptunnel.c
+++ b/net/ipv6/seg6_iptunnel.c
@@ -422,6 +422,7 @@ static const struct lwtunnel_encap_ops seg6_iptun_ops = {
 	.fill_encap = seg6_fill_encap_info,
 	.get_encap_size = seg6_encap_nlsize,
 	.cmp_encap = seg6_encap_cmp,
+	.owner = THIS_MODULE,
 };
 
 int __init seg6_iptunnel_init(void)
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 2f7ccd934416..1d281c1ff7c1 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -215,6 +215,7 @@ static const struct lwtunnel_encap_ops mpls_iptun_ops = {
 	.fill_encap = mpls_fill_encap_info,
 	.get_encap_size = mpls_encap_nlsize,
 	.cmp_encap = mpls_encap_cmp,
+	.owner = THIS_MODULE,
 };
 
 static int __init mpls_iptunnel_init(void)
-- 
cgit v1.2.3-58-ga151


From d6f8cfa3dea294eabf8f302e90176dd6381fb66e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 25 Jan 2017 11:39:49 +0100
Subject: net: phy: leds: Break dependency of phy.h on phy_led_triggers.h

<linux/phy.h> includes <linux/phy_led_triggers.h>, which is not really
needed.  Drop the include from <linux/phy.h>, and add it to all users
that didn't include it explicitly.

Suggested-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c              | 1 +
 drivers/net/phy/phy_led_triggers.c | 1 +
 include/linux/phy.h                | 1 -
 3 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index e687a9cb4a37..7cc1b7dcfe05 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -29,6 +29,7 @@
 #include <linux/mii.h>
 #include <linux/ethtool.h>
 #include <linux/phy.h>
+#include <linux/phy_led_triggers.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mdio.h>
diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c
index 3f619e7371e9..94ca42e630bb 100644
--- a/drivers/net/phy/phy_led_triggers.c
+++ b/drivers/net/phy/phy_led_triggers.c
@@ -12,6 +12,7 @@
  */
 #include <linux/leds.h>
 #include <linux/phy.h>
+#include <linux/phy_led_triggers.h>
 #include <linux/netdevice.h>
 
 static struct phy_led_trigger *phy_speed_to_led_trigger(struct phy_device *phy,
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f7d95f644eed..7fc1105605bf 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -25,7 +25,6 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/mod_devicetable.h>
-#include <linux/phy_led_triggers.h>
 
 #include <linux/atomic.h>
 
-- 
cgit v1.2.3-58-ga151


From 3c880eb0205222bb062970085ebedc73ec8dfd14 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert+renesas@glider.be>
Date: Wed, 25 Jan 2017 11:39:50 +0100
Subject: net: phy: leds: Fix truncated LED trigger names

Commit 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and
bus_id") increased the size of MII bus IDs, but forgot to update the
private definition in <linux/phy_led_triggers.h>.
This may cause:
  1. Truncation of LED trigger names,
  2. Duplicate LED trigger names,
  3. Failures registering LED triggers,
  4. Crashes due to bad error handling in the LED trigger failure path.

To fix this, and prevent the definitions going out of sync again in the
future, let the PHY LED trigger code use the existing MII_BUS_ID_SIZE
definition.

Example:
  - Before I had triggers "ee700000.etherne:01:100Mbps" and
    "ee700000.etherne:01:10Mbps",
  - After the increase of MII_BUS_ID_SIZE, both became
    "ee700000.ethernet-ffffffff:01:" => FAIL,
  - Now, the triggers are "ee700000.ethernet-ffffffff:01:100Mbps" and
    "ee700000.ethernet-ffffffff:01:10Mbps", which are unique again.

Fixes: 4567d686f5c6d955 ("phy: increase size of MII_BUS_ID_SIZE and bus_id")
Fixes: 2e0bc452f4721520 ("net: phy: leds: add support for led triggers on phy link state change")
Signed-off-by: Geert Uytterhoeven <geert+renesas@glider.be>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/phy_led_triggers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/phy_led_triggers.h b/include/linux/phy_led_triggers.h
index a2daea0a37d2..b37b05bfd1a6 100644
--- a/include/linux/phy_led_triggers.h
+++ b/include/linux/phy_led_triggers.h
@@ -18,11 +18,11 @@ struct phy_device;
 #ifdef CONFIG_LED_TRIGGER_PHY
 
 #include <linux/leds.h>
+#include <linux/phy.h>
 
 #define PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE	10
-#define PHY_MII_BUS_ID_SIZE	(20 - 3)
 
-#define PHY_LINK_LED_TRIGGER_NAME_SIZE (PHY_MII_BUS_ID_SIZE + \
+#define PHY_LINK_LED_TRIGGER_NAME_SIZE (MII_BUS_ID_SIZE + \
 				       FIELD_SIZEOF(struct mdio_device, addr)+\
 				       PHY_LED_TRIGGER_SPEED_SUFFIX_SIZE)
 
-- 
cgit v1.2.3-58-ga151


From 92e55f412cffd016cc245a74278cb4d7b89bb3bc Mon Sep 17 00:00:00 2001
From: Pablo Neira <pablo@netfilter.org>
Date: Thu, 26 Jan 2017 22:56:21 +0100
Subject: tcp: don't annotate mark on control socket from
 tcp_v6_send_response()

Unlike ipv4, this control socket is shared by all cpus so we cannot use
it as scratchpad area to annotate the mark that we pass to ip6_xmit().

Add a new parameter to ip6_xmit() to indicate the mark. The SCTP socket
family caches the flowi6 structure in the sctp_transport structure, so
we cannot use to carry the mark unless we later on reset it back, which
I discarded since it looks ugly to me.

Fixes: bf99b4ded5f8 ("tcp: fix mark propagation with fwmark_reflect enabled")
Suggested-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/ipv6.h               | 2 +-
 net/dccp/ipv6.c                  | 4 ++--
 net/ipv6/inet6_connection_sock.c | 2 +-
 net/ipv6/ip6_output.c            | 4 ++--
 net/ipv6/tcp_ipv6.c              | 5 ++---
 net/sctp/ipv6.c                  | 3 ++-
 6 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 487e57391664..7afe991e900e 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -871,7 +871,7 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
  *	upper-layer output functions
  */
 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
-	     struct ipv6_txoptions *opt, int tclass);
+	     __u32 mark, struct ipv6_txoptions *opt, int tclass);
 
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr);
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index adfc790f7193..c4e879c02186 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -227,7 +227,7 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
 		opt = ireq->ipv6_opt;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
-		err = ip6_xmit(sk, skb, &fl6, opt, np->tclass);
+		err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -281,7 +281,7 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(skb, dst);
-		ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
+		ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0);
 		DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
 		DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
 		return;
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 7396e75e161b..75c308239243 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -176,7 +176,7 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
 	/* Restore final destination back after routing done */
 	fl6.daddr = sk->sk_v6_daddr;
 
-	res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+	res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
 		       np->tclass);
 	rcu_read_unlock();
 	return res;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 38122d04fadc..2c0df09e9036 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -172,7 +172,7 @@ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
  * which are using proper atomic operations or spinlocks.
  */
 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
-	     struct ipv6_txoptions *opt, int tclass)
+	     __u32 mark, struct ipv6_txoptions *opt, int tclass)
 {
 	struct net *net = sock_net(sk);
 	const struct ipv6_pinfo *np = inet6_sk(sk);
@@ -240,7 +240,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 
 	skb->protocol = htons(ETH_P_IPV6);
 	skb->priority = sk->sk_priority;
-	skb->mark = sk->sk_mark;
+	skb->mark = mark;
 
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2b20622a5824..cb8929681dc7 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -469,7 +469,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
 		opt = ireq->ipv6_opt;
 		if (!opt)
 			opt = rcu_dereference(np->opt);
-		err = ip6_xmit(sk, skb, fl6, opt, np->tclass);
+		err = ip6_xmit(sk, skb, fl6, sk->sk_mark, opt, np->tclass);
 		rcu_read_unlock();
 		err = net_xmit_eval(err);
 	}
@@ -840,8 +840,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
 	dst = ip6_dst_lookup_flow(ctl_sk, &fl6, NULL);
 	if (!IS_ERR(dst)) {
 		skb_dst_set(buff, dst);
-		ctl_sk->sk_mark = fl6.flowi6_mark;
-		ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
+		ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass);
 		TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
 		if (rst)
 			TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 5ed8e79bf102..64dfd35ccdcc 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -222,7 +222,8 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
 	SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
 
 	rcu_read_lock();
-	res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass);
+	res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
+		       np->tclass);
 	rcu_read_unlock();
 	return res;
 }
-- 
cgit v1.2.3-58-ga151


From 9d162ed69f51cbd9ee5a0c7e82aba7acc96362ff Mon Sep 17 00:00:00 2001
From: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Date: Fri, 27 Jan 2017 08:46:23 +0100
Subject: net: phy: micrel: add support for KSZ8795

This is adds support for the PHYs in the KSZ8795 5port managed switch.

It will allow to detect the link between the switch and the soc
and uses the same read_status functions as the KSZ8873MLL switch.

Signed-off-by: Sean Nyekjaer <sean.nyekjaer@prevas.dk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c   | 14 ++++++++++++++
 include/linux/micrel_phy.h |  2 ++
 2 files changed, 16 insertions(+)

(limited to 'include')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 9a77289109b7..e55809c5beb7 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -1008,6 +1008,20 @@ static struct phy_driver ksphy_driver[] = {
 	.get_stats	= kszphy_get_stats,
 	.suspend	= genphy_suspend,
 	.resume		= genphy_resume,
+}, {
+	.phy_id		= PHY_ID_KSZ8795,
+	.phy_id_mask	= MICREL_PHY_ID_MASK,
+	.name		= "Micrel KSZ8795",
+	.features	= (SUPPORTED_Pause | SUPPORTED_Asym_Pause),
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= kszphy_config_init,
+	.config_aneg	= ksz8873mll_config_aneg,
+	.read_status	= ksz8873mll_read_status,
+	.get_sset_count = kszphy_get_sset_count,
+	.get_strings	= kszphy_get_strings,
+	.get_stats	= kszphy_get_stats,
+	.suspend	= genphy_suspend,
+	.resume		= genphy_resume,
 } };
 
 module_phy_driver(ksphy_driver);
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 257173e0095e..f541da68d1e7 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -35,6 +35,8 @@
 #define PHY_ID_KSZ886X		0x00221430
 #define PHY_ID_KSZ8863		0x00221435
 
+#define PHY_ID_KSZ8795		0x00221550
+
 /* struct phy_device dev_flags definitions */
 #define MICREL_PHY_50MHZ_CLK	0x00000001
 #define MICREL_PHY_FXEN		0x00000002
-- 
cgit v1.2.3-58-ga151